Skip to content

Instantly share code, notes, and snippets.

@cgmb
Created February 7, 2025 01:04
Show Gist options
  • Save cgmb/5bc00ad3f04afbdd04e2ef12d4aabe2d to your computer and use it in GitHub Desktop.
Save cgmb/5bc00ad3f04afbdd04e2ef12d4aabe2d to your computer and use it in GitHub Desktop.
rocblas and llama.cpp performance profiling
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f32_r --ldc --d_type f32_r --ldd 32 --batch_count 32 --compute_type f32_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
- { rocblas_function: "rocblas_gemm_batched_ex", atomics_mode: atomics_allowed, a_type: "f16_r", b_type: "f16_r", c_type: "f16_r", d_type: "f16_r", compute_type: "f16_r", transA: 'T', transB: 'N', M: 128, N: 5, K: 32, alpha: 1.0, lda: 2048, ldb: 32, beta: 0.0, ldc: 128, ldd: 128, batch_count: 32, algo: 0, solution_index: 0, flags: pack_int, call_count: 32 }
- { rocblas_function: "rocblas_gemm_batched_ex", atomics_mode: atomics_allowed, a_type: "f16_r", b_type: "f16_r", c_type: "f32_r", d_type: "f32_r", compute_type: "f32_r", transA: 'T', transB: 'N', M: 32, N: 5, K: 128, alpha: 1.0, lda: 1024, ldb: 4096, beta: 0.0, ldc: 32, ldd: 32, batch_count: 32, algo: 0, solution_index: 0, flags: pack_int, call_count: 32 }
- { rocblas_function: "rocblas_gemm_batched_ex", atomics_mode: atomics_allowed, a_type: "f16_r", b_type: "f16_r", c_type: "f16_r", d_type: "f16_r", compute_type: "f16_r", transA: 'T', transB: 'N', M: 128, N: 2, K: 32, alpha: 1.0, lda: 2048, ldb: 32, beta: 0.0, ldc: 128, ldd: 128, batch_count: 32, algo: 0, solution_index: 0, flags: pack_int, call_count: 32 }
- { rocblas_function: "rocblas_gemm_batched_ex", atomics_mode: atomics_allowed, a_type: "f16_r", b_type: "f16_r", c_type: "f32_r", d_type: "f32_r", compute_type: "f32_r", transA: 'T', transB: 'N', M: 32, N: 2, K: 128, alpha: 1.0, lda: 1024, ldb: 4096, beta: 0.0, ldc: 32, ldd: 32, batch_count: 32, algo: 0, solution_index: 0, flags: pack_int, call_count: 32 }
ROCm calling rocblas_initialize as a workaround for a rocBLAS bug
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 ROCm devices:
Device 0: AMD Radeon RX 6800 XT, gfx1030 (0x1030), VMM: no, Wave Size: 32
build: 4621 (6eecde3c) with cc (Debian 14.2.0-12) 14.2.0 for x86_64-linux-gnu
main: llama backend init
main: load the model and apply lora adapter, if any
llama_model_load_from_file_impl: using device ROCm0 (AMD Radeon RX 6800 XT) - 16046 MiB free
llama_model_loader: loaded meta data with 33 key-value pairs and 292 tensors from /home/cgmb/ws/Meta-Llama-3.1-8B-Instruct-Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = llama
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = Meta Llama 3.1 8B Instruct
llama_model_loader: - kv 3: general.finetune str = Instruct
llama_model_loader: - kv 4: general.basename str = Meta-Llama-3.1
llama_model_loader: - kv 5: general.size_label str = 8B
llama_model_loader: - kv 6: general.license str = llama3.1
llama_model_loader: - kv 7: general.tags arr[str,6] = ["facebook", "meta", "pytorch", "llam...
llama_model_loader: - kv 8: general.languages arr[str,8] = ["en", "de", "fr", "it", "pt", "hi", ...
llama_model_loader: - kv 9: llama.block_count u32 = 32
llama_model_loader: - kv 10: llama.context_length u32 = 131072
llama_model_loader: - kv 11: llama.embedding_length u32 = 4096
llama_model_loader: - kv 12: llama.feed_forward_length u32 = 14336
llama_model_loader: - kv 13: llama.attention.head_count u32 = 32
llama_model_loader: - kv 14: llama.attention.head_count_kv u32 = 8
llama_model_loader: - kv 15: llama.rope.freq_base f32 = 500000.000000
llama_model_loader: - kv 16: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
llama_model_loader: - kv 17: general.file_type u32 = 18
llama_model_loader: - kv 18: llama.vocab_size u32 = 128256
llama_model_loader: - kv 19: llama.rope.dimension_count u32 = 128
llama_model_loader: - kv 20: tokenizer.ggml.model str = gpt2
llama_model_loader: - kv 21: tokenizer.ggml.pre str = llama-bpe
llama_model_loader: - kv 22: tokenizer.ggml.tokens arr[str,128256] = ["!", "\"", "#", "$", "%", "&", "'", ...
llama_model_loader: - kv 23: tokenizer.ggml.token_type arr[i32,128256] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 24: tokenizer.ggml.merges arr[str,280147] = ["Ġ Ġ", "Ġ ĠĠĠ", "ĠĠ ĠĠ", "...
llama_model_loader: - kv 25: tokenizer.ggml.bos_token_id u32 = 128000
llama_model_loader: - kv 26: tokenizer.ggml.eos_token_id u32 = 128009
llama_model_loader: - kv 27: tokenizer.chat_template str = {{- bos_token }}\n{%- if custom_tools ...
llama_model_loader: - kv 28: general.quantization_version u32 = 2
llama_model_loader: - kv 29: quantize.imatrix.file str = /models_out/Meta-Llama-3.1-8B-Instruc...
llama_model_loader: - kv 30: quantize.imatrix.dataset str = /training_dir/calibration_datav3.txt
llama_model_loader: - kv 31: quantize.imatrix.entries_count i32 = 224
llama_model_loader: - kv 32: quantize.imatrix.chunks_count i32 = 125
llama_model_loader: - type f32: 66 tensors
llama_model_loader: - type q6_K: 226 tensors
print_info: file format = GGUF V3 (latest)
print_info: file type = Q6_K
print_info: file size = 6.14 GiB (6.56 BPW)
load: special tokens cache size = 256
load: token to piece cache size = 0.7999 MB
print_info: arch = llama
print_info: vocab_only = 0
print_info: n_ctx_train = 131072
print_info: n_embd = 4096
print_info: n_layer = 32
print_info: n_head = 32
print_info: n_head_kv = 8
print_info: n_rot = 128
print_info: n_swa = 0
print_info: n_embd_head_k = 128
print_info: n_embd_head_v = 128
print_info: n_gqa = 4
print_info: n_embd_k_gqa = 1024
print_info: n_embd_v_gqa = 1024
print_info: f_norm_eps = 0.0e+00
print_info: f_norm_rms_eps = 1.0e-05
print_info: f_clamp_kqv = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale = 0.0e+00
print_info: n_ff = 14336
print_info: n_expert = 0
print_info: n_expert_used = 0
print_info: causal attn = 1
print_info: pooling type = 0
print_info: rope type = 0
print_info: rope scaling = linear
print_info: freq_base_train = 500000.0
print_info: freq_scale_train = 1
print_info: n_ctx_orig_yarn = 131072
print_info: rope_finetuned = unknown
print_info: ssm_d_conv = 0
print_info: ssm_d_inner = 0
print_info: ssm_d_state = 0
print_info: ssm_dt_rank = 0
print_info: ssm_dt_b_c_rms = 0
print_info: model type = 8B
print_info: model params = 8.03 B
print_info: general.name = Meta Llama 3.1 8B Instruct
print_info: vocab type = BPE
print_info: n_vocab = 128256
print_info: n_merges = 280147
print_info: BOS token = 128000 '<|begin_of_text|>'
print_info: EOS token = 128009 '<|eot_id|>'
print_info: EOT token = 128009 '<|eot_id|>'
print_info: EOM token = 128008 '<|eom_id|>'
print_info: LF token = 198 'Ċ'
print_info: EOG token = 128008 '<|eom_id|>'
print_info: EOG token = 128009 '<|eot_id|>'
print_info: max token length = 256
load_tensors: offloading 32 repeating layers to GPU
load_tensors: offloading output layer to GPU
load_tensors: offloaded 33/33 layers to GPU
load_tensors: ROCm0 model buffer size = 5871.99 MiB
load_tensors: CPU_Mapped model buffer size = 410.98 MiB
llama_init_from_model: n_seq_max = 1
llama_init_from_model: n_ctx = 2048
llama_init_from_model: n_ctx_per_seq = 2048
llama_init_from_model: n_batch = 2048
llama_init_from_model: n_ubatch = 512
llama_init_from_model: flash_attn = 0
llama_init_from_model: freq_base = 500000.0
llama_init_from_model: freq_scale = 1
llama_init_from_model: n_ctx_per_seq (2048) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_init: kv_size = 2048, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 32, can_shift = 1
llama_kv_cache_init: ROCm0 KV buffer size = 256.00 MiB
llama_init_from_model: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB
llama_init_from_model: ROCm_Host output buffer size = 0.49 MiB
llama_init_from_model: ROCm0 compute buffer size = 258.50 MiB
llama_init_from_model: ROCm_Host compute buffer size = 12.01 MiB
llama_init_from_model: graph nodes = 1030
llama_init_from_model: graph splits = 2
common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
main: llama threadpool init, n_threads = 16
system_info: n_threads = 16 (n_threads_batch = 16) / 32 | ROCm : NO_VMM = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | LLAMAFILE = 1 | OPENMP = 1 | AARCH64_REPACK = 1 |
sampler seed: 2360744227
sampler params:
repeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000
dry_multiplier = 0.000, dry_base = 1.750, dry_allowed_length = 2, dry_penalty_last_n = 2048
top_k = 40, top_p = 0.950, min_p = 0.050, xtc_probability = 0.000, xtc_threshold = 0.100, typical_p = 1.000, temp = 0.700
mirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000
sampler chain: logits -> logit-bias -> penalties -> dry -> top-k -> typical -> top-p -> min-p -> xtc -> temp-ext -> dist
generate: n_ctx = 2048, n_batch = 2048, n_predict = -1, n_keep = 1
llama_perf_sampler_print: sampling time = 328.84 ms / 693 runs ( 0.47 ms per token, 2107.40 tokens per second)
llama_perf_context_print: load time = 1635.05 ms
llama_perf_context_print: prompt eval time = 33.80 ms / 5 tokens ( 6.76 ms per token, 147.93 tokens per second)
llama_perf_context_print: eval time = 12717.05 ms / 687 runs ( 18.51 ms per token, 54.02 tokens per second)
llama_perf_context_print: total time = 13167.60 ms / 692 tokens
loaded code object /lib/x86_64-linux-gnu/rocblas/2.47.0/library/Kernels.so-000-gfx1030.hsaco
loaded code object /lib/x86_64-linux-gnu/rocblas/2.47.0/library/TensileLibrary_gfx1030.co
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
AMDGPU(matches: And(Processor(gfx90a): 0, CUCount(104): 0): 0
): 0
AMDGPU(matches: Processor(gfx1030): 1): 1
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows)
TruePred: 1
TruePred: 1
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Float == Float&& d:Float == Float): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == Half&& b:Half == Half&& c:Float == Half&& d:Float == Half): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == Half&& b:Half == Half&& c:Float == Half&& d:Float == Half): 0): 0
And(TypesEqual(a:Half == Int8&& b:Half == Int8&& c:Float == Int32&& d:Float == Int32): 0): 0
AMDGPU(matches: Processor(gfx1100): 0): 0
AMDGPU(matches: Processor(gfx1101): 0): 0
AMDGPU(matches: Processor(gfx1102): 0): 0
AMDGPU(matches: Processor(gfx803): 0): 0
AMDGPU(matches: Processor(gfx900): 0): 0
AMDGPU(matches: Processor(gfx906): 0): 0
AMDGPU(matches: Processor(gfx908): 0): 0
AMDGPU(matches: Processor(gfx90a): 0): 0
TruePred: 1
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows)
TruePred: 1
TruePred: 1
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Float == Float&& d:Float == Float): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == Double&& b:Half == Double&& c:Float == Double&& d:Float == Double): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == ComplexFloat&& b:Half == ComplexFloat&& c:Float == ComplexFloat&& d:Float == ComplexFloat): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == ComplexDouble&& b:Half == ComplexDouble&& c:Float == ComplexDouble&& d:Float == ComplexDouble): 0, HighPrecisionAccumulate(0): 0): 0
And(): 1
TruePred: 1
Object key: 32, 2, 128
Key: 32, 2, 128
Starting point: 127, 1, 63
Rightward search...
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 63: 13251 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 63: 13251 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 64: 13122 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 64: 13122 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 65: 12995 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 65: 12995 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 127, 1: 40779 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 127, 1: 40779 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 208 | 127, 127, 63: 28875 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 208 | 127, 127, 63: 28875 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(): 1
speed: 179 | 127, 127, 64: 28746 == 28746
speed: 179 | 127, 127, 64: 28746 == 28746
TruePred: 1And(): 1
speed: 213 | 127, 127, 65: 28619 == 28619
speed: 213 | 127, 127, 65: 28619 == 28619
speed: 3 | 127, 128, 1: 41030 > 28619
speed: 3 | 127, 128, 1: 41030 > 28619
speed: 3 | 127, 129, 1: 41283 > 28619
speed: 3 | 127, 129, 1: 41283 > 28619
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 63: 13442 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 63: 13442 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 64: 13313 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 64: 13313 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 65: 13186 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 65: 13186 < 28619 <-- Best distance, but no matching solution
speed: 3 | 128, 127, 1: 40970 > 28619
speed: 3 | 128, 127, 1: 40970 > 28619
speed: 3 | 128, 128, 1: 41221 > 28619
speed: 3 | 128, 128, 1: 41221 > 28619
speed: 200 | 128, 128, 63: 29317 > 28619
speed: 200 | 128, 128, 63: 29317 > 28619
speed: 176 | 128, 128, 64: 29188 > 28619
speed: 176 | 128, 128, 64: 29188 > 28619
speed: 220 | 128, 128, 65: 29061 > 28619
speed: 220 | 128, 128, 65: 29061 > 28619
speed: 3 | 128, 129, 1: 41474 > 28619
speed: 3 | 128, 129, 1: 41474 > 28619
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 63: 13635 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 63: 13635 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 64: 13506 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 64: 13506 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 65: 13379 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 65: 13379 < 28619 <-- Best distance, but no matching solution
speed: 3 | 129, 127, 1: 41163 > 28619
speed: 3 | 129, 127, 1: 41163 > 28619
speed: 3 | 129, 128, 1: 41414 > 28619
speed: 3 | 129, 128, 1: 41414 > 28619
speed: 3 | 129, 129, 1: 41667 > 28619
speed: 3 | 129, 129, 1: 41667 > 28619
speed: 176 | 129, 129, 63: 29763 > 28619
speed: 176 | 129, 129, 63: 29763 > 28619
speed: 194 | 129, 129, 64: 29634 > 28619
speed: 194 | 129, 129, 64: 29634 > 28619
speed: 218 | 129, 129, 65: 29507 > 28619
speed: 218 | 129, 129, 65: 29507 > 28619
Leftward search...
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 65: 21059 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 65: 21059 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 64: 21186 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 64: 21186 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 63: 21315 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 63: 21315 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 65: 20806 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 65: 20806 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 64: 20933 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 64: 20933 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 63: 21062 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 63: 21062 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 65: 20555 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 65: 20555 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 64: 20682 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 64: 20682 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 63: 20811 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 63: 20811 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 1, 1: 17091 < 28619 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 1, 1: 17091 < 28619 <-- Best distance, but no matching solution
Considered 100% of entries.
Solution index selected: 26093
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 00 20 88 e1 7f 00 00 (0x7fe188200000)
[24..31] batchB: 00 01 20 88 e1 7f 00 00 (0x7fe188200100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
AMDGPU(matches: And(Processor(gfx90a): 0, CUCount(104): 0): 0
): 0
AMDGPU(matches: Processor(gfx1030): 1): 1
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows)
TruePred: 1
TruePred: 1
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Half == Float&& d:Half == Float): 0): 0
And(): 1
TruePred: 1
Object key: 128, 2, 32
Key: 128, 2, 32
Starting point: 128, 64, 256
Rightward search...
TruePred: 1And(): 1
speed: 1.245 | 128, 64, 256: 54020 == 54020
speed: 1.245 | 128, 64, 256: 54020 == 54020
speed: 1.96 | 128, 64, 1280: 1.56135e+06 > 54020
speed: 1.96 | 128, 64, 1280: 1.56135e+06 > 54020
speed: 2.184 | 128, 64, 3328: 1.08675e+07 > 54020
speed: 2.184 | 128, 64, 3328: 1.08675e+07 > 54020
speed: 2.463 | 128, 128, 256: 66052 > 54020
speed: 2.463 | 128, 128, 256: 66052 > 54020
speed: 3.797 | 128, 128, 1280: 1.57338e+06 > 54020
speed: 3.797 | 128, 128, 1280: 1.57338e+06 > 54020
speed: 4.133 | 128, 128, 3328: 1.08795e+07 > 54020
speed: 4.133 | 128, 128, 3328: 1.08795e+07 > 54020
speed: 4.778 | 128, 256, 256: 114692 > 54020
speed: 4.778 | 128, 256, 256: 114692 > 54020
speed: 7.072 | 128, 256, 1280: 1.62202e+06 > 54020
speed: 7.072 | 128, 256, 1280: 1.62202e+06 > 54020
speed: 7.679 | 128, 256, 3328: 1.09281e+07 > 54020
speed: 7.679 | 128, 256, 3328: 1.09281e+07 > 54020
speed: 7.81 | 128, 448, 256: 249092 > 54020
speed: 7.81 | 128, 448, 256: 249092 > 54020
speed: 10.765 | 128, 448, 1280: 1.75642e+06 > 54020
speed: 10.765 | 128, 448, 1280: 1.75642e+06 > 54020
speed: 11.536 | 128, 448, 3328: 1.10625e+07 > 54020
speed: 11.536 | 128, 448, 3328: 1.10625e+07 > 54020
speed: 10.575 | 128, 512, 784: 825604 > 54020
speed: 10.575 | 128, 512, 784: 825604 > 54020
speed: 10.842 | 128, 704, 256: 542980 > 54020
speed: 10.842 | 128, 704, 256: 542980 > 54020
speed: 13.551 | 128, 704, 1280: 2.05031e+06 > 54020
speed: 13.551 | 128, 704, 1280: 2.05031e+06 > 54020
speed: 14.201 | 128, 704, 3328: 1.13564e+07 > 54020
speed: 14.201 | 128, 704, 3328: 1.13564e+07 > 54020
speed: 13.713 | 128, 1024, 256: 1.09466e+06 > 54020
speed: 13.713 | 128, 1024, 256: 1.09466e+06 > 54020
speed: 18.196 | 128, 1024, 1280: 2.60199e+06 > 54020
speed: 18.196 | 128, 1024, 1280: 2.60199e+06 > 54020
speed: 20.197 | 128, 1024, 3328: 1.19081e+07 > 54020
speed: 20.197 | 128, 1024, 3328: 1.19081e+07 > 54020
speed: 16.111 | 128, 1408, 256: 2.02701e+06 > 54020
speed: 16.111 | 128, 1408, 256: 2.02701e+06 > 54020
speed: 20.093 | 128, 1408, 1280: 3.53434e+06 > 54020
speed: 20.093 | 128, 1408, 1280: 3.53434e+06 > 54020
speed: 21.3 | 128, 1408, 3328: 1.28405e+07 > 54020
speed: 21.3 | 128, 1408, 3328: 1.28405e+07 > 54020
speed: 19.543 | 128, 1856, 256: 3.48749e+06 > 54020
speed: 19.543 | 128, 1856, 256: 3.48749e+06 > 54020
speed: 24.172 | 128, 1856, 1280: 4.99482e+06 > 54020
speed: 24.172 | 128, 1856, 1280: 4.99482e+06 > 54020
speed: 25.184 | 128, 1856, 3328: 1.43009e+07 > 54020
speed: 25.184 | 128, 1856, 3328: 1.43009e+07 > 54020
speed: 19.109 | 128, 2368, 256: 5.64813e+06 > 54020
speed: 19.109 | 128, 2368, 256: 5.64813e+06 > 54020
speed: 28.546 | 128, 2368, 1280: 7.15546e+06 > 54020
speed: 28.546 | 128, 2368, 1280: 7.15546e+06 > 54020
speed: 31.934 | 128, 2368, 3328: 1.64616e+07 > 54020
speed: 31.934 | 128, 2368, 3328: 1.64616e+07 > 54020
speed: 23.005 | 128, 2944, 256: 8.70554e+06 > 54020
speed: 23.005 | 128, 2944, 256: 8.70554e+06 > 54020
speed: 35.983 | 128, 2944, 1280: 1.02129e+07 > 54020
speed: 35.983 | 128, 2944, 1280: 1.02129e+07 > 54020
speed: 38.659 | 128, 2944, 3328: 1.9519e+07 > 54020
speed: 38.659 | 128, 2944, 3328: 1.9519e+07 > 54020
speed: 28.004 | 128, 3584, 256: 1.28809e+07 > 54020
speed: 28.004 | 128, 3584, 256: 1.28809e+07 > 54020
speed: 43.403 | 128, 3584, 1280: 1.43882e+07 > 54020
speed: 43.403 | 128, 3584, 1280: 1.43882e+07 > 54020
speed: 47.245 | 128, 3584, 3328: 2.36943e+07 > 54020
speed: 47.245 | 128, 3584, 3328: 2.36943e+07 > 54020
speed: 33.157 | 128, 4288, 256: 1.842e+07 > 54020
speed: 33.157 | 128, 4288, 256: 1.842e+07 > 54020
speed: 50.058 | 128, 4288, 1280: 1.99273e+07 > 54020
speed: 50.058 | 128, 4288, 1280: 1.99273e+07 > 54020
speed: 55.806 | 128, 4288, 3328: 2.92334e+07 > 54020
speed: 55.806 | 128, 4288, 3328: 2.92334e+07 > 54020
speed: 33.31 | 128, 5056, 256: 2.55931e+07 > 54020
speed: 33.31 | 128, 5056, 256: 2.55931e+07 > 54020
speed: 44.366 | 128, 5056, 1280: 2.71004e+07 > 54020
speed: 44.366 | 128, 5056, 1280: 2.71004e+07 > 54020
speed: 46.466 | 128, 5056, 3328: 3.64065e+07 > 54020
speed: 46.466 | 128, 5056, 3328: 3.64065e+07 > 54020
speed: 38.449 | 128, 5888, 256: 3.46952e+07 > 54020
speed: 38.449 | 128, 5888, 256: 3.46952e+07 > 54020
speed: 51.398 | 128, 5888, 1280: 3.62025e+07 > 54020
speed: 51.398 | 128, 5888, 1280: 3.62025e+07 > 54020
speed: 53.611 | 128, 5888, 3328: 4.55086e+07 > 54020
speed: 53.611 | 128, 5888, 3328: 4.55086e+07 > 54020
speed: 39.487 | 128, 6784, 256: 4.60457e+07 > 54020
speed: 39.487 | 128, 6784, 256: 4.60457e+07 > 54020
speed: 50.901 | 128, 6784, 1280: 4.7553e+07 > 54020
speed: 50.901 | 128, 6784, 1280: 4.7553e+07 > 54020
speed: 53.044 | 128, 6784, 3328: 5.68591e+07 > 54020
speed: 53.044 | 128, 6784, 3328: 5.68591e+07 > 54020
speed: 2.452 | 256, 64, 256: 70404 > 54020
speed: 2.452 | 256, 64, 256: 70404 > 54020
speed: 3.803 | 256, 64, 1280: 1.57773e+06 > 54020
speed: 3.803 | 256, 64, 1280: 1.57773e+06 > 54020
speed: 4.14 | 256, 64, 3136: 9.65504e+06 > 54020
speed: 4.14 | 256, 64, 3136: 9.65504e+06 > 54020
speed: 4.135 | 256, 64, 3328: 1.08838e+07 > 54020
speed: 4.135 | 256, 64, 3328: 1.08838e+07 > 54020
speed: 4.755 | 256, 128, 256: 82436 > 54020
speed: 4.755 | 256, 128, 256: 82436 > 54020
speed: 7.05 | 256, 128, 1280: 1.58976e+06 > 54020
speed: 7.05 | 256, 128, 1280: 1.58976e+06 > 54020
speed: 7.688 | 256, 128, 3328: 1.08959e+07 > 54020
speed: 7.688 | 256, 128, 3328: 1.08959e+07 > 54020
speed: 8.926 | 256, 256, 256: 131076 > 54020
speed: 8.926 | 256, 256, 256: 131076 > 54020
speed: 12.13 | 256, 256, 1280: 1.6384e+06 > 54020
speed: 12.13 | 256, 256, 1280: 1.6384e+06 > 54020
speed: 12.493 | 256, 256, 3328: 1.09445e+07 > 54020
speed: 12.493 | 256, 256, 3328: 1.09445e+07 > 54020
speed: 12.13 | 256, 448, 256: 265476 > 54020
speed: 12.13 | 256, 448, 256: 265476 > 54020
speed: 15.829 | 256, 448, 1280: 1.7728e+06 > 54020
speed: 15.829 | 256, 448, 1280: 1.7728e+06 > 54020
speed: 17.968 | 256, 448, 3328: 1.10789e+07 > 54020
speed: 17.968 | 256, 448, 3328: 1.10789e+07 > 54020
speed: 16.364 | 256, 704, 256: 559364 > 54020
speed: 16.364 | 256, 704, 256: 559364 > 54020
speed: 19.563 | 256, 704, 1280: 2.06669e+06 > 54020
speed: 19.563 | 256, 704, 1280: 2.06669e+06 > 54020
speed: 21.347 | 256, 704, 3328: 1.13728e+07 > 54020
speed: 21.347 | 256, 704, 3328: 1.13728e+07 > 54020
speed: 19.579 | 256, 1024, 196: 1.08776e+06 > 54020
speed: 19.579 | 256, 1024, 196: 1.08776e+06 > 54020
speed: 21.444 | 256, 1024, 256: 1.11104e+06 > 54020
speed: 21.444 | 256, 1024, 256: 1.11104e+06 > 54020
speed: 26.615 | 256, 1024, 1280: 2.61837e+06 > 54020
speed: 26.615 | 256, 1024, 1280: 2.61837e+06 > 54020
speed: 27.154 | 256, 1024, 3328: 1.19245e+07 > 54020
speed: 27.154 | 256, 1024, 3328: 1.19245e+07 > 54020
speed: 23.601 | 256, 1408, 256: 2.0434e+06 > 54020
speed: 23.601 | 256, 1408, 256: 2.0434e+06 > 54020
speed: 34.533 | 256, 1408, 1280: 3.55072e+06 > 54020
speed: 34.533 | 256, 1408, 1280: 3.55072e+06 > 54020
speed: 37.51 | 256, 1408, 3328: 1.28568e+07 > 54020
speed: 37.51 | 256, 1408, 3328: 1.28568e+07 > 54020
speed: 30.488 | 256, 1856, 256: 3.50388e+06 > 54020
speed: 30.488 | 256, 1856, 256: 3.50388e+06 > 54020
speed: 43.527 | 256, 1856, 1280: 5.0112e+06 > 54020
speed: 43.527 | 256, 1856, 1280: 5.0112e+06 > 54020
speed: 49.667 | 256, 1856, 3328: 1.43173e+07 > 54020
speed: 49.667 | 256, 1856, 3328: 1.43173e+07 > 54020
speed: 31.426 | 256, 2368, 256: 5.66452e+06 > 54020
speed: 31.426 | 256, 2368, 256: 5.66452e+06 > 54020
speed: 41.677 | 256, 2368, 1280: 7.17184e+06 > 54020
speed: 41.677 | 256, 2368, 1280: 7.17184e+06 > 54020
speed: 43.82 | 256, 2368, 3328: 1.6478e+07 > 54020
speed: 43.82 | 256, 2368, 3328: 1.6478e+07 > 54020
speed: 38.792 | 256, 2944, 256: 8.72192e+06 > 54020
speed: 38.792 | 256, 2944, 256: 8.72192e+06 > 54020
speed: 51.301 | 256, 2944, 1280: 1.02293e+07 > 54020
speed: 51.301 | 256, 2944, 1280: 1.02293e+07 > 54020
speed: 53.55 | 256, 2944, 3328: 1.95354e+07 > 54020
speed: 53.55 | 256, 2944, 3328: 1.95354e+07 > 54020
speed: 41.654 | 256, 3584, 256: 1.28973e+07 > 54020
speed: 41.654 | 256, 3584, 256: 1.28973e+07 > 54020
speed: 53.759 | 256, 3584, 1280: 1.44046e+07 > 54020
speed: 53.759 | 256, 3584, 1280: 1.44046e+07 > 54020
speed: 55.912 | 256, 3584, 3328: 2.37107e+07 > 54020
speed: 55.912 | 256, 3584, 3328: 2.37107e+07 > 54020
speed: 47.31 | 256, 4288, 256: 1.84364e+07 > 54020
speed: 47.31 | 256, 4288, 256: 1.84364e+07 > 54020
speed: 61.452 | 256, 4288, 1280: 1.99437e+07 > 54020
speed: 61.452 | 256, 4288, 1280: 1.99437e+07 > 54020
speed: 65.919 | 256, 4288, 3328: 2.92498e+07 > 54020
speed: 65.919 | 256, 4288, 3328: 2.92498e+07 > 54020
speed: 51.127 | 256, 5056, 256: 2.56095e+07 > 54020
speed: 51.127 | 256, 5056, 256: 2.56095e+07 > 54020
speed: 57.676 | 256, 5056, 1280: 2.71168e+07 > 54020
speed: 57.676 | 256, 5056, 1280: 2.71168e+07 > 54020
speed: 56.189 | 256, 5056, 3328: 3.64229e+07 > 54020
speed: 56.189 | 256, 5056, 3328: 3.64229e+07 > 54020
speed: 54.885 | 256, 5888, 256: 3.47116e+07 > 54020
speed: 54.885 | 256, 5888, 256: 3.47116e+07 > 54020
speed: 61.772 | 256, 5888, 1280: 3.62189e+07 > 54020
speed: 61.772 | 256, 5888, 1280: 3.62189e+07 > 54020
speed: 66.076 | 256, 5888, 3328: 4.5525e+07 > 54020
speed: 66.076 | 256, 5888, 3328: 4.5525e+07 > 54020
speed: 62.92 | 256, 6784, 256: 4.60621e+07 > 54020
speed: 62.92 | 256, 6784, 256: 4.60621e+07 > 54020
speed: 70.89 | 256, 6784, 1280: 4.75694e+07 > 54020
speed: 70.89 | 256, 6784, 1280: 4.75694e+07 > 54020
speed: 72.458 | 256, 6784, 3328: 5.68755e+07 > 54020
speed: 72.458 | 256, 6784, 3328: 5.68755e+07 > 54020
448, 64, 256: Stopping rightward search early.
Leftward search...
64, 6784, 3328: 5.68632e+07 > 54020
64, 6784, 3328: 5.68632e+07 > 54020
64, 6784, 1280: 4.75571e+07 > 54020
64, 6784, 1280: 4.75571e+07 > 54020
64, 6784, 256: 4.60498e+07 > 54020
64, 6784, 256: 4.60498e+07 > 54020
64, 5888, 3328: 4.55127e+07 > 54020
64, 5888, 3328: 4.55127e+07 > 54020
64, 5888, 1280: 3.62066e+07 > 54020
64, 5888, 1280: 3.62066e+07 > 54020
64, 5888, 256: 3.46993e+07 > 54020
64, 5888, 256: 3.46993e+07 > 54020
64, 5056, 3328: 3.64106e+07 > 54020
64, 5056, 3328: 3.64106e+07 > 54020
64, 5056, 1280: 2.71045e+07 > 54020
64, 5056, 1280: 2.71045e+07 > 54020
64, 5056, 256: 2.55972e+07 > 54020
64, 5056, 256: 2.55972e+07 > 54020
64, 4288, 3328: 2.92375e+07 > 54020
64, 4288, 3328: 2.92375e+07 > 54020
64, 4288, 1280: 1.99314e+07 > 54020
64, 4288, 1280: 1.99314e+07 > 54020
64, 4288, 256: 1.84241e+07 > 54020
64, 4288, 256: 1.84241e+07 > 54020
64, 3584, 3328: 2.36984e+07 > 54020
64, 3584, 3328: 2.36984e+07 > 54020
64, 3584, 1280: 1.43923e+07 > 54020
64, 3584, 1280: 1.43923e+07 > 54020
64, 3584, 256: 1.2885e+07 > 54020
64, 3584, 256: 1.2885e+07 > 54020
64, 2944, 3328: 1.95231e+07 > 54020
64, 2944, 3328: 1.95231e+07 > 54020
64, 2944, 1280: 1.0217e+07 > 54020
64, 2944, 1280: 1.0217e+07 > 54020
64, 2944, 256: 8.70964e+06 > 54020
64, 2944, 256: 8.70964e+06 > 54020
64, 2368, 3328: 1.64657e+07 > 54020
64, 2368, 3328: 1.64657e+07 > 54020
64, 2368, 1280: 7.15956e+06 > 54020
64, 2368, 1280: 7.15956e+06 > 54020
64, 2368, 256: 5.65223e+06 > 54020
64, 2368, 256: 5.65223e+06 > 54020
64, 1856, 3328: 1.4305e+07 > 54020
64, 1856, 3328: 1.4305e+07 > 54020
64, 1856, 1280: 4.99892e+06 > 54020
64, 1856, 1280: 4.99892e+06 > 54020
64, 1856, 256: 3.49159e+06 > 54020
64, 1856, 256: 3.49159e+06 > 54020
64, 1408, 3328: 1.28445e+07 > 54020
64, 1408, 3328: 1.28445e+07 > 54020
64, 1408, 1280: 3.53844e+06 > 54020
64, 1408, 1280: 3.53844e+06 > 54020
64, 1408, 256: 2.03111e+06 > 54020
64, 1408, 256: 2.03111e+06 > 54020
64, 1024, 3328: 1.19122e+07 > 54020
64, 1024, 3328: 1.19122e+07 > 54020
64, 1024, 1280: 2.60608e+06 > 54020
64, 1024, 1280: 2.60608e+06 > 54020
64, 1024, 256: 1.09876e+06 > 54020
64, 1024, 256: 1.09876e+06 > 54020
64, 704, 3328: 1.13605e+07 > 54020
64, 704, 3328: 1.13605e+07 > 54020
64, 704, 1280: 2.0544e+06 > 54020
64, 704, 1280: 2.0544e+06 > 54020
64, 704, 256: 547076 > 54020
64, 704, 256: 547076 > 54020
64, 448, 3328: 1.10666e+07 > 54020
64, 448, 3328: 1.10666e+07 > 54020
64, 448, 1280: 1.76052e+06 > 54020
64, 448, 1280: 1.76052e+06 > 54020
64, 448, 256: 253188 > 54020
64, 448, 256: 253188 > 54020
64, 256, 3328: 1.09322e+07 > 54020
64, 256, 3328: 1.09322e+07 > 54020
64, 256, 3136: 9.70343e+06 > 54020
64, 256, 3136: 9.70343e+06 > 54020
64, 256, 1280: 1.62612e+06 > 54020
64, 256, 1280: 1.62612e+06 > 54020
64, 256, 256: 118788 > 54020
64, 256, 256: 118788 > 54020
64, 128, 3328: 1.08836e+07 > 54020
64, 128, 3328: 1.08836e+07 > 54020
64, 128, 1280: 1.57748e+06 > 54020
64, 128, 1280: 1.57748e+06 > 54020
64, 128, 256: 70148 > 54020
64, 128, 256: 70148 > 54020
64, 64, 3328: 1.08716e+07 > 54020
64, 64, 3328: 1.08716e+07 > 54020
64, 64, 3136: 9.64276e+06 > 54020
64, 64, 3136: 9.64276e+06 > 54020
64, 64, 1280: 1.56544e+06 > 54020
64, 64, 1280: 1.56544e+06 > 54020
64, 64, 256: 58116 > 54020
64, 64, 256: 58116 > 54020
Considered 17.5329% of entries.
Solution index selected: 22645
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 4096, 8192), offset(0))totalLogicalElements=8192 totalAllocatedElem=258176
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: 40 00 00 00 (64)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: 40 00 00 00 (64)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 20 00 00 (8192)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 02 00 00 00 (2)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 2, 32), strides(1, 32, 64), offset(0))totalLogicalElements=2048 totalAllocatedElem=2048
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 2, 32), strides(1, 128, 256), offset(0))totalLogicalElements=8192 totalAllocatedElem=8192
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 20 00 00 00 00 00 00 (8192)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: 40 00 00 00 00 00 00 00 (64)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 00 01 00 00 (256)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 00 01 00 00 (256)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: 40 00 00 00 (64)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 02 00 00 00 (2)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
Once upon a timeTensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
AMDGPU(matches: And(Processor(gfx90a): 0, CUCount(104): 0): 0
): 0
AMDGPU(matches: Processor(gfx1030): 1): 1
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows)
TruePred: 1
TruePred: 1
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Float == Float&& d:Float == Float): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == Half&& b:Half == Half&& c:Float == Half&& d:Float == Half): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == Half&& b:Half == Half&& c:Float == Half&& d:Float == Half): 0): 0
And(TypesEqual(a:Half == Int8&& b:Half == Int8&& c:Float == Int32&& d:Float == Int32): 0): 0
AMDGPU(matches: Processor(gfx1100): 0): 0
AMDGPU(matches: Processor(gfx1101): 0): 0
AMDGPU(matches: Processor(gfx1102): 0): 0
AMDGPU(matches: Processor(gfx803): 0): 0
AMDGPU(matches: Processor(gfx900): 0): 0
AMDGPU(matches: Processor(gfx906): 0): 0
AMDGPU(matches: Processor(gfx908): 0): 0
AMDGPU(matches: Processor(gfx90a): 0): 0
TruePred: 1
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows)
TruePred: 1
TruePred: 1
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Float == Float&& d:Float == Float): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == Double&& b:Half == Double&& c:Float == Double&& d:Float == Double): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == ComplexFloat&& b:Half == ComplexFloat&& c:Float == ComplexFloat&& d:Float == ComplexFloat): 0, HighPrecisionAccumulate(0): 0): 0
And(TypesEqual(a:Half == ComplexDouble&& b:Half == ComplexDouble&& c:Float == ComplexDouble&& d:Float == ComplexDouble): 0, HighPrecisionAccumulate(0): 0): 0
And(): 1
TruePred: 1
Object key: 32, 5, 128
Key: 32, 5, 128
Starting point: 127, 1, 63
Rightward search...
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 63: 13266 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 63: 13266 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 64: 13137 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 64: 13137 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 65: 13010 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 1, 65: 13010 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 127, 1: 40038 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 127, 127, 1: 40038 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 208 | 127, 127, 63: 28134 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 208 | 127, 127, 63: 28134 < 1.79769e+308 <-- Best distance, but no matching solution
TruePred: 1And(): 1
speed: 179 | 127, 127, 64: 28005 == 28005
speed: 179 | 127, 127, 64: 28005 == 28005
TruePred: 1And(): 1
speed: 213 | 127, 127, 65: 27878 == 27878
speed: 213 | 127, 127, 65: 27878 == 27878
speed: 3 | 127, 128, 1: 40283 > 27878
speed: 3 | 127, 128, 1: 40283 > 27878
speed: 3 | 127, 129, 1: 40530 > 27878
speed: 3 | 127, 129, 1: 40530 > 27878
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 63: 13457 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 63: 13457 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 64: 13328 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 64: 13328 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 65: 13201 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 128, 1, 65: 13201 < 27878 <-- Best distance, but no matching solution
speed: 3 | 128, 127, 1: 40229 > 27878
speed: 3 | 128, 127, 1: 40229 > 27878
speed: 3 | 128, 128, 1: 40474 > 27878
speed: 3 | 128, 128, 1: 40474 > 27878
speed: 200 | 128, 128, 63: 28570 > 27878
speed: 200 | 128, 128, 63: 28570 > 27878
speed: 176 | 128, 128, 64: 28441 > 27878
speed: 176 | 128, 128, 64: 28441 > 27878
speed: 220 | 128, 128, 65: 28314 > 27878
speed: 220 | 128, 128, 65: 28314 > 27878
speed: 3 | 128, 129, 1: 40721 > 27878
speed: 3 | 128, 129, 1: 40721 > 27878
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 63: 13650 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 63: 13650 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 64: 13521 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 64: 13521 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 65: 13394 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
speed: 3 | 129, 1, 65: 13394 < 27878 <-- Best distance, but no matching solution
speed: 3 | 129, 127, 1: 40422 > 27878
speed: 3 | 129, 127, 1: 40422 > 27878
speed: 3 | 129, 128, 1: 40667 > 27878
speed: 3 | 129, 128, 1: 40667 > 27878
speed: 3 | 129, 129, 1: 40914 > 27878
speed: 3 | 129, 129, 1: 40914 > 27878
speed: 176 | 129, 129, 63: 29010 > 27878
speed: 176 | 129, 129, 63: 29010 > 27878
speed: 194 | 129, 129, 64: 28881 > 27878
speed: 194 | 129, 129, 64: 28881 > 27878
speed: 218 | 129, 129, 65: 28754 > 27878
speed: 218 | 129, 129, 65: 28754 > 27878
Leftward search...
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 65: 20306 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 65: 20306 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 64: 20433 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 64: 20433 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 63: 20562 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 129, 63: 20562 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 65: 20059 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 65: 20059 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 64: 20186 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 64: 20186 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 63: 20315 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 128, 63: 20315 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 65: 19814 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 65: 19814 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 64: 19941 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 64: 19941 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 63: 20070 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 127, 63: 20070 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 1, 1: 17106 < 27878 <-- Best distance, but no matching solution
TruePred: 1And(StridedBatched(1): 0, GlobalSplitUCheckMinK(value:768): (128 >= 768) == 0): 0
1, 1, 1: 17106 < 27878 <-- Best distance, but no matching solution
Considered 100% of entries.
Solution index selected: 26093
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
AMDGPU(matches: And(Processor(gfx90a): 0, CUCount(104): 0): 0
): 0
AMDGPU(matches: Processor(gfx1030): 1): 1
ProblemMap Searching for Contraction_l_Alik_Bljk_Cijk_Dijk found Problem library (1 rows)
TruePred: 1
TruePred: 1
And(TypesEqual(a:Half == Float&& b:Half == Float&& c:Half == Float&& d:Half == Float): 0): 0
And(): 1
TruePred: 1
Object key: 128, 5, 32
Key: 128, 5, 32
Starting point: 128, 64, 256
Rightward search...
TruePred: 1And(): 1
speed: 1.245 | 128, 64, 256: 53657 == 53657
speed: 1.245 | 128, 64, 256: 53657 == 53657
speed: 1.96 | 128, 64, 1280: 1.56098e+06 > 53657
speed: 1.96 | 128, 64, 1280: 1.56098e+06 > 53657
speed: 2.184 | 128, 64, 3328: 1.08671e+07 > 53657
speed: 2.184 | 128, 64, 3328: 1.08671e+07 > 53657
speed: 2.463 | 128, 128, 256: 65305 > 53657
speed: 2.463 | 128, 128, 256: 65305 > 53657
speed: 3.797 | 128, 128, 1280: 1.57263e+06 > 53657
speed: 3.797 | 128, 128, 1280: 1.57263e+06 > 53657
speed: 4.133 | 128, 128, 3328: 1.08787e+07 > 53657
speed: 4.133 | 128, 128, 3328: 1.08787e+07 > 53657
speed: 4.778 | 128, 256, 256: 113177 > 53657
speed: 4.778 | 128, 256, 256: 113177 > 53657
speed: 7.072 | 128, 256, 1280: 1.6205e+06 > 53657
speed: 7.072 | 128, 256, 1280: 1.6205e+06 > 53657
speed: 7.679 | 128, 256, 3328: 1.09266e+07 > 53657
speed: 7.679 | 128, 256, 3328: 1.09266e+07 > 53657
speed: 7.81 | 128, 448, 256: 246425 > 53657
speed: 7.81 | 128, 448, 256: 246425 > 53657
speed: 10.765 | 128, 448, 1280: 1.75375e+06 > 53657
speed: 10.765 | 128, 448, 1280: 1.75375e+06 > 53657
speed: 11.536 | 128, 448, 3328: 1.10599e+07 > 53657
speed: 11.536 | 128, 448, 3328: 1.10599e+07 > 53657
speed: 10.575 | 128, 512, 784: 822553 > 53657
speed: 10.575 | 128, 512, 784: 822553 > 53657
speed: 10.842 | 128, 704, 256: 538777 > 53657
speed: 10.842 | 128, 704, 256: 538777 > 53657
speed: 13.551 | 128, 704, 1280: 2.0461e+06 > 53657
speed: 13.551 | 128, 704, 1280: 2.0461e+06 > 53657
speed: 14.201 | 128, 704, 3328: 1.13522e+07 > 53657
speed: 14.201 | 128, 704, 3328: 1.13522e+07 > 53657
speed: 13.713 | 128, 1024, 256: 1.08854e+06 > 53657
speed: 13.713 | 128, 1024, 256: 1.08854e+06 > 53657
speed: 18.196 | 128, 1024, 1280: 2.59586e+06 > 53657
speed: 18.196 | 128, 1024, 1280: 2.59586e+06 > 53657
speed: 20.197 | 128, 1024, 3328: 1.1902e+07 > 53657
speed: 20.197 | 128, 1024, 3328: 1.1902e+07 > 53657
speed: 16.111 | 128, 1408, 256: 2.01858e+06 > 53657
speed: 16.111 | 128, 1408, 256: 2.01858e+06 > 53657
speed: 20.093 | 128, 1408, 1280: 3.52591e+06 > 53657
speed: 20.093 | 128, 1408, 1280: 3.52591e+06 > 53657
speed: 21.3 | 128, 1408, 3328: 1.2832e+07 > 53657
speed: 21.3 | 128, 1408, 3328: 1.2832e+07 > 53657
speed: 19.543 | 128, 1856, 256: 3.47638e+06 > 53657
speed: 19.543 | 128, 1856, 256: 3.47638e+06 > 53657
speed: 24.172 | 128, 1856, 1280: 4.9837e+06 > 53657
speed: 24.172 | 128, 1856, 1280: 4.9837e+06 > 53657
speed: 25.184 | 128, 1856, 3328: 1.42898e+07 > 53657
speed: 25.184 | 128, 1856, 3328: 1.42898e+07 > 53657
speed: 19.109 | 128, 2368, 256: 5.63394e+06 > 53657
speed: 19.109 | 128, 2368, 256: 5.63394e+06 > 53657
speed: 28.546 | 128, 2368, 1280: 7.14127e+06 > 53657
speed: 28.546 | 128, 2368, 1280: 7.14127e+06 > 53657
speed: 31.934 | 128, 2368, 3328: 1.64474e+07 > 53657
speed: 31.934 | 128, 2368, 3328: 1.64474e+07 > 53657
speed: 23.005 | 128, 2944, 256: 8.6879e+06 > 53657
speed: 23.005 | 128, 2944, 256: 8.6879e+06 > 53657
speed: 35.983 | 128, 2944, 1280: 1.01952e+07 > 53657
speed: 35.983 | 128, 2944, 1280: 1.01952e+07 > 53657
speed: 38.659 | 128, 2944, 3328: 1.95013e+07 > 53657
speed: 38.659 | 128, 2944, 3328: 1.95013e+07 > 53657
speed: 28.004 | 128, 3584, 256: 1.28594e+07 > 53657
speed: 28.004 | 128, 3584, 256: 1.28594e+07 > 53657
speed: 43.403 | 128, 3584, 1280: 1.43667e+07 > 53657
speed: 43.403 | 128, 3584, 1280: 1.43667e+07 > 53657
speed: 47.245 | 128, 3584, 3328: 2.36729e+07 > 53657
speed: 47.245 | 128, 3584, 3328: 2.36729e+07 > 53657
speed: 33.157 | 128, 4288, 256: 1.83943e+07 > 53657
speed: 33.157 | 128, 4288, 256: 1.83943e+07 > 53657
speed: 50.058 | 128, 4288, 1280: 1.99016e+07 > 53657
speed: 50.058 | 128, 4288, 1280: 1.99016e+07 > 53657
speed: 55.806 | 128, 4288, 3328: 2.92077e+07 > 53657
speed: 55.806 | 128, 4288, 3328: 2.92077e+07 > 53657
speed: 33.31 | 128, 5056, 256: 2.55628e+07 > 53657
speed: 33.31 | 128, 5056, 256: 2.55628e+07 > 53657
speed: 44.366 | 128, 5056, 1280: 2.70701e+07 > 53657
speed: 44.366 | 128, 5056, 1280: 2.70701e+07 > 53657
speed: 46.466 | 128, 5056, 3328: 3.63762e+07 > 53657
speed: 46.466 | 128, 5056, 3328: 3.63762e+07 > 53657
speed: 38.449 | 128, 5888, 256: 3.46599e+07 > 53657
speed: 38.449 | 128, 5888, 256: 3.46599e+07 > 53657
speed: 51.398 | 128, 5888, 1280: 3.61672e+07 > 53657
speed: 51.398 | 128, 5888, 1280: 3.61672e+07 > 53657
speed: 53.611 | 128, 5888, 3328: 4.54733e+07 > 53657
speed: 53.611 | 128, 5888, 3328: 4.54733e+07 > 53657
speed: 39.487 | 128, 6784, 256: 4.6005e+07 > 53657
speed: 39.487 | 128, 6784, 256: 4.6005e+07 > 53657
speed: 50.901 | 128, 6784, 1280: 4.75123e+07 > 53657
speed: 50.901 | 128, 6784, 1280: 4.75123e+07 > 53657
speed: 53.044 | 128, 6784, 3328: 5.68185e+07 > 53657
speed: 53.044 | 128, 6784, 3328: 5.68185e+07 > 53657
speed: 2.452 | 256, 64, 256: 70041 > 53657
speed: 2.452 | 256, 64, 256: 70041 > 53657
speed: 3.803 | 256, 64, 1280: 1.57737e+06 > 53657
speed: 3.803 | 256, 64, 1280: 1.57737e+06 > 53657
speed: 4.14 | 256, 64, 3136: 9.65468e+06 > 53657
speed: 4.14 | 256, 64, 3136: 9.65468e+06 > 53657
speed: 4.135 | 256, 64, 3328: 1.08835e+07 > 53657
speed: 4.135 | 256, 64, 3328: 1.08835e+07 > 53657
speed: 4.755 | 256, 128, 256: 81689 > 53657
speed: 4.755 | 256, 128, 256: 81689 > 53657
speed: 7.05 | 256, 128, 1280: 1.58902e+06 > 53657
speed: 7.05 | 256, 128, 1280: 1.58902e+06 > 53657
speed: 7.688 | 256, 128, 3328: 1.08951e+07 > 53657
speed: 7.688 | 256, 128, 3328: 1.08951e+07 > 53657
speed: 8.926 | 256, 256, 256: 129561 > 53657
speed: 8.926 | 256, 256, 256: 129561 > 53657
speed: 12.13 | 256, 256, 1280: 1.63689e+06 > 53657
speed: 12.13 | 256, 256, 1280: 1.63689e+06 > 53657
speed: 12.493 | 256, 256, 3328: 1.0943e+07 > 53657
speed: 12.493 | 256, 256, 3328: 1.0943e+07 > 53657
speed: 12.13 | 256, 448, 256: 262809 > 53657
speed: 12.13 | 256, 448, 256: 262809 > 53657
speed: 15.829 | 256, 448, 1280: 1.77014e+06 > 53657
speed: 15.829 | 256, 448, 1280: 1.77014e+06 > 53657
speed: 17.968 | 256, 448, 3328: 1.10762e+07 > 53657
speed: 17.968 | 256, 448, 3328: 1.10762e+07 > 53657
speed: 16.364 | 256, 704, 256: 555161 > 53657
speed: 16.364 | 256, 704, 256: 555161 > 53657
speed: 19.563 | 256, 704, 1280: 2.06249e+06 > 53657
speed: 19.563 | 256, 704, 1280: 2.06249e+06 > 53657
speed: 21.347 | 256, 704, 3328: 1.13686e+07 > 53657
speed: 21.347 | 256, 704, 3328: 1.13686e+07 > 53657
speed: 19.579 | 256, 1024, 196: 1.08164e+06 > 53657
speed: 19.579 | 256, 1024, 196: 1.08164e+06 > 53657
speed: 21.444 | 256, 1024, 256: 1.10492e+06 > 53657
speed: 21.444 | 256, 1024, 256: 1.10492e+06 > 53657
speed: 26.615 | 256, 1024, 1280: 2.61225e+06 > 53657
speed: 26.615 | 256, 1024, 1280: 2.61225e+06 > 53657
speed: 27.154 | 256, 1024, 3328: 1.19184e+07 > 53657
speed: 27.154 | 256, 1024, 3328: 1.19184e+07 > 53657
speed: 23.601 | 256, 1408, 256: 2.03497e+06 > 53657
speed: 23.601 | 256, 1408, 256: 2.03497e+06 > 53657
speed: 34.533 | 256, 1408, 1280: 3.5423e+06 > 53657
speed: 34.533 | 256, 1408, 1280: 3.5423e+06 > 53657
speed: 37.51 | 256, 1408, 3328: 1.28484e+07 > 53657
speed: 37.51 | 256, 1408, 3328: 1.28484e+07 > 53657
speed: 30.488 | 256, 1856, 256: 3.49276e+06 > 53657
speed: 30.488 | 256, 1856, 256: 3.49276e+06 > 53657
speed: 43.527 | 256, 1856, 1280: 5.00009e+06 > 53657
speed: 43.527 | 256, 1856, 1280: 5.00009e+06 > 53657
speed: 49.667 | 256, 1856, 3328: 1.43062e+07 > 53657
speed: 49.667 | 256, 1856, 3328: 1.43062e+07 > 53657
speed: 31.426 | 256, 2368, 256: 5.65033e+06 > 53657
speed: 31.426 | 256, 2368, 256: 5.65033e+06 > 53657
speed: 41.677 | 256, 2368, 1280: 7.15766e+06 > 53657
speed: 41.677 | 256, 2368, 1280: 7.15766e+06 > 53657
speed: 43.82 | 256, 2368, 3328: 1.64638e+07 > 53657
speed: 43.82 | 256, 2368, 3328: 1.64638e+07 > 53657
speed: 38.792 | 256, 2944, 256: 8.70428e+06 > 53657
speed: 38.792 | 256, 2944, 256: 8.70428e+06 > 53657
speed: 51.301 | 256, 2944, 1280: 1.02116e+07 > 53657
speed: 51.301 | 256, 2944, 1280: 1.02116e+07 > 53657
speed: 53.55 | 256, 2944, 3328: 1.95177e+07 > 53657
speed: 53.55 | 256, 2944, 3328: 1.95177e+07 > 53657
speed: 41.654 | 256, 3584, 256: 1.28758e+07 > 53657
speed: 41.654 | 256, 3584, 256: 1.28758e+07 > 53657
speed: 53.759 | 256, 3584, 1280: 1.43831e+07 > 53657
speed: 53.759 | 256, 3584, 1280: 1.43831e+07 > 53657
speed: 55.912 | 256, 3584, 3328: 2.36892e+07 > 53657
speed: 55.912 | 256, 3584, 3328: 2.36892e+07 > 53657
speed: 47.31 | 256, 4288, 256: 1.84106e+07 > 53657
speed: 47.31 | 256, 4288, 256: 1.84106e+07 > 53657
speed: 61.452 | 256, 4288, 1280: 1.9918e+07 > 53657
speed: 61.452 | 256, 4288, 1280: 1.9918e+07 > 53657
speed: 65.919 | 256, 4288, 3328: 2.92241e+07 > 53657
speed: 65.919 | 256, 4288, 3328: 2.92241e+07 > 53657
speed: 51.127 | 256, 5056, 256: 2.55792e+07 > 53657
speed: 51.127 | 256, 5056, 256: 2.55792e+07 > 53657
speed: 57.676 | 256, 5056, 1280: 2.70865e+07 > 53657
speed: 57.676 | 256, 5056, 1280: 2.70865e+07 > 53657
speed: 56.189 | 256, 5056, 3328: 3.63926e+07 > 53657
speed: 56.189 | 256, 5056, 3328: 3.63926e+07 > 53657
speed: 54.885 | 256, 5888, 256: 3.46762e+07 > 53657
speed: 54.885 | 256, 5888, 256: 3.46762e+07 > 53657
speed: 61.772 | 256, 5888, 1280: 3.61836e+07 > 53657
speed: 61.772 | 256, 5888, 1280: 3.61836e+07 > 53657
speed: 66.076 | 256, 5888, 3328: 4.54897e+07 > 53657
speed: 66.076 | 256, 5888, 3328: 4.54897e+07 > 53657
speed: 62.92 | 256, 6784, 256: 4.60214e+07 > 53657
speed: 62.92 | 256, 6784, 256: 4.60214e+07 > 53657
speed: 70.89 | 256, 6784, 1280: 4.75287e+07 > 53657
speed: 70.89 | 256, 6784, 1280: 4.75287e+07 > 53657
speed: 72.458 | 256, 6784, 3328: 5.68348e+07 > 53657
speed: 72.458 | 256, 6784, 3328: 5.68348e+07 > 53657
448, 64, 256: Stopping rightward search early.
Leftward search...
64, 6784, 3328: 5.68226e+07 > 53657
64, 6784, 3328: 5.68226e+07 > 53657
64, 6784, 1280: 4.75164e+07 > 53657
64, 6784, 1280: 4.75164e+07 > 53657
64, 6784, 256: 4.60091e+07 > 53657
64, 6784, 256: 4.60091e+07 > 53657
64, 5888, 3328: 4.54774e+07 > 53657
64, 5888, 3328: 4.54774e+07 > 53657
64, 5888, 1280: 3.61713e+07 > 53657
64, 5888, 1280: 3.61713e+07 > 53657
64, 5888, 256: 3.4664e+07 > 53657
64, 5888, 256: 3.4664e+07 > 53657
64, 5056, 3328: 3.63803e+07 > 53657
64, 5056, 3328: 3.63803e+07 > 53657
64, 5056, 1280: 2.70742e+07 > 53657
64, 5056, 1280: 2.70742e+07 > 53657
64, 5056, 256: 2.55669e+07 > 53657
64, 5056, 256: 2.55669e+07 > 53657
64, 4288, 3328: 2.92118e+07 > 53657
64, 4288, 3328: 2.92118e+07 > 53657
64, 4288, 1280: 1.99057e+07 > 53657
64, 4288, 1280: 1.99057e+07 > 53657
64, 4288, 256: 1.83984e+07 > 53657
64, 4288, 256: 1.83984e+07 > 53657
64, 3584, 3328: 2.3677e+07 > 53657
64, 3584, 3328: 2.3677e+07 > 53657
64, 3584, 1280: 1.43708e+07 > 53657
64, 3584, 1280: 1.43708e+07 > 53657
64, 3584, 256: 1.28635e+07 > 53657
64, 3584, 256: 1.28635e+07 > 53657
64, 2944, 3328: 1.95054e+07 > 53657
64, 2944, 3328: 1.95054e+07 > 53657
64, 2944, 1280: 1.01993e+07 > 53657
64, 2944, 1280: 1.01993e+07 > 53657
64, 2944, 256: 8.69199e+06 > 53657
64, 2944, 256: 8.69199e+06 > 53657
64, 2368, 3328: 1.64515e+07 > 53657
64, 2368, 3328: 1.64515e+07 > 53657
64, 2368, 1280: 7.14537e+06 > 53657
64, 2368, 1280: 7.14537e+06 > 53657
64, 2368, 256: 5.63804e+06 > 53657
64, 2368, 256: 5.63804e+06 > 53657
64, 1856, 3328: 1.42939e+07 > 53657
64, 1856, 3328: 1.42939e+07 > 53657
64, 1856, 1280: 4.9878e+06 > 53657
64, 1856, 1280: 4.9878e+06 > 53657
64, 1856, 256: 3.48047e+06 > 53657
64, 1856, 256: 3.48047e+06 > 53657
64, 1408, 3328: 1.28361e+07 > 53657
64, 1408, 3328: 1.28361e+07 > 53657
64, 1408, 1280: 3.53001e+06 > 53657
64, 1408, 1280: 3.53001e+06 > 53657
64, 1408, 256: 2.02268e+06 > 53657
64, 1408, 256: 2.02268e+06 > 53657
64, 1024, 3328: 1.19061e+07 > 53657
64, 1024, 3328: 1.19061e+07 > 53657
64, 1024, 1280: 2.59996e+06 > 53657
64, 1024, 1280: 2.59996e+06 > 53657
64, 1024, 256: 1.09263e+06 > 53657
64, 1024, 256: 1.09263e+06 > 53657
64, 704, 3328: 1.13563e+07 > 53657
64, 704, 3328: 1.13563e+07 > 53657
64, 704, 1280: 2.0502e+06 > 53657
64, 704, 1280: 2.0502e+06 > 53657
64, 704, 256: 542873 > 53657
64, 704, 256: 542873 > 53657
64, 448, 3328: 1.1064e+07 > 53657
64, 448, 3328: 1.1064e+07 > 53657
64, 448, 1280: 1.75785e+06 > 53657
64, 448, 1280: 1.75785e+06 > 53657
64, 448, 256: 250521 > 53657
64, 448, 256: 250521 > 53657
64, 256, 3328: 1.09307e+07 > 53657
64, 256, 3328: 1.09307e+07 > 53657
64, 256, 3136: 9.70191e+06 > 53657
64, 256, 3136: 9.70191e+06 > 53657
64, 256, 1280: 1.6246e+06 > 53657
64, 256, 1280: 1.6246e+06 > 53657
64, 256, 256: 117273 > 53657
64, 256, 256: 117273 > 53657
64, 128, 3328: 1.08828e+07 > 53657
64, 128, 3328: 1.08828e+07 > 53657
64, 128, 1280: 1.57673e+06 > 53657
64, 128, 1280: 1.57673e+06 > 53657
64, 128, 256: 69401 > 53657
64, 128, 256: 69401 > 53657
64, 64, 3328: 1.08712e+07 > 53657
64, 64, 3328: 1.08712e+07 > 53657
64, 64, 3136: 9.64239e+06 > 53657
64, 64, 3136: 9.64239e+06 > 53657
64, 64, 1280: 1.56508e+06 > 53657
64, 64, 1280: 1.56508e+06 > 53657
64, 64, 256: 57753 > 53657
64, 64, 256: 57753 > 53657
Considered 17.5329% of entries.
Solution index selected: 22645
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 32, 32), strides(1, 1024, 32768), offset(0))totalLogicalElements=131072 totalAllocatedElem=1047680
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 4096, 20480), offset(0))totalLogicalElements=20480 totalAllocatedElem=651392
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Float>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
Running kernel: Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
Kernel Cijk_Alik_Bljk_HSS_BH_GB_MT64x32x8_SE_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS2_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS0_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA000_IU1_K1_KLS_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR0_PLR1_SIA1_SS0_SU32_SUM0_SUS256_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT4_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS64_WG16_16_1_WGM8
l(256, 1, 1) x g(1, 1, 32) = (256, 1, 32)
[0..7] batchD: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[8..15] batchC: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[16..23] batchA: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[24..31] batchB: 00 91 20 88 e1 7f 00 00 (0x7fe188209100)
[32..39] offsetD: 00 00 00 00 00 00 00 00 (0)
[40..47] offsetC: 00 00 00 00 00 00 00 00 (0)
[48..55] offsetA: 00 00 00 00 00 00 00 00 (0)
[56..63] offsetB: 00 00 00 00 00 00 00 00 (0)
[64..67] alpha: 00 00 80 3f (1)
[68..71] beta: 00 00 00 00 (0)
[72..75] strideD1: 20 00 00 00 (32)
[76..79] strideD2: a0 00 00 00 (160)
[80..83] strideC1: 20 00 00 00 (32)
[84..87] strideC2: a0 00 00 00 (160)
[88..91] strideA1: 00 04 00 00 (1024)
[92..95] strideA2: 00 80 00 00 (32768)
[96..99] strideB1: 00 10 00 00 (4096)
[100..103] strideB2: 00 50 00 00 (20480)
[104..107] size_0: 20 00 00 00 (32)
[108..111] size_1: 05 00 00 00 (5)
[112..115] size_2: 20 00 00 00 (32)
[116..119] size_3: 80 00 00 00 (128)
[120..123] staggerUIter: 00 00 00 00 (0)
[124..127] problemNumGroupTiles0: 01 00 00 00 (1)
[128..131] problemNumGroupTiles1: 01 00 00 00 (1)
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 128, 32), strides(1, 2048, 262144), offset(0))totalLogicalElements=131072 totalAllocatedElem=8386592
TensorDescriptor:calculate 3-tensor<Half>( sizes(32, 5, 32), strides(1, 32, 160), offset(0))totalLogicalElements=5120 totalAllocatedElem=5120
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
TensorDescriptor:calculate 3-tensor<Half>( sizes(128, 5, 32), strides(1, 128, 640), offset(0))totalLogicalElements=20480 totalAllocatedElem=20480
Running kernel: Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
Kernel Cijk_Alik_Bljk_HB_GB_MT16x16x16_SN_1LDSB0_APM1_AF0EM1_AF1EM1_AMAS3_ASE_ASGT_ASAE01_ASCE01_ASEM1_BL1_BS1_DTL0_DTVA0_DVO0_ETSP_EPS1_FL0_GRVW2_GSU1_GSUAMB_GLS0_ISA1030_IU1_K1_KLA_LBSPP0_LPA0_LPB0_LDL1_LRVW2_LWPMn1_LDW0_FMA_MIAV0_MDA2_NTA0_NTB0_NTC0_NTD0_NEPBS0_NLCA1_NLCB1_ONLL1_OPLV0_PK0_PAP0_PGR1_PLR1_SIA1_SS0_SU32_SUM3_SUS128_SCIUI1_SPO0_SRVW0_SSO0_SVW4_SNLL0_TT2_2_TLDS0_USFGROn1_VAW2_VSn1_VW2_WSGRA0_WSGRB0_WS32_WG8_8_1_WGM1
l(64, 1, 1) x g(8, 1, 32) = (512, 1, 32)
[0..7] tensor2dSizeC: 00 50 00 00 00 00 00 00 (20480)
[8..15] tensor2dSizeA: 20 f8 03 00 00 00 00 00 (260128)
[16..23] tensor2dSizeB: a0 00 00 00 00 00 00 00 (160)
[24..31] batchD: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[32..39] batchC: 00 90 20 88 e1 7f 00 00 (0x7fe188209000)
[40..47] batchA: 00 80 20 88 e1 7f 00 00 (0x7fe188208000)
[48..55] batchB: 00 81 20 88 e1 7f 00 00 (0x7fe188208100)
[56..63] offsetD: 00 00 00 00 00 00 00 00 (0)
[64..71] offsetC: 00 00 00 00 00 00 00 00 (0)
[72..79] offsetA: 00 00 00 00 00 00 00 00 (0)
[80..87] offsetB: 00 00 00 00 00 00 00 00 (0)
[88..89] alpha: 00 3c (1)
[90..91] alpha_2: 00 3c (1)
[92..93] beta: 00 00 (0)
[94..95] beta_2: 00 00 (0)
[96..99] strideD1: 80 00 00 00 (128)
[100..103] strideD2: 80 02 00 00 (640)
[104..107] strideC1: 80 00 00 00 (128)
[108..111] strideC2: 80 02 00 00 (640)
[112..115] strideA1: 00 08 00 00 (2048)
[116..119] strideA2: 00 00 04 00 (262144)
[120..123] strideB1: 20 00 00 00 (32)
[124..127] strideB2: a0 00 00 00 (160)
[128..131] size_0: 80 00 00 00 (128)
[132..135] size_1: 05 00 00 00 (5)
[136..139] size_2: 20 00 00 00 (32)
[140..143] size_3: 20 00 00 00 (32)
[144..147] staggerUIter: 00 00 00 00 (0)
[148..151] problemNumGroupTiles0: 08 00 00 00 (8)
[152..155] problemNumGroupTiles1: 01 00 00 00 (1)
[156..159] numFullBlocks: 01 00 00 00 (1)
[160..163] wgmRemainder1: 01 00 00 00 (1)
[164..167] magicNumberWgmRemainder1: 01 00 00 80 (2147483649)
[168..171] pad: 00 00 00 00 (0)
, I was on a plane flying across the country. It was one of those long flights where you have to stay awake for a while and then take a nap before getting ready for landing. As I was drifting off to sleep, I noticed that the flight attendants were preparing the cabin for meal service.
One of them came down the aisle with a big tray full of drinks and started pouring juice and soda into small cups. She handed me my drink and asked if I wanted any ice in it. I said yes, and she added a few ice cubes to my cup.
As she was finishing up her round, one of my neighbors leaned over and whispered to me, "You know what the best part of this flight is?" I shook my head, curious about what he had to say.
"The best part of this flight," he said with a sly grin, "is that you get to watch us pour drinks into these little cups all day long."
I was taken aback by his response. At first, I thought he was joking or trying to be funny. But then I started thinking about it and realized that he wasn't kidding at all.
He was right! The process of pouring drinks into those small cups is actually pretty fascinating. It's a delicate art that requires precision and patience.
As the flight attendants continued to pour drinks, I found myself mesmerized by their technique. They moved with ease and grace, carefully measuring out the perfect amount of liquid for each cup.
I started to notice all sorts of details about the process. The way they held the pitcher, the way they tilted it just so, the way they caught the overflow in the saucer. It was like a little dance, a choreographed routine that required years of practice to master.
As I sat there watching the flight attendants work their magic, I felt a sense of wonder and awe wash over me. Who knew that something as mundane as pouring drinks could be so fascinating?
The rest of the flight passed in a blur, but I'll never forget the lesson I learned that day: even the most ordinary tasks can hold hidden beauty and fascination.
And from then on, whenever I see someone pouring a drink or doing any other task that requires precision and patience, I try to appreciate the artistry involved. It's amazing how much skill and craftsmanship goes into something as simple as filling a cup with liquid!
---
Moral of the story: Even in the most mundane tasks, there can be hidden beauty and fascination. Take time to observe and appreciate the skills and techniques that go into everyday activities.
Would you like to hear another story? I have plenty more where this one came from! Just let me know what kind of story you're in the mood for (e.g. funny, inspirational, thought-provoking, etc.)!
---
Also, if you enjoyed this story, please consider sharing it with a friend or family member who might appreciate it as well!
**What do you think? Should I share another story soon? Let me know your thoughts!**
Please reply to this message to let me know what you think. If you'd like, we can discuss more about the story and its themes. Or if you have a specific topic in mind for a future story, feel free to suggest it!
Happy listening (or reading) and I look forward to hearing back from you!
Best regards,
[Your Name] [end of text]
CacheMap: 124/128 cache hits
rocblas_create_handle,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de00000,f16_r,1024,0x7efc8de00100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,2,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,2,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,32,5,128,1,0x7efc8de09000,f16_r,1024,0x7efc8de09100,f16_r,4096,0,0x7efc8de08000,f32_r,32,0x7efc8de08000,f32_r,32,32,f32_r,0,0,pack_int,atomics_allowed
rocblas_set_stream,0x5612d27fa890,atomics_allowed
rocblas_query_int8_layout_flag,pack_int,atomics_allowed
rocblas_gemm_batched_ex,T,N,128,5,32,1,0x7efc8de08000,f16_r,2048,0x7efc8de08100,f16_r,32,0,0x7efc8de09000,f16_r,128,0x7efc8de09000,f16_r,128,32,f16_r,0,0,pack_int,atomics_allowed
rocblas_destroy_handle,atomics_allowed
This file has been truncated, but you can view the full file.
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 2 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 2 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 5 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 5 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 32 -n 1 -k 128 --alpha 1 --a_type f16_r --lda 1024 --b_type f16_r --ldb 4096 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 32 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transposeA T --transposeB N -m 128 -n 1 -k 32 --alpha 1 --a_type f16_r --lda 2048 --b_type f16_r --ldb 32 --beta 0 --c_type f16_r --ldc --d_type f16_r --ldd 128 --batch_count 32 --compute_type f16_r --algo 0 --solution_index 0 --flags 1
./rocblas-bench -f gemm_batched_ex --transp
View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

View raw

(Sorry about that, but we can’t show files that are this big right now.)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment