vkuzo/gist:83656e4a74777cfc0915de6b27be1ff6 Secret

## gistfile1.txt
> python benchmarks/bench_qdq.py

elem_dtype           use_fp4_custom_triton_dequant_kernel      q_time_us    q_mem_bw_tb_s    dq_time_us    dq_mem_bw_tb_s
-------------------  --------------------------------------  -----------  ---------------  ------------  ----------------
torch.float8_e4m3fn  False                                        282.21             0.48         99.57              1.37
torch.float8_e5m2    False                                        300.86             0.45         99.20              1.38
fp6_e2m3             False                                        315.23             0.43        134.44              1.02
fp6_e3m2             False                                        315.79             0.43        124.67              1.10
fp4_e2m1             False                                        452.60             0.25        210.79              0.54
fp4_e2m1             True                                        3998.35             0.03        108.62              1.05


> python benchmarks/bench_llama_train.py

experiment             time_sec    speedup    mem_gb
-------------------  ----------  ---------  --------
eager                    0.3166     1.0000   67.1042
compile                  0.2596     1.2198   67.8703
mx_fp8_compile           0.4313     0.7341   67.8703
mx_fp6_e2m3_compile      0.4213     0.7516   67.8703
mx_fp6_e3m2_compile      0.3969     0.7979   67.4633
mx_fp4_compile           0.4240     0.7468   67.8703
microxcaling             2.2622     0.1400   65.3955


> python benchmarks/bench_llama.py

model_type    dtype           experiment            time_sec    speedup    mem_gb
------------  --------------  ------------------  ----------  ---------  --------
whole_model   torch.bfloat16  eager                   0.0174              13.5153
whole_model   torch.bfloat16  compile                 0.0078     2.2349   13.7712
whole_model   torch.bfloat16  mxfp8_compile           0.0191     0.9089   14.0389
whole_model   torch.bfloat16  mxfp6_e2m3_compile      0.0243     0.7133   17.1182
whole_model   torch.bfloat16  mxfp6_e3m2_compile      0.0243     0.7134   22.0079
whole_model   torch.bfloat16  mxfp4                   0.0441     0.3939   29.1249
whole_model   torch.bfloat16  mxfp4_compile           0.0326     0.5327   18.5776
whole_model   torch.float32   microxcaling            0.0773     0.2245   41.7501
	> python benchmarks/bench_qdq.py

	elem_dtype use_fp4_custom_triton_dequant_kernel q_time_us q_mem_bw_tb_s dq_time_us dq_mem_bw_tb_s
	------------------- -------------------------------------- ----------- --------------- ------------ ----------------
	torch.float8_e4m3fn False 282.21 0.48 99.57 1.37
	torch.float8_e5m2 False 300.86 0.45 99.20 1.38
	fp6_e2m3 False 315.23 0.43 134.44 1.02
	fp6_e3m2 False 315.79 0.43 124.67 1.10
	fp4_e2m1 False 452.60 0.25 210.79 0.54
	fp4_e2m1 True 3998.35 0.03 108.62 1.05


	> python benchmarks/bench_llama_train.py

	experiment time_sec speedup mem_gb
	------------------- ---------- --------- --------
	eager 0.3166 1.0000 67.1042
	compile 0.2596 1.2198 67.8703
	mx_fp8_compile 0.4313 0.7341 67.8703
	mx_fp6_e2m3_compile 0.4213 0.7516 67.8703
	mx_fp6_e3m2_compile 0.3969 0.7979 67.4633
	mx_fp4_compile 0.4240 0.7468 67.8703
	microxcaling 2.2622 0.1400 65.3955


	> python benchmarks/bench_llama.py

	model_type dtype experiment time_sec speedup mem_gb
	------------ -------------- ------------------ ---------- --------- --------
	whole_model torch.bfloat16 eager 0.0174 13.5153
	whole_model torch.bfloat16 compile 0.0078 2.2349 13.7712
	whole_model torch.bfloat16 mxfp8_compile 0.0191 0.9089 14.0389
	whole_model torch.bfloat16 mxfp6_e2m3_compile 0.0243 0.7133 17.1182
	whole_model torch.bfloat16 mxfp6_e3m2_compile 0.0243 0.7134 22.0079
	whole_model torch.bfloat16 mxfp4 0.0441 0.3939 29.1249
	whole_model torch.bfloat16 mxfp4_compile 0.0326 0.5327 18.5776
	whole_model torch.float32 microxcaling 0.0773 0.2245 41.7501