diff --git a/aiter/configs/model_configs/qwen3_5_397b_bf16_tuned_gemm.csv b/aiter/configs/model_configs/qwen3_5_397b_bf16_tuned_gemm.csv new file mode 100644 index 0000000000..cffbcf230d --- /dev/null +++ b/aiter/configs/model_configs/qwen3_5_397b_bf16_tuned_gemm.csv @@ -0,0 +1,188 @@ +gfx,cu_num,M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle,libtype,solidx,splitK,us,kernelName,err_ratio,tflops,bw +gfx950,256,1,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,skinny,2,0,2.879,sol2,0.0,0.18,185.0 +gfx950,256,2,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,skinny,2,0,3.9881,sol2,0.0,0.26,135.64 +gfx950,256,4,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,skinny,2,0,4.0827,sol2,0.0,0.51,136.57 +gfx950,256,8,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1827,4,4.4966,flydsl_gemm7_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0293,0.93,131.4 +gfx950,256,16,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1827,4,4.5962,flydsl_gemm7_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0225,1.83,143.03 +gfx950,256,32,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,1,16,4.8638,_ZN5aiter39bf16gemm_fp32bf16_tn_32x64_splitk_cleanE,0.0322,3.45,162.53 +gfx950,256,48,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1619,8,4.6078,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0225,5.46,200.45 +gfx950,256,64,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2143,8,4.989,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0217,6.73,211.82 +gfx950,256,96,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2143,8,5.0148,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0186,10.04,263.82 +gfx950,256,128,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2139,8,5.0507,flydsl_gemm5_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0182,13.29,314.66 +gfx950,256,192,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2508,8,4.7517,flydsl_gemm6_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0199,21.18,446.52 +gfx950,256,256,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2439,8,5.6402,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,23.8,470.59 +gfx950,256,384,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2136,8,6.5038,flydsl_gemm5_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0192,30.96,571.85 +gfx950,256,512,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2899,8,7.0391,flydsl_gemm8_abf16_wbf16_bf16_t48x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0186,38.13,679.65 +gfx950,256,768,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2899,8,8.0595,flydsl_gemm8_abf16_wbf16_bf16_t48x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0192,49.96,857.88 +gfx950,256,1024,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1027,4,8.0735,flydsl_gemm3_abf16_wbf16_bf16_t16x64x256_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.019,66.5,1120.2 +gfx950,256,2048,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1728,4,10.5504,flydsl_gemm4_abf16_wbf16_bf16_t48x64x128_split_k4_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0121,101.77,1664.74 +gfx950,256,4096,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2417,1,13.2809,flydsl_gemm6_abf16_wbf16_bf16_t16x64x128_split_k1_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0047,161.7,2605.47 +gfx950,256,8192,64,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2890,1,19.6826,flydsl_gemm8_abf16_wbf16_bf16_t48x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,218.21,3489.46 +gfx950,256,1,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,skinny,2,0,3.6718,sol2,0.0,0.57,573.52 +gfx950,256,2,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,skinny,2,0,4.0261,sol2,0.0,1.04,525.21 +gfx950,256,4,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4022,8,4.8663,flydsl_gemm7_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0273,1.72,438.11 +gfx950,256,8,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2455,8,4.7958,flydsl_gemm4_abf16_wbf16_bf16_t16x64x128_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0239,3.5,451.81 +gfx950,256,16,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3659,8,4.9698,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0222,6.75,450.0 +gfx950,256,32,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2640,8,4.8663,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0198,13.79,488.19 +gfx950,256,48,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3659,8,5.0669,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,19.87,496.35 +gfx950,256,64,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4884,8,5.7062,flydsl_gemm6_abf16_wbf16_bf16_t32x64x64_split_k8_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0192,23.52,465.14 +gfx950,256,96,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3532,8,5.8774,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.019,34.25,498.99 +gfx950,256,128,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3719,4,6.4423,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0183,41.67,498.47 +gfx950,256,192,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4647,4,6.9178,flydsl_gemm5_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0186,58.21,544.73 +gfx950,256,256,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3707,4,7.4795,flydsl_gemm4_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0188,71.78,578.3 +gfx950,256,384,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4723,4,8.7291,flydsl_gemm5_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0116,92.26,623.14 +gfx950,256,512,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3704,2,9.9331,flydsl_gemm4_abf16_wbf16_bf16_t16x64x128_split_k2_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0124,108.1,659.77 +gfx950,256,768,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,5,5,11.6811,_ZN5aiter39bf16gemm_fp32bf16_tn_64x64_splitk_cleanE,0.0141,137.88,751.8 +gfx950,256,1024,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,5189,1,12.636,flydsl_gemm6_abf16_wbf16_bf16_t16x64x128_split_k1_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0048,169.95,871.32 +gfx950,256,2048,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,17.1703,native,0.0,250.14,1160.31 +gfx950,256,4096,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,20.8374,native,0.0,412.24,1811.59 +gfx950,256,8192,256,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4367,1,35.5559,flydsl_gemm4_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,483.18,2064.36 +gfx950,256,1,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,skinny,2,0,5.1911,sol2,0.0,0.81,809.76 +gfx950,256,2,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4271,8,5.3191,flydsl_gemm8_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0234,1.58,792.0 +gfx950,256,4,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3659,8,5.3058,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.022,3.16,797.46 +gfx950,256,8,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4023,8,5.3834,flydsl_gemm7_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0181,6.23,792.81 +gfx950,256,16,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2455,8,5.8626,flydsl_gemm4_abf16_wbf16_bf16_t16x64x128_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0188,11.45,740.59 +gfx950,256,32,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3659,8,5.8419,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0204,22.98,768.45 +gfx950,256,48,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3659,8,6.3326,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0195,31.79,732.19 +gfx950,256,64,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3532,8,6.7352,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k8_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0197,39.86,710.32 +gfx950,256,96,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3343,4,7.2111,flydsl_gemm4_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0202,55.84,704.34 +gfx950,256,128,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4647,4,7.7087,flydsl_gemm5_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0193,69.64,697.13 +gfx950,256,192,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4723,4,8.8415,flydsl_gemm5_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0119,91.08,674.52 +gfx950,256,256,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4723,4,9.9387,flydsl_gemm5_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.012,108.04,659.4 +gfx950,256,384,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3946,2,11.4895,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k2_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.005,140.18,673.07 +gfx950,256,512,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4711,2,12.1177,flydsl_gemm5_abf16_wbf16_bf16_t32x64x128_split_k2_block_m_warp1_block_n_warp1_block_k_warp4_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0201,177.22,735.53 +gfx950,256,768,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,7,3,14.4106,_ZN5aiter39bf16gemm_fp32bf16_tn_80x64_splitk_cleanE,0.0089,223.53,782.22 +gfx950,256,1024,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3939,1,16.2506,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,264.3,838.83 +gfx950,256,2048,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,20.1511,native,0.0,426.28,1144.78 +gfx950,256,4096,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2923,1,29.9834,flydsl_gemm3_abf16_wbf16_bf16_t64x128x128_split_k1_block_m_warp1_block_n_warp4_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0048,572.98,1398.88 +gfx950,256,8192,512,4096,False,torch.bfloat16,torch.bfloat16,False,False,opus,1403,0,50.679,opus_gemm_mono_tile_512x128x128x64_2x4_16x16x32_0x0x0_nooob,0.0,677.99,1572.48 +gfx950,256,1,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3659,8,5.8023,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0283,1.45,1447.5 +gfx950,256,2,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4023,8,6.0322,flydsl_gemm7_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0186,2.78,1394.03 +gfx950,256,4,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4023,8,5.9724,flydsl_gemm7_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0164,5.62,1411.42 +gfx950,256,8,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4023,8,6.0026,flydsl_gemm7_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.019,11.18,1411.14 +gfx950,256,16,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4271,8,6.137,flydsl_gemm8_abf16_wbf16_bf16_t16x64x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.019,21.87,1393.59 +gfx950,256,32,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2463,4,6.9623,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0187,38.56,1251.93 +gfx950,256,48,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2463,4,7.4362,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0185,54.15,1194.18 +gfx950,256,64,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,5127,4,7.8047,flydsl_gemm7_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0194,68.79,1158.78 +gfx950,256,96,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3523,4,8.8375,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0122,91.12,1060.44 +gfx950,256,128,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3955,4,9.8047,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k4_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0117,109.51,989.25 +gfx950,256,192,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3948,2,11.648,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k2_block_m_warp2_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0122,138.27,888.97 +gfx950,256,256,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3949,2,12.2843,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k2_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,174.82,896.27 +gfx950,256,384,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,13.8782,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0118,232.11,887.78 +gfx950,256,512,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3940,1,16.0743,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,267.19,848.03 +gfx950,256,768,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4888,1,18.1206,flydsl_gemm5_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.005,355.53,896.93 +gfx950,256,1024,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4888,1,18.6951,flydsl_gemm5_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.005,459.48,1009.59 +gfx950,256,2048,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3597,1,29.2948,flydsl_gemm4_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,586.45,1002.23 +gfx950,256,4096,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2089,1,47.4466,flydsl_gemm3_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp4_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,724.18,1060.81 +gfx950,256,8192,1024,4096,False,torch.bfloat16,torch.bfloat16,False,False,opus,1401,0,65.7983,opus_gemm_mono_tile_512x128x256x64_2x4_16x16x32_0x0x0_nooob,0.0,1044.4,1402.39 +gfx950,256,1,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,3.8969,auto,0.0,1.08,1078.68 +gfx950,256,1,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4018,4,8.5085,flydsl_gemm7_abf16_wbf16_bf16_t16x64x64_split_k4_block_m_warp1_block_n_warp1_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0134,3.94,3945.56 +gfx950,256,1,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1463,8,13.6938,flydsl_gemm3_abf16_wbf16_bf16_t16x128x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0208,4.9,4902.47 +gfx950,256,2,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,3.9258,auto,0.0,2.14,1073.09 +gfx950,256,2,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2452,4,8.7173,flydsl_gemm4_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0131,7.7,3852.94 +gfx950,256,2,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2419,8,13.9909,flydsl_gemm4_abf16_wbf16_bf16_t16x128x128_split_k8_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.02,9.59,4800.12 +gfx950,256,4,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,4.447,auto,0.0,3.77,951.47 +gfx950,256,4,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1484,4,8.9727,flydsl_gemm3_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0117,14.96,3746.92 +gfx950,256,4,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2419,8,14.8624,flydsl_gemm4_abf16_wbf16_bf16_t16x128x128_split_k8_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0197,18.06,4521.96 +gfx950,256,8,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,3.9857,auto,0.0,8.42,1070.84 +gfx950,256,8,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3656,4,9.1861,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k4_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0123,29.22,3667.01 +gfx950,256,8,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3999,8,15.2037,flydsl_gemm7_abf16_wbf16_bf16_t16x128x64_split_k8_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0218,35.31,4426.91 +gfx950,256,16,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,3.8083,auto,0.0,17.62,1140.08 +gfx950,256,16,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3164,4,9.4768,flydsl_gemm5_abf16_wbf16_bf16_t16x64x128_split_k4_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0122,56.65,3568.35 +gfx950,256,16,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1451,8,16.2404,flydsl_gemm3_abf16_wbf16_bf16_t16x128x128_split_k8_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0203,66.12,4156.43 +gfx950,256,32,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,3.9702,auto,0.0,33.81,1130.73 +gfx950,256,32,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2460,2,11.0315,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k2_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0123,97.33,3089.22 +gfx950,256,32,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2460,2,17.3869,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k2_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0145,123.51,3904.97 +gfx950,256,48,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3484,1,4.2141,flydsl_gemm7_abf16_wbf16_bf16_t16x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,47.77,1100.28 +gfx950,256,48,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2622,2,12.4017,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k2_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,129.87,2769.04 +gfx950,256,48,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2625,2,18.56,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k2_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0062,173.56,3679.34 +gfx950,256,64,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4183,1,4.7372,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,56.67,1009.91 +gfx950,256,64,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3511,2,12.7077,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k2_block_m_warp1_block_n_warp1_block_k_warp4_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0198,168.99,2723.0 +gfx950,256,64,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3517,2,20.0442,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k2_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0061,214.27,3426.51 +gfx950,256,96,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4183,1,4.8706,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,82.67,1042.8 +gfx950,256,96,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,8,4,14.9718,_ZN5aiter39bf16gemm_fp32bf16_tn_96x64_splitk_cleanE,0.0116,215.15,2346.23 +gfx950,256,96,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3786,2,22.6192,flydsl_gemm4_abf16_wbf16_bf16_t64x64x128_split_k2_block_m_warp2_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0061,284.82,3071.2 +gfx950,256,128,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4995,1,4.4062,flydsl_gemm7_abf16_wbf16_bf16_t32x64x64_split_k1_block_m_warp1_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,121.84,1219.63 +gfx950,256,128,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,5252,1,15.4805,flydsl_gemm6_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp1_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,277.44,2303.0 +gfx950,256,128,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4217,2,26.1951,flydsl_gemm4_abf16_wbf16_bf16_t64x64x128_split_k2_block_m_warp2_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0145,327.92,2681.97 +gfx950,256,192,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4152,1,5.0455,flydsl_gemm5_abf16_wbf16_bf16_t32x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,159.61,1182.0 +gfx950,256,192,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,18.7636,native,0.0,343.35,1955.92 +gfx950,256,192,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,32.089,native,0.0,401.54,2238.38 +gfx950,256,256,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3156,1,19.4809,flydsl_gemm3_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,440.94,1937.73 +gfx950,256,256,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4886,1,36.818,flydsl_gemm5_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp1_block_k_warp4_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.014,466.62,1993.6 +gfx950,256,384,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3865,1,6.0054,flydsl_gemm4_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,268.19,1287.72 +gfx950,256,384,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4403,1,24.2638,flydsl_gemm4_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,531.03,1642.19 +gfx950,256,384,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4400,1,46.4997,flydsl_gemm4_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp1_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0062,554.19,1646.16 +gfx950,256,512,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3095,1,7.2124,flydsl_gemm4_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp4_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,297.75,1235.77 +gfx950,256,512,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3600,1,28.3466,flydsl_gemm4_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp4_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,606.06,1479.65 +gfx950,256,512,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,50.0109,auto,0.0,687.04,1593.49 +gfx950,256,768,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1636,1,8.6303,flydsl_gemm2_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,373.25,1306.12 +gfx950,256,768,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2090,1,37.4367,flydsl_gemm3_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp4_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,688.36,1232.41 +gfx950,256,768,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3508,1,73.1559,flydsl_gemm4_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0063,704.52,1175.34 +gfx950,256,1024,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3508,1,42.8466,flydsl_gemm4_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,801.92,1174.69 +gfx950,256,1024,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3508,1,80.4182,flydsl_gemm4_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0063,854.53,1147.44 +gfx950,256,2048,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,81,1,13.0062,flydsl_gemm2_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,660.45,1773.67 +gfx950,256,2048,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,opus,6401,0,70.5756,opus_gemm_mono_tile_512x128x256x64_2x4_16x16x32_0x0x0_nooob_4g_safe,0.0,973.7,950.88 +gfx950,256,2048,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,opus,6401,0,128.3684,opus_gemm_mono_tile_512x128x256x64_2x4_16x16x32_0x0x0_nooob_4g_safe,0.0,1070.66,914.87 +gfx950,256,4096,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,109.6244,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1253.73,918.26 +gfx950,256,4096,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,188.43,native,0.0,1458.78,890.37 +gfx950,256,8192,4096,512,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,42.699,native,0.0,804.7,1866.36 +gfx950,256,8192,4096,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,196.0327,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1402.2,855.84 +gfx950,256,8192,4096,8192,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,363.3926,native,0.0,1512.84,738.69 +gfx950,256,1,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,13.2664,auto,0.0,5.06,5060.41 +gfx950,256,2,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,13.2719,auto,0.0,10.11,5060.17 +gfx950,256,4,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,13.1195,auto,0.0,20.46,5122.69 +gfx950,256,8,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,13.2491,auto,0.0,40.52,5080.0 +gfx950,256,16,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,13.2411,auto,0.0,81.09,5097.92 +gfx950,256,32,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,13.6744,auto,0.0,157.04,4965.14 +gfx950,256,48,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2612,1,16.9464,flydsl_gemm4_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp1_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,190.08,4029.68 +gfx950,256,64,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,triton,0,0,17.0494,auto,0.0,251.91,4028.4 +gfx950,256,96,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3772,1,19.5099,flydsl_gemm4_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp4_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,330.21,3560.66 +gfx950,256,128,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4888,1,21.4286,flydsl_gemm5_abf16_wbf16_bf16_t64x64x128_split_k1_block_m_warp2_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,400.86,3278.54 +gfx950,256,192,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4404,1,26.2318,flydsl_gemm4_abf16_wbf16_bf16_t96x64x128_split_k1_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,491.19,2738.18 +gfx950,256,256,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2249,1,29.3346,flydsl_gemm3_abf16_wbf16_bf16_t128x64x128_split_k1_block_m_warp4_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,585.65,2502.18 +gfx950,256,384,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4368,1,39.9965,flydsl_gemm4_abf16_wbf16_bf16_t96x128x64_split_k1_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,644.3,1913.82 +gfx950,256,512,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3508,1,43.8229,flydsl_gemm4_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,784.06,1818.5 +gfx950,256,768,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2171,1,64.1459,flydsl_gemm3_abf16_wbf16_bf16_t128x256x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,803.47,1340.43 +gfx950,256,1024,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2175,1,72.4367,flydsl_gemm3_abf16_wbf16_bf16_t128x256x64_split_k1_block_m_warp4_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,948.68,1273.87 +gfx950,256,2048,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,105.0997,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1307.7,1117.42 +gfx950,256,4096,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,193.5792,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1419.98,866.68 +gfx950,256,8192,8192,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,373.5705,native,0.0,1471.63,718.57 +gfx950,256,1,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4249,1,15.2086,flydsl_gemm8_abf16_wbf16_bf16_t16x64x128_split_k1_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0051,4.69,4690.03 +gfx950,256,2,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2458,1,16.0604,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,8.88,4442.88 +gfx950,256,4,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2446,1,15.8821,flydsl_gemm4_abf16_wbf16_bf16_t16x64x128_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,17.96,4495.98 +gfx950,256,8,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1489,1,15.3826,flydsl_gemm3_abf16_wbf16_bf16_t16x64x256_split_k1_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0048,37.08,4648.63 +gfx950,256,16,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2457,1,15.9002,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k1_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0048,71.75,4510.18 +gfx950,256,32,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4314,1,18.9861,flydsl_gemm8_abf16_wbf16_bf16_t32x64x64_split_k1_block_m_warp2_block_n_warp1_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,120.18,3798.69 +gfx950,256,48,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,20.9618,native,0.0,163.28,3460.2 +gfx950,256,64,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4307,1,21.5259,flydsl_gemm5_abf16_wbf16_bf16_t32x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,212.0,3388.55 +gfx950,256,96,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3610,1,22.5829,flydsl_gemm4_abf16_wbf16_bf16_t48x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,303.11,3266.22 +gfx950,256,128,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4259,1,24.0271,flydsl_gemm4_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,379.85,3103.99 +gfx950,256,192,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4435,1,30.2384,flydsl_gemm4_abf16_wbf16_bf16_t96x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,452.74,2520.58 +gfx950,256,256,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2331,1,36.3919,flydsl_gemm3_abf16_wbf16_bf16_t128x64x64_split_k1_block_m_warp2_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,501.58,2139.4 +gfx950,256,384,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3508,1,42.6318,flydsl_gemm4_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,642.25,1903.12 +gfx950,256,512,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,50.8741,native,0.0,717.6,1659.2 +gfx950,256,768,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2488,1,67.4099,flydsl_gemm3_abf16_wbf16_bf16_t256x128x64_split_k1_block_m_warp4_block_n_warp4_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,812.36,1349.42 +gfx950,256,1024,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,76.7552,native,0.0,951.26,1270.5 +gfx950,256,2048,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,159.7258,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,914.25,774.65 +gfx950,256,4096,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,249.7709,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1169.3,705.29 +gfx950,256,8192,8704,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,438.6263,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1331.69,640.68 +gfx950,256,1,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2458,1,17.3442,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,4.84,4838.2 +gfx950,256,2,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2446,1,16.9496,flydsl_gemm4_abf16_wbf16_bf16_t16x64x128_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,9.9,4952.53 +gfx950,256,4,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3649,1,17.0573,flydsl_gemm6_abf16_wbf16_bf16_t16x64x64_split_k1_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0051,19.67,4924.62 +gfx950,256,8,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2457,1,17.4084,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k1_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,38.55,4831.89 +gfx950,256,16,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2457,1,16.7325,flydsl_gemm4_abf16_wbf16_bf16_t16x64x256_split_k1_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,80.21,5040.78 +gfx950,256,32,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1683,1,18.5012,flydsl_gemm3_abf16_wbf16_bf16_t32x64x256_split_k1_block_m_warp2_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0048,145.09,4583.68 +gfx950,256,48,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,1643,1,20.2528,flydsl_gemm3_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,198.81,4209.9 +gfx950,256,64,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2373,1,20.8594,flydsl_gemm3_abf16_wbf16_bf16_t32x64x128_split_k1_block_m_warp1_block_n_warp1_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0048,257.38,4109.47 +gfx950,256,96,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3610,1,23.3365,flydsl_gemm4_abf16_wbf16_bf16_t48x64x64_split_k1_block_m_warp1_block_n_warp2_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,345.08,3712.58 +gfx950,256,128,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,4264,1,26.3285,flydsl_gemm4_abf16_wbf16_bf16_t64x64x64_split_k1_block_m_warp2_block_n_warp1_block_k_warp1_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0,407.82,3325.53 +gfx950,256,192,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2932,1,30.4599,flydsl_gemm3_abf16_wbf16_bf16_t64x128x128_split_k1_block_m_warp2_block_n_warp4_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0048,528.76,2934.71 +gfx950,256,256,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,34.369,native,0.0,624.83,2654.31 +gfx950,256,384,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,3508,1,43.1327,flydsl_gemm4_abf16_wbf16_bf16_t128x128x64_split_k1_block_m_warp2_block_n_warp4_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,746.82,2200.1 +gfx950,256,512,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,53.757,native,0.0,798.96,1833.55 +gfx950,256,768,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,flydsl,2487,1,70.7595,flydsl_gemm3_abf16_wbf16_bf16_t256x128x64_split_k1_block_m_warp4_block_n_warp2_block_k_warp2_async_copyTrue_b_to_ldsTrue_b_preshuffleFalse_c_to_ldsFalse_gfx950,0.0049,910.47,1496.71 +gfx950,256,1024,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,82.1786,native,0.0,1045.28,1378.05 +gfx950,256,2048,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,154.041,native,0.0,1115.28,925.77 +gfx950,256,4096,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,asm,10,1,264.5707,_ZN5aiter24bf16gemm_bf16_tn_256x256E,0.0,1298.7,760.96 +gfx950,256,8192,10240,4096,False,torch.bfloat16,torch.bfloat16,False,False,torch,0,0,464.3005,native,0.0,1480.06,686.55 diff --git a/aiter/configs/model_configs/qwen3_5_397b_untuned_gemm.csv b/aiter/configs/model_configs/qwen3_5_397b_untuned_gemm.csv new file mode 100644 index 0000000000..2dab6fb740 --- /dev/null +++ b/aiter/configs/model_configs/qwen3_5_397b_untuned_gemm.csv @@ -0,0 +1,189 @@ +M,N,K,bias,dtype,outdtype,scaleAB,bpreshuffle +1,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1024,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4096,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,10240,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1024,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4096,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,8704,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1024,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4096,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,4096,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,4096,512,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1024,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4096,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,1024,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1024,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4096,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,512,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1024,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4096,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,64,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1024,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4096,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,8192,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1024,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4096,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,4096,8192,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +16,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +32,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +48,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +64,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +96,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +128,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +192,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +256,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +384,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +512,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +768,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +1024,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +2048,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +4096,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE +8192,256,4096,FALSE,torch.bfloat16,torch.bfloat16,FALSE,FALSE