diff --git a/aiter/configs/model_configs/dsv4_fp8fp4_tuned_fmoe.csv b/aiter/configs/model_configs/dsv4_fp8fp4_tuned_fmoe.csv index dd9ea8897a..bd4e723ad9 100644 --- a/aiter/configs/model_configs/dsv4_fp8fp4_tuned_fmoe.csv +++ b/aiter/configs/model_configs/dsv4_fp8fp4_tuned_fmoe.csv @@ -213,3 +213,35 @@ cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w, 256,16384,4096,256,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,329.48530000000005,cktile_a8w4_bm64,0.0,639.364,cktile_a8w4_bm64,0.0,968.8493,0,0,0.0,0.0,flydsl_fallback 256,32768,4096,256,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,614.7645,cktile_a8w4_bm64,0.0,1269.2433,cktile_a8w4_bm64,0.0,1884.0078,0,0,0.0,0.0,flydsl_fallback 256,131072,4096,256,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,2501.0829,cktile_a8w4_bm64,0.0,5285.1745,cktile_a8w4_bm64,0.0,7786.2574,0,0,0.0,0.0,flydsl_fallback +256,1,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,13.1954,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w2_gui_kw2_fp8,0.0%,11.3628,flydsl_moe2_afp8_wfp4_bf16_t32x128x256_reduce_bnt2,0.0%,24.5582,0,0,12.3,262334.5, +256,2,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,20.1426,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w3_gui_fp8,0.0%,14.7358,flydsl_moe2_afp8_wfp4_bf16_t32x128x256_reduce_persist,0.0%,34.8784,0,0,17.32,184712.47, +256,4,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,35.6444,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w3_gui_fp8,0.0%,22.9261,flydsl_moe2_afp8_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.0%,58.5705,0,0,20.62,109995.65, +256,8,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,66.3011,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w4_gui_fp8,0.0%,39.381,flydsl_moe2_afp8_wfp4_bf16_t32x128x256_reduce_persist,0.0%,105.6821,0,0,22.86,60961.59, +256,16,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,124.9989,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w3_gui_kw2_fp8,0.0%,72.1626,cktile_a8w4_bm32,0.0%,197.1615,0,0,24.51,32677.01, +256,32,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,201.2978,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w3_gui_kw2_fp8,0.0%,110.8525,flydsl_moe2_afp8_wfp4_bf16_t32x128x128_reduce_bnt2_persist,0.0%,312.1503,0,0,30.96,20640.2, +256,64,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,274.0652,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w3_gui_kw2_fp8,0.0%,151.599,flydsl_moe2_afp8_wfp4_bf16_t32x128x256_atomic_persist,0.0%,425.6642,0,0,45.41,15136.9, +256,128,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,321.1872,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w4_gui_kw2_fp8,0.0%,178.1048,cktile_a8w4_bm32,0.0%,499.292,0,0,77.42,12906.32, +256,256,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,345.4854,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w4_gui_kw2_fp8,0.0%,193.9367,flydsl_moe2_afp8_wfp4_bf16_t32x128x128_atomic_bnt2_persist,0.0%,539.4221,0,0,143.32,11949.08, +256,512,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,356.8284,flydsl_moe1_afp8_wfp4_bf16_t32x64x256_w4_gui_kw2_fp8,0.0%,197.886,flydsl_moe2_afp8_wfp4_bf16_t32x128x256_atomic,0.0%,554.7144,0,0,278.74,11625.34, +256,1024,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,392.833,flydsl_moe1_afp8_wfp4_bf16_t32x128x256_w3_gui_fp8,0.0%,210.1492,flydsl_moe2_afp8_wfp4_bf16_t32x128x256_atomic,0.0%,602.9822,0,0,512.85,10705.18, +256,2048,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,419.4653,flydsl_moe1_afp8_wfp4_bf16_t64x128x256_w3_gui_fp8,0.0%,246.9579,flydsl_moe2_afp8_wfp4_bf16_t64x128x256_atomic_bnt2_persist,0.0%,666.4232,0,0,928.05,9704.97, +256,4096,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,563.2488,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_w4_gui_fp8,0.0%,356.8566,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_reduce_bnt2_persist_sbm128,0.0%,920.1054,0,0,1344.36,7056.56, +256,8192,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1000.4671,flydsl_moe1_afp8_wfp4_bf16_t64x256x256_w4_bnt0_gui,0.0%,573.6539,flydsl_moe2_afp8_wfp4_bf16_t64x256x256_reduce_bnt2,0.0%,1574.121,0,0,1571.61,4156.68, +256,16384,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1590.7982,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_bnt0_gui_fp8,0.0%,1045.6778,flydsl_moe2_afp8_wfp4_bf16_t64x256x128_atomic_sbm128,0.0%,2636.476,0,0,1876.67,2519.95, +256,32768,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,2915.7859,flydsl_moe1_afp8_wfp4_bf16_t128x256x256_w2_bnt0_gui_fp8,0.0%,1991.2871,flydsl_moe2_afp8_wfp4_bf16_t64x256x128_atomic_bnt2_sbm128,0.0%,4907.073,0,0,2016.6,1394.95, +256,1,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,26.0098,cktile_a8w4_bm32,0.0,15.5499,cktile_a8w4_bm32,0.0,41.5597,0,0,0.0,0.0,flydsl_fallback +256,2,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,29.9158,cktile_a8w4_bm32,0.0,17.2712,cktile_a8w4_bm32,0.0,47.187,0,0,0.0,0.0,flydsl_fallback +256,4,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,46.4114,cktile_a8w4_bm32,0.0,25.4266,cktile_a8w4_bm32,0.0,71.838,0,0,0.0,0.0,flydsl_fallback +256,8,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,81.1698,cktile_a8w4_bm32,0.0,42.8618,cktile_a8w4_bm32,0.0,124.0316,0,0,0.0,0.0,flydsl_fallback +256,16,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,136.22549999999998,cktile_a8w4_bm32,0.0,72.1626,cktile_a8w4_bm32,0.0,208.3881,0,0,0.0,0.0,flydsl_fallback +256,32,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,216.1381,cktile_a8w4_bm32,0.0,113.7684,cktile_a8w4_bm32,0.0,329.9065,0,0,0.0,0.0,flydsl_fallback +256,64,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,286.9568,cktile_a8w4_bm32,0.0,153.4274,cktile_a8w4_bm32,0.0,440.3842,0,0,0.0,0.0,flydsl_fallback +256,128,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,337.0177,cktile_a8w4_bm32,0.0,178.1048,cktile_a8w4_bm32,0.0,515.1225,0,0,0.0,0.0,flydsl_fallback +256,256,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,361.5862,cktile_a8w4_bm32,0.0,194.9629,cktile_a8w4_bm32,0.0,556.5491,0,0,0.0,0.0,flydsl_fallback +256,512,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,396.1095,cktile_a8w4_bm32,0.0,215.6017,cktile_a8w4_bm32,0.0,611.7112,0,0,0.0,0.0,flydsl_fallback +256,1024,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,418.9211,cktile_a8w4_bm32,0.0,218.5121,cktile_a8w4_bm32,0.0,637.4332,0,0,0.0,0.0,flydsl_fallback +256,2048,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,483.1896,cktile_a8w4_bm64,0.0,268.486,cktile_a8w4_bm64,0.0,751.6756,0,0,0.0,0.0,flydsl_fallback +256,4096,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,730.8670999999999,cktile_a8w4_bm64,0.0,426.8268,cktile_a8w4_bm64,0.0,1157.6939,0,0,0.0,0.0,flydsl_fallback +256,8192,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1288.683,cktile_a8w4_bm64,0.0,746.6747,cktile_a8w4_bm64,0.0,2035.3577,0,0,0.0,0.0,flydsl_fallback +256,16384,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,2307.6415,cktile_a8w4_bm64,0.0,1423.8539,cktile_a8w4_bm64,0.0,3731.4954,0,0,0.0,0.0,flydsl_fallback +256,32768,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,4420.6118,cktile_a8w4_bm64,0.0,2684.5026,cktile_a8w4_bm64,0.0,7105.1144,0,0,0.0,0.0,flydsl_fallback diff --git a/aiter/configs/model_configs/dsv4_fp8fp4_untuned_fmoe.csv b/aiter/configs/model_configs/dsv4_fp8fp4_untuned_fmoe.csv index 60eaf47a81..554f48834e 100644 --- a/aiter/configs/model_configs/dsv4_fp8fp4_untuned_fmoe.csv +++ b/aiter/configs/model_configs/dsv4_fp8fp4_untuned_fmoe.csv @@ -47,3 +47,19 @@ token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type, 8192,7168,1536,385,7,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 16384,7168,1536,385,7,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 32768,7168,1536,385,7,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +64,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +128,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +256,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +512,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1024,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2048,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4096,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8192,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16384,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32768,4096,2048,256,6,ActivationType.Silu,torch.bfloat16,torch.float8_e4m3fn,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 diff --git a/aiter/configs/model_configs/glm5_fp4_tuned_fmoe.csv b/aiter/configs/model_configs/glm5_fp4_tuned_fmoe.csv new file mode 100644 index 0000000000..4fbf968289 --- /dev/null +++ b/aiter/configs/model_configs/glm5_fp4_tuned_fmoe.csv @@ -0,0 +1,129 @@ +gfx,cu_num,token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1,block_m,ksplit,us1,kernelName1,err1,us2,kernelName2,err2,us,run_1stage,xbf16,flat,tflops,bw,_tag +gfx950,256,1,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,23.1763,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.9%,16.8433,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,1.4%,40.0196,0,0,0,16.98,242417.31, +gfx950,256,2,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,42.6271,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3_xcd4_kw4_fp4,0.9%,26.848,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce,1.5%,69.4751,0,0,0,19.56,139639.41, +gfx950,256,4,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,73.3988,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3_kw2_fp4,0.8%,44.5249,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2,1.6%,117.9237,0,0,0,23.05,82269.29, +gfx950,256,8,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,134.2371,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,75.9134,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2,1.5%,210.1505,0,0,0,25.87,46164.88, +gfx950,256,16,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,233.4281,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3,0.0%,127.2874,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_persist,6.3%,360.7155,0,0,0,30.14,26895.77, +gfx950,256,32,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,351.1481,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.8%,193.5721,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_persist,1.5%,544.7202,0,0,0,39.92,17811.01, +gfx950,256,64,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,458.17,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w2_kw4,0.0%,249.3472,flydsl_moe2_afp4_wfp4_bf16_t32x128x128_atomic_persist,6.2%,707.5172,0,0,0,61.46,13713.6, +gfx950,256,128,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,513.8856000000001,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w2,0.0%,276.6438,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic,6.2%,790.5294,0,0,0,110.02,12275.05, +gfx950,256,256,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,522.8823,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_fp4,0.8%,287.7902,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2_persist,6.3%,810.6725,0,0,0,214.57,11972.95, +gfx950,256,512,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,531.5599,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_fp4,0.8%,291.8145,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2_persist,6.3%,823.3744,0,0,0,422.52,11793.98, +gfx950,256,1024,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,544.8517,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_fp4,0.8%,320.5397,flydsl_moe2_afp4_wfp4_bf16_t64x128x128_atomic_bnt2_persist,6.3%,865.3914,0,0,0,804.01,11232.26, +gfx950,256,2048,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,665.9407,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4,0.0%,439.9808,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic_bnt2_persist,6.3%,1105.9215,0,0,0,1258.29,8806.39, +gfx950,256,4096,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,978.5945,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0,0.0%,784.0105,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,6.3%,1762.605,0,0,0,1578.99,5546.86, +gfx950,256,8192,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1505.6979,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0_xcd4,0.0%,1262.4143,flydsl_moe2_afp4_wfp4_bf16_t128x128x128_atomic_bnt2,6.3%,2768.1122,0,0,0,2010.86,3559.26, +gfx950,256,16384,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,2394.5774,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0_xcd4,0.0%,2118.576,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic_bnt2_persist,6.3%,4513.1534,0,0,0,2466.69,2216.5, +gfx950,256,32768,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,4390.8457,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0_xcd4,0.0%,4142.7559,flydsl_moe2_afp4_wfp4_bf16_t128x128x128_atomic,6.3%,8533.6016,0,0,0,2609.11,1207.63, +gfx950,256,1,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,18.5303,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w2_kw2,0.0%,11.8251,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_xcd4,0.4%,30.3554,0,0,0,11.19,159797.96, +gfx950,256,2,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,26.0253,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4,0.0%,16.8432,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce,0.4%,42.8685,0,0,0,15.85,113154.17, +gfx950,256,4,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,45.770500000000006,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_xcd4,0.0%,24.9613,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2,0.5%,70.7318,0,0,0,19.21,68579.99, +gfx950,256,8,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,70.93090000000001,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2,0.0%,41.4835,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce,0.5%,112.4144,0,0,0,24.18,43151.59, +gfx950,256,16,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,120.0689,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.9%,67.7614,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_reduce_bnt2,0.6%,187.8303,0,0,0,28.94,25826.54, +gfx950,256,32,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,186.3168,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4,0.0%,101.8728,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_persist,0.6%,288.1896,0,0,0,37.72,16833.72, +gfx950,256,64,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,238.3794,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_fp4,0.8%,133.0382,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_persist,3.9%,371.4176,0,0,0,58.54,13063.17, +gfx950,256,128,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,255.7691,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_fp4,0.8%,145.4351,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_bnt2_persist,4.0%,401.2042,0,0,0,108.39,12096.26, +gfx950,256,256,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,268.7419,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_fp4,0.8%,151.0036,flydsl_moe2_afp4_wfp4_bf16_t32x128x128_atomic,4.0%,419.7455,0,0,0,207.2,11567.56, +gfx950,256,512,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,273.4661,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_fp4,0.8%,161.6771,flydsl_moe2_afp4_wfp4_bf16_t32x256x128_atomic,4.0%,435.1432,0,0,0,399.74,11169.08, +gfx950,256,1024,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,297.8267,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3,0.0%,179.8705,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2,4.0%,477.6972,0,0,0,728.27,10193.88, +gfx950,256,2048,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,359.0518,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w4,0.0%,281.6555,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2_sbm128,4.0%,640.7073,0,0,0,1085.96,7629.79, +gfx950,256,4096,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,500.0051,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0,0.0%,460.2597,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_bnt2,4.0%,960.2648,0,0,0,1449.15,5130.05, +gfx950,256,8192,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,750.913,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,850.9769,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist_sbm128,4.0%,1601.8899,0,0,0,1737.41,3122.38, +gfx950,256,16384,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1295.9968,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,1598.6393,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_atomic_xcd4_persist,4.0%,2894.6361,0,0,0,1922.96,1780.09, +gfx950,256,32768,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,2423.733,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,3023.9599,flydsl_moe2_afp4_wfp4_bf16_t128x128x256_reduce_bnt2_xcd4_persist,0.6%,5447.6929,0,0,0,2043.54,1001.28, +gfx950,256,1,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,14.3775,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w4_kw4,0.0%,8.799,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.6%,23.1765,0,0,0,7.33,104648.02, +gfx950,256,2,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,16.4623,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w4_bnt0_kw2_fp4,1.1%,12.9256,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce,0.1%,29.3879,0,0,0,11.56,82530.33, +gfx950,256,4,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.643,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4_bnt0,0.0%,17.9639,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce,0.1%,46.6069,0,0,0,14.58,52040.15, +gfx950,256,8,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,42.8562,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_fp4,0.9%,26.1469,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce,0.1%,69.0031,0,0,0,19.69,35150.65, +gfx950,256,16,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,61.5569,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w2_fp4,0.8%,39.254,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,2.0%,100.8109,0,0,0,26.96,24061.4, +gfx950,256,32,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,93.0181,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,52.6628,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,2.1%,145.6809,0,0,0,37.31,16652.47, +gfx950,256,64,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,124.6707,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3,0.0%,71.5594,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,2.1%,196.2301,0,0,0,55.4,12365.77, +gfx950,256,128,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,132.5044,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w3_fp4,0.8%,77.995,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_persist,0.1%,210.4994,0,0,0,103.29,11533.12, +gfx950,256,256,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,142.2958,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3_fp4,0.8%,84.6001,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2_sbm64,0.1%,226.8959,0,0,0,191.66,10710.09, +gfx950,256,512,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,143.7898,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_fp4,0.8%,89.9336,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_atomic_persist,2.1%,233.7234,0,0,0,372.12,10417.41, +gfx950,256,1024,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,159.80810000000002,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w3,0.0%,114.103,flydsl_moe2_afp4_wfp4_bf16_t64x128x256_atomic_persist,2.1%,273.9111,0,0,0,635.05,8923.45, +gfx950,256,2048,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,196.5692,flydsl_moe1_afp4_wfp4_bf16_t128x128x256,0.0%,195.9419,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2_persist_sbm128,0.1%,392.5111,0,0,0,886.32,6275.25, +gfx950,256,4096,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,279.3132,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w4_bnt0_xcd4,0.0%,337.0306,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2_persist,0.1%,616.3438,0,0,0,1128.89,4057.56, +gfx950,256,8192,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,422.7372,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0_xcd4,0.0%,592.4468,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2_xcd4,0.1%,1015.184,0,0,0,1370.76,2537.82, +gfx950,256,16384,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,695.0513,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,1124.9939,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_sbm128,0.1%,1820.0452,0,0,0,1529.16,1498.5, +gfx950,256,32768,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1249.3197,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w3_bnt0_xcd4,0.0%,2168.6555,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2_xcd4_persist_sbm128,0.1%,3417.9752,0,0,0,1628.53,886.3, +gfx950,256,1,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,13.933,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w2_kw4,0.0%,7.8352,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.9%,21.7682,0,0,0,3.9,55709.55, +gfx950,256,2,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,14.5649,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w2_bnt0_kw4,0.0%,9.789,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,1.2%,24.3539,0,0,0,6.98,49795.52, +gfx950,256,4,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,16.1328,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w4_bnt0_kw2_fp4,0.4%,13.1423,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_bnt2,0.0%,29.2751,0,0,0,11.61,41426.05, +gfx950,256,8,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,24.2983,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w4_kw2_fp4,0.5%,17.294,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce,0.0%,41.5923,0,0,0,16.34,29159.86, +gfx950,256,16,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,32.9941,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_fp4,0.9%,24.2111,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.0%,57.2052,0,0,0,23.76,21203.9, +gfx950,256,32,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,46.9406,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w4_kw2_fp4,0.8%,31.5012,flydsl_moe2_afp4_wfp4_bf16_t32x256x256_reduce_persist,0.0%,78.4418,0,0,0,34.65,15467.11, +gfx950,256,64,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,66.2383,flydsl_moe1_afp4_wfp4_bf16_t32x128x256_w4,0.0%,40.3127,flydsl_moe2_afp4_wfp4_bf16_t32x128x256_reduce_persist,0.0%,106.551,0,0,0,51.02,11392.27, +gfx950,256,128,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,67.3417,flydsl_moe1_afp4_wfp4_bf16_t32x64x256_w3_kw2_fp4,0.8%,44.5853,flydsl_moe2_afp4_wfp4_bf16_t16x128x256_atomic_sbm32,0.8%,111.927,0,0,0,97.13,10855.62, +gfx950,256,256,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,73.0874,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3_kw2_fp4,0.8%,47.7846,flydsl_moe2_afp4_wfp4_bf16_t16x128x256_atomic_bnt2_persist_sbm32,0.8%,120.872,0,0,0,179.89,10071.78, +gfx950,256,512,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,81.41470000000001,flydsl_moe1_afp4_wfp4_bf16_t32x32x256_w3_kw2,0.0%,58.3181,flydsl_moe2_afp4_wfp4_bf16_t32x128x128_atomic_persist,0.8%,139.7328,0,0,0,311.21,8746.09, +gfx950,256,1024,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,91.8469,flydsl_moe1_afp4_wfp4_bf16_t64x64x256_w4,0.0%,84.9999,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist,0.0%,176.8468,0,0,0,491.8,6963.95, +gfx950,256,2048,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,114.6878,flydsl_moe1_afp4_wfp4_bf16_t128x128x256_w2,0.0%,140.4587,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2_sbm128,0.0%,255.1465,0,0,0,681.75,4900.82, +gfx950,256,4096,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,149.773,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,243.432,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_bnt2,0.0%,393.205,0,0,0,884.76,3276.09, +gfx950,256,8192,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,222.9684,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w2_bnt0_xcd4,0.0%,452.3736,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_persist,0.0%,675.342,0,0,0,1030.27,2019.23, +gfx950,256,16384,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,369.8823,flydsl_moe1_afp4_wfp4_bf16_t64x128x256_w3_bnt0_xcd4,0.0%,928.2411,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce,0.0%,1298.1234,0,0,0,1071.99,1166.81, +gfx950,256,32768,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,634.6348,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0%,1818.9628,flydsl_moe2_afp4_wfp4_bf16_t64x256x256_reduce_xcd4_sbm128,0.0%,2453.5976,0,0,0,1134.31,740.41, +gfx950,256,1,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.3763,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,17.5043,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0575,45.8806,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,2,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,48.718900000000005,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,28.366,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0591,77.0849,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,4,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,87.1101,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,46.4638,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0613,133.5739,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,8,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,150.69209999999998,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,80.0445,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0624,230.7366,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,16,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,266.2115,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,130.7334,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0634,396.9449,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,32,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,393.2752,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,201.0956,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0627,594.3708,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,64,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,516.4786,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,255.4396,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.062,771.9182,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,128,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,574.6139999999999,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,285.857,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0626,860.471,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,256,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,594.5266,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,298.7479,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0627,893.2745,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,512,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,613.6784,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,312.1556,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.063,925.834,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,1024,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,644.7756999999999,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,345.1159,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0626,989.8916,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,2048,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,727.2141,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,451.1975,moe_ck2stages_gemm2_256x128x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.063,1178.4116,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,4096,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,997.0191,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,795.3814,moe_ck2stages_gemm2_256x128x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0627,1792.4005,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,8192,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1505.6979,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1509.7601,moe_ck2stages_gemm2_256x128x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0628,3015.458,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,16384,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,2394.5774,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,2971.6274,moe_ck2stages_gemm2_256x128x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0626,5366.2048,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,32768,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,4390.8457,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,5940.4268,moe_ck2stages_gemm2_256x128x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0627,10331.2725,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,1,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,25.4216,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,12.6329,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0391,38.0545,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,2,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.983,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,18.066,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0352,47.049,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,4,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,48.4515,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,27.5553,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0351,76.0068,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,8,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,79.7915,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,43.3279,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.037,123.1194,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,16,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,138.8509,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,69.0591,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0393,207.91,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,32,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,214.0619,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,104.3364,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0387,318.3983,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,64,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,281.9063,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,134.655,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0391,416.5613,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,128,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,299.0999,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,147.2183,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0395,446.3182,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,256,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,311.5726,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,160.2567,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0397,471.8293,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,512,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,316.6857,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,174.0669,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0401,490.7526,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,1024,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,340.5174,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,215.7797,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0395,556.2971,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,2048,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,380.232,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,391.9694,moe_ck2stages_gemm2_256x128x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0397,772.2014,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,4096,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,529.684,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,746.3369,moe_ck2stages_gemm2_256x64x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0398,1276.0209,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,8192,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,750.913,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1474.2985,moe_ck2stages_gemm2_256x128x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0398,2225.2115,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,16384,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,1295.9968,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,2905.6642,moe_ck2stages_gemm2_256x128x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0398,4201.661,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,32768,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,2423.733,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,5806.61,moe_ck2stages_gemm2_256x128x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0398,8230.343,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,1,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,23.9867,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,8.799,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0163,32.7857,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,2,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,25.2882,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,15.5011,moe_ck2stages_gemm2_256x32x128x128_1x4_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0198,40.7893,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,4,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.8126,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,18.5661,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0184,47.3787,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,8,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,48.1134,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,26.7195,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0197,74.8329,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,16,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,73.1592,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,39.254,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0201,112.4132,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,32,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,104.8819,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,52.6628,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.021,157.5447,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,64,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,141.0362,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,71.5594,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0207,212.5956,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,128,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,151.9778,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,81.5974,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0208,233.5752,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,256,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,163.79379999999998,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,85.0243,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0206,248.8181,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,512,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,166.28410000000002,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,94.4218,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.021,260.7059,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,1024,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,190.1867,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,138.7019,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0205,328.8886,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,2048,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,259.3236,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,237.221,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0207,496.5446,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,4096,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,349.384,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,452.3463,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0208,801.7303,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,8192,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,451.7225,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,902.509,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0207,1354.2315,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,16384,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,801.4884,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1755.2507,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0207,2556.7391,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,32768,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,64,0,1444.5641,moe_ck2stages_gemm1_256x64x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,3616.0041,moe_ck2stages_gemm2_64x64x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0207,5060.5682,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,1,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,22.866300000000003,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,7.8352,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0093,30.7015,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,2,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,23.1781,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,9.789,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0116,32.9671,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,4,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,25.0505,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,15.1775,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.01,40.228,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,8,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,28.6518,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,18.0363,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0084,46.6881,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,16,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,38.6635,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,23.8459,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0084,62.5094,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,32,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,57.7881,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,32.371,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0082,90.1591,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,64,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,74.4751,moe_ck2stages_gemm1_256x32x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,42.8282,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0084,117.3033,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,128,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,79.32520000000001,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,47.095,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0083,126.4202,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,256,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,88.8383,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,52.5251,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0084,141.3634,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,512,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,90.127,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,60.2166,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0082,150.3436,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,1024,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,107.5767,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,105.3068,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0083,212.8835,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,2048,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,32,0,138.7609,moe_ck2stages_gemm1_64x32x32x128_1x1_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,197.3381,moe_ck2stages_gemm2_64x32x32x128_1x1_MulABScaleExpertWeightShuffled_v1_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0082,336.099,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,4096,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,167.3846,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,391.3202,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0082,558.7048,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,8192,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,272.9605,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,720.8387,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0082,993.7992,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,16384,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,426.4577000000001,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,1416.7869,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0082,1843.2446,0,0,0,0.0,0.0,flydsl_fallback +gfx950,256,32768,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0,128,0,634.6348,moe_ck2stages_gemm1_256x128x128x128_1x4_MulABScaleShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight0_silu_FP4X2_FP4X2_B16,0.0,2858.5569,moe_ck2stages_gemm2_64x128x128x128_1x1_MulABScaleExpertWeightShuffled_v3_Nswizzle0_Quant3_MulRoutedWeight1_FP4X2_FP4X2_B16,0.0082,3493.1917,0,0,0,0.0,0.0,flydsl_fallback diff --git a/aiter/configs/model_configs/glm5_fp4_untuned_fmoe.csv b/aiter/configs/model_configs/glm5_fp4_untuned_fmoe.csv new file mode 100644 index 0000000000..d626353901 --- /dev/null +++ b/aiter/configs/model_configs/glm5_fp4_untuned_fmoe.csv @@ -0,0 +1,65 @@ +token,model_dim,inter_dim,expert,topk,act_type,dtype,q_dtype_a,q_dtype_w,q_type,use_g1u1,doweight_stage1 +1,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +64,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +128,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +256,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +512,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1024,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2048,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4096,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8192,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16384,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32768,6144,2048,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +64,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +128,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +256,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +512,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1024,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2048,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4096,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8192,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16384,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32768,6144,1024,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +64,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +128,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +256,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +512,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1024,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2048,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4096,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8192,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16384,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32768,6144,512,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +64,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +128,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +256,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +512,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +1024,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +2048,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +4096,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +8192,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +16384,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0 +32768,6144,256,257,9,ActivationType.Silu,torch.bfloat16,torch.float4_e2m1fn_x2,torch.float4_e2m1fn_x2,QuantType.per_1x32,1,0