From 69d7daca314eddcd394d6925ba75b73ceac15ed7 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Tue, 17 Sep 2024 09:40:03 -0700 Subject: [PATCH] Tuned fp8 gemm for LDM cases Summary: ``` buck2 run @//mode/opt-amd-gpu -c fbcode.rocm_arch=mi300 --modifier ovr_config//third-party/rocm/constraints:6.0.1 //deeplearning/fbgemm/fbgemm_gpu/experimental/gen_ai/bench:quantize_bench -- --enable_amd_env_vars --kernels=ck_rowwise --N 3584 --M 8192 --K 9728 --use_rotating_buffer_bench ck_rowwise sim: 13.812. ck_rowwise ms: 0.558. ck_rowwise TFLOPS: 1022.833. ck_rowwise GB/s: 310.266. ``` Differential Revision: D62776861 --- .../gen_ai/src/quantize/ck_extensions/fp8_rowwise_gemm.hip | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_gemm.hip b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_gemm.hip index 3450d2cad..d89db6675 100644 --- a/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_gemm.hip +++ b/fbgemm_gpu/experimental/gen_ai/src/quantize/ck_extensions/fp8_rowwise_gemm.hip @@ -130,7 +130,7 @@ static const std::unordered_map< {{8192, 9728, 3584}, fp8_rowwise_256x256x256x128_16x16_8x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3}, {{8192, 3584, 9728}, - fp8_rowwise_256x128x128x128_32x32_2x2_8x32x1_8x32x1_1x32x1x8_8x8x1_1x1_intrawave_v5}, + fp8_rowwise_256x224x256x128_16x16_7x8_8x32x1_8x32x1_1x32x1x8_8x8x1_1x2_intrawave_v3}, {{8192, 3584, 3584}, fp8_rowwise_256x256x224x128_16x16_8x7_8x32x1_8x32x1_1x64x1x4_8x8x1_2x1_intrawave_v3}, {{4096, 3584, 3584},