Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
07fea75
[CK_TILE] Add FP8 per-tensor quantization support for FMHA V3 pipeline
poyenc Feb 9, 2026
df30860
[CK] chore: Remove dead code and unused variables from V3 pipeline
poyenc Feb 12, 2026
6cf6cbd
fix(fmha_v3): remove dead P buffer from LDS size calculation
poyenc Feb 24, 2026
2d565ff
refactor(fmha_v3): sync 4-buffer LDS architecture from CK submodule
poyenc Feb 24, 2026
35d4f30
fix(codegen): add no_scale qscale guard for bf16/fp16 v3 dispatch
poyenc Feb 24, 2026
9620f60
fix(fmha_v3): revert V tile distribution and LDS descriptor swap
poyenc Feb 24, 2026
b9a1db3
refactor(fmha_fwd): simplify V3 dispatch to rely on trait matching
poyenc Feb 28, 2026
9211167
fix(warp_gemm): remove duplicate type aliases from bad conflict resol…
poyenc Mar 13, 2026
4e5fd8e
fix(fmha_v3): remove unused smem_ptr and suppress unreachable-code wa…
poyenc Mar 13, 2026
cd88039
fix(codegen): add missing return in check_hdim compatibility rule
poyenc Mar 13, 2026
c0b41a4
feat: add kernel_attr_for composable arch+attribute template
poyenc Mar 14, 2026
9fb34a1
style: format kernel_launch.hpp with clang-format
poyenc Mar 16, 2026
0734917
refactor(batch_prefill): align codegen architecture with fmha_fwd.py
poyenc Feb 26, 2026
d38ddb5
feat(batch_prefill): add V3 pipeline for paged KV attention
poyenc Feb 28, 2026
2529790
refactor(batch_prefill): remove bf16/fp16 V3 tile and pipeline from c…
poyenc Mar 2, 2026
2d4cd3e
fix(batch_prefill): guard V3 scatter-gather auto-advance past seqlen_k
poyenc Mar 3, 2026
b58168a
perf(batch_prefill): move V3 page offset advance out of load lambdas
poyenc Mar 3, 2026
1cab14c
perf(batch_prefill): branchless page_id clamping in load_physical_pages
poyenc Mar 3, 2026
ceac7d9
perf(batch_prefill): split page advance into issue/consume for vmcnt …
poyenc Mar 4, 2026
50cd5b5
perf(batch_prefill): add s_nop 3 for FP8 compute phases to reduce SIM…
poyenc Mar 5, 2026
6a5c81c
feat: add KV_BLOCKSCALE support to V3 batch prefill
poyenc Mar 8, 2026
f9adb94
perf(batch_prefill): optimize KV_BLOCKSCALE v_descale and k_descale p…
poyenc Mar 9, 2026
54ea4a1
feat(batch_prefill): add dedicated V3 policy and SRD rebasing for dev…
poyenc Mar 13, 2026
b830920
perf(batch_prefill): align V3 scheduling with feature branch
poyenc Mar 14, 2026
23fd2e9
perf(batch_prefill): add no-packed-fp32-ops attribute for V3 kernels
poyenc Mar 14, 2026
ce3248c
refactor(fmha_v3): extract shared helpers and clean up V3 pipeline he…
poyenc Mar 15, 2026
242d03c
docs(batch_prefill): document V3 LINEAR-only layout design decision
poyenc Mar 15, 2026
5a66eaf
docs: add V3 batch prefill to CHANGELOG
poyenc Mar 15, 2026
7d69723
style: format batch_prefill pipeline with clang-format
poyenc Mar 16, 2026
bb8e87b
revert: remove no-packed-fp32-ops kernel_attr from V3 batch prefill
poyenc Mar 16, 2026
72e3d84
fix: remove unused V_KLanes variable and smem_ptr parameter in batch …
poyenc Mar 16, 2026
4bc1e01
fix: resolve compilation errors in V3 pipeline non-gfx950 stubs
poyenc Mar 16, 2026
5c16109
Merge branch 'develop' into users/poyenc/batch-prefill-v3
poyenc Mar 17, 2026
347ac33
Merge branch 'develop' into users/poyenc/batch-prefill-v3
poyenc Mar 20, 2026
1061b67
Merge branch 'develop' into users/poyenc/batch-prefill-v3
poyenc Mar 20, 2026
5a7befb
Merge branch 'develop' into users/poyenc/batch-prefill-v3
poyenc Mar 20, 2026
f4447cf
Merge branch 'develop' into users/poyenc/batch-prefill-v3
poyenc Mar 23, 2026
f487b66
Merge branch 'develop' into users/poyenc/batch-prefill-v3
poyenc Mar 26, 2026
715b3d8
Merge branch 'develop' into users/poyenc/batch-prefill-v3
poyenc Mar 31, 2026
7d022d1
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Mar 31, 2026
584d6ad
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 1, 2026
0e83fe7
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 2, 2026
bec4af8
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 3, 2026
c54c74d
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 4, 2026
3c0e87c
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 7, 2026
415ed0c
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 11, 2026
a09b751
style: clang-format-18 batch prefill v3 pipeline
poyenc Apr 12, 2026
29a6e52
fix: remove duplicate fp8 CTransposed warp gemm dispatcher entries
poyenc Apr 13, 2026
b501a46
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 14, 2026
09becba
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 15, 2026
690df62
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 16, 2026
a2fbf47
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 17, 2026
f993d00
fix(warp_gemm): restore missing fp8 non-transposed dispatcher entry
poyenc Apr 17, 2026
15e4fcc
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 17, 2026
6959f8d
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 20, 2026
41dd838
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 22, 2026
515cb8f
fix(batch_prefill): restore group-mode-only guard in codegen rules
poyenc Apr 22, 2026
a652c87
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 23, 2026
0a6d865
fix(codegen): suppress unreachable-code warning in batch_prefill v3 d…
poyenc Apr 23, 2026
677cd96
fix(codegen): handle unsupported targets gracefully in batch_prefill
poyenc Apr 23, 2026
56ff8a3
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 24, 2026
0bb635e
Merge branch 'develop' into users/poyenc/ck/batch-prefill-v3
poyenc Apr 24, 2026
70ed05b
[CK] Remove unused smem_epilogue_buf in batch prefill v3 kernel
poyenc Apr 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions projects/composablekernel/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
* Added gfx11 support for FMHA.
* Added microscaling (MX) FP8/FP4 support on gfx950 for FMHA forward kernel ("qr" pipeline only).
* Added FP8 per-tensor quantization support for FMHA forward V3 pipeline on gfx950.
* Added new FMHA batch prefill kernel on gfx950 with FP8 per-tensor and per-block KV quantization support.

### Changed

Expand Down

Large diffs are not rendered by default.

104 changes: 104 additions & 0 deletions projects/composablekernel/example/ck_tile/01_fmha/fmha_fwd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1403,6 +1403,110 @@ auto fmha_batch_prefill_create_kargs_and_grids(fmha_batch_prefill_args args)
return ck_tile::make_tuple(kargs, grids);
}

template <typename FmhaKernel>
auto fmha_batch_prefill_v3_create_kargs_and_grids(fmha_batch_prefill_args args)
{
assert(args.nhead_q % args.nhead_k == 0);
using PageTableKargs = typename FmhaKernel::PageBlockTableKargs;
const PageTableKargs page_table = [&]() {
if constexpr(FmhaKernel::kKVLookupTable ==
ck_tile::BlockAttentionKVCacheLookupTableEnum::SGLANG_PAGE_TABLE_1D)
{
return PageTableKargs{reinterpret_cast<const int32_t*>(args.kv_indptr),
reinterpret_cast<const int32_t*>(args.kv_page_indices),
reinterpret_cast<const int32_t*>(args.kv_last_page_lens)};
}
else
{
return PageTableKargs{reinterpret_cast<const int32_t*>(args.kv_page_indices),
args.batch_stride_block_table,
reinterpret_cast<const int32_t*>(args.seqlen_k_ptr)};
}
}();
auto kargs = [&] {
if constexpr(FmhaKernel::kIsGroupMode)
{
return FmhaKernel::MakeKargs(args.q_ptr,
args.k_ptr,
args.v_ptr,
args.q_descale_ptr,
args.k_descale_ptr,
args.v_descale_ptr,
args.lse_ptr,
args.o_ptr,
args.seqstart_q_ptr,
args.hdim_q,
args.hdim_v,
args.nhead_q,
args.nhead_q / args.nhead_k,
args.num_total_pages,
args.page_block_size,
page_table,
args.scale_s,
args.logits_soft_cap,
args.stride_q,
args.stride_k,
args.stride_v,
args.stride_o,
args.nhead_stride_q,
args.nhead_stride_k,
args.nhead_stride_v,
args.nhead_stride_lse,
args.nhead_stride_o,
args.batch_stride_k,
args.batch_stride_v,
args.window_size_left,
args.window_size_right,
args.mask_type,
args.nblock_stride_kv_block_descale,
args.nhead_stride_kv_block_descale);
}
else
{
return FmhaKernel::MakeKargs(args.q_ptr,
args.k_ptr,
args.v_ptr,
args.q_descale_ptr,
args.k_descale_ptr,
args.v_descale_ptr,
args.lse_ptr,
args.o_ptr,
args.seqlen_q,
args.hdim_q,
args.hdim_v,
args.nhead_q,
args.nhead_q / args.nhead_k,
args.num_total_pages,
args.page_block_size,
page_table,
args.scale_s,
args.logits_soft_cap,
args.stride_q,
args.stride_k,
args.stride_v,
args.stride_o,
args.nhead_stride_q,
args.nhead_stride_k,
args.nhead_stride_v,
args.nhead_stride_lse,
args.nhead_stride_o,
args.batch_stride_q,
args.batch_stride_k,
args.batch_stride_v,
args.batch_stride_lse,
args.batch_stride_o,
args.window_size_left,
args.window_size_right,
args.mask_type,
args.nblock_stride_kv_block_descale,
args.nhead_stride_kv_block_descale);
}
}();

dim3 grids = FmhaKernel::GridSize(args.batch, args.nhead_q, args.max_seqlen_q, args.hdim_v);
return ck_tile::make_tuple(kargs, grids);
}

// this is used to pattern-match internl kernel implementation, not to instantiate kernel
template <ck_tile::index_t HDim_,
typename DataType_,
Expand Down
2 changes: 2 additions & 0 deletions projects/composablekernel/include/ck_tile/ops/fmha.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "ck_tile/ops/fmha/block/page_block_navigator.hpp"
#include "ck_tile/ops/fmha/block/variants.hpp"
#include "ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp"
#include "ck_tile/ops/fmha/kernel/fmha_batch_prefill_v3_kernel.hpp"
#include "ck_tile/ops/fmha/kernel/fmha_bwd_kernel.hpp"
#include "ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp"
#include "ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_tile_partitioner.hpp"
Expand All @@ -24,6 +25,7 @@
#include "ck_tile/ops/fmha/kernel/fmha_fwd_v3_kernel.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_pipeline_qr_ks_vs_async_default_policy.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_batch_prefill_v3_pipeline.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_convert_dq.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dot_do_o.hpp"
#include "ck_tile/ops/fmha/pipeline/block_fmha_bwd_dq_dk_dv_pipeline_kr_ktr_vr.hpp"
Expand Down
Loading
Loading