Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions projects/composablekernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if suppo
option(BUILD_CK_TILE_ENGINE "Build the tile_engine subdirectory" ON)
option(BUILD_CK_EXAMPLES "Build the example subdirectory" ON)
option(BUILD_CK_TUTORIALS "Build the tutorial subdirectory" ON)
option(CK_ENABLE_ROCM_CK "Build rocm_ck API" OFF)

if(CK_EXPERIMENTAL_BUILDER)
add_definitions(-DCK_EXPERIMENTAL_BUILDER)
Expand Down Expand Up @@ -754,6 +755,12 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY AND NOT HIPTE
if(BUILD_CK_TILE_ENGINE)
add_subdirectory(tile_engine)
endif()
if(CK_ENABLE_ROCM_CK)
add_subdirectory(rocm_ck)
if(TARGET check)
add_dependencies(check build-smoke-rocm-ck)
endif()
endif()
if(BUILD_TESTING)
rocm_package_setup_component(tests
LIBRARY_NAME composablekernel
Expand Down
13 changes: 13 additions & 0 deletions projects/composablekernel/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,9 @@ def cmake_build(Map conf=[:]){
if (params.RUN_BUILDER_TESTS && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) {
setup_args = " -D CK_EXPERIMENTAL_BUILDER=ON " + setup_args
}
if (params.RUN_ROCM_CK_TESTS) {
setup_args = " -D CK_ENABLE_ROCM_CK=ON " + setup_args
}
setup_cmd = conf.get(
"setup_cmd",
"""${cmake_envs} cmake -G Ninja ${setup_args} -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_CXX_FLAGS=" -O3 " .. """
Expand Down Expand Up @@ -837,6 +840,9 @@ def cmake_build(Map conf=[:]){
if (params.RUN_BUILDER_TESTS && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) {
sh 'ninja check-builder'
}
if (params.RUN_ROCM_CK_TESTS) {
sh 'ninja check-rocm-ck'
}
if(params.BUILD_PACKAGES){
echo "Build ckProfiler packages"
sh 'ninja -j64 package'
Expand Down Expand Up @@ -876,6 +882,9 @@ def cmake_build(Map conf=[:]){
if (params.RUN_BUILDER_TESTS && !setup_args.contains("-DCK_CXX_STANDARD=") && !setup_args.contains("gfx10") && !setup_args.contains("gfx11")) {
sh 'ninja check-builder'
}
if (params.RUN_ROCM_CK_TESTS) {
sh 'ninja check-rocm-ck'
}
if(params.BUILD_PACKAGES){
echo "Build ckProfiler packages"
sh 'ninja -j64 package'
Expand Down Expand Up @@ -1425,6 +1434,10 @@ pipeline {
name: "RUN_BUILDER_TESTS",
defaultValue: false,
description: "Run CK_BUILDER tests (default: OFF)")
booleanParam(
name: "RUN_ROCM_CK_TESTS",
defaultValue: true,
description: "Run rocm_ck tests (default: ON)")
booleanParam(
name: "RUN_ALL_UNIT_TESTS",
defaultValue: false,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,7 @@ bool run_grouped_conv_fwd(bool do_verification,

ref_invoker.Run(ref_argument);

out_host.ForEach([&](auto&, auto idx)
{
out_element_op(out_host(idx), c_host(idx));
});
out_host.ForEach([&](auto&, auto idx) { out_element_op(out_host(idx), c_host(idx)); });

out_device_buf.FromDevice(out_device.mData.data());

Expand Down
29 changes: 29 additions & 0 deletions projects/composablekernel/rocm_ck/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
#
# rocm_ck — constexpr schema API over CK Tile device kernels.
#
# Build from CK root:
# cmake -B build -S . -G Ninja -DCK_ENABLE_ROCM_CK=ON
# ninja -C build smoke-rocm-ck

# rocm_ck — header-only INTERFACE library
add_library(rocm_ck INTERFACE)
target_include_directories(rocm_ck INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
)
# rocm_ck requires C++20, but the CK library is still migrating from c++17
# We should remove this once the library fully migrates to c++20.
target_compile_features(rocm_ck INTERFACE cxx_std_20)
target_compile_options(rocm_ck INTERFACE -Wno-c++20-compat)

# CK Tile headers — required for device code compilation
if(NOT TARGET ck_tile_headers)
add_library(ck_tile_headers INTERFACE)
target_include_directories(ck_tile_headers INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
)
endif()

enable_testing()
add_subdirectory(tests)
113 changes: 113 additions & 0 deletions projects/composablekernel/rocm_ck/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# rocm_ck

A C++20 constexpr API for configuring and distributing
[CK Tile](../include/ck_tile/) GPU kernels across multiple architectures.

> **Status**: Early development. The current code establishes the directory
> structure, build integration, and CI pipeline. A single unit test verifies
> that the build and test infrastructure works end-to-end in Jenkins.
> The schema types, device bridge, and kernel tests described below are
> under active development.

## Why rocm_ck exists

CK Tile kernels are C++ templates. A GEMM kernel's tile size, pipeline
strategy, data types, and epilogue are all template parameters — fixed at
compile time. This is excellent for performance (zero-overhead abstraction,
full inlining), but it creates a problem for multi-architecture distribution:
the host program must be compiled separately from device code, and the host
compiler must never see CK Tile headers.

rocm_ck solves this by introducing a **host-device boundary** built on
constexpr data rather than template parameters:

1. **On the host side**, kernel configurations are plain C++20 structs
(`Signature`, `Algorithm`, `GemmSpec`). These are constexpr data —
they describe *what* to compute and *how*, without instantiating any
templates. Host code reasons about kernels using values, not types.

2. **On the device side**, a thin bridge layer lowers these constexpr
descriptions into CK Tile template instantiations. Each `GemmSpec`
maps to exactly one `ck_tile::GemmPipeline<...>` specialization.

3. **At the boundary**, pre-compiled kernels are packaged into
[kpack archives](https://github.com/ROCm/TheRock/blob/main/docs/rfcs/RFC0008-Multi-Arch-Packaging.md) —
self-describing, compressed, multi-architecture bundles. The host loads kernels at runtime
by matching a `GemmSpec` against the kpack table of contents. No
recompilation, no template instantiation on the host.

This separation is what makes CK Tile viable in
[TheRock](https://github.com/ROCm/TheRock)'s multi-arch build system,
where a single host binary must work with device code compiled for
many GPU targets (e.g. gfx90a, gfx942, gfx1151).

## The constexpr schema model

Traditional GPU kernel libraries select kernels through template
parameters or runtime enums. rocm_ck uses a third approach: **constexpr
structs that are validated at compile time and lowered to templates on
the device side.**

A kernel configuration has two axes:

- **Signature** — *what* the kernel computes: a directed graph of
operators (`GemmOp`, `AddOp`, `ReluOp`, ...) connecting named tensor
slots. Data types, layouts, and batch dimensions are part of the
signature.

- **Algorithm** — *how* the kernel computes it: tile geometry, pipeline
strategy, warp layout, padding, and scheduling. These are tuning
parameters that don't change the mathematical result.

The `Signature` and `Algorithm` are plain aggregate structs with
designated initializers — no constructors, no inheritance, no runtime
polymorphism. Validation happens in `consteval` functions: invalid
configurations (unsupported tile size, incompatible data types, missing
tensor slots) fail at compile time with actionable error messages.

Here is a preview of the API direction (not yet implemented):

```cpp
// Host side — pure constexpr, any C++20 compiler, no CK headers
constexpr Signature sig = {
.dtype = DataType::FP16,
.ops = {
GemmOp{.lhs = "A", .rhs = "B", .out = "C"},
AddOp{.lhs = "C", .rhs = "bias", .out = "D"},
ReluOp{.in = "D", .out = "E"},
},
};

// Device side — make_kernel lowers to a CK Tile template instantiation.
// Compiled separately per architecture, packaged into .kpack archives.
```

## Directory layout

```text
rocm_ck/
├── CMakeLists.txt # INTERFACE library, C++20, ck_tile_headers target
├── include/rocm_ck/ # Public headers — host-safe, no CK/HIP deps
├── src/ # (planned) Device bridge, kpack loading
└── tests/
├── CMakeLists.txt # Test tiers: ROCM_CK_SMOKE, ROCM_CK_KERNEL
├── unit/ # Fast host-only tests (< 1s, no GPU)
└── kernel/ # (planned) GPU kernel tests
```

## Build

rocm_ck is a CK feature, gated by `CK_ENABLE_ROCM_CK`:

```bash
cd composablekernel
cmake -B build -S . -G Ninja \
-DCK_ENABLE_ROCM_CK=ON \
-DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++

ninja -C build smoke-rocm-ck # host-only smoke tests
ninja -C build check-rocm-ck # all rocm_ck tests
ctest --test-dir build -L ROCM_CK_SMOKE --output-on-failure
```

Default CK builds (`CK_ENABLE_ROCM_CK=OFF`) are unaffected.
17 changes: 17 additions & 0 deletions projects/composablekernel/rocm_ck/include/rocm_ck/index_t.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
// Role: types — index_t, long_index_t. No runtime, no CK deps.

#pragma once

#include <cstdint>

namespace rocm_ck {

// Matches ck_tile::index_t without pulling in CK Tile headers.
using index_t = std::int32_t;

// batch_stride * nhead can exceed int32. Matches ck_tile::long_index_t.
using long_index_t = std::int64_t;

} // namespace rocm_ck
67 changes: 67 additions & 0 deletions projects/composablekernel/rocm_ck/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT
#
# rocm_ck tests
#
# Test tiers:
# ROCM_CK_SMOKE — Fast host-only tests (< 1s total). No GPU, no HIP.
# ROCM_CK_KERNEL — GPU kernel tests. Require HIP and a GPU.
#
# Usage:
# ninja smoke-rocm-ck # build + run smoke tests
# ninja build-smoke-rocm-ck # build only (no run)
# ninja check-rocm-ck # run all rocm_ck tests
#
# ctest -L ROCM_CK_SMOKE --output-on-failure

# Google Test (via CK's FetchContent wrapper)
include(${CMAKE_CURRENT_SOURCE_DIR}/../../cmake/gtest.cmake)

# ---------------------------------------------------------------------------
# Helper function — reusable per-test setup
# ---------------------------------------------------------------------------
function(add_rocm_ck_test test_name)
add_executable(${test_name} ${ARGN})
target_link_libraries(${test_name} PRIVATE rocm_ck GTest::gtest_main)
target_compile_options(${test_name} PRIVATE
-Wno-global-constructors # GTest registration macros
-Wno-undef # GTest internal headers
)
endfunction()

# ---------------------------------------------------------------------------
# Smoke tests (fast, host-only, no GPU)
# ---------------------------------------------------------------------------
set(ROCM_CK_SMOKE_TESTS
unit/unit_index_t.cpp
)

set(ROCM_CK_SMOKE_TARGETS)
foreach(test_source ${ROCM_CK_SMOKE_TESTS})
get_filename_component(test_name ${test_source} NAME_WLE)
set(target_name "rocm_ck_${test_name}")
add_rocm_ck_test(${target_name} ${test_source})
add_test(NAME ${target_name} COMMAND ${target_name})
set_tests_properties(${target_name} PROPERTIES LABELS "ROCM_CK_SMOKE")
list(APPEND ROCM_CK_SMOKE_TARGETS ${target_name})
endforeach()

# rocm_ck_unit_index_t verifies rocm_ck index types match ck_tile
target_include_directories(rocm_ck_unit_index_t PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)

# ---------------------------------------------------------------------------
# Convenience targets
# ---------------------------------------------------------------------------
add_custom_target(build-smoke-rocm-ck DEPENDS ${ROCM_CK_SMOKE_TARGETS})

add_custom_target(smoke-rocm-ck
COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -L "ROCM_CK_SMOKE"
DEPENDS build-smoke-rocm-ck
USES_TERMINAL
COMMENT "Running rocm_ck smoke tests...")

add_custom_target(check-rocm-ck
COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -L "ROCM_CK"
DEPENDS build-smoke-rocm-ck
USES_TERMINAL
COMMENT "Running all rocm_ck tests...")
33 changes: 33 additions & 0 deletions projects/composablekernel/rocm_ck/tests/unit/unit_index_t.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT

#include <rocm_ck/index_t.hpp>

#include <ck_tile/core/numeric/integer.hpp>

#include <gtest/gtest.h>

using ::rocm_ck::index_t;
using ::rocm_ck::long_index_t;

namespace {

TEST(IndexTypes, IndexTypeIs32Bit) { EXPECT_EQ(sizeof(index_t), 4); }

TEST(IndexTypes, LongIndexTypeIs64Bit) { EXPECT_EQ(sizeof(long_index_t), 8); }

TEST(IndexTypes, IndexTypeIsSigned) { EXPECT_TRUE(index_t(-1) < 0); }

TEST(IndexTypes, LongIndexTypeIsSigned) { EXPECT_TRUE(long_index_t(-1) < 0); }

TEST(IndexTypes, MatchesCkTileIndexType)
{
EXPECT_TRUE((std::is_same_v<index_t, ck_tile::index_t>));
}

TEST(IndexTypes, MatchesCkTileLongIndexType)
{
EXPECT_TRUE((std::is_same_v<long_index_t, ck_tile::long_index_t>));
}

} // namespace
Loading