From 34a8a8cce8c9c1a8772650e9193e36f47b6030db Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Wed, 24 Jun 2026 18:41:42 -0700 Subject: [PATCH 1/9] Add multi-gpu CI for cudax --- ci/matrix.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 3653930095e..b47bbabbcd4 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -61,6 +61,7 @@ workflows: - {jobs: ['test'], project: ['libcudacxx', 'cudax'], std: 'max', gpu: 'h100' } # Multi-GPU coverage: - {jobs: ['test'], project: 'libcudacxx', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'} + - {jobs: ['test'], project: 'cudax', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'} # RTX PRO 6000 coverage (limited due to small number of runners): - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'rtxpro6000'} # Misc: From b95276ba41688deaec9a8265c037205952267144 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Thu, 25 Jun 2026 07:58:11 +0200 Subject: [PATCH 2/9] cudax: run green context graph test on all devices --- cudax/test/stf/green_context/cuda_graph.cu | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/cudax/test/stf/green_context/cuda_graph.cu b/cudax/test/stf/green_context/cuda_graph.cu index 6e13261bfc9..9676c33e4c5 100644 --- a/cudax/test/stf/green_context/cuda_graph.cu +++ b/cudax/test/stf/green_context/cuda_graph.cu @@ -11,6 +11,8 @@ #include #include +#include + using namespace cuda::experimental::stf; // Green contexts are only supported since CUDA 12.4 @@ -56,15 +58,23 @@ int main() auto handle_Y = ctx.logical_data(make_slice(&Y[0], n)); // The green_context_helper class automates the creation of green context views - green_context_helper gc(num_sms); + std::vector gc(ndevs); + for (int devid = 0; devid < ndevs; devid++) + { + gc[devid] = green_context_helper(num_sms, devid); + } for (int iter = 0; iter < NITER; iter++) { - auto cnt = gc.get_count(); - ctx.task(exec_place::green_ctx(gc.get_view(iter % cnt)), handle_X.read(), handle_Y.rw()) - ->*[&](cudaStream_t stream, auto dX, auto dY) { - axpy<<<16, 16, 0, stream>>>(alpha, dX, dY); - }; + for (int devid = 0; devid < ndevs; devid++) + { + auto& g_ctx = gc[devid]; + auto cnt = g_ctx.get_count(); + ctx.task(exec_place::green_ctx(g_ctx.get_view(iter % cnt)), handle_X.read(), handle_Y.rw()) + ->*[&](cudaStream_t stream, auto dX, auto dY) { + axpy<<<16, 16, 0, stream>>>(alpha, dX, dY); + }; + } } ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto hX, auto hY) { From 9ea42e29d169a607e4172aef76a58c917863f794 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Thu, 25 Jun 2026 09:23:37 +0200 Subject: [PATCH 3/9] cudax: use per-place streams for grid parallel_for --- .../__stf/internal/parallel_for_scope.cuh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index cf7b79654ae..173a30038cf 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -687,7 +687,7 @@ public: { auto active = t.activate_place(i); const auto sub_shape = partitioner_t::apply(shape, pos4(i), e_place.get_dims()); - do_parallel_for(f, active.place(), sub_shape, t); + do_parallel_for(f, active.place(), sub_shape, t, i); } } } @@ -897,7 +897,11 @@ public: // Executes the loop on a device, or use the host implementation template void do_parallel_for( - Fun&& f, const exec_place& sub_exec_place, const sub_shape_t& sub_shape, typename context::task_type& t) + Fun&& f, + const exec_place& sub_exec_place, + const sub_shape_t& sub_shape, + typename context::task_type& t, + size_t place_index = 0) { // parallel_for never calls this function with a host. _CCCL_ASSERT(sub_exec_place != exec_place::host(), "Internal CUDASTF error."); @@ -905,7 +909,7 @@ public: if (sub_exec_place == exec_place::device_auto()) { // We have all latitude - recurse with the current device. - return do_parallel_for(::std::forward(f), exec_place::current_device(), sub_shape, t); + return do_parallel_for(::std::forward(f), exec_place::current_device(), sub_shape, t, place_index); } using Fun_no_ref = ::std::remove_reference_t; @@ -941,7 +945,7 @@ public: if constexpr (::std::is_same_v) { reserved::loop - <<(blocks), static_cast(block_size), 0, t.get_stream()>>>( + <<(blocks), static_cast(block_size), 0, t.get_stream(place_index)>>>( static_cast(n), sub_shape, mv(f), arg_instances); } else if constexpr (::std::is_same_v) From 9f1bffa4d8d67c8224c60b4e2d4a45b051ac8bab Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Thu, 25 Jun 2026 09:29:42 +0200 Subject: [PATCH 4/9] cudax: format grid parallel_for stream fix --- .../__stf/internal/parallel_for_scope.cuh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index 173a30038cf..c3ba1741b7e 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -896,12 +896,11 @@ public: // Executes the loop on a device, or use the host implementation template - void do_parallel_for( - Fun&& f, - const exec_place& sub_exec_place, - const sub_shape_t& sub_shape, - typename context::task_type& t, - size_t place_index = 0) + void do_parallel_for(Fun&& f, + const exec_place& sub_exec_place, + const sub_shape_t& sub_shape, + typename context::task_type& t, + size_t place_index = 0) { // parallel_for never calls this function with a host. _CCCL_ASSERT(sub_exec_place != exec_place::host(), "Internal CUDASTF error."); From 107f745a663aa7d329a19dedbdc4c46005a5e0c4 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Thu, 25 Jun 2026 10:40:37 +0200 Subject: [PATCH 5/9] cudax: keep gc_grid data placement managed --- cudax/test/stf/green_context/gc_grid.cu | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cudax/test/stf/green_context/gc_grid.cu b/cudax/test/stf/green_context/gc_grid.cu index 7ae77b1c152..89fd5450d62 100644 --- a/cudax/test/stf/green_context/gc_grid.cu +++ b/cudax/test/stf/green_context/gc_grid.cu @@ -99,7 +99,14 @@ int main() for (int iter = 0; iter < NITER; iter++) { - ctx.parallel_for(blocked_partition(), where, handle_X.shape(), handle_X.rw(), handle_Y.read()) + // This test targets green-context grid execution. Keep the data placement + // independent from the grid to avoid also testing localized VMM mappings. + ctx.parallel_for( + blocked_partition(), + where, + handle_X.shape(), + handle_X.rw(data_place::managed()), + handle_Y.read(data_place::managed())) ->*[] __device__(size_t i, auto x, auto y) { x(i) += y(i); }; From 13a8057d9bf97f349c9480f8e6c182806457e6b3 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Thu, 25 Jun 2026 10:54:11 +0200 Subject: [PATCH 6/9] cudax: add managed green-context grid probe --- cudax/test/stf/CMakeLists.txt | 1 + cudax/test/stf/green_context/gc_grid.cu | 9 +- .../test/stf/green_context/gc_grid_managed.cu | 116 ++++++++++++++++++ 3 files changed, 118 insertions(+), 8 deletions(-) create mode 100644 cudax/test/stf/green_context/gc_grid_managed.cu diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt index 6405723f09d..eca64787382 100644 --- a/cudax/test/stf/CMakeLists.txt +++ b/cudax/test/stf/CMakeLists.txt @@ -152,6 +152,7 @@ set( green_context/axpy_gc.cu green_context/cuda_graph.cu green_context/gc_grid.cu + green_context/gc_grid_managed.cu hash/ctx_hash.cu hash/logical_data.cu hashtable/fusion.cu diff --git a/cudax/test/stf/green_context/gc_grid.cu b/cudax/test/stf/green_context/gc_grid.cu index 89fd5450d62..7ae77b1c152 100644 --- a/cudax/test/stf/green_context/gc_grid.cu +++ b/cudax/test/stf/green_context/gc_grid.cu @@ -99,14 +99,7 @@ int main() for (int iter = 0; iter < NITER; iter++) { - // This test targets green-context grid execution. Keep the data placement - // independent from the grid to avoid also testing localized VMM mappings. - ctx.parallel_for( - blocked_partition(), - where, - handle_X.shape(), - handle_X.rw(data_place::managed()), - handle_Y.read(data_place::managed())) + ctx.parallel_for(blocked_partition(), where, handle_X.shape(), handle_X.rw(), handle_Y.read()) ->*[] __device__(size_t i, auto x, auto y) { x(i) += y(i); }; diff --git a/cudax/test/stf/green_context/gc_grid_managed.cu b/cudax/test/stf/green_context/gc_grid_managed.cu new file mode 100644 index 00000000000..11760adbff6 --- /dev/null +++ b/cudax/test/stf/green_context/gc_grid_managed.cu @@ -0,0 +1,116 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +// Green contexts are only supported since CUDA 12.4 +#if _CCCL_CTK_AT_LEAST(12, 4) +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + size_t n = x.extent(0); + for (int ind = tid; ind < n; ind += nthreads) + { + y(ind) += a * x(ind); + } +} + +void debug_info(cudaStream_t stream, CUgreenCtx g_ctx) +{ + // Get the green context associated to that CUDA stream + CUgreenCtx stream_cugc; + cuda_safe_call(cuStreamGetGreenCtx(CUstream(stream), &stream_cugc)); + assert(stream_cugc != nullptr); + + CUcontext stream_green_primary; + CUcontext place_green_primary; + + unsigned long long stream_ctxId; + unsigned long long place_ctxId; + + // Convert green contexts to primary contexts and get their ID + cuda_safe_call(cuCtxFromGreenCtx(&stream_green_primary, stream_cugc)); + cuda_safe_call(cuCtxGetId(stream_green_primary, &stream_ctxId)); + + cuda_safe_call(cuCtxFromGreenCtx(&place_green_primary, g_ctx)); + cuda_safe_call(cuCtxGetId(place_green_primary, &place_ctxId)); + + // Make sure the stream belongs to the same green context as the execution place + EXPECT(stream_ctxId == place_ctxId); +} +#endif // _CCCL_CTK_AT_LEAST(12, 4) + +int main() +{ +#if _CCCL_CTK_BELOW(12, 4) + fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n"); + return 0; +#else // ^^^ _CCCL_CTK_BELOW(12, 4) ^^^ / vvv _CCCL_CTK_AT_LEAST(12, 4) vvv + int ndevs; + const int num_sms = 8; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + stream_ctx ctx; + + int NITER = 8; + const int n = 16 * 1024 * 1024; + + std::vector X(n); + std::vector Y(n); + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + Y[ind] = 2.0 * ind - 3.0; + } + + auto handle_X = ctx.logical_data(make_slice(&X[0], n)); + auto handle_Y = ctx.logical_data(make_slice(&Y[0], n)); + + std::vector exec_places; + + // The green_context_helper class automates the creation of green context views + std::vector gc(ndevs); + for (int devid = 0; devid < ndevs; devid++) + { + gc[devid] = green_context_helper(num_sms, devid); + + auto& g_ctx = gc[devid]; + auto cnt = g_ctx.get_count(); + for (size_t i = 0; i < cnt; i++) + { + exec_places.push_back(exec_place::green_ctx(g_ctx.get_view(i))); + } + } + + auto where = make_grid(exec_places); + + for (int iter = 0; iter < NITER; iter++) + { + // Probe green-context grid execution independently from composite VMM mappings. + ctx.parallel_for( + blocked_partition(), + where, + handle_X.shape(), + handle_X.rw(data_place::managed()), + handle_Y.read(data_place::managed())) + ->*[] __device__(size_t i, auto x, auto y) { + x(i) += y(i); + }; + } + + ctx.finalize(); +#endif // ^^^ _CCCL_CTK_AT_LEAST(12, 4) ^^^ +} From 02492c87e176712b6ef4178e224fe1abd393f7f1 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Thu, 25 Jun 2026 13:44:48 +0200 Subject: [PATCH 7/9] ci: target green-context grid debug on 2-GPU H100 --- ci/matrix.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index b47bbabbcd4..8ddb9ec7724 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -22,6 +22,8 @@ workflows: # # IMPORTANT: Do NOT delete or remove the `override:` key below, even when it is empty. override: + - { jobs: ['run_gpu'], project: 'target', std: 20, ctk: '13.X', cxx: 'gcc', gpu: 'h100_2gpu', sm: 'gpu', + args: '--preset cudax-cpp20 --build-targets "cudax.test.stf.green_context.gc_grid cudax.test.stf.green_context.gc_grid_managed" --custom-test-cmd "CUDA_LAUNCH_BLOCKING=1 CUDA_LOG_FILE=stderr ctest --test-dir \"${BUILD_DIR}\" -R \"cudax.test.stf.green_context.gc_grid\" -V --output-on-failure"' } pull_request: # Old CTK: Oldest/newest supported host compilers: From 3684197b4271e725f37029cf9474de268598d75c Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Thu, 25 Jun 2026 15:06:05 +0200 Subject: [PATCH 8/9] ci: defer BUILD_DIR expansion in debug override --- ci/matrix.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 8ddb9ec7724..115a0c92b21 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -23,7 +23,7 @@ workflows: # IMPORTANT: Do NOT delete or remove the `override:` key below, even when it is empty. override: - { jobs: ['run_gpu'], project: 'target', std: 20, ctk: '13.X', cxx: 'gcc', gpu: 'h100_2gpu', sm: 'gpu', - args: '--preset cudax-cpp20 --build-targets "cudax.test.stf.green_context.gc_grid cudax.test.stf.green_context.gc_grid_managed" --custom-test-cmd "CUDA_LAUNCH_BLOCKING=1 CUDA_LOG_FILE=stderr ctest --test-dir \"${BUILD_DIR}\" -R \"cudax.test.stf.green_context.gc_grid\" -V --output-on-failure"' } + args: '--preset cudax-cpp20 --build-targets "cudax.test.stf.green_context.gc_grid cudax.test.stf.green_context.gc_grid_managed" --custom-test-cmd "CUDA_LAUNCH_BLOCKING=1 CUDA_LOG_FILE=stderr ctest --test-dir \"\${BUILD_DIR}\" -R \"cudax.test.stf.green_context.gc_grid\" -V --output-on-failure"' } pull_request: # Old CTK: Oldest/newest supported host compilers: From 8e0fd85d6bf79a828f9f7bfd4a15bd0527e6d188 Mon Sep 17 00:00:00 2001 From: Cedric AUGONNET Date: Fri, 26 Jun 2026 13:35:13 +0200 Subject: [PATCH 9/9] ci: filter generated args for target jobs --- ci/run_cpu_target.sh | 28 ++++++++++++++++++++++++---- ci/run_gpu_target.sh | 28 ++++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/ci/run_cpu_target.sh b/ci/run_cpu_target.sh index 8195188749b..f150e81d20f 100755 --- a/ci/run_cpu_target.sh +++ b/ci/run_cpu_target.sh @@ -9,12 +9,32 @@ set -euo pipefail ci_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) repo_dir=$(cd "${ci_dir}/.." && pwd) -user_args=("$@") -set -- +build_common_args=() +target_args=() +while [[ "$#" -gt 0 ]]; do + case "$1" in + -v | --verbose | -verbose | -pedantic | --pedantic) + build_common_args+=("$1") + shift + ;; + -std | -arch | -cuda | -cxx | -cmake-options) + build_common_args+=("$1" "$2") + shift 2 + ;; + *) + target_args+=("$1") + shift + ;; + esac +done + +set -- "${build_common_args[@]}" source "${ci_dir}/build_common.sh" -set -- "${user_args[@]}" cd "${repo_dir}" -cmd=("${ci_dir}/util/build_and_test_targets.sh" "$@") +cmd=("${ci_dir}/util/build_and_test_targets.sh" "${target_args[@]}") +if [[ "${#GLOBAL_CMAKE_OPTIONS[@]}" -gt 0 ]]; then + cmd+=(--cmake-options "${GLOBAL_CMAKE_OPTIONS[*]}") +fi printf '\033[34m%s\033[0m\n' "${cmd[*]}" "${cmd[@]}" diff --git a/ci/run_gpu_target.sh b/ci/run_gpu_target.sh index 4bc25bfa20d..37a665b3027 100755 --- a/ci/run_gpu_target.sh +++ b/ci/run_gpu_target.sh @@ -9,12 +9,32 @@ set -euo pipefail ci_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) repo_dir=$(cd "${ci_dir}/.." && pwd) -user_args=("$@") -set -- +build_common_args=() +target_args=() +while [[ "$#" -gt 0 ]]; do + case "$1" in + -v | --verbose | -verbose | -pedantic | --pedantic) + build_common_args+=("$1") + shift + ;; + -std | -arch | -cuda | -cxx | -cmake-options) + build_common_args+=("$1" "$2") + shift 2 + ;; + *) + target_args+=("$1") + shift + ;; + esac +done + +set -- "${build_common_args[@]}" source "${ci_dir}/build_common.sh" -set -- "${user_args[@]}" cd "${repo_dir}" -cmd=("${ci_dir}/util/build_and_test_targets.sh" "$@") +cmd=("${ci_dir}/util/build_and_test_targets.sh" "${target_args[@]}") +if [[ "${#GLOBAL_CMAKE_OPTIONS[@]}" -gt 0 ]]; then + cmd+=(--cmake-options "${GLOBAL_CMAKE_OPTIONS[*]}") +fi printf '\033[34m%s\033[0m\n' "${cmd[*]}" "${cmd[@]}"