From 34a8a8cce8c9c1a8772650e9193e36f47b6030db Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Wed, 24 Jun 2026 18:41:42 -0700
Subject: [PATCH 1/9] Add multi-gpu CI for cudax

---
 ci/matrix.yaml | 1 +
 1 file changed, 1 insertion(+)
diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 3653930095e..b47bbabbcd4 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -61,6 +61,7 @@ workflows:
     - {jobs: ['test'],                    project: ['libcudacxx', 'cudax'], std: 'max', gpu: 'h100' }
     # Multi-GPU coverage:
     - {jobs: ['test'], project: 'libcudacxx', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'}
+    - {jobs: ['test'], project: 'cudax',      std: 'max', gpu: 'h100_2gpu', sm: 'gpu'}
     # RTX PRO 6000 coverage (limited due to small number of runners):
     - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'rtxpro6000'}
     # Misc:

From b95276ba41688deaec9a8265c037205952267144 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Thu, 25 Jun 2026 07:58:11 +0200
Subject: [PATCH 2/9] cudax: run green context graph test on all devices

---
 cudax/test/stf/green_context/cuda_graph.cu | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/cudax/test/stf/green_context/cuda_graph.cu b/cudax/test/stf/green_context/cuda_graph.cu
index 6e13261bfc9..9676c33e4c5 100644
--- a/cudax/test/stf/green_context/cuda_graph.cu
+++ b/cudax/test/stf/green_context/cuda_graph.cu
@@ -11,6 +11,8 @@
 #include <cuda/experimental/__places/exec/green_context.cuh>
 #include <cuda/experimental/stf.cuh>
 
+#include <vector>
+
 using namespace cuda::experimental::stf;
 
 // Green contexts are only supported since CUDA 12.4
@@ -56,15 +58,23 @@ int main()
   auto handle_Y = ctx.logical_data(make_slice(&Y[0], n));
 
   // The green_context_helper class automates the creation of green context views
-  green_context_helper gc(num_sms);
+  std::vector<green_context_helper> gc(ndevs);
+  for (int devid = 0; devid < ndevs; devid++)
+  {
+    gc[devid] = green_context_helper(num_sms, devid);
+  }
 
   for (int iter = 0; iter < NITER; iter++)
   {
-    auto cnt = gc.get_count();
-    ctx.task(exec_place::green_ctx(gc.get_view(iter % cnt)), handle_X.read(), handle_Y.rw())
-        ->*[&](cudaStream_t stream, auto dX, auto dY) {
-              axpy<<<16, 16, 0, stream>>>(alpha, dX, dY);
-            };
+    for (int devid = 0; devid < ndevs; devid++)
+    {
+      auto& g_ctx = gc[devid];
+      auto cnt    = g_ctx.get_count();
+      ctx.task(exec_place::green_ctx(g_ctx.get_view(iter % cnt)), handle_X.read(), handle_Y.rw())
+          ->*[&](cudaStream_t stream, auto dX, auto dY) {
+                axpy<<<16, 16, 0, stream>>>(alpha, dX, dY);
+              };
+    }
   }
 
   ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto hX, auto hY) {

From 9ea42e29d169a607e4172aef76a58c917863f794 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Thu, 25 Jun 2026 09:23:37 +0200
Subject: [PATCH 3/9] cudax: use per-place streams for grid parallel_for

---
 .../__stf/internal/parallel_for_scope.cuh            | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
index cf7b79654ae..173a30038cf 100644
--- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -687,7 +687,7 @@ public:
           {
             auto active          = t.activate_place(i);
             const auto sub_shape = partitioner_t::apply(shape, pos4(i), e_place.get_dims());
-            do_parallel_for(f, active.place(), sub_shape, t);
+            do_parallel_for(f, active.place(), sub_shape, t, i);
           }
         }
       }
@@ -897,7 +897,11 @@ public:
   // Executes the loop on a device, or use the host implementation
   template <typename Fun, typename sub_shape_t>
   void do_parallel_for(
-    Fun&& f, const exec_place& sub_exec_place, const sub_shape_t& sub_shape, typename context::task_type& t)
+    Fun&& f,
+    const exec_place& sub_exec_place,
+    const sub_shape_t& sub_shape,
+    typename context::task_type& t,
+    size_t place_index = 0)
   {
     // parallel_for never calls this function with a host.
     _CCCL_ASSERT(sub_exec_place != exec_place::host(), "Internal CUDASTF error.");
@@ -905,7 +909,7 @@ public:
     if (sub_exec_place == exec_place::device_auto())
     {
       // We have all latitude - recurse with the current device.
-      return do_parallel_for(::std::forward<Fun>(f), exec_place::current_device(), sub_shape, t);
+      return do_parallel_for(::std::forward<Fun>(f), exec_place::current_device(), sub_shape, t, place_index);
     }
 
     using Fun_no_ref = ::std::remove_reference_t<Fun>;
@@ -941,7 +945,7 @@ public:
     if constexpr (::std::is_same_v<context, stream_ctx>)
     {
       reserved::loop<Fun_no_ref, sub_shape_t, deps_tup_t>
-        <<<static_cast<int>(blocks), static_cast<int>(block_size), 0, t.get_stream()>>>(
+        <<<static_cast<int>(blocks), static_cast<int>(block_size), 0, t.get_stream(place_index)>>>(
           static_cast<int>(n), sub_shape, mv(f), arg_instances);
     }
     else if constexpr (::std::is_same_v<context, graph_ctx>)

From 9f1bffa4d8d67c8224c60b4e2d4a45b051ac8bab Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Thu, 25 Jun 2026 09:29:42 +0200
Subject: [PATCH 4/9] cudax: format grid parallel_for stream fix

---
 .../__stf/internal/parallel_for_scope.cuh             | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
index 173a30038cf..c3ba1741b7e 100644
--- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh
@@ -896,12 +896,11 @@ public:
 
   // Executes the loop on a device, or use the host implementation
   template <typename Fun, typename sub_shape_t>
-  void do_parallel_for(
-    Fun&& f,
-    const exec_place& sub_exec_place,
-    const sub_shape_t& sub_shape,
-    typename context::task_type& t,
-    size_t place_index = 0)
+  void do_parallel_for(Fun&& f,
+                       const exec_place& sub_exec_place,
+                       const sub_shape_t& sub_shape,
+                       typename context::task_type& t,
+                       size_t place_index = 0)
   {
     // parallel_for never calls this function with a host.
     _CCCL_ASSERT(sub_exec_place != exec_place::host(), "Internal CUDASTF error.");

From 107f745a663aa7d329a19dedbdc4c46005a5e0c4 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Thu, 25 Jun 2026 10:40:37 +0200
Subject: [PATCH 5/9] cudax: keep gc_grid data placement managed

---
 cudax/test/stf/green_context/gc_grid.cu | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/cudax/test/stf/green_context/gc_grid.cu b/cudax/test/stf/green_context/gc_grid.cu
index 7ae77b1c152..89fd5450d62 100644
--- a/cudax/test/stf/green_context/gc_grid.cu
+++ b/cudax/test/stf/green_context/gc_grid.cu
@@ -99,7 +99,14 @@ int main()
 
   for (int iter = 0; iter < NITER; iter++)
   {
-    ctx.parallel_for(blocked_partition(), where, handle_X.shape(), handle_X.rw(), handle_Y.read())
+    // This test targets green-context grid execution. Keep the data placement
+    // independent from the grid to avoid also testing localized VMM mappings.
+    ctx.parallel_for(
+      blocked_partition(),
+      where,
+      handle_X.shape(),
+      handle_X.rw(data_place::managed()),
+      handle_Y.read(data_place::managed()))
         ->*[] __device__(size_t i, auto x, auto y) {
               x(i) += y(i);
             };

From 13a8057d9bf97f349c9480f8e6c182806457e6b3 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Thu, 25 Jun 2026 10:54:11 +0200
Subject: [PATCH 6/9] cudax: add managed green-context grid probe

---
 cudax/test/stf/CMakeLists.txt                 |   1 +
 cudax/test/stf/green_context/gc_grid.cu       |   9 +-
 .../test/stf/green_context/gc_grid_managed.cu | 116 ++++++++++++++++++
 3 files changed, 118 insertions(+), 8 deletions(-)
 create mode 100644 cudax/test/stf/green_context/gc_grid_managed.cu

diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt
index 6405723f09d..eca64787382 100644
--- a/cudax/test/stf/CMakeLists.txt
+++ b/cudax/test/stf/CMakeLists.txt
@@ -152,6 +152,7 @@ set(
   green_context/axpy_gc.cu
   green_context/cuda_graph.cu
   green_context/gc_grid.cu
+  green_context/gc_grid_managed.cu
   hash/ctx_hash.cu
   hash/logical_data.cu
   hashtable/fusion.cu
diff --git a/cudax/test/stf/green_context/gc_grid.cu b/cudax/test/stf/green_context/gc_grid.cu
index 89fd5450d62..7ae77b1c152 100644
--- a/cudax/test/stf/green_context/gc_grid.cu
+++ b/cudax/test/stf/green_context/gc_grid.cu
@@ -99,14 +99,7 @@ int main()
 
   for (int iter = 0; iter < NITER; iter++)
   {
-    // This test targets green-context grid execution. Keep the data placement
-    // independent from the grid to avoid also testing localized VMM mappings.
-    ctx.parallel_for(
-      blocked_partition(),
-      where,
-      handle_X.shape(),
-      handle_X.rw(data_place::managed()),
-      handle_Y.read(data_place::managed()))
+    ctx.parallel_for(blocked_partition(), where, handle_X.shape(), handle_X.rw(), handle_Y.read())
         ->*[] __device__(size_t i, auto x, auto y) {
               x(i) += y(i);
             };
diff --git a/cudax/test/stf/green_context/gc_grid_managed.cu b/cudax/test/stf/green_context/gc_grid_managed.cu
new file mode 100644
index 00000000000..11760adbff6
--- /dev/null
+++ b/cudax/test/stf/green_context/gc_grid_managed.cu
@@ -0,0 +1,116 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/experimental/__places/exec/green_context.cuh>
+#include <cuda/experimental/stf.cuh>
+
+using namespace cuda::experimental::stf;
+
+// Green contexts are only supported since CUDA 12.4
+#if _CCCL_CTK_AT_LEAST(12, 4)
+__global__ void axpy(double a, slice<const double> x, slice<double> y)
+{
+  int tid      = blockIdx.x * blockDim.x + threadIdx.x;
+  int nthreads = gridDim.x * blockDim.x;
+
+  size_t n = x.extent(0);
+  for (int ind = tid; ind < n; ind += nthreads)
+  {
+    y(ind) += a * x(ind);
+  }
+}
+
+void debug_info(cudaStream_t stream, CUgreenCtx g_ctx)
+{
+  // Get the green context associated to that CUDA stream
+  CUgreenCtx stream_cugc;
+  cuda_safe_call(cuStreamGetGreenCtx(CUstream(stream), &stream_cugc));
+  assert(stream_cugc != nullptr);
+
+  CUcontext stream_green_primary;
+  CUcontext place_green_primary;
+
+  unsigned long long stream_ctxId;
+  unsigned long long place_ctxId;
+
+  // Convert green contexts to primary contexts and get their ID
+  cuda_safe_call(cuCtxFromGreenCtx(&stream_green_primary, stream_cugc));
+  cuda_safe_call(cuCtxGetId(stream_green_primary, &stream_ctxId));
+
+  cuda_safe_call(cuCtxFromGreenCtx(&place_green_primary, g_ctx));
+  cuda_safe_call(cuCtxGetId(place_green_primary, &place_ctxId));
+
+  // Make sure the stream belongs to the same green context as the execution place
+  EXPECT(stream_ctxId == place_ctxId);
+}
+#endif // _CCCL_CTK_AT_LEAST(12, 4)
+
+int main()
+{
+#if _CCCL_CTK_BELOW(12, 4)
+  fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n");
+  return 0;
+#else // ^^^ _CCCL_CTK_BELOW(12, 4) ^^^ / vvv _CCCL_CTK_AT_LEAST(12, 4) vvv
+  int ndevs;
+  const int num_sms = 8;
+  cuda_safe_call(cudaGetDeviceCount(&ndevs));
+
+  stream_ctx ctx;
+
+  int NITER   = 8;
+  const int n = 16 * 1024 * 1024;
+
+  std::vector<double> X(n);
+  std::vector<double> Y(n);
+
+  for (int ind = 0; ind < n; ind++)
+  {
+    X[ind] = 1.0 * ind;
+    Y[ind] = 2.0 * ind - 3.0;
+  }
+
+  auto handle_X = ctx.logical_data(make_slice(&X[0], n));
+  auto handle_Y = ctx.logical_data(make_slice(&Y[0], n));
+
+  std::vector<exec_place> exec_places;
+
+  // The green_context_helper class automates the creation of green context views
+  std::vector<green_context_helper> gc(ndevs);
+  for (int devid = 0; devid < ndevs; devid++)
+  {
+    gc[devid] = green_context_helper(num_sms, devid);
+
+    auto& g_ctx = gc[devid];
+    auto cnt    = g_ctx.get_count();
+    for (size_t i = 0; i < cnt; i++)
+    {
+      exec_places.push_back(exec_place::green_ctx(g_ctx.get_view(i)));
+    }
+  }
+
+  auto where = make_grid(exec_places);
+
+  for (int iter = 0; iter < NITER; iter++)
+  {
+    // Probe green-context grid execution independently from composite VMM mappings.
+    ctx.parallel_for(
+      blocked_partition(),
+      where,
+      handle_X.shape(),
+      handle_X.rw(data_place::managed()),
+      handle_Y.read(data_place::managed()))
+        ->*[] __device__(size_t i, auto x, auto y) {
+              x(i) += y(i);
+            };
+  }
+
+  ctx.finalize();
+#endif // ^^^ _CCCL_CTK_AT_LEAST(12, 4) ^^^
+}

From 02492c87e176712b6ef4178e224fe1abd393f7f1 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Thu, 25 Jun 2026 13:44:48 +0200
Subject: [PATCH 7/9] ci: target green-context grid debug on 2-GPU H100

---
 ci/matrix.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index b47bbabbcd4..8ddb9ec7724 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -22,6 +22,8 @@ workflows:
   #
   # IMPORTANT: Do NOT delete or remove the `override:` key below, even when it is empty.
   override:
+    - { jobs: ['run_gpu'], project: 'target', std: 20, ctk: '13.X', cxx: 'gcc', gpu: 'h100_2gpu', sm: 'gpu',
+        args: '--preset cudax-cpp20 --build-targets "cudax.test.stf.green_context.gc_grid cudax.test.stf.green_context.gc_grid_managed" --custom-test-cmd "CUDA_LAUNCH_BLOCKING=1 CUDA_LOG_FILE=stderr ctest --test-dir \"${BUILD_DIR}\" -R \"cudax.test.stf.green_context.gc_grid\" -V --output-on-failure"' }
 
   pull_request:
     # Old CTK: Oldest/newest supported host compilers:

From 3684197b4271e725f37029cf9474de268598d75c Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Thu, 25 Jun 2026 15:06:05 +0200
Subject: [PATCH 8/9] ci: defer BUILD_DIR expansion in debug override

---
 ci/matrix.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 8ddb9ec7724..115a0c92b21 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -23,7 +23,7 @@ workflows:
   # IMPORTANT: Do NOT delete or remove the `override:` key below, even when it is empty.
   override:
     - { jobs: ['run_gpu'], project: 'target', std: 20, ctk: '13.X', cxx: 'gcc', gpu: 'h100_2gpu', sm: 'gpu',
-        args: '--preset cudax-cpp20 --build-targets "cudax.test.stf.green_context.gc_grid cudax.test.stf.green_context.gc_grid_managed" --custom-test-cmd "CUDA_LAUNCH_BLOCKING=1 CUDA_LOG_FILE=stderr ctest --test-dir \"${BUILD_DIR}\" -R \"cudax.test.stf.green_context.gc_grid\" -V --output-on-failure"' }
+        args: '--preset cudax-cpp20 --build-targets "cudax.test.stf.green_context.gc_grid cudax.test.stf.green_context.gc_grid_managed" --custom-test-cmd "CUDA_LAUNCH_BLOCKING=1 CUDA_LOG_FILE=stderr ctest --test-dir \"\${BUILD_DIR}\" -R \"cudax.test.stf.green_context.gc_grid\" -V --output-on-failure"' }
 
   pull_request:
     # Old CTK: Oldest/newest supported host compilers:

From 8e0fd85d6bf79a828f9f7bfd4a15bd0527e6d188 Mon Sep 17 00:00:00 2001
From: Cedric AUGONNET <caugonnet@nvidia.com>
Date: Fri, 26 Jun 2026 13:35:13 +0200
Subject: [PATCH 9/9] ci: filter generated args for target jobs

---
 ci/run_cpu_target.sh | 28 ++++++++++++++++++++++++----
 ci/run_gpu_target.sh | 28 ++++++++++++++++++++++++----
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/ci/run_cpu_target.sh b/ci/run_cpu_target.sh
index 8195188749b..f150e81d20f 100755
--- a/ci/run_cpu_target.sh
+++ b/ci/run_cpu_target.sh
@@ -9,12 +9,32 @@ set -euo pipefail
 ci_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 repo_dir=$(cd "${ci_dir}/.." && pwd)
 
-user_args=("$@")
-set --
+build_common_args=()
+target_args=()
+while [[ "$#" -gt 0 ]]; do
+  case "$1" in
+    -v | --verbose | -verbose | -pedantic | --pedantic)
+      build_common_args+=("$1")
+      shift
+      ;;
+    -std | -arch | -cuda | -cxx | -cmake-options)
+      build_common_args+=("$1" "$2")
+      shift 2
+      ;;
+    *)
+      target_args+=("$1")
+      shift
+      ;;
+  esac
+done
+
+set -- "${build_common_args[@]}"
 source "${ci_dir}/build_common.sh"
-set -- "${user_args[@]}"
 
 cd "${repo_dir}"
-cmd=("${ci_dir}/util/build_and_test_targets.sh" "$@")
+cmd=("${ci_dir}/util/build_and_test_targets.sh" "${target_args[@]}")
+if [[ "${#GLOBAL_CMAKE_OPTIONS[@]}" -gt 0 ]]; then
+  cmd+=(--cmake-options "${GLOBAL_CMAKE_OPTIONS[*]}")
+fi
 printf '\033[34m%s\033[0m\n' "${cmd[*]}"
 "${cmd[@]}"
diff --git a/ci/run_gpu_target.sh b/ci/run_gpu_target.sh
index 4bc25bfa20d..37a665b3027 100755
--- a/ci/run_gpu_target.sh
+++ b/ci/run_gpu_target.sh
@@ -9,12 +9,32 @@ set -euo pipefail
 ci_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 repo_dir=$(cd "${ci_dir}/.." && pwd)
 
-user_args=("$@")
-set --
+build_common_args=()
+target_args=()
+while [[ "$#" -gt 0 ]]; do
+  case "$1" in
+    -v | --verbose | -verbose | -pedantic | --pedantic)
+      build_common_args+=("$1")
+      shift
+      ;;
+    -std | -arch | -cuda | -cxx | -cmake-options)
+      build_common_args+=("$1" "$2")
+      shift 2
+      ;;
+    *)
+      target_args+=("$1")
+      shift
+      ;;
+  esac
+done
+
+set -- "${build_common_args[@]}"
 source "${ci_dir}/build_common.sh"
-set -- "${user_args[@]}"
 
 cd "${repo_dir}"
-cmd=("${ci_dir}/util/build_and_test_targets.sh" "$@")
+cmd=("${ci_dir}/util/build_and_test_targets.sh" "${target_args[@]}")
+if [[ "${#GLOBAL_CMAKE_OPTIONS[@]}" -gt 0 ]]; then
+  cmd+=(--cmake-options "${GLOBAL_CMAKE_OPTIONS[*]}")
+fi
 printf '\033[34m%s\033[0m\n' "${cmd[*]}"
 "${cmd[@]}"