Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ workflows:
#
# IMPORTANT: Do NOT delete or remove the `override:` key below, even when it is empty.
override:
- { jobs: ['run_gpu'], project: 'target', std: 20, ctk: '13.X', cxx: 'gcc', gpu: 'h100_2gpu', sm: 'gpu',
args: '--preset cudax-cpp20 --build-targets "cudax.test.stf.green_context.gc_grid cudax.test.stf.green_context.gc_grid_managed" --custom-test-cmd "CUDA_LAUNCH_BLOCKING=1 CUDA_LOG_FILE=stderr ctest --test-dir \"\${BUILD_DIR}\" -R \"cudax.test.stf.green_context.gc_grid\" -V --output-on-failure"' }

pull_request:
# Old CTK: Oldest/newest supported host compilers:
Expand Down Expand Up @@ -61,6 +63,7 @@ workflows:
- {jobs: ['test'], project: ['libcudacxx', 'cudax'], std: 'max', gpu: 'h100' }
# Multi-GPU coverage:
- {jobs: ['test'], project: 'libcudacxx', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'}
- {jobs: ['test'], project: 'cudax', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'}
# RTX PRO 6000 coverage (limited due to small number of runners):
- {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'rtxpro6000'}
# Misc:
Expand Down
28 changes: 24 additions & 4 deletions ci/run_cpu_target.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,32 @@ set -euo pipefail
ci_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
repo_dir=$(cd "${ci_dir}/.." && pwd)

user_args=("$@")
set --
build_common_args=()
target_args=()
while [[ "$#" -gt 0 ]]; do
case "$1" in
-v | --verbose | -verbose | -pedantic | --pedantic)
build_common_args+=("$1")
shift
;;
-std | -arch | -cuda | -cxx | -cmake-options)
build_common_args+=("$1" "$2")
shift 2
;;
*)
target_args+=("$1")
shift
;;
esac
done

set -- "${build_common_args[@]}"
source "${ci_dir}/build_common.sh"
set -- "${user_args[@]}"

cd "${repo_dir}"
cmd=("${ci_dir}/util/build_and_test_targets.sh" "$@")
cmd=("${ci_dir}/util/build_and_test_targets.sh" "${target_args[@]}")
if [[ "${#GLOBAL_CMAKE_OPTIONS[@]}" -gt 0 ]]; then
cmd+=(--cmake-options "${GLOBAL_CMAKE_OPTIONS[*]}")
fi
printf '\033[34m%s\033[0m\n' "${cmd[*]}"
"${cmd[@]}"
28 changes: 24 additions & 4 deletions ci/run_gpu_target.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,32 @@ set -euo pipefail
ci_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
repo_dir=$(cd "${ci_dir}/.." && pwd)

user_args=("$@")
set --
build_common_args=()
target_args=()
while [[ "$#" -gt 0 ]]; do
case "$1" in
-v | --verbose | -verbose | -pedantic | --pedantic)
build_common_args+=("$1")
shift
;;
-std | -arch | -cuda | -cxx | -cmake-options)
build_common_args+=("$1" "$2")
shift 2
;;
*)
target_args+=("$1")
shift
;;
esac
done

set -- "${build_common_args[@]}"
source "${ci_dir}/build_common.sh"
set -- "${user_args[@]}"

cd "${repo_dir}"
cmd=("${ci_dir}/util/build_and_test_targets.sh" "$@")
cmd=("${ci_dir}/util/build_and_test_targets.sh" "${target_args[@]}")
if [[ "${#GLOBAL_CMAKE_OPTIONS[@]}" -gt 0 ]]; then
cmd+=(--cmake-options "${GLOBAL_CMAKE_OPTIONS[*]}")
fi
printf '\033[34m%s\033[0m\n' "${cmd[*]}"
"${cmd[@]}"
Original file line number Diff line number Diff line change
Expand Up @@ -687,7 +687,7 @@ public:
{
auto active = t.activate_place(i);
const auto sub_shape = partitioner_t::apply(shape, pos4(i), e_place.get_dims());
do_parallel_for(f, active.place(), sub_shape, t);
do_parallel_for(f, active.place(), sub_shape, t, i);
}
}
}
Expand Down Expand Up @@ -896,16 +896,19 @@ public:

// Executes the loop on a device, or use the host implementation
template <typename Fun, typename sub_shape_t>
void do_parallel_for(
Fun&& f, const exec_place& sub_exec_place, const sub_shape_t& sub_shape, typename context::task_type& t)
void do_parallel_for(Fun&& f,
const exec_place& sub_exec_place,
const sub_shape_t& sub_shape,
typename context::task_type& t,
size_t place_index = 0)
{
// parallel_for never calls this function with a host.
_CCCL_ASSERT(sub_exec_place != exec_place::host(), "Internal CUDASTF error.");

if (sub_exec_place == exec_place::device_auto())
{
// We have all latitude - recurse with the current device.
return do_parallel_for(::std::forward<Fun>(f), exec_place::current_device(), sub_shape, t);
return do_parallel_for(::std::forward<Fun>(f), exec_place::current_device(), sub_shape, t, place_index);
}

using Fun_no_ref = ::std::remove_reference_t<Fun>;
Expand Down Expand Up @@ -941,7 +944,7 @@ public:
if constexpr (::std::is_same_v<context, stream_ctx>)
{
reserved::loop<Fun_no_ref, sub_shape_t, deps_tup_t>
<<<static_cast<int>(blocks), static_cast<int>(block_size), 0, t.get_stream()>>>(
<<<static_cast<int>(blocks), static_cast<int>(block_size), 0, t.get_stream(place_index)>>>(
static_cast<int>(n), sub_shape, mv(f), arg_instances);
}
else if constexpr (::std::is_same_v<context, graph_ctx>)
Expand Down
1 change: 1 addition & 0 deletions cudax/test/stf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ set(
green_context/axpy_gc.cu
green_context/cuda_graph.cu
green_context/gc_grid.cu
green_context/gc_grid_managed.cu
hash/ctx_hash.cu
hash/logical_data.cu
hashtable/fusion.cu
Expand Down
22 changes: 16 additions & 6 deletions cudax/test/stf/green_context/cuda_graph.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
#include <cuda/experimental/__places/exec/green_context.cuh>
#include <cuda/experimental/stf.cuh>

#include <vector>

using namespace cuda::experimental::stf;

// Green contexts are only supported since CUDA 12.4
Expand Down Expand Up @@ -56,15 +58,23 @@ int main()
auto handle_Y = ctx.logical_data(make_slice(&Y[0], n));

// The green_context_helper class automates the creation of green context views
green_context_helper gc(num_sms);
std::vector<green_context_helper> gc(ndevs);
for (int devid = 0; devid < ndevs; devid++)
{
gc[devid] = green_context_helper(num_sms, devid);
}

for (int iter = 0; iter < NITER; iter++)
{
auto cnt = gc.get_count();
ctx.task(exec_place::green_ctx(gc.get_view(iter % cnt)), handle_X.read(), handle_Y.rw())
->*[&](cudaStream_t stream, auto dX, auto dY) {
axpy<<<16, 16, 0, stream>>>(alpha, dX, dY);
};
for (int devid = 0; devid < ndevs; devid++)
{
auto& g_ctx = gc[devid];
auto cnt = g_ctx.get_count();
ctx.task(exec_place::green_ctx(g_ctx.get_view(iter % cnt)), handle_X.read(), handle_Y.rw())
->*[&](cudaStream_t stream, auto dX, auto dY) {
axpy<<<16, 16, 0, stream>>>(alpha, dX, dY);
};
}
}

ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto hX, auto hY) {
Expand Down
116 changes: 116 additions & 0 deletions cudax/test/stf/green_context/gc_grid_managed.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <cuda/experimental/__places/exec/green_context.cuh>
#include <cuda/experimental/stf.cuh>

using namespace cuda::experimental::stf;

// Green contexts are only supported since CUDA 12.4
#if _CCCL_CTK_AT_LEAST(12, 4)
__global__ void axpy(double a, slice<const double> x, slice<double> y)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
int nthreads = gridDim.x * blockDim.x;

size_t n = x.extent(0);
for (int ind = tid; ind < n; ind += nthreads)
{
y(ind) += a * x(ind);
}
}

void debug_info(cudaStream_t stream, CUgreenCtx g_ctx)
{
// Get the green context associated to that CUDA stream
CUgreenCtx stream_cugc;
cuda_safe_call(cuStreamGetGreenCtx(CUstream(stream), &stream_cugc));
assert(stream_cugc != nullptr);

CUcontext stream_green_primary;
CUcontext place_green_primary;

unsigned long long stream_ctxId;
unsigned long long place_ctxId;

// Convert green contexts to primary contexts and get their ID
cuda_safe_call(cuCtxFromGreenCtx(&stream_green_primary, stream_cugc));
cuda_safe_call(cuCtxGetId(stream_green_primary, &stream_ctxId));

cuda_safe_call(cuCtxFromGreenCtx(&place_green_primary, g_ctx));
cuda_safe_call(cuCtxGetId(place_green_primary, &place_ctxId));

// Make sure the stream belongs to the same green context as the execution place
EXPECT(stream_ctxId == place_ctxId);
}
#endif // _CCCL_CTK_AT_LEAST(12, 4)

int main()
{
#if _CCCL_CTK_BELOW(12, 4)
fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n");
return 0;
#else // ^^^ _CCCL_CTK_BELOW(12, 4) ^^^ / vvv _CCCL_CTK_AT_LEAST(12, 4) vvv
int ndevs;
const int num_sms = 8;
cuda_safe_call(cudaGetDeviceCount(&ndevs));

stream_ctx ctx;

int NITER = 8;
const int n = 16 * 1024 * 1024;

std::vector<double> X(n);
std::vector<double> Y(n);

for (int ind = 0; ind < n; ind++)
{
X[ind] = 1.0 * ind;
Y[ind] = 2.0 * ind - 3.0;
}

auto handle_X = ctx.logical_data(make_slice(&X[0], n));
auto handle_Y = ctx.logical_data(make_slice(&Y[0], n));

std::vector<exec_place> exec_places;

// The green_context_helper class automates the creation of green context views
std::vector<green_context_helper> gc(ndevs);
for (int devid = 0; devid < ndevs; devid++)
{
gc[devid] = green_context_helper(num_sms, devid);

auto& g_ctx = gc[devid];
auto cnt = g_ctx.get_count();
for (size_t i = 0; i < cnt; i++)
{
exec_places.push_back(exec_place::green_ctx(g_ctx.get_view(i)));
}
}

auto where = make_grid(exec_places);

for (int iter = 0; iter < NITER; iter++)
{
// Probe green-context grid execution independently from composite VMM mappings.
ctx.parallel_for(
blocked_partition(),
where,
handle_X.shape(),
handle_X.rw(data_place::managed()),
handle_Y.read(data_place::managed()))
->*[] __device__(size_t i, auto x, auto y) {
x(i) += y(i);
};
}

ctx.finalize();
#endif // ^^^ _CCCL_CTK_AT_LEAST(12, 4) ^^^
}
Loading