diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 3653930095e..115a0c92b21 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -22,6 +22,8 @@ workflows: # # IMPORTANT: Do NOT delete or remove the `override:` key below, even when it is empty. override: + - { jobs: ['run_gpu'], project: 'target', std: 20, ctk: '13.X', cxx: 'gcc', gpu: 'h100_2gpu', sm: 'gpu', + args: '--preset cudax-cpp20 --build-targets "cudax.test.stf.green_context.gc_grid cudax.test.stf.green_context.gc_grid_managed" --custom-test-cmd "CUDA_LAUNCH_BLOCKING=1 CUDA_LOG_FILE=stderr ctest --test-dir \"\${BUILD_DIR}\" -R \"cudax.test.stf.green_context.gc_grid\" -V --output-on-failure"' } pull_request: # Old CTK: Oldest/newest supported host compilers: @@ -61,6 +63,7 @@ workflows: - {jobs: ['test'], project: ['libcudacxx', 'cudax'], std: 'max', gpu: 'h100' } # Multi-GPU coverage: - {jobs: ['test'], project: 'libcudacxx', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'} + - {jobs: ['test'], project: 'cudax', std: 'max', gpu: 'h100_2gpu', sm: 'gpu'} # RTX PRO 6000 coverage (limited due to small number of runners): - {jobs: ['test_lid0'], project: 'cub', std: 'max', cxx: 'gcc', gpu: 'rtxpro6000'} # Misc: diff --git a/ci/run_cpu_target.sh b/ci/run_cpu_target.sh index 8195188749b..f150e81d20f 100755 --- a/ci/run_cpu_target.sh +++ b/ci/run_cpu_target.sh @@ -9,12 +9,32 @@ set -euo pipefail ci_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) repo_dir=$(cd "${ci_dir}/.." && pwd) -user_args=("$@") -set -- +build_common_args=() +target_args=() +while [[ "$#" -gt 0 ]]; do + case "$1" in + -v | --verbose | -verbose | -pedantic | --pedantic) + build_common_args+=("$1") + shift + ;; + -std | -arch | -cuda | -cxx | -cmake-options) + build_common_args+=("$1" "$2") + shift 2 + ;; + *) + target_args+=("$1") + shift + ;; + esac +done + +set -- "${build_common_args[@]}" source "${ci_dir}/build_common.sh" -set -- "${user_args[@]}" cd "${repo_dir}" -cmd=("${ci_dir}/util/build_and_test_targets.sh" "$@") +cmd=("${ci_dir}/util/build_and_test_targets.sh" "${target_args[@]}") +if [[ "${#GLOBAL_CMAKE_OPTIONS[@]}" -gt 0 ]]; then + cmd+=(--cmake-options "${GLOBAL_CMAKE_OPTIONS[*]}") +fi printf '\033[34m%s\033[0m\n' "${cmd[*]}" "${cmd[@]}" diff --git a/ci/run_gpu_target.sh b/ci/run_gpu_target.sh index 4bc25bfa20d..37a665b3027 100755 --- a/ci/run_gpu_target.sh +++ b/ci/run_gpu_target.sh @@ -9,12 +9,32 @@ set -euo pipefail ci_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) repo_dir=$(cd "${ci_dir}/.." && pwd) -user_args=("$@") -set -- +build_common_args=() +target_args=() +while [[ "$#" -gt 0 ]]; do + case "$1" in + -v | --verbose | -verbose | -pedantic | --pedantic) + build_common_args+=("$1") + shift + ;; + -std | -arch | -cuda | -cxx | -cmake-options) + build_common_args+=("$1" "$2") + shift 2 + ;; + *) + target_args+=("$1") + shift + ;; + esac +done + +set -- "${build_common_args[@]}" source "${ci_dir}/build_common.sh" -set -- "${user_args[@]}" cd "${repo_dir}" -cmd=("${ci_dir}/util/build_and_test_targets.sh" "$@") +cmd=("${ci_dir}/util/build_and_test_targets.sh" "${target_args[@]}") +if [[ "${#GLOBAL_CMAKE_OPTIONS[@]}" -gt 0 ]]; then + cmd+=(--cmake-options "${GLOBAL_CMAKE_OPTIONS[*]}") +fi printf '\033[34m%s\033[0m\n' "${cmd[*]}" "${cmd[@]}" diff --git a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh index cf7b79654ae..c3ba1741b7e 100644 --- a/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/parallel_for_scope.cuh @@ -687,7 +687,7 @@ public: { auto active = t.activate_place(i); const auto sub_shape = partitioner_t::apply(shape, pos4(i), e_place.get_dims()); - do_parallel_for(f, active.place(), sub_shape, t); + do_parallel_for(f, active.place(), sub_shape, t, i); } } } @@ -896,8 +896,11 @@ public: // Executes the loop on a device, or use the host implementation template - void do_parallel_for( - Fun&& f, const exec_place& sub_exec_place, const sub_shape_t& sub_shape, typename context::task_type& t) + void do_parallel_for(Fun&& f, + const exec_place& sub_exec_place, + const sub_shape_t& sub_shape, + typename context::task_type& t, + size_t place_index = 0) { // parallel_for never calls this function with a host. _CCCL_ASSERT(sub_exec_place != exec_place::host(), "Internal CUDASTF error."); @@ -905,7 +908,7 @@ public: if (sub_exec_place == exec_place::device_auto()) { // We have all latitude - recurse with the current device. - return do_parallel_for(::std::forward(f), exec_place::current_device(), sub_shape, t); + return do_parallel_for(::std::forward(f), exec_place::current_device(), sub_shape, t, place_index); } using Fun_no_ref = ::std::remove_reference_t; @@ -941,7 +944,7 @@ public: if constexpr (::std::is_same_v) { reserved::loop - <<(blocks), static_cast(block_size), 0, t.get_stream()>>>( + <<(blocks), static_cast(block_size), 0, t.get_stream(place_index)>>>( static_cast(n), sub_shape, mv(f), arg_instances); } else if constexpr (::std::is_same_v) diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt index 6405723f09d..eca64787382 100644 --- a/cudax/test/stf/CMakeLists.txt +++ b/cudax/test/stf/CMakeLists.txt @@ -152,6 +152,7 @@ set( green_context/axpy_gc.cu green_context/cuda_graph.cu green_context/gc_grid.cu + green_context/gc_grid_managed.cu hash/ctx_hash.cu hash/logical_data.cu hashtable/fusion.cu diff --git a/cudax/test/stf/green_context/cuda_graph.cu b/cudax/test/stf/green_context/cuda_graph.cu index 6e13261bfc9..9676c33e4c5 100644 --- a/cudax/test/stf/green_context/cuda_graph.cu +++ b/cudax/test/stf/green_context/cuda_graph.cu @@ -11,6 +11,8 @@ #include #include +#include + using namespace cuda::experimental::stf; // Green contexts are only supported since CUDA 12.4 @@ -56,15 +58,23 @@ int main() auto handle_Y = ctx.logical_data(make_slice(&Y[0], n)); // The green_context_helper class automates the creation of green context views - green_context_helper gc(num_sms); + std::vector gc(ndevs); + for (int devid = 0; devid < ndevs; devid++) + { + gc[devid] = green_context_helper(num_sms, devid); + } for (int iter = 0; iter < NITER; iter++) { - auto cnt = gc.get_count(); - ctx.task(exec_place::green_ctx(gc.get_view(iter % cnt)), handle_X.read(), handle_Y.rw()) - ->*[&](cudaStream_t stream, auto dX, auto dY) { - axpy<<<16, 16, 0, stream>>>(alpha, dX, dY); - }; + for (int devid = 0; devid < ndevs; devid++) + { + auto& g_ctx = gc[devid]; + auto cnt = g_ctx.get_count(); + ctx.task(exec_place::green_ctx(g_ctx.get_view(iter % cnt)), handle_X.read(), handle_Y.rw()) + ->*[&](cudaStream_t stream, auto dX, auto dY) { + axpy<<<16, 16, 0, stream>>>(alpha, dX, dY); + }; + } } ctx.host_launch(handle_X.read(), handle_Y.read())->*[&](auto hX, auto hY) { diff --git a/cudax/test/stf/green_context/gc_grid_managed.cu b/cudax/test/stf/green_context/gc_grid_managed.cu new file mode 100644 index 00000000000..11760adbff6 --- /dev/null +++ b/cudax/test/stf/green_context/gc_grid_managed.cu @@ -0,0 +1,116 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cuda::experimental::stf; + +// Green contexts are only supported since CUDA 12.4 +#if _CCCL_CTK_AT_LEAST(12, 4) +__global__ void axpy(double a, slice x, slice y) +{ + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int nthreads = gridDim.x * blockDim.x; + + size_t n = x.extent(0); + for (int ind = tid; ind < n; ind += nthreads) + { + y(ind) += a * x(ind); + } +} + +void debug_info(cudaStream_t stream, CUgreenCtx g_ctx) +{ + // Get the green context associated to that CUDA stream + CUgreenCtx stream_cugc; + cuda_safe_call(cuStreamGetGreenCtx(CUstream(stream), &stream_cugc)); + assert(stream_cugc != nullptr); + + CUcontext stream_green_primary; + CUcontext place_green_primary; + + unsigned long long stream_ctxId; + unsigned long long place_ctxId; + + // Convert green contexts to primary contexts and get their ID + cuda_safe_call(cuCtxFromGreenCtx(&stream_green_primary, stream_cugc)); + cuda_safe_call(cuCtxGetId(stream_green_primary, &stream_ctxId)); + + cuda_safe_call(cuCtxFromGreenCtx(&place_green_primary, g_ctx)); + cuda_safe_call(cuCtxGetId(place_green_primary, &place_ctxId)); + + // Make sure the stream belongs to the same green context as the execution place + EXPECT(stream_ctxId == place_ctxId); +} +#endif // _CCCL_CTK_AT_LEAST(12, 4) + +int main() +{ +#if _CCCL_CTK_BELOW(12, 4) + fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n"); + return 0; +#else // ^^^ _CCCL_CTK_BELOW(12, 4) ^^^ / vvv _CCCL_CTK_AT_LEAST(12, 4) vvv + int ndevs; + const int num_sms = 8; + cuda_safe_call(cudaGetDeviceCount(&ndevs)); + + stream_ctx ctx; + + int NITER = 8; + const int n = 16 * 1024 * 1024; + + std::vector X(n); + std::vector Y(n); + + for (int ind = 0; ind < n; ind++) + { + X[ind] = 1.0 * ind; + Y[ind] = 2.0 * ind - 3.0; + } + + auto handle_X = ctx.logical_data(make_slice(&X[0], n)); + auto handle_Y = ctx.logical_data(make_slice(&Y[0], n)); + + std::vector exec_places; + + // The green_context_helper class automates the creation of green context views + std::vector gc(ndevs); + for (int devid = 0; devid < ndevs; devid++) + { + gc[devid] = green_context_helper(num_sms, devid); + + auto& g_ctx = gc[devid]; + auto cnt = g_ctx.get_count(); + for (size_t i = 0; i < cnt; i++) + { + exec_places.push_back(exec_place::green_ctx(g_ctx.get_view(i))); + } + } + + auto where = make_grid(exec_places); + + for (int iter = 0; iter < NITER; iter++) + { + // Probe green-context grid execution independently from composite VMM mappings. + ctx.parallel_for( + blocked_partition(), + where, + handle_X.shape(), + handle_X.rw(data_place::managed()), + handle_Y.read(data_place::managed())) + ->*[] __device__(size_t i, auto x, auto y) { + x(i) += y(i); + }; + } + + ctx.finalize(); +#endif // ^^^ _CCCL_CTK_AT_LEAST(12, 4) ^^^ +}