diff --git a/.gitignore b/.gitignore index 3d6c84a83f..adc183090f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,10 @@ +## build logs +build_*.log + +## aarch64 build artifacts +cpp/build-aarch64/ +*.tar.bz2 + ## common __pycache__ *.pyc diff --git a/cpp/include/raft/util/reduction.cuh b/cpp/include/raft/util/reduction.cuh index 71492b8f17..38ff1417e7 100644 --- a/cpp/include/raft/util/reduction.cuh +++ b/cpp/include/raft/util/reduction.cuh @@ -52,6 +52,21 @@ DI T warpReduce(T val, ReduceLambda reduce_op) return logicalWarpReduce(val, reduce_op); } +/** + * @brief Warp-level reduction with raft::add_op + * @tparam T Value type to be reduced + * @param val input value + * @param reduce_op raft::add_op instance + * @return Reduction result. All lanes will have the valid result. + * @note Explicit overload to disambiguate from cub::detail::scan::warpReduce when + * called with raft::add_op; avoids ambiguous template instantiation on CUDA 13.2+ + */ +template +DI T warpReduce(T val, raft::add_op reduce_op) +{ + return logicalWarpReduce(val, reduce_op); +} + /** * @brief Warp-level reduction * @tparam T Value type to be reduced diff --git a/cpp/tests/util/reduction.cu b/cpp/tests/util/reduction.cu index 03756a1d2b..5a444740c3 100644 --- a/cpp/tests/util/reduction.cu +++ b/cpp/tests/util/reduction.cu @@ -31,6 +31,15 @@ RAFT_KERNEL test_reduction_kernel(const int* input, int* reduction_res, ReduceLa if (threadIdx.x == 0) { reduction_res[0] = th_val; } } +// regression: warpReduce(val, raft::add_op{}) must not cause ambiguity with CUB on CUDA 13.2+ +RAFT_KERNEL test_warp_reduce_with_add_op_kernel(const int* input, int* reduction_res) +{ + assert(gridDim.x == 1); + int th_val = input[threadIdx.x]; + th_val = raft::warpReduce(th_val, raft::add_op{}); + if (threadIdx.x % 32 == 0) { atomicAdd(reduction_res, th_val); } +} + template RAFT_KERNEL test_ranked_reduction_kernel(const int* input, int* reduction_res, @@ -131,6 +140,20 @@ struct reduction_launch { RAFT_CUDA_TRY(cudaPeekAtLastError()); ASSERT_EQ(ref_d.value(stream), ref_val); } + + static void run_warp_reduce_with_add_op(const rmm::device_uvector& arr_d, + int ref_val, + rmm::cuda_stream_view stream) + { + rmm::device_scalar ref_d(0, stream); + const int block_dim = 64; + const int grid_dim = 1; + test_warp_reduce_with_add_op_kernel<<>>(arr_d.data(), + ref_d.data()); + stream.synchronize(); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + ASSERT_EQ(ref_d.value(stream), ref_val); + } }; template @@ -162,6 +185,12 @@ class ReductionTest : public testing::TestWithParam> { // NOLI } void run_binary_reduction() { reduction_launch::run_binary(arr_d, 24, stream); } + + void run_warp_reduce_with_add_op() + { + // two warps of 32 threads; warp 0 sums elements 0-31 (=78), warp 1 sums 32-63 (=80) + reduction_launch::run_warp_reduce_with_add_op(arr_d, 158, stream); + } }; const std::vector test_vector{1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 5, 1, 2, 3, 4, 1, 2, @@ -173,13 +202,18 @@ const std::vector binary_test_vector{ auto reduction_input = ::testing::Values(test_vector); auto binary_reduction_input = ::testing::Values(binary_test_vector); -using ReductionTestInt = ReductionTest; // NOLINT -using BinaryReductionTestInt = ReductionTest; // NOLINT +using ReductionTestInt = ReductionTest; // NOLINT +using BinaryReductionTestInt = ReductionTest; // NOLINT +using WarpReduceAddOpTestInt = ReductionTest; // NOLINT TEST_P(ReductionTestInt, REDUCTIONS) { run_reduction(); } INSTANTIATE_TEST_CASE_P(ReductionTest, ReductionTestInt, reduction_input); // NOLINT TEST_P(BinaryReductionTestInt, BINARY_REDUCTION) { run_binary_reduction(); } // NOLINT INSTANTIATE_TEST_CASE_P(BinaryReductionTest, BinaryReductionTestInt, binary_reduction_input); // NOLINT +TEST_P(WarpReduceAddOpTestInt, WARP_REDUCE_WITH_ADD_OP) { run_warp_reduce_with_add_op(); } // NOLINT +INSTANTIATE_TEST_CASE_P(WarpReduceAddOpTest, + WarpReduceAddOpTestInt, + reduction_input); // NOLINT } // namespace raft::util