Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## build logs
build_*.log

## aarch64 build artifacts
cpp/build-aarch64/
*.tar.bz2

## common
__pycache__
*.pyc
Expand Down
15 changes: 15 additions & 0 deletions cpp/include/raft/util/reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,21 @@ DI T warpReduce(T val, ReduceLambda reduce_op)
return logicalWarpReduce<WarpSize>(val, reduce_op);
}

/**
* @brief Warp-level reduction with raft::add_op
* @tparam T Value type to be reduced
* @param val input value
* @param reduce_op raft::add_op instance
* @return Reduction result. All lanes will have the valid result.
* @note Explicit overload to disambiguate from cub::detail::scan::warpReduce when
* called with raft::add_op; avoids ambiguous template instantiation on CUDA 13.2+
*/
template <typename T>
DI T warpReduce(T val, raft::add_op reduce_op)
{
return logicalWarpReduce<WarpSize>(val, reduce_op);
}

/**
* @brief Warp-level reduction
* @tparam T Value type to be reduced
Expand Down
38 changes: 36 additions & 2 deletions cpp/tests/util/reduction.cu
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,15 @@ RAFT_KERNEL test_reduction_kernel(const int* input, int* reduction_res, ReduceLa
if (threadIdx.x == 0) { reduction_res[0] = th_val; }
}

// regression: warpReduce(val, raft::add_op{}) must not cause ambiguity with CUB on CUDA 13.2+
RAFT_KERNEL test_warp_reduce_with_add_op_kernel(const int* input, int* reduction_res)
{
assert(gridDim.x == 1);
int th_val = input[threadIdx.x];
th_val = raft::warpReduce(th_val, raft::add_op{});

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this really cause an ambiguity without the extra overload in util/reduction.cuh? It's called here with raft namespace, so I doubt CUB overload is ever picked up here.

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It showed up for me when trying to do a full source rebuild of cuvs on the dgx spark. When finally debugging my build failure, it traced down to raft, but only showed up when building for arm64.

@zbrad zbrad Jun 8, 2026

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I have the cuvs regression test for it that I'm submitting to cuvs, it's at cpp/tests/regression/warp_reduce_add_op.cu

if (threadIdx.x % 32 == 0) { atomicAdd(reduction_res, th_val); }
}

template <typename ReduceLambda>
RAFT_KERNEL test_ranked_reduction_kernel(const int* input,
int* reduction_res,
Expand Down Expand Up @@ -131,6 +140,20 @@ struct reduction_launch {
RAFT_CUDA_TRY(cudaPeekAtLastError());
ASSERT_EQ(ref_d.value(stream), ref_val);
}

static void run_warp_reduce_with_add_op(const rmm::device_uvector<int>& arr_d,
int ref_val,
rmm::cuda_stream_view stream)
{
rmm::device_scalar<int> ref_d(0, stream);
const int block_dim = 64;
const int grid_dim = 1;
test_warp_reduce_with_add_op_kernel<<<grid_dim, block_dim, 0, stream>>>(arr_d.data(),
ref_d.data());
stream.synchronize();
RAFT_CUDA_TRY(cudaPeekAtLastError());
ASSERT_EQ(ref_d.value(stream), ref_val);
}
};

template <typename T>
Expand Down Expand Up @@ -162,6 +185,12 @@ class ReductionTest : public testing::TestWithParam<std::vector<int>> { // NOLI
}

void run_binary_reduction() { reduction_launch::run_binary(arr_d, 24, stream); }

void run_warp_reduce_with_add_op()
{
// two warps of 32 threads; warp 0 sums elements 0-31 (=78), warp 1 sums 32-63 (=80)
reduction_launch::run_warp_reduce_with_add_op(arr_d, 158, stream);
}
};

const std::vector<int> test_vector{1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 5, 1, 2, 3, 4, 1, 2,
Expand All @@ -173,13 +202,18 @@ const std::vector<int> binary_test_vector{
auto reduction_input = ::testing::Values(test_vector);
auto binary_reduction_input = ::testing::Values(binary_test_vector);

using ReductionTestInt = ReductionTest<int>; // NOLINT
using BinaryReductionTestInt = ReductionTest<int>; // NOLINT
using ReductionTestInt = ReductionTest<int>; // NOLINT
using BinaryReductionTestInt = ReductionTest<int>; // NOLINT
using WarpReduceAddOpTestInt = ReductionTest<int>; // NOLINT
TEST_P(ReductionTestInt, REDUCTIONS) { run_reduction(); }
INSTANTIATE_TEST_CASE_P(ReductionTest, ReductionTestInt, reduction_input); // NOLINT
TEST_P(BinaryReductionTestInt, BINARY_REDUCTION) { run_binary_reduction(); } // NOLINT
INSTANTIATE_TEST_CASE_P(BinaryReductionTest,
BinaryReductionTestInt,
binary_reduction_input); // NOLINT
TEST_P(WarpReduceAddOpTestInt, WARP_REDUCE_WITH_ADD_OP) { run_warp_reduce_with_add_op(); } // NOLINT
INSTANTIATE_TEST_CASE_P(WarpReduceAddOpTest,
WarpReduceAddOpTestInt,
reduction_input); // NOLINT

} // namespace raft::util
Loading