diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index ddc54a63e57..23b3db2d05f 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -143,6 +143,9 @@ struct DeviceHistogram //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! //! @param[in] d_temp_storage //! @devicestorage //! @@ -169,11 +172,15 @@ struct DeviceHistogram //! @param[in] num_samples //! The number of input samples (i.e., the length of `d_samples`) //! - //! @param[in] stream + //! @param[in] env //! @rst - //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst - template + template > CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, @@ -183,7 +190,7 @@ struct DeviceHistogram LevelT lower_level, LevelT upper_level, OffsetT num_samples, - cudaStream_t stream = nullptr) + const EnvT& env = {}) { /// The sample value type of the input iterator using SampleT = cub::detail::it_value_t; @@ -198,7 +205,7 @@ struct DeviceHistogram num_samples, static_cast(1), sizeof(SampleT) * num_samples, - stream); + env); } //! @rst @@ -288,6 +295,9 @@ struct DeviceHistogram //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! //! @param[in] d_temp_storage //! @devicestorage //! @@ -321,11 +331,15 @@ struct DeviceHistogram //! The number of bytes between starts of consecutive rows in //! the region of interest //! - //! @param[in] stream + //! @param[in] env //! @rst - //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst - template + template > CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven( void* d_temp_storage, size_t& temp_storage_bytes, @@ -337,7 +351,7 @@ struct DeviceHistogram OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, - cudaStream_t stream = nullptr) + const EnvT& env = {}) { return MultiHistogramEven<1, 1>( d_temp_storage, @@ -350,7 +364,7 @@ struct DeviceHistogram num_row_samples, num_rows, row_stride_bytes, - stream); + env); } //! @rst @@ -1622,7 +1636,7 @@ public: LevelT lower_level, LevelT upper_level, OffsetT num_samples, - EnvT env = {}) + const EnvT& env = {}) { using SampleT = cub::detail::it_value_t; return MultiHistogramEven<1, 1>( @@ -1740,7 +1754,7 @@ public: OffsetT num_row_samples, OffsetT num_rows, size_t row_stride_bytes, - EnvT env = {}) + const EnvT& env = {}) { return MultiHistogramEven<1, 1>( d_samples, diff --git a/cub/test/catch2_test_device_histogram_env.cu b/cub/test/catch2_test_device_histogram_env.cu index 836b70d34e2..0895462babe 100644 --- a/cub/test/catch2_test_device_histogram_env.cu +++ b/cub/test/catch2_test_device_histogram_env.cu @@ -64,6 +64,112 @@ TEST_CASE("DeviceHistogram::HistogramEven works with default environment", "[his REQUIRE(d_histogram == expected); } +TEST_CASE("DeviceHistogram::HistogramEven works with user provided memory and environment", "[histogram][device]") +{ + auto d_samples = c2h::device_vector{0, 2, 1, 0, 3, 4, 2, 1}; + int num_samples = static_cast(d_samples.size()); + int num_levels = 6; + int lower_level = 0; + int upper_level = 5; + auto d_histogram = c2h::device_vector(num_levels - 1, 0); + + c2h::device_vector expected{2, 2, 2, 1, 1}; + + size_t expected_bytes_allocated{}; + auto error = cub::DeviceHistogram::HistogramEven( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_samples); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + + auto d_temp = c2h::device_vector(expected_bytes_allocated, thrust::no_init); + void* temp_storage = thrust::raw_pointer_cast(d_temp.data()); + + auto test_histogram_even = [&](const auto& env) { + size_t num_bytes = 0; + error = cub::DeviceHistogram::HistogramEven( + nullptr, + num_bytes, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_samples, + env); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + REQUIRE(expected_bytes_allocated == num_bytes); + + error = cub::DeviceHistogram::HistogramEven( + temp_storage, + num_bytes, + thrust::raw_pointer_cast(d_samples.data()), + thrust::raw_pointer_cast(d_histogram.data()), + num_levels, + lower_level, + upper_level, + num_samples, + env); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + + // Verify result + REQUIRE(d_histogram == expected); + }; + + int current_device; + error = cudaGetDevice(¤t_device); + REQUIRE(error == cudaSuccess); + + SECTION("DeviceHistogram::HistogramEven works with cudaStream_t") + { + cuda::stream stream{cuda::devices[current_device]}; + test_histogram_even(stream.get()); + } + + SECTION("DeviceHistogram::HistogramEven works with cuda::stream") + { + cuda::stream stream{cuda::devices[current_device]}; + test_histogram_even(stream); + } + + SECTION("DeviceHistogram::HistogramEven works with cuda::stream_ref") + { + cuda::stream stream{cuda::devices[current_device]}; + cuda::stream_ref stream_ref{stream}; + test_histogram_even(stream_ref); + } + + SECTION("DeviceHistogram::HistogramEven works with cuda::std::execution::env") + { + cuda::std::execution::env env{}; + test_histogram_even(env); + } + + SECTION("DeviceHistogram::HistogramEven works with cuda::execution::gpu") + { + const auto policy = cuda::execution::gpu; + test_histogram_even(policy); + } + + SECTION("DeviceHistogram::HistogramEven works with cuda::execution::gpu with stream") + { + cuda::stream stream{cuda::devices[current_device]}; + const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream); + test_histogram_even(policy); + } +} + TEST_CASE("DeviceHistogram::HistogramRange works with default environment", "[histogram][device]") { auto d_samples = c2h::device_vector{2.2f, 6.1f, 7.5f, 2.9f, 3.5f, 0.3f, 2.9f, 2.1f};