diff --git a/cub/cub/device/device_histogram.cuh b/cub/cub/device/device_histogram.cuh index 23b3db2d05f..1d5b5e73196 100644 --- a/cub/cub/device/device_histogram.cuh +++ b/cub/cub/device/device_histogram.cuh @@ -1201,6 +1201,9 @@ public: //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! //! @param[in] d_temp_storage //! @devicestorage //! @@ -1236,16 +1239,17 @@ public: //! @param[in] num_pixels //! The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`) //! - //! @param[in] stream + //! @param[in] env //! @rst - //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst template + typename OffsetT, + typename EnvT = ::cuda::std::execution::env<>> CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1254,7 +1258,7 @@ public: ::cuda::std::array num_levels, ::cuda::std::array d_levels, OffsetT num_pixels, - cudaStream_t stream = nullptr) + const EnvT& env = {}) { /// The sample value type of the input iterator using SampleT = cub::detail::it_value_t; @@ -1269,7 +1273,7 @@ public: num_pixels, (OffsetT) 1, (size_t) (sizeof(SampleT) * NUM_CHANNELS * num_pixels), - stream); + env); } //! Deprecate [Since 3.0] @@ -1402,6 +1406,9 @@ public: //! **[inferred]** Signed integer type for sequence offsets, list lengths, //! pointer differences, etc. @offset_size1 //! + //! @tparam EnvT + //! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`) + //! //! @param[in] d_temp_storage //! @devicestorage //! @@ -1444,16 +1451,17 @@ public: //! The number of bytes between starts of consecutive rows in the //! region of interest //! - //! @param[in] stream + //! @param[in] env //! @rst - //! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`. + //! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``. //! @endrst template + typename OffsetT, + typename EnvT = ::cuda::std::execution::env<>> CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange( void* d_temp_storage, size_t& temp_storage_bytes, @@ -1464,51 +1472,54 @@ public: OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, - cudaStream_t stream = nullptr) + const EnvT& env = {}) { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramRange"); - /// The sample value type of the input iterator using SampleT = cub::detail::it_value_t; ::cuda::std::bool_constant is_byte_sample; - auto policy_selector = - detail::histogram::policy_selector_from_types{}; + using default_policy_selector = + detail::histogram::policy_selector_from_types; + return detail::dispatch_with_env_and_tuning( + d_temp_storage, + temp_storage_bytes, + env, + [&](auto policy_selector, void* storage, size_t& bytes, auto stream) -> cudaError_t { + if constexpr (sizeof(OffsetT) > sizeof(int)) + { + if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX) + { + return detail::histogram::dispatch_range( + storage, + bytes, + d_samples, + d_histogram, + num_levels, + d_levels, + (int) num_row_pixels, + (int) num_rows, + (int) (row_stride_bytes / sizeof(SampleT)), + stream, + is_byte_sample, + policy_selector); + } + } - if constexpr (sizeof(OffsetT) > sizeof(int)) - { - if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX) - { - // Down-convert OffsetT data type return detail::histogram::dispatch_range( - d_temp_storage, - temp_storage_bytes, + storage, + bytes, d_samples, d_histogram, num_levels, d_levels, - (int) num_row_pixels, - (int) num_rows, - (int) (row_stride_bytes / sizeof(SampleT)), + num_row_pixels, + num_rows, + (OffsetT) (row_stride_bytes / sizeof(SampleT)), stream, is_byte_sample, policy_selector); - } - } - - return detail::histogram::dispatch_range( - d_temp_storage, - temp_storage_bytes, - d_samples, - d_histogram, - num_levels, - d_levels, - num_row_pixels, - num_rows, - (OffsetT) (row_stride_bytes / sizeof(SampleT)), - stream, - is_byte_sample, - policy_selector); + }); } //! Deprecate [Since 3.0] @@ -2363,7 +2374,7 @@ public: ::cuda::std::array num_levels, ::cuda::std::array d_levels, OffsetT num_pixels, - EnvT env = {}) + const EnvT& env = {}) { using SampleT = cub::detail::it_value_t; return MultiHistogramRange( @@ -2486,7 +2497,7 @@ public: OffsetT num_row_pixels, OffsetT num_rows, size_t row_stride_bytes, - EnvT env = {}) + const EnvT& env = {}) { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceHistogram::MultiHistogramRange"); diff --git a/cub/test/catch2_test_device_histogram_env.cu b/cub/test/catch2_test_device_histogram_env.cu index 0895462babe..06a39a07ad5 100644 --- a/cub/test/catch2_test_device_histogram_env.cu +++ b/cub/test/catch2_test_device_histogram_env.cu @@ -686,6 +686,127 @@ TEST_CASE("DeviceHistogram::MultiHistogramEven uses custom stream", "[histogram] REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); } +#if TEST_LAUNCH == 0 +C2H_TEST("DeviceHistogram::MultiHistogramRange works with user provided memory and environment", "[histogram][device]") +{ + [[maybe_unused]] constexpr int NUM_CHANNELS = 4; + [[maybe_unused]] constexpr int NUM_ACTIVE_CHANNELS = 3; + + auto d_samples = c2h::device_vector{0, 2, 1, 255, 3, 4, 2, 128}; + int num_pixels = 2; + + auto d_levels_r = c2h::device_vector{0, 2, 4}; + auto d_levels_g = c2h::device_vector{0, 3, 5}; + auto d_levels_b = c2h::device_vector{0, 1, 2, 3}; + + cuda::std::array num_levels = {3, 3, 4}; + + cuda::std::array d_levels = { + thrust::raw_pointer_cast(d_levels_r.data()), + thrust::raw_pointer_cast(d_levels_g.data()), + thrust::raw_pointer_cast(d_levels_b.data())}; + + auto d_histogram_r = c2h::device_vector(2, 0); + auto d_histogram_g = c2h::device_vector(2, 0); + auto d_histogram_b = c2h::device_vector(3, 0); + + cuda::std::array d_histogram = { + thrust::raw_pointer_cast(d_histogram_r.data()), + thrust::raw_pointer_cast(d_histogram_g.data()), + thrust::raw_pointer_cast(d_histogram_b.data())}; + + c2h::device_vector expected_r{1, 1}; + c2h::device_vector expected_g{1, 1}; + c2h::device_vector expected_b{0, 1, 1}; + + size_t expected_bytes_allocated{}; + auto error = cub::DeviceHistogram::MultiHistogramRange( + nullptr, + expected_bytes_allocated, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_pixels); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + + auto d_temp = c2h::device_vector(expected_bytes_allocated, thrust::no_init); + void* temp_storage = thrust::raw_pointer_cast(d_temp.data()); + + auto test_multi_histogram_range = [&](const auto& env) { + size_t num_bytes = 0; + error = cub::DeviceHistogram::MultiHistogramRange( + nullptr, num_bytes, thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, d_levels, num_pixels, env); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + REQUIRE(expected_bytes_allocated == num_bytes); + + error = cub::DeviceHistogram::MultiHistogramRange( + temp_storage, + num_bytes, + thrust::raw_pointer_cast(d_samples.data()), + d_histogram, + num_levels, + d_levels, + num_pixels, + env); + REQUIRE(error == cudaSuccess); + REQUIRE(cudaSuccess == cudaPeekAtLastError()); + REQUIRE(cudaSuccess == cudaDeviceSynchronize()); + + // Verify result + REQUIRE(d_histogram_r == expected_r); + REQUIRE(d_histogram_g == expected_g); + REQUIRE(d_histogram_b == expected_b); + }; + + int current_device; + error = cudaGetDevice(¤t_device); + REQUIRE(error == cudaSuccess); + + SECTION("DeviceHistogram::MultiHistogramRange works with cudaStream_t") + { + cuda::stream stream{cuda::devices[current_device]}; + test_multi_histogram_range(stream.get()); + } + + SECTION("DeviceHistogram::MultiHistogramRange works with cuda::stream") + { + cuda::stream stream{cuda::devices[current_device]}; + test_multi_histogram_range(stream); + } + + SECTION("DeviceHistogram::MultiHistogramRange works with cuda::stream_ref") + { + cuda::stream stream{cuda::devices[current_device]}; + cuda::stream_ref stream_ref{stream}; + test_multi_histogram_range(stream_ref); + } + + SECTION("DeviceHistogram::MultiHistogramRange works with cuda::std::execution::env") + { + cuda::std::execution::env env{}; + test_multi_histogram_range(env); + } + + SECTION("DeviceHistogram::MultiHistogramRange works with cuda::execution::gpu") + { + const auto policy = cuda::execution::gpu; + test_multi_histogram_range(policy); + } + + SECTION("DeviceHistogram::MultiHistogramRange works with cuda::execution::gpu with stream") + { + cuda::stream stream{cuda::devices[current_device]}; + const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream); + test_multi_histogram_range(policy); + } +} +#endif // TEST_LAUNCH == 0 + C2H_TEST("DeviceHistogram::MultiHistogramRange uses environment", "[histogram][device]") { [[maybe_unused]] constexpr int NUM_CHANNELS = 4;