Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 51 additions & 40 deletions cub/cub/device/device_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1201,6 +1201,9 @@ public:
//! **[inferred]** Signed integer type for sequence offsets, list lengths,
//! pointer differences, etc. @offset_size1
//!
//! @tparam EnvT
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand Down Expand Up @@ -1236,16 +1239,17 @@ public:
//! @param[in] num_pixels
//! The number of multi-channel pixels (i.e., the length of `d_samples / NUM_CHANNELS`)
//!
//! @param[in] stream
//! @param[in] env
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
typename OffsetT,
typename EnvT = ::cuda::std::execution::env<>>
CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
Expand All @@ -1254,7 +1258,7 @@ public:
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_levels,
::cuda::std::array<const LevelT*, NUM_ACTIVE_CHANNELS> d_levels,
OffsetT num_pixels,
cudaStream_t stream = nullptr)
const EnvT& env = {})
{
/// The sample value type of the input iterator
using SampleT = cub::detail::it_value_t<SampleIteratorT>;
Expand All @@ -1269,7 +1273,7 @@ public:
num_pixels,
(OffsetT) 1,
(size_t) (sizeof(SampleT) * NUM_CHANNELS * num_pixels),
stream);
env);
}

//! Deprecate [Since 3.0]
Expand Down Expand Up @@ -1402,6 +1406,9 @@ public:
//! **[inferred]** Signed integer type for sequence offsets, list lengths,
//! pointer differences, etc. @offset_size1
//!
//! @tparam EnvT
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand Down Expand Up @@ -1444,16 +1451,17 @@ public:
//! The number of bytes between starts of consecutive rows in the
//! region of interest
//!
//! @param[in] stream
//! @param[in] env
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <int NUM_CHANNELS,
int NUM_ACTIVE_CHANNELS,
typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT>
typename OffsetT,
typename EnvT = ::cuda::std::execution::env<>>
CUB_RUNTIME_FUNCTION static cudaError_t MultiHistogramRange(
void* d_temp_storage,
size_t& temp_storage_bytes,
Expand All @@ -1464,51 +1472,54 @@ public:
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = nullptr)
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceHistogram::MultiHistogramRange");

/// The sample value type of the input iterator
using SampleT = cub::detail::it_value_t<SampleIteratorT>;
::cuda::std::bool_constant<sizeof(SampleT) == 1> is_byte_sample;

auto policy_selector =
detail::histogram::policy_selector_from_types<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, false>{};
using default_policy_selector =
detail::histogram::policy_selector_from_types<SampleT, CounterT, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, false>;
return detail::dispatch_with_env_and_tuning<default_policy_selector>(
d_temp_storage,
temp_storage_bytes,
env,
[&](auto policy_selector, void* storage, size_t& bytes, auto stream) -> cudaError_t {
if constexpr (sizeof(OffsetT) > sizeof(int))
{
if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
{
return detail::histogram::dispatch_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
storage,
bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
(int) num_row_pixels,
(int) num_rows,
(int) (row_stride_bytes / sizeof(SampleT)),
stream,
is_byte_sample,
policy_selector);
}
}

if constexpr (sizeof(OffsetT) > sizeof(int))
{
if ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) INT_MAX)
{
// Down-convert OffsetT data type
return detail::histogram::dispatch_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
storage,
bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
(int) num_row_pixels,
(int) num_rows,
(int) (row_stride_bytes / sizeof(SampleT)),
num_row_pixels,
num_rows,
(OffsetT) (row_stride_bytes / sizeof(SampleT)),
stream,
is_byte_sample,
policy_selector);
}
}

return detail::histogram::dispatch_range<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
d_temp_storage,
temp_storage_bytes,
d_samples,
d_histogram,
num_levels,
d_levels,
num_row_pixels,
num_rows,
(OffsetT) (row_stride_bytes / sizeof(SampleT)),
stream,
is_byte_sample,
policy_selector);
});
}

//! Deprecate [Since 3.0]
Expand Down Expand Up @@ -2363,7 +2374,7 @@ public:
::cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_levels,
::cuda::std::array<const LevelT*, NUM_ACTIVE_CHANNELS> d_levels,
OffsetT num_pixels,
EnvT env = {})
const EnvT& env = {})
{
using SampleT = cub::detail::it_value_t<SampleIteratorT>;
return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
Expand Down Expand Up @@ -2486,7 +2497,7 @@ public:
OffsetT num_row_pixels,
OffsetT num_rows,
size_t row_stride_bytes,
EnvT env = {})
const EnvT& env = {})
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceHistogram::MultiHistogramRange");

Expand Down
121 changes: 121 additions & 0 deletions cub/test/catch2_test_device_histogram_env.cu
Original file line number Diff line number Diff line change
Expand Up @@ -686,6 +686,127 @@ TEST_CASE("DeviceHistogram::MultiHistogramEven uses custom stream", "[histogram]
REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream));
}

#if TEST_LAUNCH == 0
C2H_TEST("DeviceHistogram::MultiHistogramRange works with user provided memory and environment", "[histogram][device]")
{
[[maybe_unused]] constexpr int NUM_CHANNELS = 4;
[[maybe_unused]] constexpr int NUM_ACTIVE_CHANNELS = 3;

auto d_samples = c2h::device_vector<unsigned char>{0, 2, 1, 255, 3, 4, 2, 128};
int num_pixels = 2;

auto d_levels_r = c2h::device_vector<unsigned char>{0, 2, 4};
auto d_levels_g = c2h::device_vector<unsigned char>{0, 3, 5};
auto d_levels_b = c2h::device_vector<unsigned char>{0, 1, 2, 3};

cuda::std::array<int, NUM_ACTIVE_CHANNELS> num_levels = {3, 3, 4};

cuda::std::array<const unsigned char*, NUM_ACTIVE_CHANNELS> d_levels = {
thrust::raw_pointer_cast(d_levels_r.data()),
thrust::raw_pointer_cast(d_levels_g.data()),
thrust::raw_pointer_cast(d_levels_b.data())};

auto d_histogram_r = c2h::device_vector<int>(2, 0);
auto d_histogram_g = c2h::device_vector<int>(2, 0);
auto d_histogram_b = c2h::device_vector<int>(3, 0);

cuda::std::array<int*, NUM_ACTIVE_CHANNELS> d_histogram = {
thrust::raw_pointer_cast(d_histogram_r.data()),
thrust::raw_pointer_cast(d_histogram_g.data()),
thrust::raw_pointer_cast(d_histogram_b.data())};

c2h::device_vector<int> expected_r{1, 1};
c2h::device_vector<int> expected_g{1, 1};
c2h::device_vector<int> expected_b{0, 1, 1};

size_t expected_bytes_allocated{};
auto error = cub::DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
nullptr,
expected_bytes_allocated,
thrust::raw_pointer_cast(d_samples.data()),
d_histogram,
num_levels,
d_levels,
num_pixels);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

auto d_temp = c2h::device_vector<uint8_t>(expected_bytes_allocated, thrust::no_init);
void* temp_storage = thrust::raw_pointer_cast(d_temp.data());

auto test_multi_histogram_range = [&](const auto& env) {
size_t num_bytes = 0;
error = cub::DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
nullptr, num_bytes, thrust::raw_pointer_cast(d_samples.data()), d_histogram, num_levels, d_levels, num_pixels, env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
REQUIRE(expected_bytes_allocated == num_bytes);

error = cub::DeviceHistogram::MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
temp_storage,
num_bytes,
thrust::raw_pointer_cast(d_samples.data()),
d_histogram,
num_levels,
d_levels,
num_pixels,
env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

// Verify result
REQUIRE(d_histogram_r == expected_r);
REQUIRE(d_histogram_g == expected_g);
REQUIRE(d_histogram_b == expected_b);
};

int current_device;
error = cudaGetDevice(&current_device);
REQUIRE(error == cudaSuccess);

SECTION("DeviceHistogram::MultiHistogramRange works with cudaStream_t")
{
cuda::stream stream{cuda::devices[current_device]};
test_multi_histogram_range(stream.get());
}

SECTION("DeviceHistogram::MultiHistogramRange works with cuda::stream")
{
cuda::stream stream{cuda::devices[current_device]};
test_multi_histogram_range(stream);
}

SECTION("DeviceHistogram::MultiHistogramRange works with cuda::stream_ref")
{
cuda::stream stream{cuda::devices[current_device]};
cuda::stream_ref stream_ref{stream};
test_multi_histogram_range(stream_ref);
}

SECTION("DeviceHistogram::MultiHistogramRange works with cuda::std::execution::env")
{
cuda::std::execution::env env{};
test_multi_histogram_range(env);
}

SECTION("DeviceHistogram::MultiHistogramRange works with cuda::execution::gpu")
{
const auto policy = cuda::execution::gpu;
test_multi_histogram_range(policy);
}

SECTION("DeviceHistogram::MultiHistogramRange works with cuda::execution::gpu with stream")
{
cuda::stream stream{cuda::devices[current_device]};
const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
test_multi_histogram_range(policy);
}
}
#endif // TEST_LAUNCH == 0

C2H_TEST("DeviceHistogram::MultiHistogramRange uses environment", "[histogram][device]")
{
[[maybe_unused]] constexpr int NUM_CHANNELS = 4;
Expand Down
Loading