Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 26 additions & 12 deletions cub/cub/device/device_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,9 @@ struct DeviceHistogram
//! **[inferred]** Signed integer type for sequence offsets, list lengths,
//! pointer differences, etc. @offset_size1
//!
//! @tparam EnvT
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand All @@ -169,11 +172,15 @@ struct DeviceHistogram
//! @param[in] num_samples
//! The number of input samples (i.e., the length of `d_samples`)
//!
//! @param[in] stream
//! @param[in] env
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
template <typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT,
typename EnvT = ::cuda::std::execution::env<>>
CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
Expand All @@ -183,7 +190,7 @@ struct DeviceHistogram
LevelT lower_level,
LevelT upper_level,
OffsetT num_samples,
cudaStream_t stream = nullptr)
const EnvT& env = {})
{
/// The sample value type of the input iterator
using SampleT = cub::detail::it_value_t<SampleIteratorT>;
Expand All @@ -198,7 +205,7 @@ struct DeviceHistogram
num_samples,
static_cast<OffsetT>(1),
sizeof(SampleT) * num_samples,
stream);
env);
}

//! @rst
Expand Down Expand Up @@ -288,6 +295,9 @@ struct DeviceHistogram
//! **[inferred]** Signed integer type for sequence offsets, list lengths,
//! pointer differences, etc. @offset_size1
//!
//! @tparam EnvT
//! **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
//!
//! @param[in] d_temp_storage
//! @devicestorage
//!
Expand Down Expand Up @@ -321,11 +331,15 @@ struct DeviceHistogram
//! The number of bytes between starts of consecutive rows in
//! the region of interest
//!
//! @param[in] stream
//! @param[in] env
//! @rst
//! **[optional]** CUDA stream to launch kernels within. Default is stream\ :sub:`0`.
//! **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
//! @endrst
template <typename SampleIteratorT, typename CounterT, typename LevelT, typename OffsetT>
template <typename SampleIteratorT,
typename CounterT,
typename LevelT,
typename OffsetT,
typename EnvT = ::cuda::std::execution::env<>>
CUB_RUNTIME_FUNCTION static cudaError_t HistogramEven(
void* d_temp_storage,
size_t& temp_storage_bytes,
Expand All @@ -337,7 +351,7 @@ struct DeviceHistogram
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
cudaStream_t stream = nullptr)
const EnvT& env = {})
{
return MultiHistogramEven<1, 1>(
d_temp_storage,
Expand All @@ -350,7 +364,7 @@ struct DeviceHistogram
num_row_samples,
num_rows,
row_stride_bytes,
stream);
env);
}

//! @rst
Expand Down Expand Up @@ -1622,7 +1636,7 @@ public:
LevelT lower_level,
LevelT upper_level,
OffsetT num_samples,
EnvT env = {})
const EnvT& env = {})
{
using SampleT = cub::detail::it_value_t<SampleIteratorT>;
return MultiHistogramEven<1, 1>(
Expand Down Expand Up @@ -1740,7 +1754,7 @@ public:
OffsetT num_row_samples,
OffsetT num_rows,
size_t row_stride_bytes,
EnvT env = {})
const EnvT& env = {})
{
return MultiHistogramEven<1, 1>(
d_samples,
Expand Down
106 changes: 106 additions & 0 deletions cub/test/catch2_test_device_histogram_env.cu
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,112 @@ TEST_CASE("DeviceHistogram::HistogramEven works with default environment", "[his
REQUIRE(d_histogram == expected);
}

TEST_CASE("DeviceHistogram::HistogramEven works with user provided memory and environment", "[histogram][device]")
{
auto d_samples = c2h::device_vector<int>{0, 2, 1, 0, 3, 4, 2, 1};
int num_samples = static_cast<int>(d_samples.size());
int num_levels = 6;
int lower_level = 0;
int upper_level = 5;
auto d_histogram = c2h::device_vector<int>(num_levels - 1, 0);

c2h::device_vector<int> expected{2, 2, 2, 1, 1};

size_t expected_bytes_allocated{};
auto error = cub::DeviceHistogram::HistogramEven(
nullptr,
expected_bytes_allocated,
thrust::raw_pointer_cast(d_samples.data()),
thrust::raw_pointer_cast(d_histogram.data()),
num_levels,
lower_level,
upper_level,
num_samples);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

auto d_temp = c2h::device_vector<uint8_t>(expected_bytes_allocated, thrust::no_init);
void* temp_storage = thrust::raw_pointer_cast(d_temp.data());

auto test_histogram_even = [&](const auto& env) {
size_t num_bytes = 0;
error = cub::DeviceHistogram::HistogramEven(
nullptr,
num_bytes,
thrust::raw_pointer_cast(d_samples.data()),
thrust::raw_pointer_cast(d_histogram.data()),
num_levels,
lower_level,
upper_level,
num_samples,
env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());
REQUIRE(expected_bytes_allocated == num_bytes);

error = cub::DeviceHistogram::HistogramEven(
temp_storage,
num_bytes,
thrust::raw_pointer_cast(d_samples.data()),
thrust::raw_pointer_cast(d_histogram.data()),
num_levels,
lower_level,
upper_level,
num_samples,
env);
REQUIRE(error == cudaSuccess);
REQUIRE(cudaSuccess == cudaPeekAtLastError());
REQUIRE(cudaSuccess == cudaDeviceSynchronize());

// Verify result
REQUIRE(d_histogram == expected);
};

int current_device;
error = cudaGetDevice(&current_device);
REQUIRE(error == cudaSuccess);

SECTION("DeviceHistogram::HistogramEven works with cudaStream_t")
{
cuda::stream stream{cuda::devices[current_device]};
test_histogram_even(stream.get());
}

SECTION("DeviceHistogram::HistogramEven works with cuda::stream")
{
cuda::stream stream{cuda::devices[current_device]};
test_histogram_even(stream);
}

SECTION("DeviceHistogram::HistogramEven works with cuda::stream_ref")
{
cuda::stream stream{cuda::devices[current_device]};
cuda::stream_ref stream_ref{stream};
test_histogram_even(stream_ref);
}

SECTION("DeviceHistogram::HistogramEven works with cuda::std::execution::env")
{
cuda::std::execution::env env{};
test_histogram_even(env);
}

SECTION("DeviceHistogram::HistogramEven works with cuda::execution::gpu")
{
const auto policy = cuda::execution::gpu;
test_histogram_even(policy);
}

SECTION("DeviceHistogram::HistogramEven works with cuda::execution::gpu with stream")
{
cuda::stream stream{cuda::devices[current_device]};
const auto policy = cuda::execution::gpu.with(cuda::get_stream, stream);
test_histogram_even(policy);
}
}

TEST_CASE("DeviceHistogram::HistogramRange works with default environment", "[histogram][device]")
{
auto d_samples = c2h::device_vector<float>{2.2f, 6.1f, 7.5f, 2.9f, 3.5f, 0.3f, 2.9f, 2.1f};
Expand Down
Loading