Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions c/parallel/src/scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,13 @@ struct scan_kernel_source
return arg;
}

static auto lookahead_make_tile_state_kernel_arg(void* ts)
static auto lookahead_make_tile_state_kernel_arg(void* ts, ::cuda::std::uint32_t* atomic_counter = nullptr)
{
// we can ignore passing a wrong AccumT, since we only store a pointer, and the kernel will have the right type
cub::detail::scan::tile_state_kernel_arg_t<scan_tile_state, char> arg;
::cuda::std::__construct_at(&arg.lookahead, static_cast<cub::detail::warpspeed::tile_state_t<char>*>(ts));
::cuda::std::__construct_at(&arg.lookahead,
cub::detail::scan::lookahead_tile_state_arg_t<char>{
static_cast<cub::detail::warpspeed::tile_state_t<char>*>(ts), atomic_counter});
return arg;
}
};
Expand Down
23 changes: 13 additions & 10 deletions cub/cub/detail/warpspeed/look_ahead.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ struct alignas(_Alignment) tile_state_t : tile_state_unaligned_t<AccumT>

template <typename AccumT>
_CCCL_DEVICE_API void
storeTileAggregate(tile_state_t<AccumT>* ptrTileStates, scan_state scanState, AccumT aggr, int index)
storeTileAggregate(tile_state_t<AccumT>* ptrTileStates, scan_state scanState, AccumT aggr, int index, int num_tiles)
{
_CCCL_ASSERT(::cuda::is_aligned(ptrTileStates, alignof(tile_state_t<AccumT>)), "");
_CCCL_ASSERT(index >= 0 && index < gridDim.x, "Reading out of bounds tile state");
_CCCL_ASSERT(index >= 0 && index < num_tiles, "Reading out of bounds tile state");

if constexpr (sizeof(tile_state_t<AccumT>) <= cub::detail::warpspeed::max_native_atomic_size()
&& ::cuda::is_trivially_copyable_v<tile_state_t<AccumT>>)
Expand All @@ -99,10 +99,10 @@ storeTileAggregate(tile_state_t<AccumT>* ptrTileStates, scan_state scanState, Ac
}

template <typename AccumT>
_CCCL_DEVICE_API tile_state_t<AccumT> loadTileAggregate(tile_state_t<AccumT>* ptrTileStates, int index)
_CCCL_DEVICE_API tile_state_t<AccumT> loadTileAggregate(tile_state_t<AccumT>* ptrTileStates, int index, int num_tiles)
{
_CCCL_ASSERT(::cuda::is_aligned(ptrTileStates, alignof(tile_state_t<AccumT>)), "");
_CCCL_ASSERT(index >= 0 && index < gridDim.x, "Reading out of bounds tile state");
_CCCL_ASSERT(index >= 0 && index < num_tiles, "Reading out of bounds tile state");

tile_state_t<AccumT> res;
if constexpr (sizeof(tile_state_t<AccumT>) <= cub::detail::warpspeed::max_native_atomic_size()
Expand Down Expand Up @@ -149,14 +149,15 @@ _CCCL_DEVICE_API void warpLoadLookahead(
tile_state_t<AccumT> (&outTileStates)[numTileStatesPerThread],
tile_state_t<AccumT>* ptrTileStates,
int idxTileCur,
int idxTileNext)
int idxTileNext,
int num_tiles)
{
for (int i = 0; i < numTileStatesPerThread; ++i)
{
const int idxTileLookahead = idxTileCur + 32 * i + laneIdx;
if (idxTileLookahead < idxTileNext)
{
outTileStates[i] = loadTileAggregate(ptrTileStates, idxTileLookahead);
outTileStates[i] = loadTileAggregate(ptrTileStates, idxTileLookahead, num_tiles);
}
else
{
Expand All @@ -182,7 +183,8 @@ template <int numTileStatesPerThread, typename AccumT, typename ScanOpT>
const int idxTilePrev,
const AccumT aggrExclusiveCtaPrev,
const int idxTileNext,
ScanOpT& scan_op)
ScanOpT& scan_op,
const int num_tiles)
{
const int laneIdx = specialRegisters.laneIdx;
const ::cuda::std::uint32_t lanemaskEq = ::cuda::ptx::get_sreg_lanemask_eq();
Expand All @@ -203,7 +205,7 @@ template <int numTileStatesPerThread, typename AccumT, typename ScanOpT>
while (idxTileCur < idxTileNext)
{
tile_state_t<AccumT> regTmpStates[numTileStatesPerThread];
warpLoadLookahead(laneIdx, regTmpStates, ptrTileStates, idxTileCur, idxTileNext);
warpLoadLookahead(laneIdx, regTmpStates, ptrTileStates, idxTileCur, idxTileNext, num_tiles);

for (int idx = 0; idx < numTileStatesPerThread; ++idx)
{
Expand Down Expand Up @@ -274,7 +276,8 @@ template <int numTileStatesPerThread, typename AccumT, typename ScanOpT>
int& idxTilePrev,
AccumT& aggrExclusiveCtaPrev,
const int idxTileNext,
ScanOpT& scan_op)
ScanOpT& scan_op,
const int num_tiles)
{
const int laneIdx = specialRegisters.laneIdx;
const ::cuda::std::uint32_t lanemaskEq = ::cuda::ptx::get_sreg_lanemask_eq();
Expand All @@ -290,7 +293,7 @@ template <int numTileStatesPerThread, typename AccumT, typename ScanOpT>
while (idxTileCur < idxTileNext)
{
tile_state_t<AccumT> regTmpStates[numTileStatesPerThread];
warpLoadLookahead(laneIdx, regTmpStates, ptrTileStates, idxTileCur, idxTileNext);
warpLoadLookahead(laneIdx, regTmpStates, ptrTileStates, idxTileCur, idxTileNext, num_tiles);

for (int idx = 0; idx < numTileStatesPerThread; ++idx)
{
Expand Down
78 changes: 47 additions & 31 deletions cub/cub/detail/warpspeed/squad/load_store.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
#include <cuda/std/__type_traits/make_nbit_int.h>
#include <cuda/std/cstdint>

#include <nv/target>

CUB_NAMESPACE_BEGIN

namespace detail::warpspeed
Expand Down Expand Up @@ -280,16 +282,20 @@ squadStoreBulkSync(Squad squad, CpAsyncOobInfo<OutputT> cpAsyncOobInfo, const ::
asm volatile("" : "+l"(srcSmem));
# endif // _CCCL_CUDA_COMPILER(NVCC, <, 13, 3)
// Copy a subset of the first 16 bytes
if (::cuda::ptx::elect_sync(~0))
{
::cuda::ptx::cp_async_bulk_cp_mask(
::cuda::ptx::space_global,
::cuda::ptx::space_shared,
cpAsyncOobInfo.ptrGmemStartAlignDown,
srcSmem,
/*size*/ 16,
byteMaskStart);
}
NV_IF_ELSE_TARGET(

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: scalar copy is duplicated in 3 cases here, consider a helper function reducing code duplication.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

agreed. I guess with "scalar" you mean the 16-byte masked copy?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes

NV_PROVIDES_SM_100,
(if (::cuda::ptx::elect_sync(~0)) {
::cuda::ptx::cp_async_bulk_cp_mask(
::cuda::ptx::space_global,
::cuda::ptx::space_shared,
cpAsyncOobInfo.ptrGmemStartAlignDown,
srcSmem,
/*size*/ 16,
byteMaskStart);
}),
(const int rank = squad.threadRank(); if (rank < 16 && ((byteMaskStart >> rank) & 1u)) {
cpAsyncOobInfo.ptrGmemStartAlignDown[rank] = srcSmem[rank];
}));
Comment on lines +296 to +298

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: Please put additional braces around the macro arguments:

Suggested change
(const int rank = squad.threadRank(); if (rank < 16 && ((byteMaskStart >> rank) & 1u)) {
cpAsyncOobInfo.ptrGmemStartAlignDown[rank] = srcSmem[rank];
}));
({const int rank = squad.threadRank(); if (rank < 16 && ((byteMaskStart >> rank) & 1u)) {
cpAsyncOobInfo.ptrGmemStartAlignDown[rank] = srcSmem[rank];
}}));

This helps clang-format.

Comment on lines +285 to +298

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important question: Since you introduced code paths that access SMEM using regular loads and stores, is the line above:

::cuda::ptx::fence_proxy_async(::cuda::ptx::space_shared);

still valid? We need to acquire SMEM in the async proxy for the bulk copy, but now we are mixing bulk copies and regular loads from SMEM. Is this still legal?

@ahendriksen may be able to help here.

}
if (doEndCopy)
{
Expand All @@ -299,32 +305,42 @@ squadStoreBulkSync(Squad squad, CpAsyncOobInfo<OutputT> cpAsyncOobInfo, const ::
asm volatile("" : "+l"(cpAsyncOobInfo.ptrGmemEndAlignDown));
# endif // _CCCL_CUDA_COMPILER(NVHPC)

// Copy a subset of the last 16 bytes
if (::cuda::ptx::elect_sync(~0))
{
::cuda::ptx::cp_async_bulk_cp_mask(
::cuda::ptx::space_global,
::cuda::ptx::space_shared,
cpAsyncOobInfo.ptrGmemEndAlignDown,
ptrSmemMiddle + cpAsyncOobInfo.underCopySizeBytes,
/*size*/ 16,
byteMaskEnd);
}
// Copy a subset of the first 16 bytes
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_100,
(if (::cuda::ptx::elect_sync(~0)) {
::cuda::ptx::cp_async_bulk_cp_mask(
::cuda::ptx::space_global,
::cuda::ptx::space_shared,
cpAsyncOobInfo.ptrGmemEndAlignDown,
ptrSmemMiddle + cpAsyncOobInfo.underCopySizeBytes,
/*size*/ 16,
byteMaskEnd);
}),
(const int rank = squad.threadRank();
const ::cuda::std::byte* tail_smem_source = ptrSmemMiddle + cpAsyncOobInfo.underCopySizeBytes;
if (rank < 16 && ((byteMaskEnd >> rank) & 1u)) {
cpAsyncOobInfo.ptrGmemEndAlignDown[rank] = tail_smem_source[rank];
}));
Comment on lines +320 to +324

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: we compute the copy masks based on offsets and counts, so it seems counter intuitive to use them for the fallback loops here. We should just use the information we used to create the masks in the first place.

Here, this would be (I think):

Suggested change
(const int rank = squad.threadRank();
const ::cuda::std::byte* tail_smem_source = ptrSmemMiddle + cpAsyncOobInfo.underCopySizeBytes;
if (rank < 16 && ((byteMaskEnd >> rank) & 1u)) {
cpAsyncOobInfo.ptrGmemEndAlignDown[rank] = tail_smem_source[rank];
}));
(const int rank = squad.threadRank();
const ::cuda::std::byte* tail_smem_source = ptrSmemMiddle + cpAsyncOobInfo.underCopySizeBytes;
if (rank < cpAsyncOobInfo.smemEndBytesAfter16BBoundary) {
cpAsyncOobInfo.ptrGmemEndAlignDown[rank] = tail_smem_source[rank];
}));

Applies to the other occurrences of this logic as well.

}
}
else
{
// Copy a subset of the first 16 bytes
if (::cuda::ptx::elect_sync(~0))
{
::cuda::ptx::cp_async_bulk_cp_mask(
::cuda::ptx::space_global,
::cuda::ptx::space_shared,
cpAsyncOobInfo.ptrGmemStartAlignDown,
srcSmem,
/*size*/ 16,
byteMaskSmall);
}
NV_IF_ELSE_TARGET(
NV_PROVIDES_SM_100,
(if (::cuda::ptx::elect_sync(~0)) {
::cuda::ptx::cp_async_bulk_cp_mask(
::cuda::ptx::space_global,
::cuda::ptx::space_shared,
cpAsyncOobInfo.ptrGmemStartAlignDown,
srcSmem,
/*size*/ 16,
byteMaskSmall);
}),
(const int rank = squad.threadRank(); if (rank < 16 && ((byteMaskSmall >> rank) & 1u)) {
cpAsyncOobInfo.ptrGmemStartAlignDown[rank] = srcSmem[rank];
}));
}
// Commit and wait for store to have completed reading from shared memory
::cuda::ptx::cp_async_bulk_commit_group();
Expand Down
43 changes: 29 additions & 14 deletions cub/cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,13 @@ struct DeviceScanKernelSource
return arg;
}

CUB_RUNTIME_FUNCTION static constexpr auto lookahead_make_tile_state_kernel_arg(void* ts)
CUB_RUNTIME_FUNCTION static constexpr auto
lookahead_make_tile_state_kernel_arg(void* ts, ::cuda::std::uint32_t* atomic_counter = nullptr)
{
tile_state_kernel_arg_t<ScanTileStateT, AccumT> arg;
::cuda::std::__construct_at(&arg.lookahead, static_cast<warpspeed::tile_state_t<AccumT>*>(ts));
::cuda::std::__construct_at(
&arg.lookahead,
lookahead_tile_state_arg_t<AccumT>{static_cast<warpspeed::tile_state_t<AccumT>*>(ts), atomic_counter});
return arg;
}
};
Expand Down Expand Up @@ -1083,6 +1086,7 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead(
OffsetT num_items,
cudaStream_t stream,
bool dependent_launch,
bool atomic_scheduling,
KernelSource kernel_source,
KernelLauncherFactory launcher_factory)
{
Expand All @@ -1101,25 +1105,33 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead(
CUB_DETAIL_STATIC_ISH_ASSERT(lookahead_policy.lookahead_items_per_thread >= 1,
"Lookahead scan policy must look ahead at least 1 item per thread");

const int grid_dim =
const int num_tiles =
static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(lookahead_policy.tile_size())));

if (d_temp_storage == nullptr)
size_t allocation_sizes[2] = {
static_cast<size_t>(num_tiles) * kernel_source.lookahead_tile_state_size(), sizeof(::cuda::std::uint32_t)};
void* allocations[2] = {};
if (const auto error =
CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes)))
{
temp_storage_bytes = static_cast<size_t>(grid_dim) * kernel_source.lookahead_tile_state_size();
return cudaSuccess;
return error;
}

if (num_items == 0)
if (d_temp_storage == nullptr)
{
return cudaSuccess;
}
Comment on lines +1120 to 1123

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: The check here is correct, but where did the num_items == 0 and early return go? This should be retained.


void* d_tile_state = allocations[0];
::cuda::std::uint32_t* d_atomic_counter = static_cast<::cuda::std::uint32_t*>(allocations[1]);

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical: add a test checking alignment, as previous implementation doesn't enforce alignment on temporary allocations.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think an assertion should be fine. Each temporary storage allocation is 256 byte aligned.


int sm_count = 0;
if (const auto error = CubDebug(launcher_factory.MultiProcessorCount(sm_count)))
{
return error;
}

const int scan_grid_dim = atomic_scheduling ? ::cuda::std::min(sm_count, num_tiles) : num_tiles;

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: try using occupancy or higher CTA count per SM if it improves performance.

// Maximum dynamic shared memory size that we can use for temporary storage.
int max_dynamic_smem_size{};
if (const auto error =
Expand All @@ -1129,7 +1141,7 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead(
}

// TODO(bgruber): we probably need to ensure alignment of d_temp_storage

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion: Drop once a test is added, as we already enforce aligned allocation of temporaries above.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can be dropped today.

_CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.lookahead_tile_state_alignment()), "");
_CCCL_ASSERT(::cuda::is_aligned(d_tile_state, kernel_source.lookahead_tile_state_alignment()), "");

auto scan_kernel = kernel_source.ScanKernel();
[[maybe_unused]] auto kernel_src = kernel_source; // need to pull a copy to not access `this` during const. eval.
Expand Down Expand Up @@ -1188,7 +1200,7 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead(
// Invoke init kernel
{
constexpr auto init_kernel_threads = 128;
const auto init_grid_size = ::cuda::ceil_div(grid_dim, init_kernel_threads);
const auto init_grid_size = ::cuda::ceil_div(num_tiles, init_kernel_threads);

# ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, %lld>>>()\n",
Expand All @@ -1200,8 +1212,8 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead(
if (const auto error = CubDebug(
launcher_factory(init_grid_size, init_kernel_threads, 0, stream, dependent_launch)
.doit(kernel_source.InitKernel(),
kernel_source.lookahead_make_tile_state_kernel_arg(d_temp_storage),
grid_dim)))
kernel_source.lookahead_make_tile_state_kernel_arg(d_tile_state, d_atomic_counter),
num_tiles)))
{
return error;
}
Expand All @@ -1223,15 +1235,16 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead(
{
const int block_dim = detail::scan::num_total_threads(lookahead_policy);
# ifdef CUB_DEBUG_LOG
_CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream);
_CubLog(
"Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", scan_grid_dim, block_dim, smem_size, (long long) stream);
# endif // CUB_DEBUG_LOG

if (const auto error = CubDebug(
launcher_factory(grid_dim, block_dim, smem_size, stream, dependent_launch)
launcher_factory(scan_grid_dim, block_dim, smem_size, stream, dependent_launch)
.doit(scan_kernel,
THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_in),
THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_out),
kernel_source.lookahead_make_tile_state_kernel_arg(d_temp_storage),
kernel_source.lookahead_make_tile_state_kernel_arg(d_tile_state, d_atomic_counter),
/* start_tile, unused */ 0,
::cuda::std::move(scan_op),
init_value,
Expand Down Expand Up @@ -1285,6 +1298,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke(
const bool dependent_launch = cc >= ::cuda::compute_capability{9, 0};
if CUB_DETAIL_CONSTEXPR_ISH (policy_getter().algorithm == ScanAlgorithm::lookahead)
{
const bool atomic_scheduling = cc == ::cuda::compute_capability{9, 0};
return invoke_lookahead(
policy_getter,
d_temp_storage,
Expand All @@ -1296,6 +1310,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke(
num_items,
stream,
dependent_launch,
atomic_scheduling,
kernel_source,
launcher_factory);
}
Expand Down
30 changes: 22 additions & 8 deletions cub/cub/device/dispatch/kernels/kernel_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,23 @@

#include <thrust/type_traits/is_contiguous_iterator.h>

#include <cuda/std/cstdint>

CUB_NAMESPACE_BEGIN

namespace detail::scan
{
template <typename AccumT>
struct lookahead_tile_state_arg_t
{
warpspeed::tile_state_t<AccumT>* tile_states;
::cuda::std::uint32_t* atomic_counter;
};
Comment thread
srinivasyadav18 marked this conversation as resolved.

template <typename ScanTileState, typename AccumT>
union tile_state_kernel_arg_t
{
warpspeed::tile_state_t<AccumT>* lookahead;
lookahead_tile_state_arg_t<AccumT> lookahead;
ScanTileState lookback;

// ScanTileState<AccumT> is not trivially [default|copy]-constructible, so because of
Expand Down Expand Up @@ -69,7 +78,11 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__(128) void DeviceScanInitKernel(
constexpr ScanPolicy policy = current_policy<PolicySelectorT>();
if constexpr (policy.algorithm == ScanAlgorithm::lookahead)
{
device_scan_init_lookahead_body(tile_state.lookahead, num_tiles);
device_scan_init_lookahead_body(tile_state.lookahead.tile_states, num_tiles);
if (tile_state.lookahead.atomic_counter != nullptr && blockIdx.x == 0 && threadIdx.x == 0)
{
*tile_state.lookahead.atomic_counter = 0;
}
}
else
#endif // _CCCL_CUDACC_AT_LEAST(12, 8)
Expand Down Expand Up @@ -205,12 +218,13 @@ __launch_bounds__(device_scan_launch_bounds<PolicySelector>, 1) _CCCL_KERNEL_ATT
if constexpr (active_policy.algorithm == ScanAlgorithm::lookahead)
{
#if _CCCL_CUDACC_AT_LEAST(12, 8)
NV_IF_TARGET(NV_PROVIDES_SM_100, ({
auto scan_params = scanKernelParams<it_value_t<InputIteratorT>, it_value_t<OutputIteratorT>, AccumT>{
d_in, d_out, tile_state.lookahead, num_items, num_stages};
device_scan_lookahead_body<PolicySelector, ForceInclusive, RealInitValueT, StableReductionOrder>(
scan_params, scan_op, init_value);
}));
NV_IF_TARGET(
NV_PROVIDES_SM_90, ({
auto scan_params = scanKernelParams<it_value_t<InputIteratorT>, it_value_t<OutputIteratorT>, AccumT>{
d_in, d_out, tile_state.lookahead.tile_states, tile_state.lookahead.atomic_counter, num_items, num_stages};
device_scan_lookahead_body<PolicySelector, ForceInclusive, RealInitValueT, StableReductionOrder>(
scan_params, scan_op, init_value);
}));
#else
static_assert(sizeof(d_in) == 0,
"Implementation bug: Tuning policy selected lookahead, but CUDA compiler does not support it");
Expand Down
Loading
Loading