-
Notifications
You must be signed in to change notification settings - Fork 419
warpspeed run_to_run deterministic scan for SM90 using atomic global counter #9565
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -25,6 +25,8 @@ | |||||||||||||||||||||
| #include <cuda/std/__type_traits/make_nbit_int.h> | ||||||||||||||||||||||
| #include <cuda/std/cstdint> | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| #include <nv/target> | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| CUB_NAMESPACE_BEGIN | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| namespace detail::warpspeed | ||||||||||||||||||||||
|
|
@@ -280,16 +282,20 @@ squadStoreBulkSync(Squad squad, CpAsyncOobInfo<OutputT> cpAsyncOobInfo, const :: | |||||||||||||||||||||
| asm volatile("" : "+l"(srcSmem)); | ||||||||||||||||||||||
| # endif // _CCCL_CUDA_COMPILER(NVCC, <, 13, 3) | ||||||||||||||||||||||
| // Copy a subset of the first 16 bytes | ||||||||||||||||||||||
| if (::cuda::ptx::elect_sync(~0)) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| ::cuda::ptx::cp_async_bulk_cp_mask( | ||||||||||||||||||||||
| ::cuda::ptx::space_global, | ||||||||||||||||||||||
| ::cuda::ptx::space_shared, | ||||||||||||||||||||||
| cpAsyncOobInfo.ptrGmemStartAlignDown, | ||||||||||||||||||||||
| srcSmem, | ||||||||||||||||||||||
| /*size*/ 16, | ||||||||||||||||||||||
| byteMaskStart); | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| NV_IF_ELSE_TARGET( | ||||||||||||||||||||||
| NV_PROVIDES_SM_100, | ||||||||||||||||||||||
| (if (::cuda::ptx::elect_sync(~0)) { | ||||||||||||||||||||||
| ::cuda::ptx::cp_async_bulk_cp_mask( | ||||||||||||||||||||||
| ::cuda::ptx::space_global, | ||||||||||||||||||||||
| ::cuda::ptx::space_shared, | ||||||||||||||||||||||
| cpAsyncOobInfo.ptrGmemStartAlignDown, | ||||||||||||||||||||||
| srcSmem, | ||||||||||||||||||||||
| /*size*/ 16, | ||||||||||||||||||||||
| byteMaskStart); | ||||||||||||||||||||||
| }), | ||||||||||||||||||||||
| (const int rank = squad.threadRank(); if (rank < 16 && ((byteMaskStart >> rank) & 1u)) { | ||||||||||||||||||||||
| cpAsyncOobInfo.ptrGmemStartAlignDown[rank] = srcSmem[rank]; | ||||||||||||||||||||||
| })); | ||||||||||||||||||||||
|
Comment on lines
+296
to
+298
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: Please put additional braces around the macro arguments:
Suggested change
This helps clang-format.
Comment on lines
+285
to
+298
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important question: Since you introduced code paths that access SMEM using regular loads and stores, is the line above: still valid? We need to acquire SMEM in the async proxy for the bulk copy, but now we are mixing bulk copies and regular loads from SMEM. Is this still legal? @ahendriksen may be able to help here. |
||||||||||||||||||||||
| } | ||||||||||||||||||||||
| if (doEndCopy) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
|
|
@@ -299,32 +305,42 @@ squadStoreBulkSync(Squad squad, CpAsyncOobInfo<OutputT> cpAsyncOobInfo, const :: | |||||||||||||||||||||
| asm volatile("" : "+l"(cpAsyncOobInfo.ptrGmemEndAlignDown)); | ||||||||||||||||||||||
| # endif // _CCCL_CUDA_COMPILER(NVHPC) | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| // Copy a subset of the last 16 bytes | ||||||||||||||||||||||
| if (::cuda::ptx::elect_sync(~0)) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| ::cuda::ptx::cp_async_bulk_cp_mask( | ||||||||||||||||||||||
| ::cuda::ptx::space_global, | ||||||||||||||||||||||
| ::cuda::ptx::space_shared, | ||||||||||||||||||||||
| cpAsyncOobInfo.ptrGmemEndAlignDown, | ||||||||||||||||||||||
| ptrSmemMiddle + cpAsyncOobInfo.underCopySizeBytes, | ||||||||||||||||||||||
| /*size*/ 16, | ||||||||||||||||||||||
| byteMaskEnd); | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| // Copy a subset of the first 16 bytes | ||||||||||||||||||||||
| NV_IF_ELSE_TARGET( | ||||||||||||||||||||||
| NV_PROVIDES_SM_100, | ||||||||||||||||||||||
| (if (::cuda::ptx::elect_sync(~0)) { | ||||||||||||||||||||||
| ::cuda::ptx::cp_async_bulk_cp_mask( | ||||||||||||||||||||||
| ::cuda::ptx::space_global, | ||||||||||||||||||||||
| ::cuda::ptx::space_shared, | ||||||||||||||||||||||
| cpAsyncOobInfo.ptrGmemEndAlignDown, | ||||||||||||||||||||||
| ptrSmemMiddle + cpAsyncOobInfo.underCopySizeBytes, | ||||||||||||||||||||||
| /*size*/ 16, | ||||||||||||||||||||||
| byteMaskEnd); | ||||||||||||||||||||||
| }), | ||||||||||||||||||||||
| (const int rank = squad.threadRank(); | ||||||||||||||||||||||
| const ::cuda::std::byte* tail_smem_source = ptrSmemMiddle + cpAsyncOobInfo.underCopySizeBytes; | ||||||||||||||||||||||
| if (rank < 16 && ((byteMaskEnd >> rank) & 1u)) { | ||||||||||||||||||||||
| cpAsyncOobInfo.ptrGmemEndAlignDown[rank] = tail_smem_source[rank]; | ||||||||||||||||||||||
| })); | ||||||||||||||||||||||
|
Comment on lines
+320
to
+324
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: we compute the copy masks based on offsets and counts, so it seems counter intuitive to use them for the fallback loops here. We should just use the information we used to create the masks in the first place. Here, this would be (I think):
Suggested change
Applies to the other occurrences of this logic as well. |
||||||||||||||||||||||
| } | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| else | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| // Copy a subset of the first 16 bytes | ||||||||||||||||||||||
| if (::cuda::ptx::elect_sync(~0)) | ||||||||||||||||||||||
| { | ||||||||||||||||||||||
| ::cuda::ptx::cp_async_bulk_cp_mask( | ||||||||||||||||||||||
| ::cuda::ptx::space_global, | ||||||||||||||||||||||
| ::cuda::ptx::space_shared, | ||||||||||||||||||||||
| cpAsyncOobInfo.ptrGmemStartAlignDown, | ||||||||||||||||||||||
| srcSmem, | ||||||||||||||||||||||
| /*size*/ 16, | ||||||||||||||||||||||
| byteMaskSmall); | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| NV_IF_ELSE_TARGET( | ||||||||||||||||||||||
| NV_PROVIDES_SM_100, | ||||||||||||||||||||||
| (if (::cuda::ptx::elect_sync(~0)) { | ||||||||||||||||||||||
| ::cuda::ptx::cp_async_bulk_cp_mask( | ||||||||||||||||||||||
| ::cuda::ptx::space_global, | ||||||||||||||||||||||
| ::cuda::ptx::space_shared, | ||||||||||||||||||||||
| cpAsyncOobInfo.ptrGmemStartAlignDown, | ||||||||||||||||||||||
| srcSmem, | ||||||||||||||||||||||
| /*size*/ 16, | ||||||||||||||||||||||
| byteMaskSmall); | ||||||||||||||||||||||
| }), | ||||||||||||||||||||||
| (const int rank = squad.threadRank(); if (rank < 16 && ((byteMaskSmall >> rank) & 1u)) { | ||||||||||||||||||||||
| cpAsyncOobInfo.ptrGmemStartAlignDown[rank] = srcSmem[rank]; | ||||||||||||||||||||||
| })); | ||||||||||||||||||||||
| } | ||||||||||||||||||||||
| // Commit and wait for store to have completed reading from shared memory | ||||||||||||||||||||||
| ::cuda::ptx::cp_async_bulk_commit_group(); | ||||||||||||||||||||||
|
|
||||||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -140,10 +140,13 @@ struct DeviceScanKernelSource | |
| return arg; | ||
| } | ||
|
|
||
| CUB_RUNTIME_FUNCTION static constexpr auto lookahead_make_tile_state_kernel_arg(void* ts) | ||
| CUB_RUNTIME_FUNCTION static constexpr auto | ||
| lookahead_make_tile_state_kernel_arg(void* ts, ::cuda::std::uint32_t* atomic_counter = nullptr) | ||
| { | ||
| tile_state_kernel_arg_t<ScanTileStateT, AccumT> arg; | ||
| ::cuda::std::__construct_at(&arg.lookahead, static_cast<warpspeed::tile_state_t<AccumT>*>(ts)); | ||
| ::cuda::std::__construct_at( | ||
| &arg.lookahead, | ||
| lookahead_tile_state_arg_t<AccumT>{static_cast<warpspeed::tile_state_t<AccumT>*>(ts), atomic_counter}); | ||
| return arg; | ||
| } | ||
| }; | ||
|
|
@@ -1083,6 +1086,7 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead( | |
| OffsetT num_items, | ||
| cudaStream_t stream, | ||
| bool dependent_launch, | ||
| bool atomic_scheduling, | ||
| KernelSource kernel_source, | ||
| KernelLauncherFactory launcher_factory) | ||
| { | ||
|
|
@@ -1101,25 +1105,33 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead( | |
| CUB_DETAIL_STATIC_ISH_ASSERT(lookahead_policy.lookahead_items_per_thread >= 1, | ||
| "Lookahead scan policy must look ahead at least 1 item per thread"); | ||
|
|
||
| const int grid_dim = | ||
| const int num_tiles = | ||
| static_cast<int>(::cuda::ceil_div(num_items, static_cast<OffsetT>(lookahead_policy.tile_size()))); | ||
|
|
||
| if (d_temp_storage == nullptr) | ||
| size_t allocation_sizes[2] = { | ||
| static_cast<size_t>(num_tiles) * kernel_source.lookahead_tile_state_size(), sizeof(::cuda::std::uint32_t)}; | ||
| void* allocations[2] = {}; | ||
| if (const auto error = | ||
| CubDebug(detail::alias_temporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) | ||
| { | ||
| temp_storage_bytes = static_cast<size_t>(grid_dim) * kernel_source.lookahead_tile_state_size(); | ||
| return cudaSuccess; | ||
| return error; | ||
| } | ||
|
|
||
| if (num_items == 0) | ||
| if (d_temp_storage == nullptr) | ||
| { | ||
| return cudaSuccess; | ||
| } | ||
|
Comment on lines
+1120
to
1123
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: The check here is correct, but where did the |
||
|
|
||
| void* d_tile_state = allocations[0]; | ||
| ::cuda::std::uint32_t* d_atomic_counter = static_cast<::cuda::std::uint32_t*>(allocations[1]); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. critical: add a test checking alignment, as previous implementation doesn't enforce alignment on temporary allocations.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think an assertion should be fine. Each temporary storage allocation is 256 byte aligned. |
||
|
|
||
| int sm_count = 0; | ||
| if (const auto error = CubDebug(launcher_factory.MultiProcessorCount(sm_count))) | ||
| { | ||
| return error; | ||
| } | ||
|
|
||
| const int scan_grid_dim = atomic_scheduling ? ::cuda::std::min(sm_count, num_tiles) : num_tiles; | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: try using occupancy or higher CTA count per SM if it improves performance. |
||
| // Maximum dynamic shared memory size that we can use for temporary storage. | ||
| int max_dynamic_smem_size{}; | ||
| if (const auto error = | ||
|
|
@@ -1129,7 +1141,7 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead( | |
| } | ||
|
|
||
| // TODO(bgruber): we probably need to ensure alignment of d_temp_storage | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion: Drop once a test is added, as we already enforce aligned allocation of temporaries above.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can be dropped today. |
||
| _CCCL_ASSERT(::cuda::is_aligned(d_temp_storage, kernel_source.lookahead_tile_state_alignment()), ""); | ||
| _CCCL_ASSERT(::cuda::is_aligned(d_tile_state, kernel_source.lookahead_tile_state_alignment()), ""); | ||
|
|
||
| auto scan_kernel = kernel_source.ScanKernel(); | ||
| [[maybe_unused]] auto kernel_src = kernel_source; // need to pull a copy to not access `this` during const. eval. | ||
|
|
@@ -1188,7 +1200,7 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead( | |
| // Invoke init kernel | ||
| { | ||
| constexpr auto init_kernel_threads = 128; | ||
| const auto init_grid_size = ::cuda::ceil_div(grid_dim, init_kernel_threads); | ||
| const auto init_grid_size = ::cuda::ceil_div(num_tiles, init_kernel_threads); | ||
|
|
||
| # ifdef CUB_DEBUG_LOG | ||
| _CubLog("Invoking DeviceScanInitKernel<<<%d, %d, 0, %lld>>>()\n", | ||
|
|
@@ -1200,8 +1212,8 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead( | |
| if (const auto error = CubDebug( | ||
| launcher_factory(init_grid_size, init_kernel_threads, 0, stream, dependent_launch) | ||
| .doit(kernel_source.InitKernel(), | ||
| kernel_source.lookahead_make_tile_state_kernel_arg(d_temp_storage), | ||
| grid_dim))) | ||
| kernel_source.lookahead_make_tile_state_kernel_arg(d_tile_state, d_atomic_counter), | ||
| num_tiles))) | ||
| { | ||
| return error; | ||
| } | ||
|
|
@@ -1223,15 +1235,16 @@ CUB_RUNTIME_FUNCTION _CCCL_HOST _CCCL_FORCEINLINE cudaError_t invoke_lookahead( | |
| { | ||
| const int block_dim = detail::scan::num_total_threads(lookahead_policy); | ||
| # ifdef CUB_DEBUG_LOG | ||
| _CubLog("Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", grid_dim, block_dim, smem_size, (long long) stream); | ||
| _CubLog( | ||
| "Invoking DeviceScanKernel<<<%d, %d, %d, %lld>>>()\n", scan_grid_dim, block_dim, smem_size, (long long) stream); | ||
| # endif // CUB_DEBUG_LOG | ||
|
|
||
| if (const auto error = CubDebug( | ||
| launcher_factory(grid_dim, block_dim, smem_size, stream, dependent_launch) | ||
| launcher_factory(scan_grid_dim, block_dim, smem_size, stream, dependent_launch) | ||
| .doit(scan_kernel, | ||
| THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_in), | ||
| THRUST_NS_QUALIFIER::try_unwrap_contiguous_iterator(d_out), | ||
| kernel_source.lookahead_make_tile_state_kernel_arg(d_temp_storage), | ||
| kernel_source.lookahead_make_tile_state_kernel_arg(d_tile_state, d_atomic_counter), | ||
| /* start_tile, unused */ 0, | ||
| ::cuda::std::move(scan_op), | ||
| init_value, | ||
|
|
@@ -1285,6 +1298,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke( | |
| const bool dependent_launch = cc >= ::cuda::compute_capability{9, 0}; | ||
| if CUB_DETAIL_CONSTEXPR_ISH (policy_getter().algorithm == ScanAlgorithm::lookahead) | ||
| { | ||
| const bool atomic_scheduling = cc == ::cuda::compute_capability{9, 0}; | ||
| return invoke_lookahead( | ||
| policy_getter, | ||
| d_temp_storage, | ||
|
|
@@ -1296,6 +1310,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke( | |
| num_items, | ||
| stream, | ||
| dependent_launch, | ||
| atomic_scheduling, | ||
| kernel_source, | ||
| launcher_factory); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion: scalar copy is duplicated in 3 cases here, consider a helper function reducing code duplication.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
agreed. I guess with "scalar" you mean the 16-byte masked copy?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes