-
Notifications
You must be signed in to change notification settings - Fork 419
Add cuda::bit_ffs to <cuda/bit> #9609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
temujinkz
wants to merge
3
commits into
NVIDIA:main
Choose a base branch
from
temujinkz:cuda-bit-ffs
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+272
−0
Open
Changes from 2 commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| .. _libcudacxx-extended-api-bit-bit_ffs: | ||
|
|
||
| ``cuda::bit_ffs`` | ||
| ================= | ||
|
|
||
| Defined in the ``<cuda/bit>`` header. | ||
|
|
||
| .. code:: cuda | ||
|
|
||
| namespace cuda { | ||
|
|
||
| template <typename T> | ||
| [[nodiscard]] __host__ __device__ constexpr | ||
| int bit_ffs(T value) noexcept; | ||
|
|
||
| } // namespace cuda | ||
|
|
||
| The function returns one plus the index of the least significant set bit of ``value``, or ``0`` if ``value`` is zero. This matches the semantics of ``__builtin_ffs`` and CUDA's ``__ffs``. | ||
|
|
||
| **Parameters** | ||
|
|
||
| - ``value``: the unsigned integer value to scan. | ||
|
|
||
| **Return value** | ||
|
|
||
| - ``0`` if ``value`` is zero, otherwise the 1-based position of the least significant set bit. | ||
|
|
||
| **Constraints** | ||
|
|
||
| - ``T`` is an unsigned integral type. | ||
|
|
||
| .. note:: | ||
|
|
||
| Unlike ``cuda::std::countr_zero``, which returns the number of trailing zero bits, ``bit_ffs`` uses a 1-based position and is well defined for a zero input. | ||
|
|
||
| Example | ||
| ------- | ||
|
|
||
| .. code:: cuda | ||
|
|
||
| #include <cuda/bit> | ||
| #include <cuda/std/cassert> | ||
| #include <cuda/std/cstdint> | ||
|
|
||
| __global__ void bit_ffs_kernel() { | ||
| assert(cuda::bit_ffs(uint32_t{0}) == 0); | ||
| assert(cuda::bit_ffs(uint32_t{1}) == 1); | ||
| assert(cuda::bit_ffs(uint32_t{0b10101000}) == 4); | ||
| assert(cuda::bit_ffs(~uint32_t{0}) == 1); | ||
| } | ||
|
|
||
| int main() { | ||
| bit_ffs_kernel<<<1, 1>>>(); | ||
| cudaDeviceSynchronize(); | ||
| return 0; | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,127 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of libcu++, the C++ Standard Library for your entire system, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef _CUDA___BIT_BIT_FFS_H | ||
| #define _CUDA___BIT_BIT_FFS_H | ||
|
|
||
| #include <cuda/std/detail/__config> | ||
|
|
||
| #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
| # pragma GCC system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
| # pragma clang system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
| # pragma system_header | ||
| #endif // no system header | ||
|
|
||
| #include <cuda/std/__bit/countr.h> | ||
| #include <cuda/std/__type_traits/is_unsigned_integer.h> | ||
| #include <cuda/std/cstdint> | ||
|
|
||
| #if _CCCL_COMPILER(MSVC) | ||
| # include <intrin.h> | ||
| #endif // _CCCL_COMPILER(MSVC) | ||
|
|
||
| #include <cuda/std/__cccl/prologue.h> | ||
|
|
||
| #if _CCCL_CHECK_BUILTIN(builtin_ffs) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) | ||
| # define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__) | ||
| # define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__) | ||
| #endif // _CCCL_CHECK_BUILTIN(builtin_ffs) | ||
|
|
||
| // nvcc does not support __builtin_ffs in device code | ||
| #if _CCCL_CUDA_COMPILER(NVCC) && _CCCL_DEVICE_COMPILATION() | ||
| # undef _CCCL_BUILTIN_FFS | ||
| # undef _CCCL_BUILTIN_FFSLL | ||
| #endif // _CCCL_CUDA_COMPILER(NVCC) && _CCCL_DEVICE_COMPILATION() | ||
|
|
||
| _CCCL_BEGIN_NAMESPACE_CUDA | ||
|
|
||
| template <class _Tp> | ||
| [[nodiscard]] _CCCL_API constexpr int __bit_ffs_impl_constexpr(_Tp __value) noexcept | ||
| { | ||
| return (__value == _Tp{0}) ? 0 : ::cuda::std::countr_zero(__value) + 1; | ||
| } | ||
|
|
||
| #if !_CCCL_COMPILER(NVRTC) | ||
| template <class _Tp> | ||
| [[nodiscard]] _CCCL_HOST_API int __bit_ffs_impl_host(_Tp __value) noexcept | ||
| { | ||
| # if defined(_CCCL_BUILTIN_FFS) | ||
| if constexpr (sizeof(_Tp) <= sizeof(int)) | ||
| { | ||
| return _CCCL_BUILTIN_FFS(static_cast<int>(static_cast<unsigned>(__value))); | ||
| } | ||
| else | ||
| { | ||
| return _CCCL_BUILTIN_FFSLL(static_cast<long long>(static_cast<unsigned long long>(__value))); | ||
| } | ||
| # elif _CCCL_COMPILER(MSVC) | ||
| unsigned long __where{}; | ||
| unsigned char __found{}; | ||
| if constexpr (sizeof(_Tp) <= sizeof(uint32_t)) | ||
| { | ||
| __found = ::_BitScanForward(&__where, static_cast<uint32_t>(__value)); | ||
| } | ||
| else | ||
| { | ||
| __found = ::_BitScanForward64(&__where, static_cast<uint64_t>(__value)); | ||
| } | ||
| return __found ? static_cast<int>(__where) + 1 : 0; | ||
| # else | ||
| return ::cuda::__bit_ffs_impl_constexpr(__value); | ||
| # endif // _CCCL_BUILTIN_FFS | ||
| } | ||
| #endif // !_CCCL_COMPILER(NVRTC) | ||
|
|
||
| #if _CCCL_CUDA_COMPILATION() | ||
| template <class _Tp> | ||
| [[nodiscard]] _CCCL_DEVICE_API int __bit_ffs_impl_device(_Tp __value) noexcept | ||
| { | ||
| if constexpr (sizeof(_Tp) <= sizeof(int)) | ||
| { | ||
| return ::__ffs(static_cast<int>(static_cast<unsigned>(__value))); | ||
| } | ||
| else | ||
| { | ||
| return ::__ffsll(static_cast<long long>(static_cast<unsigned long long>(__value))); | ||
| } | ||
| } | ||
| #endif // _CCCL_CUDA_COMPILATION() | ||
|
|
||
| // Returns one plus the index of the least significant set bit of __value, or 0 if __value is zero. | ||
| // This matches the semantics of __builtin_ffs and CUDA's __ffs. Unlike cuda::std::countr_zero, the | ||
| // result is 1-based and the zero input is well defined (it returns 0). | ||
| _CCCL_TEMPLATE(class _Tp) | ||
| _CCCL_REQUIRES(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>) | ||
| [[nodiscard]] _CCCL_API constexpr int bit_ffs(_Tp __value) noexcept | ||
| { | ||
| if constexpr (sizeof(_Tp) <= sizeof(unsigned long long)) | ||
| { | ||
| #if !_CCCL_TILE_COMPILATION() // error: asm statement is unsupported in tile code | ||
| _CCCL_IF_NOT_CONSTEVAL_DEFAULT | ||
| { | ||
| NV_IF_ELSE_TARGET( | ||
| NV_IS_HOST, (return ::cuda::__bit_ffs_impl_host(__value);), (return ::cuda::__bit_ffs_impl_device(__value);)) | ||
| } | ||
| #endif // !_CCCL_TILE_COMPILATION() | ||
| return ::cuda::__bit_ffs_impl_constexpr(__value); | ||
| } | ||
| else | ||
| { | ||
| return ::cuda::__bit_ffs_impl_constexpr(__value); | ||
| } | ||
| } | ||
|
|
||
| _CCCL_END_NAMESPACE_CUDA | ||
|
|
||
| #include <cuda/std/__cccl/epilogue.h> | ||
|
|
||
| #endif // _CUDA___BIT_BIT_FFS_H |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #include <cuda/bit> | ||
| #include <cuda/std/cassert> | ||
| #include <cuda/std/cstdint> | ||
| #include <cuda/std/type_traits> | ||
|
|
||
| #include "test_macros.h" | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
|
|
||
| // bit_ffs must only accept unsigned integer types, not bool, signed integers, floating point, or enums. | ||
| enum class Enum | ||
| { | ||
| value | ||
| }; | ||
|
|
||
| template <class T> | ||
| _CCCL_CONCEPT can_bit_ffs = _CCCL_REQUIRES_EXPR((T), T value)(cuda::bit_ffs(value)); | ||
|
|
||
| static_assert(can_bit_ffs<unsigned char>); | ||
| static_assert(can_bit_ffs<unsigned int>); | ||
| static_assert(can_bit_ffs<unsigned long long>); | ||
| static_assert(!can_bit_ffs<bool>); | ||
| static_assert(!can_bit_ffs<int>); | ||
| static_assert(!can_bit_ffs<float>); | ||
| static_assert(!can_bit_ffs<Enum>); | ||
|
|
||
| template <typename T> | ||
| TEST_FUNC constexpr void test() | ||
| { | ||
| using nl = cuda::std::numeric_limits<T>; | ||
| [[maybe_unused]] constexpr T all_ones = static_cast<T>(~T{0}); | ||
|
|
||
| // a zero input is well defined and returns 0 | ||
| assert(cuda::bit_ffs(T{0}) == 0); | ||
| // the least significant bit set maps to position 1 | ||
| assert(cuda::bit_ffs(T{1}) == 1); | ||
| assert(cuda::bit_ffs(all_ones) == 1); | ||
| // a single bit set at position k maps to result k + 1 | ||
| assert(cuda::bit_ffs(static_cast<T>(T{1} << 1)) == 2); | ||
| assert(cuda::bit_ffs(static_cast<T>(T{1} << 3)) == 4); | ||
| assert(cuda::bit_ffs(static_cast<T>(T{1} << (nl::digits - 1))) == nl::digits); | ||
| // the lowest set bit wins when several bits are set | ||
| assert(cuda::bit_ffs(static_cast<T>(0b10101000)) == 4); | ||
| } | ||
|
|
||
| TEST_FUNC constexpr bool test() | ||
| { | ||
| test<unsigned char>(); | ||
| test<unsigned short>(); | ||
| test<unsigned>(); | ||
| test<unsigned long>(); | ||
| test<unsigned long long>(); | ||
|
|
||
| test<uint8_t>(); | ||
| test<uint16_t>(); | ||
| test<uint32_t>(); | ||
| test<uint64_t>(); | ||
| test<size_t>(); | ||
| test<uintmax_t>(); | ||
| test<uintptr_t>(); | ||
|
|
||
| #if _CCCL_HAS_INT128() | ||
| test<__uint128_t>(); | ||
| #endif // _CCCL_HAS_INT128() | ||
| return true; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a test or two that we cannot invoke the function with something like: template<typename T>
_CCCL_CONCEPT can_bit_ffs = _CCCL_REQUIRES_EXPR((T), T val)((cuda::bit_ffs(val)); |
||
| } | ||
|
|
||
| int main(int, char**) | ||
| { | ||
| assert(test()); | ||
| static_assert(test()); | ||
| return 0; | ||
| } | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would also add an example for all bit set