-
Notifications
You must be signed in to change notification settings - Fork 419
Add cuda::bit_ffs to <cuda/bit> #9609
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| .. _libcudacxx-extended-api-bit-bit_ffs: | ||
|
|
||
| ``cuda::bit_ffs`` | ||
| ================= | ||
|
|
||
| Defined in the ``<cuda/bit>`` header. | ||
|
|
||
| .. code:: cuda | ||
|
|
||
| namespace cuda { | ||
|
|
||
| template <typename T> | ||
| [[nodiscard]] __host__ __device__ constexpr | ||
| int bit_ffs(T value) noexcept; | ||
|
|
||
| } // namespace cuda | ||
|
|
||
| The function returns one plus the index of the least significant set bit of ``value``, or ``0`` if ``value`` is zero. This matches the semantics of ``__builtin_ffs`` and CUDA's ``__ffs``. | ||
|
|
||
| **Parameters** | ||
|
|
||
| - ``value``: the unsigned integer value to scan. | ||
|
|
||
| **Return value** | ||
|
|
||
| - ``0`` if ``value`` is zero, otherwise the 1-based position of the least significant set bit. | ||
|
|
||
| **Constraints** | ||
|
|
||
| - ``T`` is an unsigned integral type. | ||
|
|
||
| .. note:: | ||
|
|
||
| Unlike ``cuda::std::countr_zero``, which returns the number of trailing zero bits, ``bit_ffs`` uses a 1-based position and is well defined for a zero input. | ||
|
|
||
| Example | ||
| ------- | ||
|
|
||
| .. code:: cuda | ||
|
|
||
| #include <cuda/bit> | ||
| #include <cuda/std/cassert> | ||
| #include <cuda/std/cstdint> | ||
|
|
||
| __global__ void bit_ffs_kernel() { | ||
| assert(cuda::bit_ffs(uint32_t{0}) == 0); | ||
| assert(cuda::bit_ffs(uint32_t{1}) == 1); | ||
| assert(cuda::bit_ffs(uint32_t{0b10101000}) == 4); | ||
| } | ||
|
|
||
| int main() { | ||
| bit_ffs_kernel<<<1, 1>>>(); | ||
| cudaDeviceSynchronize(); | ||
| return 0; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| //===----------------------------------------------------------------------===// | ||
| // | ||
| // Part of libcu++, the C++ Standard Library for your entire system, | ||
| // under the Apache License v2.0 with LLVM Exceptions. | ||
| // See https://llvm.org/LICENSE.txt for license information. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||
| // | ||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| #ifndef _CUDA___BIT_BIT_FFS_H | ||
| #define _CUDA___BIT_BIT_FFS_H | ||
|
|
||
| #include <cuda/std/detail/__config> | ||
|
|
||
| #if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) | ||
| # pragma GCC system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) | ||
| # pragma clang system_header | ||
| #elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) | ||
| # pragma system_header | ||
| #endif // no system header | ||
|
|
||
| #include <cuda/std/__bit/countr.h> | ||
| #include <cuda/std/__type_traits/is_unsigned_integer.h> | ||
|
|
||
| #include <cuda/std/__cccl/prologue.h> | ||
|
|
||
| _CCCL_BEGIN_NAMESPACE_CUDA | ||
|
|
||
| // Returns one plus the index of the least significant set bit of __value, or 0 if __value is zero. | ||
| // This matches the semantics of __builtin_ffs and CUDA's __ffs. Unlike cuda::std::countr_zero, the | ||
| // result is 1-based and the zero input is well defined (it returns 0). | ||
| _CCCL_TEMPLATE(class _Tp) | ||
| _CCCL_REQUIRES(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>) | ||
| [[nodiscard]] _CCCL_API constexpr int bit_ffs(_Tp __value) noexcept | ||
| { | ||
| return (__value == _Tp{0}) ? 0 : ::cuda::std::countr_zero(__value) + 1; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the implementation can be optimized. please make sure that the implementation produces the same code of |
||
| } | ||
|
|
||
| _CCCL_END_NAMESPACE_CUDA | ||
|
|
||
| #include <cuda/std/__cccl/epilogue.h> | ||
|
|
||
| #endif // _CUDA___BIT_BIT_FFS_H | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,66 @@ | ||||||
| //===----------------------------------------------------------------------===// | ||||||
| // | ||||||
| // Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
| // See https://llvm.org/LICENSE.txt for license information. | ||||||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
| // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. | ||||||
| // | ||||||
| //===----------------------------------------------------------------------===// | ||||||
|
|
||||||
| #include <cuda/bit> | ||||||
| #include <cuda/std/cassert> | ||||||
| #include <cuda/std/cstdint> | ||||||
| #include <cuda/std/type_traits> | ||||||
|
|
||||||
| #include "test_macros.h" | ||||||
|
coderabbitai[bot] marked this conversation as resolved.
|
||||||
|
|
||||||
| template <typename T> | ||||||
| TEST_FUNC constexpr bool test() | ||||||
| { | ||||||
| using nl = cuda::std::numeric_limits<T>; | ||||||
| constexpr T all_ones = static_cast<T>(~T{0}); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
|
|
||||||
| // a zero input is well defined and returns 0 | ||||||
| static_assert(cuda::bit_ffs(T{0}) == 0); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: All those should only be
Suggested change
We are invoking the function in a constexpr context, so otherwise we would never test the runtime path |
||||||
| // the least significant bit set maps to position 1 | ||||||
| static_assert(cuda::bit_ffs(T{1}) == 1); | ||||||
| static_assert(cuda::bit_ffs(all_ones) == 1); | ||||||
| // a single bit set at position k maps to result k + 1 | ||||||
| static_assert(cuda::bit_ffs(static_cast<T>(T{1} << 1)) == 2); | ||||||
| static_assert(cuda::bit_ffs(static_cast<T>(T{1} << 3)) == 4); | ||||||
| static_assert(cuda::bit_ffs(static_cast<T>(T{1} << (nl::digits - 1))) == nl::digits); | ||||||
| // the lowest set bit wins when several bits are set | ||||||
| static_assert(cuda::bit_ffs(static_cast<T>(0b10101000)) == 4); | ||||||
|
|
||||||
| unused(all_ones); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| return true; | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: This function is only called from the |
||||||
| } | ||||||
|
|
||||||
| TEST_FUNC constexpr bool test() | ||||||
| { | ||||||
| test<unsigned char>(); | ||||||
| test<unsigned short>(); | ||||||
| test<unsigned>(); | ||||||
| test<unsigned long>(); | ||||||
| test<unsigned long long>(); | ||||||
|
|
||||||
| test<uint8_t>(); | ||||||
| test<uint16_t>(); | ||||||
| test<uint32_t>(); | ||||||
| test<uint64_t>(); | ||||||
| test<size_t>(); | ||||||
| test<uintmax_t>(); | ||||||
| test<uintptr_t>(); | ||||||
|
|
||||||
| #if _CCCL_HAS_INT128() | ||||||
| test<__uint128_t>(); | ||||||
| #endif // _CCCL_HAS_INT128() | ||||||
| return true; | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please add a test or two that we cannot invoke the function with something like: template<typename T>
_CCCL_CONCEPT can_bit_ffs = _CCCL_REQUIRES_EXPR((T), T val)((cuda::bit_ffs(val)); |
||||||
| } | ||||||
|
|
||||||
| int main(int, char**) | ||||||
| { | ||||||
| assert(test()); | ||||||
| static_assert(test()); | ||||||
| return 0; | ||||||
| } | ||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would also add an example for all bit set