From 6395197ca9830eb238a9eaa592022b20464e1beb Mon Sep 17 00:00:00 2001 From: temujinkz Date: Fri, 26 Jun 2026 13:32:24 +0500 Subject: [PATCH 1/3] Add cuda::bit_ffs to Implements a type-safe find-first-set that returns 1 + the index of the least significant set bit, or 0 for a zero input, matching __builtin_ffs and CUDA's __ffs (unlike countr_zero). Adds tests and docs. Closes #6108 Signed-off-by: temujinkz --- docs/libcudacxx/extended_api/bit.rst | 6 ++ docs/libcudacxx/extended_api/bit/bit_ffs.rst | 55 ++++++++++++++++ libcudacxx/include/cuda/__bit/bit_ffs.h | 45 +++++++++++++ libcudacxx/include/cuda/bit | 1 + .../test/libcudacxx/cuda/bit/bit_ffs.pass.cpp | 66 +++++++++++++++++++ 5 files changed, 173 insertions(+) create mode 100644 docs/libcudacxx/extended_api/bit/bit_ffs.rst create mode 100644 libcudacxx/include/cuda/__bit/bit_ffs.h create mode 100644 libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp diff --git a/docs/libcudacxx/extended_api/bit.rst b/docs/libcudacxx/extended_api/bit.rst index 521a663c1fc..85455fa8e2f 100644 --- a/docs/libcudacxx/extended_api/bit.rst +++ b/docs/libcudacxx/extended_api/bit.rst @@ -8,6 +8,7 @@ Bit :maxdepth: 1 bit/bitmask + bit/bit_ffs bit/bit_reverse bit/bitfield_insert bit/bitfield_extract @@ -26,6 +27,11 @@ Bit - CCCL 3.0.0 - CUDA 13.0 + * - :ref:`bit_ffs ` + - Find the first (least significant) set bit + - CCCL 3.5.0 + - CUDA 13.x + * - :ref:`bit_reverse ` - Reverse the order of bits - CCCL 3.0.0 diff --git a/docs/libcudacxx/extended_api/bit/bit_ffs.rst b/docs/libcudacxx/extended_api/bit/bit_ffs.rst new file mode 100644 index 00000000000..d2920331302 --- /dev/null +++ b/docs/libcudacxx/extended_api/bit/bit_ffs.rst @@ -0,0 +1,55 @@ +.. _libcudacxx-extended-api-bit-bit_ffs: + +``cuda::bit_ffs`` +================= + +Defined in the ```` header. + +.. code:: cuda + + namespace cuda { + + template + [[nodiscard]] __host__ __device__ constexpr + int bit_ffs(T value) noexcept; + + } // namespace cuda + +The function returns one plus the index of the least significant set bit of ``value``, or ``0`` if ``value`` is zero. This matches the semantics of ``__builtin_ffs`` and CUDA's ``__ffs``. + +**Parameters** + +- ``value``: the unsigned integer value to scan. + +**Return value** + +- ``0`` if ``value`` is zero, otherwise the 1-based position of the least significant set bit. + +**Constraints** + +- ``T`` is an unsigned integral type. + +.. note:: + + Unlike ``cuda::std::countr_zero``, which returns the number of trailing zero bits, ``bit_ffs`` uses a 1-based position and is well defined for a zero input. + +Example +------- + +.. code:: cuda + + #include + #include + #include + + __global__ void bit_ffs_kernel() { + assert(cuda::bit_ffs(uint32_t{0}) == 0); + assert(cuda::bit_ffs(uint32_t{1}) == 1); + assert(cuda::bit_ffs(uint32_t{0b10101000}) == 4); + } + + int main() { + bit_ffs_kernel<<<1, 1>>>(); + cudaDeviceSynchronize(); + return 0; + } diff --git a/libcudacxx/include/cuda/__bit/bit_ffs.h b/libcudacxx/include/cuda/__bit/bit_ffs.h new file mode 100644 index 00000000000..f04a94c3232 --- /dev/null +++ b/libcudacxx/include/cuda/__bit/bit_ffs.h @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA___BIT_BIT_FFS_H +#define _CUDA___BIT_BIT_FFS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +// Returns one plus the index of the least significant set bit of __value, or 0 if __value is zero. +// This matches the semantics of __builtin_ffs and CUDA's __ffs. Unlike cuda::std::countr_zero, the +// result is 1-based and the zero input is well defined (it returns 0). +_CCCL_TEMPLATE(class _Tp) +_CCCL_REQUIRES(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>) +[[nodiscard]] _CCCL_API constexpr int bit_ffs(_Tp __value) noexcept +{ + return (__value == _Tp{0}) ? 0 : ::cuda::std::countr_zero(__value) + 1; +} + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // _CUDA___BIT_BIT_FFS_H diff --git a/libcudacxx/include/cuda/bit b/libcudacxx/include/cuda/bit index 6e7325fef87..13922404cc0 100644 --- a/libcudacxx/include/cuda/bit +++ b/libcudacxx/include/cuda/bit @@ -21,6 +21,7 @@ # pragma system_header #endif // no system header +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp new file mode 100644 index 00000000000..04018bd25f4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp @@ -0,0 +1,66 @@ +//===----------------------------------------------------------------------===// +// +// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "test_macros.h" + +template +TEST_FUNC constexpr bool test() +{ + using nl = cuda::std::numeric_limits; + constexpr T all_ones = static_cast(~T{0}); + + // a zero input is well defined and returns 0 + static_assert(cuda::bit_ffs(T{0}) == 0); + // the least significant bit set maps to position 1 + static_assert(cuda::bit_ffs(T{1}) == 1); + static_assert(cuda::bit_ffs(all_ones) == 1); + // a single bit set at position k maps to result k + 1 + static_assert(cuda::bit_ffs(static_cast(T{1} << 1)) == 2); + static_assert(cuda::bit_ffs(static_cast(T{1} << 3)) == 4); + static_assert(cuda::bit_ffs(static_cast(T{1} << (nl::digits - 1))) == nl::digits); + // the lowest set bit wins when several bits are set + static_assert(cuda::bit_ffs(static_cast(0b10101000)) == 4); + + unused(all_ones); + return true; +} + +TEST_FUNC constexpr bool test() +{ + test(); + test(); + test(); + test(); + test(); + + test(); + test(); + test(); + test(); + test(); + test(); + test(); + +#if _CCCL_HAS_INT128() + test<__uint128_t>(); +#endif // _CCCL_HAS_INT128() + return true; +} + +int main(int, char**) +{ + assert(test()); + static_assert(test()); + return 0; +} From bfb8bd14baaacfafdc7863fa585e8cdb2ab928ec Mon Sep 17 00:00:00 2001 From: temujinkz Date: Sat, 27 Jun 2026 13:35:41 +0500 Subject: [PATCH 2/3] bit_ffs: address review (ffs builtins, constexpr tests, trait checks) Review feedback from @miscco and @fbusato on #9609: - route through __builtin_ffs / __builtin_ffsll on host and __ffs / __ffsll on device (constexpr fallback) so codegen matches __ffs / __ffsll - test: use assert instead of static_assert so the runtime path is exercised, mark the per-type helper void, [[maybe_unused]] the constant - test: add a concept check that bit_ffs rejects bool, signed, float, and enums - docs: add an all-bits-set example Signed-off-by: temujinkz --- docs/libcudacxx/extended_api/bit/bit_ffs.rst | 1 + libcudacxx/include/cuda/__bit/bit_ffs.h | 84 ++++++++++++++++++- .../test/libcudacxx/cuda/bit/bit_ffs.pass.cpp | 40 ++++++--- 3 files changed, 111 insertions(+), 14 deletions(-) diff --git a/docs/libcudacxx/extended_api/bit/bit_ffs.rst b/docs/libcudacxx/extended_api/bit/bit_ffs.rst index d2920331302..5636cdc5a1e 100644 --- a/docs/libcudacxx/extended_api/bit/bit_ffs.rst +++ b/docs/libcudacxx/extended_api/bit/bit_ffs.rst @@ -46,6 +46,7 @@ Example assert(cuda::bit_ffs(uint32_t{0}) == 0); assert(cuda::bit_ffs(uint32_t{1}) == 1); assert(cuda::bit_ffs(uint32_t{0b10101000}) == 4); + assert(cuda::bit_ffs(~uint32_t{0}) == 1); } int main() { diff --git a/libcudacxx/include/cuda/__bit/bit_ffs.h b/libcudacxx/include/cuda/__bit/bit_ffs.h index f04a94c3232..2673996518b 100644 --- a/libcudacxx/include/cuda/__bit/bit_ffs.h +++ b/libcudacxx/include/cuda/__bit/bit_ffs.h @@ -23,11 +23,79 @@ #include #include +#include + +#if _CCCL_COMPILER(MSVC) +# include +#endif // _CCCL_COMPILER(MSVC) #include +#if _CCCL_CHECK_BUILTIN(builtin_ffs) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC) +# define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__) +# define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__) +#endif // _CCCL_CHECK_BUILTIN(builtin_ffs) + +// nvcc does not support __builtin_ffs in device code +#if _CCCL_CUDA_COMPILER(NVCC) && _CCCL_DEVICE_COMPILATION() +# undef _CCCL_BUILTIN_FFS +# undef _CCCL_BUILTIN_FFSLL +#endif // _CCCL_CUDA_COMPILER(NVCC) && _CCCL_DEVICE_COMPILATION() + _CCCL_BEGIN_NAMESPACE_CUDA +template +[[nodiscard]] _CCCL_API constexpr int __bit_ffs_impl_constexpr(_Tp __value) noexcept +{ + return (__value == _Tp{0}) ? 0 : ::cuda::std::countr_zero(__value) + 1; +} + +#if !_CCCL_COMPILER(NVRTC) +template +[[nodiscard]] _CCCL_HOST_API int __bit_ffs_impl_host(_Tp __value) noexcept +{ +# if defined(_CCCL_BUILTIN_FFS) + if constexpr (sizeof(_Tp) <= sizeof(int)) + { + return _CCCL_BUILTIN_FFS(static_cast(static_cast(__value))); + } + else + { + return _CCCL_BUILTIN_FFSLL(static_cast(static_cast(__value))); + } +# elif _CCCL_COMPILER(MSVC) + unsigned long __where{}; + unsigned char __found{}; + if constexpr (sizeof(_Tp) <= sizeof(uint32_t)) + { + __found = ::_BitScanForward(&__where, static_cast(__value)); + } + else + { + __found = ::_BitScanForward64(&__where, static_cast(__value)); + } + return __found ? static_cast(__where) + 1 : 0; +# else + return ::cuda::__bit_ffs_impl_constexpr(__value); +# endif // _CCCL_BUILTIN_FFS +} +#endif // !_CCCL_COMPILER(NVRTC) + +#if _CCCL_CUDA_COMPILATION() +template +[[nodiscard]] _CCCL_DEVICE_API int __bit_ffs_impl_device(_Tp __value) noexcept +{ + if constexpr (sizeof(_Tp) <= sizeof(int)) + { + return ::__ffs(static_cast(static_cast(__value))); + } + else + { + return ::__ffsll(static_cast(static_cast(__value))); + } +} +#endif // _CCCL_CUDA_COMPILATION() + // Returns one plus the index of the least significant set bit of __value, or 0 if __value is zero. // This matches the semantics of __builtin_ffs and CUDA's __ffs. Unlike cuda::std::countr_zero, the // result is 1-based and the zero input is well defined (it returns 0). @@ -35,7 +103,21 @@ _CCCL_TEMPLATE(class _Tp) _CCCL_REQUIRES(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>) [[nodiscard]] _CCCL_API constexpr int bit_ffs(_Tp __value) noexcept { - return (__value == _Tp{0}) ? 0 : ::cuda::std::countr_zero(__value) + 1; + if constexpr (sizeof(_Tp) <= sizeof(unsigned long long)) + { +#if !_CCCL_TILE_COMPILATION() // error: asm statement is unsupported in tile code + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + NV_IF_ELSE_TARGET( + NV_IS_HOST, (return ::cuda::__bit_ffs_impl_host(__value);), (return ::cuda::__bit_ffs_impl_device(__value);)) + } +#endif // !_CCCL_TILE_COMPILATION() + return ::cuda::__bit_ffs_impl_constexpr(__value); + } + else + { + return ::cuda::__bit_ffs_impl_constexpr(__value); + } } _CCCL_END_NAMESPACE_CUDA diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp index 04018bd25f4..9dd32ad401f 100644 --- a/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp @@ -14,26 +14,40 @@ #include "test_macros.h" +// bit_ffs must only accept unsigned integer types, not bool, signed integers, floating point, or enums. +enum class Enum +{ + value +}; + +template +_CCCL_CONCEPT can_bit_ffs = _CCCL_REQUIRES_EXPR((T), T value)(cuda::bit_ffs(value)); + +static_assert(can_bit_ffs); +static_assert(can_bit_ffs); +static_assert(can_bit_ffs); +static_assert(!can_bit_ffs); +static_assert(!can_bit_ffs); +static_assert(!can_bit_ffs); +static_assert(!can_bit_ffs); + template -TEST_FUNC constexpr bool test() +TEST_FUNC constexpr void test() { - using nl = cuda::std::numeric_limits; - constexpr T all_ones = static_cast(~T{0}); + using nl = cuda::std::numeric_limits; + [[maybe_unused]] constexpr T all_ones = static_cast(~T{0}); // a zero input is well defined and returns 0 - static_assert(cuda::bit_ffs(T{0}) == 0); + assert(cuda::bit_ffs(T{0}) == 0); // the least significant bit set maps to position 1 - static_assert(cuda::bit_ffs(T{1}) == 1); - static_assert(cuda::bit_ffs(all_ones) == 1); + assert(cuda::bit_ffs(T{1}) == 1); + assert(cuda::bit_ffs(all_ones) == 1); // a single bit set at position k maps to result k + 1 - static_assert(cuda::bit_ffs(static_cast(T{1} << 1)) == 2); - static_assert(cuda::bit_ffs(static_cast(T{1} << 3)) == 4); - static_assert(cuda::bit_ffs(static_cast(T{1} << (nl::digits - 1))) == nl::digits); + assert(cuda::bit_ffs(static_cast(T{1} << 1)) == 2); + assert(cuda::bit_ffs(static_cast(T{1} << 3)) == 4); + assert(cuda::bit_ffs(static_cast(T{1} << (nl::digits - 1))) == nl::digits); // the lowest set bit wins when several bits are set - static_assert(cuda::bit_ffs(static_cast(0b10101000)) == 4); - - unused(all_ones); - return true; + assert(cuda::bit_ffs(static_cast(0b10101000)) == 4); } TEST_FUNC constexpr bool test() From 9833415c819dad7fc79b7e80b1eeb5e269b84a62 Mon Sep 17 00:00:00 2001 From: temujinkz Date: Sat, 27 Jun 2026 16:37:03 +0500 Subject: [PATCH 3/3] bit_ffs: include and directly Per the IWYU coding guideline (and CodeRabbit), the test used numeric_limits and size_t through transitive includes. Add the direct headers. Signed-off-by: temujinkz --- libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp b/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp index 9dd32ad401f..17c8f965224 100644 --- a/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp @@ -9,7 +9,9 @@ #include #include +#include #include +#include #include #include "test_macros.h"