Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/libcudacxx/extended_api/bit.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Bit
:maxdepth: 1

bit/bitmask
bit/bit_ffs
bit/bit_reverse
bit/bitfield_insert
bit/bitfield_extract
Expand All @@ -26,6 +27,11 @@ Bit
- CCCL 3.0.0
- CUDA 13.0

* - :ref:`bit_ffs <libcudacxx-extended-api-bit-bit_ffs>`
- Find the first (least significant) set bit
- CCCL 3.5.0
- CUDA 13.x

* - :ref:`bit_reverse <libcudacxx-extended-api-bit-bit_reverse>`
- Reverse the order of bits
- CCCL 3.0.0
Expand Down
56 changes: 56 additions & 0 deletions docs/libcudacxx/extended_api/bit/bit_ffs.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
.. _libcudacxx-extended-api-bit-bit_ffs:

``cuda::bit_ffs``
=================

Defined in the ``<cuda/bit>`` header.

.. code:: cuda

namespace cuda {

template <typename T>
[[nodiscard]] __host__ __device__ constexpr
int bit_ffs(T value) noexcept;

} // namespace cuda

The function returns one plus the index of the least significant set bit of ``value``, or ``0`` if ``value`` is zero. This matches the semantics of ``__builtin_ffs`` and CUDA's ``__ffs``.

**Parameters**

- ``value``: the unsigned integer value to scan.

**Return value**

- ``0`` if ``value`` is zero, otherwise the 1-based position of the least significant set bit.

**Constraints**

- ``T`` is an unsigned integral type.

.. note::

Unlike ``cuda::std::countr_zero``, which returns the number of trailing zero bits, ``bit_ffs`` uses a 1-based position and is well defined for a zero input.

Example
-------

.. code:: cuda

#include <cuda/bit>
#include <cuda/std/cassert>
#include <cuda/std/cstdint>

__global__ void bit_ffs_kernel() {
assert(cuda::bit_ffs(uint32_t{0}) == 0);
assert(cuda::bit_ffs(uint32_t{1}) == 1);
assert(cuda::bit_ffs(uint32_t{0b10101000}) == 4);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also add an example for all bit set

assert(cuda::bit_ffs(~uint32_t{0}) == 1);
}

int main() {
bit_ffs_kernel<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
127 changes: 127 additions & 0 deletions libcudacxx/include/cuda/__bit/bit_ffs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___BIT_BIT_FFS_H
#define _CUDA___BIT_BIT_FFS_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__bit/countr.h>
#include <cuda/std/__type_traits/is_unsigned_integer.h>
#include <cuda/std/cstdint>

#if _CCCL_COMPILER(MSVC)
# include <intrin.h>
#endif // _CCCL_COMPILER(MSVC)

#include <cuda/std/__cccl/prologue.h>

#if _CCCL_CHECK_BUILTIN(builtin_ffs) || _CCCL_COMPILER(GCC) || _CCCL_COMPILER(CLANG) || _CCCL_COMPILER(NVHPC)
# define _CCCL_BUILTIN_FFS(...) __builtin_ffs(__VA_ARGS__)
# define _CCCL_BUILTIN_FFSLL(...) __builtin_ffsll(__VA_ARGS__)
#endif // _CCCL_CHECK_BUILTIN(builtin_ffs)

// nvcc does not support __builtin_ffs in device code
#if _CCCL_CUDA_COMPILER(NVCC) && _CCCL_DEVICE_COMPILATION()
# undef _CCCL_BUILTIN_FFS
# undef _CCCL_BUILTIN_FFSLL
#endif // _CCCL_CUDA_COMPILER(NVCC) && _CCCL_DEVICE_COMPILATION()

_CCCL_BEGIN_NAMESPACE_CUDA

template <class _Tp>
[[nodiscard]] _CCCL_API constexpr int __bit_ffs_impl_constexpr(_Tp __value) noexcept
{
return (__value == _Tp{0}) ? 0 : ::cuda::std::countr_zero(__value) + 1;
}

#if !_CCCL_COMPILER(NVRTC)
template <class _Tp>
[[nodiscard]] _CCCL_HOST_API int __bit_ffs_impl_host(_Tp __value) noexcept
{
# if defined(_CCCL_BUILTIN_FFS)
if constexpr (sizeof(_Tp) <= sizeof(int))
{
return _CCCL_BUILTIN_FFS(static_cast<int>(static_cast<unsigned>(__value)));
}
else
{
return _CCCL_BUILTIN_FFSLL(static_cast<long long>(static_cast<unsigned long long>(__value)));
}
# elif _CCCL_COMPILER(MSVC)
unsigned long __where{};
unsigned char __found{};
if constexpr (sizeof(_Tp) <= sizeof(uint32_t))
{
__found = ::_BitScanForward(&__where, static_cast<uint32_t>(__value));
}
else
{
__found = ::_BitScanForward64(&__where, static_cast<uint64_t>(__value));
}
return __found ? static_cast<int>(__where) + 1 : 0;
# else
return ::cuda::__bit_ffs_impl_constexpr(__value);
# endif // _CCCL_BUILTIN_FFS
}
#endif // !_CCCL_COMPILER(NVRTC)

#if _CCCL_CUDA_COMPILATION()
template <class _Tp>
[[nodiscard]] _CCCL_DEVICE_API int __bit_ffs_impl_device(_Tp __value) noexcept
{
if constexpr (sizeof(_Tp) <= sizeof(int))
{
return ::__ffs(static_cast<int>(static_cast<unsigned>(__value)));
}
else
{
return ::__ffsll(static_cast<long long>(static_cast<unsigned long long>(__value)));
}
}
#endif // _CCCL_CUDA_COMPILATION()

// Returns one plus the index of the least significant set bit of __value, or 0 if __value is zero.
// This matches the semantics of __builtin_ffs and CUDA's __ffs. Unlike cuda::std::countr_zero, the
// result is 1-based and the zero input is well defined (it returns 0).
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>)
[[nodiscard]] _CCCL_API constexpr int bit_ffs(_Tp __value) noexcept
{
if constexpr (sizeof(_Tp) <= sizeof(unsigned long long))
{
#if !_CCCL_TILE_COMPILATION() // error: asm statement is unsupported in tile code
_CCCL_IF_NOT_CONSTEVAL_DEFAULT
{
NV_IF_ELSE_TARGET(
NV_IS_HOST, (return ::cuda::__bit_ffs_impl_host(__value);), (return ::cuda::__bit_ffs_impl_device(__value);))
}
#endif // !_CCCL_TILE_COMPILATION()
return ::cuda::__bit_ffs_impl_constexpr(__value);
}
else
{
return ::cuda::__bit_ffs_impl_constexpr(__value);
}
}

_CCCL_END_NAMESPACE_CUDA

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA___BIT_BIT_FFS_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/bit
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# pragma system_header
#endif // no system header

#include <cuda/__bit/bit_ffs.h>
#include <cuda/__bit/bit_reverse.h>
#include <cuda/__bit/bitfield.h>
#include <cuda/__bit/bitmask.h>
Expand Down
80 changes: 80 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
//===----------------------------------------------------------------------===//
//
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <cuda/bit>
#include <cuda/std/cassert>
#include <cuda/std/cstdint>
#include <cuda/std/type_traits>

#include "test_macros.h"
Comment thread
coderabbitai[bot] marked this conversation as resolved.

// bit_ffs must only accept unsigned integer types, not bool, signed integers, floating point, or enums.
enum class Enum
{
value
};

template <class T>
_CCCL_CONCEPT can_bit_ffs = _CCCL_REQUIRES_EXPR((T), T value)(cuda::bit_ffs(value));

static_assert(can_bit_ffs<unsigned char>);
static_assert(can_bit_ffs<unsigned int>);
static_assert(can_bit_ffs<unsigned long long>);
static_assert(!can_bit_ffs<bool>);
static_assert(!can_bit_ffs<int>);
static_assert(!can_bit_ffs<float>);
static_assert(!can_bit_ffs<Enum>);

template <typename T>
TEST_FUNC constexpr void test()
{
using nl = cuda::std::numeric_limits<T>;
[[maybe_unused]] constexpr T all_ones = static_cast<T>(~T{0});

// a zero input is well defined and returns 0
assert(cuda::bit_ffs(T{0}) == 0);
// the least significant bit set maps to position 1
assert(cuda::bit_ffs(T{1}) == 1);
assert(cuda::bit_ffs(all_ones) == 1);
// a single bit set at position k maps to result k + 1
assert(cuda::bit_ffs(static_cast<T>(T{1} << 1)) == 2);
assert(cuda::bit_ffs(static_cast<T>(T{1} << 3)) == 4);
assert(cuda::bit_ffs(static_cast<T>(T{1} << (nl::digits - 1))) == nl::digits);
// the lowest set bit wins when several bits are set
assert(cuda::bit_ffs(static_cast<T>(0b10101000)) == 4);
}

TEST_FUNC constexpr bool test()
{
test<unsigned char>();
test<unsigned short>();
test<unsigned>();
test<unsigned long>();
test<unsigned long long>();

test<uint8_t>();
test<uint16_t>();
test<uint32_t>();
test<uint64_t>();
test<size_t>();
test<uintmax_t>();
test<uintptr_t>();

#if _CCCL_HAS_INT128()
test<__uint128_t>();
#endif // _CCCL_HAS_INT128()
return true;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a test or two that we cannot invoke the function with bool, a signed integer or float and an enumeration

something like:

template<typename T>
_CCCL_CONCEPT can_bit_ffs = _CCCL_REQUIRES_EXPR((T), T val)((cuda::bit_ffs(val));

}

int main(int, char**)
{
assert(test());
static_assert(test());
return 0;
}