Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/libcudacxx/extended_api/bit.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Bit
:maxdepth: 1

bit/bitmask
bit/bit_ffs
bit/bit_reverse
bit/bitfield_insert
bit/bitfield_extract
Expand All @@ -26,6 +27,11 @@ Bit
- CCCL 3.0.0
- CUDA 13.0

* - :ref:`bit_ffs <libcudacxx-extended-api-bit-bit_ffs>`
- Find the first (least significant) set bit
- CCCL 3.5.0
- CUDA 13.x

* - :ref:`bit_reverse <libcudacxx-extended-api-bit-bit_reverse>`
- Reverse the order of bits
- CCCL 3.0.0
Expand Down
55 changes: 55 additions & 0 deletions docs/libcudacxx/extended_api/bit/bit_ffs.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
.. _libcudacxx-extended-api-bit-bit_ffs:

``cuda::bit_ffs``
=================

Defined in the ``<cuda/bit>`` header.

.. code:: cuda

namespace cuda {

template <typename T>
[[nodiscard]] __host__ __device__ constexpr
int bit_ffs(T value) noexcept;

} // namespace cuda

The function returns one plus the index of the least significant set bit of ``value``, or ``0`` if ``value`` is zero. This matches the semantics of ``__builtin_ffs`` and CUDA's ``__ffs``.

**Parameters**

- ``value``: the unsigned integer value to scan.

**Return value**

- ``0`` if ``value`` is zero, otherwise the 1-based position of the least significant set bit.

**Constraints**

- ``T`` is an unsigned integral type.

.. note::

Unlike ``cuda::std::countr_zero``, which returns the number of trailing zero bits, ``bit_ffs`` uses a 1-based position and is well defined for a zero input.

Example
-------

.. code:: cuda

#include <cuda/bit>
#include <cuda/std/cassert>
#include <cuda/std/cstdint>

__global__ void bit_ffs_kernel() {
assert(cuda::bit_ffs(uint32_t{0}) == 0);
assert(cuda::bit_ffs(uint32_t{1}) == 1);
assert(cuda::bit_ffs(uint32_t{0b10101000}) == 4);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would also add an example for all bit set

}

int main() {
bit_ffs_kernel<<<1, 1>>>();
cudaDeviceSynchronize();
return 0;
}
45 changes: 45 additions & 0 deletions libcudacxx/include/cuda/__bit/bit_ffs.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___BIT_BIT_FFS_H
#define _CUDA___BIT_BIT_FFS_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cuda/std/__bit/countr.h>
#include <cuda/std/__type_traits/is_unsigned_integer.h>

#include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA

// Returns one plus the index of the least significant set bit of __value, or 0 if __value is zero.
// This matches the semantics of __builtin_ffs and CUDA's __ffs. Unlike cuda::std::countr_zero, the
// result is 1-based and the zero input is well defined (it returns 0).
_CCCL_TEMPLATE(class _Tp)
_CCCL_REQUIRES(::cuda::std::__cccl_is_unsigned_integer_v<_Tp>)
[[nodiscard]] _CCCL_API constexpr int bit_ffs(_Tp __value) noexcept
{
return (__value == _Tp{0}) ? 0 : ::cuda::std::countr_zero(__value) + 1;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the implementation can be optimized. please make sure that the implementation produces the same code of __ffs and __ffsll https://godbolt.org/z/z4717fqGx

}

_CCCL_END_NAMESPACE_CUDA

#include <cuda/std/__cccl/epilogue.h>

#endif // _CUDA___BIT_BIT_FFS_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/bit
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# pragma system_header
#endif // no system header

#include <cuda/__bit/bit_ffs.h>
#include <cuda/__bit/bit_reverse.h>
#include <cuda/__bit/bitfield.h>
#include <cuda/__bit/bitmask.h>
Expand Down
66 changes: 66 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/bit/bit_ffs.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
//===----------------------------------------------------------------------===//
//
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <cuda/bit>
#include <cuda/std/cassert>
#include <cuda/std/cstdint>
#include <cuda/std/type_traits>

#include "test_macros.h"
Comment thread
coderabbitai[bot] marked this conversation as resolved.

template <typename T>
TEST_FUNC constexpr bool test()
{
using nl = cuda::std::numeric_limits<T>;
constexpr T all_ones = static_cast<T>(~T{0});

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
constexpr T all_ones = static_cast<T>(~T{0});
[[maybe_unused]] constexpr T all_ones = static_cast<T>(~T{0});


// a zero input is well defined and returns 0
static_assert(cuda::bit_ffs(T{0}) == 0);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: All those should only be

Suggested change
static_assert(cuda::bit_ffs(T{0}) == 0);
assert(cuda::bit_ffs(T{0}) == 0);

We are invoking the function in a constexpr context, so otherwise we would never test the runtime path

// the least significant bit set maps to position 1
static_assert(cuda::bit_ffs(T{1}) == 1);
static_assert(cuda::bit_ffs(all_ones) == 1);
// a single bit set at position k maps to result k + 1
static_assert(cuda::bit_ffs(static_cast<T>(T{1} << 1)) == 2);
static_assert(cuda::bit_ffs(static_cast<T>(T{1} << 3)) == 4);
static_assert(cuda::bit_ffs(static_cast<T>(T{1} << (nl::digits - 1))) == nl::digits);
// the lowest set bit wins when several bits are set
static_assert(cuda::bit_ffs(static_cast<T>(0b10101000)) == 4);

unused(all_ones);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
unused(all_ones);

return true;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nitpick: This function is only called from the test() function, so it does not need to return anything

}

TEST_FUNC constexpr bool test()
{
test<unsigned char>();
test<unsigned short>();
test<unsigned>();
test<unsigned long>();
test<unsigned long long>();

test<uint8_t>();
test<uint16_t>();
test<uint32_t>();
test<uint64_t>();
test<size_t>();
test<uintmax_t>();
test<uintptr_t>();

#if _CCCL_HAS_INT128()
test<__uint128_t>();
#endif // _CCCL_HAS_INT128()
return true;

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a test or two that we cannot invoke the function with bool, a signed integer or float and an enumeration

something like:

template<typename T>
_CCCL_CONCEPT can_bit_ffs = _CCCL_REQUIRES_EXPR((T), T val)((cuda::bit_ffs(val));

}

int main(int, char**)
{
assert(test());
static_assert(test());
return 0;
}
Loading