Skip to content
Open
9 changes: 6 additions & 3 deletions source/config/featureFlagEnums.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2022-2026 NV Access Limited, Bill Dengler, Rob Meredith, Leonard de Ruijter, Wang Chong
# This file is covered by the GNU General Public License.
# See the file COPYING for more details.
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

"""
Feature flag value enumerations.
Expand Down Expand Up @@ -146,16 +146,19 @@ class WordNavigationUnitFlag(DisplayStringEnum):
AUTO = enum.auto()
UNISCRIBE = enum.auto()
CHINESE = enum.auto()
ICU = enum.auto()
Comment thread
LeonarddeR marked this conversation as resolved.

@property
def _displayStringLabels(self) -> dict["WordNavigationUnitFlag", str]:
return {
# Translators: Label for a method of word segmentation.
self.AUTO: _("Auto"),
Comment thread
LeonarddeR marked this conversation as resolved.
Outdated
# Translators: Label for a method of word segmentation.
self.UNISCRIBE: _("Standard"),
self.UNISCRIBE: _("Windows (legacy)"),
Comment thread
LeonarddeR marked this conversation as resolved.
Outdated
# Translators: Label for a method of word segmentation.
self.CHINESE: _("Chinese"),
# Translators: Label for a method of word segmentation.
self.ICU: _("Windows Unicode (ICU)"),

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
self.ICU: _("Windows Unicode (ICU)"),
self.ICU: _("Unicode (ICU)"),

}


Expand Down
4 changes: 3 additions & 1 deletion source/textInfos/offsets.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong
# Copyright (C) 2006-2026 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

Expand Down Expand Up @@ -266,6 +266,8 @@ def wordSegFlag(self) -> WordSegFlag | None:
return WordSegFlag.AUTO
case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE:
return WordSegFlag.CHINESE
case config.featureFlagEnums.WordNavigationUnitFlag.ICU:
return WordSegFlag.ICU
case _:
log.error(f"Unknown word segmentation standard, {self.wordSegConf.calculated()!r}")
return None
Expand Down
36 changes: 29 additions & 7 deletions source/textUtils/_wordSeg/wordSegStrategy.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2025 NV Access Limited, Wang Chong
# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

Expand Down Expand Up @@ -142,10 +142,13 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
"""Return (start inclusive, end exclusive) or None. Offsets are str offsets relative to self.text."""
pass

@abstractmethod
def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
"""Segmented result with separators."""
pass
"""Segmented result with separators.

The default returns the text unchanged; only strategies that insert separators
for braille output (e.g. Chinese) override this.
"""
return self.text

def getWordOffsetRange(
self,
Expand Down Expand Up @@ -226,9 +229,6 @@ def _calculateUniscribeOffsets(
def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
return self._calculateUniscribeOffsets(self.text, offset)

def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
return self.text


class ChineseWordSegmentationStrategy(WordSegmentationStrategy):
_lib: CDLL | None = None
Expand Down Expand Up @@ -347,3 +347,25 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
def __init__(self, text: str, encoding: str | None = None) -> None:
super().__init__(text, encoding)
self.wordEnds = self._callCppJieba()


class IcuWordSegmentationStrategy(WordSegmentationStrategy):
"""ICU-based word segmentation (Windows built-in ICU library).

Word boundaries follow Unicode Standard Annex #29 default rules plus automatic
dictionary-based segmentation selected by the script of the text.
SegmentedText returns the text unchanged (no braille separator insertion).
"""

def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
from textUtils import icu

if self.encoding == textUtils.WCHAR_ENCODING:
return icu.calculateWordOffsets(self.text, offset)
# Convert the str offset to a UTF-16 offset for ICU, then convert the result back.
offsetConverter = textUtils.WideStringOffsetConverter(self.text)
wideOffset = offsetConverter.strToEncodedOffsets(offset, offset)[0]
result = icu.calculateWordOffsets(self.text, wideOffset)
if result is None:
return None
return offsetConverter.encodedToStrOffsets(*result)
48 changes: 27 additions & 21 deletions source/textUtils/_wordSeg/wordSegmenter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2025-2026 NV Access Limited, Wang Chong
# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

Expand All @@ -10,6 +10,7 @@

from ..segFlag import WordSegFlag
from . import wordSegStrategy
from winBindings.icu import ICU_AVAILABLE as _ICU_AVAILABLE
Comment thread
LeonarddeR marked this conversation as resolved.
Outdated


_GET_SEGMENT_RECOVERABLE_EXCEPTIONS = (
Expand Down Expand Up @@ -43,29 +44,34 @@ def __init__(
def _chooseStrategy(
self,
) -> wordSegStrategy.WordSegmentationStrategy:
"""Choose the appropriate segmentation strategy based on the text content."""
if self.wordSegFlag == WordSegFlag.AUTO:
if (
wordSegStrategy.ChineseWordSegmentationStrategy._lib
and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(
self.text,
)
"""Choose the segmentation strategy, falling back Chinese -> ICU -> Uniscribe.

The CHINESE flag always uses the Chinese strategy when cppjieba is loaded; under
AUTO the Chinese strategy is used only for Chinese (non-kana) text. ICU is used
for AUTO and ICU, and as the fallback when cppjieba is unavailable: it follows
UAX#29 plus script-driven dictionary segmentation, handling complex scripts that
Uniscribe breaks poorly. Uniscribe is the final fallback and the only strategy
for the UNISCRIBE flag (it stays pinned where it is strictly required, e.g.
EditTextInfo, to match the Windows edit control / Notepad).
"""
flag = self.wordSegFlag
# Chinese: always for the CHINESE flag, or under AUTO for Chinese (non-kana) text.
if (
flag in (WordSegFlag.AUTO, WordSegFlag.CHINESE)
and wordSegStrategy.ChineseWordSegmentationStrategy._lib
):
if flag == WordSegFlag.CHINESE or (
WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(self.text)
and not WordSegmenter._KANA.search(self.text)
):
return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
match self.wordSegFlag:
case WordSegFlag.UNISCRIBE:
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
case WordSegFlag.CHINESE:
if wordSegStrategy.ChineseWordSegmentationStrategy._lib:
return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
log.debugWarning(
"Chinese word segmenter is currently unavailable. Falling back to Uniscribe.",
)
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
case _:
pass
elif flag == WordSegFlag.CHINESE:
log.debugWarning("Chinese word segmenter is unavailable. Falling back to ICU/Uniscribe.")
# ICU for everything except the explicit UNISCRIBE flag.
if flag != WordSegFlag.UNISCRIBE and _ICU_AVAILABLE:
return wordSegStrategy.IcuWordSegmentationStrategy(self.text, self.encoding)
elif flag == WordSegFlag.ICU:
log.debugWarning("ICU word segmenter is unavailable. Falling back to Uniscribe.")
return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)

def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
Expand Down
115 changes: 115 additions & 0 deletions source/textUtils/icu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2026 NV Access Limited, Leonard de Ruijter
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

"""ICU-based text boundary utilities using the Windows built-in ICU library.

Requires Windows 10 version 1703 (Creators Update) or later.
"""

import ctypes
from contextlib import contextmanager

import winBindings.icu as _icu
Comment thread
LeonarddeR marked this conversation as resolved.
Outdated
from logHandler import log

_ROOT_LOCALE: bytes = b""
"""ICU root locale. Word and character segmentation are script-driven, not
locale-driven (see calculateWordOffsets), so the root locale is always used.
"""


@contextmanager
def _breakIterator(kind: int, locale: bytes, buf: ctypes.Array[ctypes.c_wchar]):
"""Context manager that opens an ICU BreakIterator, yields it, then closes it.

The caller owns the text buffer and must keep it alive for the duration of the
block, satisfying ICU's requirement that the text pointer remains valid while
the iterator is in use.

:param kind: One of the UBRK_* constants from winBindings.icu.
:param locale: ICU locale byte string (the root locale, _ROOT_LOCALE).
:param buf: NUL-terminated UTF-16 buffer (ctypes.create_unicode_buffer) to analyze.
:raises RuntimeError: If ICU reports an error opening the iterator.
"""
textLength = len(buf) - 1
status = _icu.UErrorCode(0)
bi = _icu.ubrk_open(kind, locale, buf, textLength, ctypes.byref(status))
if _icu.U_FAILURE(status.value) or not bi:
raise RuntimeError(f"ubrk_open failed with status {status.value}")
try:
yield bi
finally:
_icu.ubrk_close(bi)


def calculateWordOffsets(
text: str,
offset: int,
) -> tuple[int, int] | None:
"""Calculate the UTF-16 start and end offsets of the word at the given offset.

Word boundaries follow Unicode Standard Annex #29 default rules plus automatic
dictionary-based segmentation for scripts such as Thai, Lao, Khmer, and CJK
ideographs. ICU selects the dictionary by the script of the characters, not by
the locale, so no language is passed: any locale (including unrecognised codes)
would yield identical word boundaries and ICU never errors on an unknown locale
(it silently falls back to the root locale). The root locale is therefore used
unconditionally. (Locale-sensitive break types such as line and sentence
breaking would need a locale, but those are not used here.)

Trailing whitespace is included in the preceding word segment, matching the
behaviour of NVDA's Uniscribe implementation (textUtils.cpp). When the offset
falls inside a whitespace run, the returned segment is the preceding word plus
the whitespace.

Note: ICU coalesces a run of identical whitespace into one segment but splits
mixed whitespace (e.g. space + tab) into separate segments, so a mixed run is
not merged into a single word. This is not worth special-casing: the legacy
Uniscribe/Notepad behaviour for mixed whitespace runs is itself inconsistent.

:param text: The line text as a Python str.
:param offset: UTF-16 code unit offset within text at which to find the boundary.
:return: (startOffset, endOffset) as UTF-16 code unit indices (endOffset exclusive),
or None if the ICU call failed.
"""
# A c_wchar buffer is UTF-16 code-unit indexed on Windows, so buf[a:b] is exactly
# the segment ICU's offsets refer to (lone surrogates decode as non-space).
buf = ctypes.create_unicode_buffer(text)
textLength = len(buf) - 1
if offset >= textLength:
return (offset, offset + 1)

Comment thread
LeonarddeR marked this conversation as resolved.
try:
with _breakIterator(_icu.UBRK_WORD, _ROOT_LOCALE, buf) as bi:
# Find [start, end) — the ICU segment containing offset.
# ICU offsets are UTF-16 code-unit indexed, so anchor on the boundary following
# offset and take the boundary preceding that. (ubrk_preceding(offset + 1)
# would snap back for multi-unit segments.)
end = _icu.ubrk_following(bi, offset)
if end == _icu.UBRK_DONE:
end = textLength
start = _icu.ubrk_preceding(bi, end)
if start == _icu.UBRK_DONE:
start = 0

if buf[start:end].isspace():
# Offset is inside a whitespace run. Attach this run to the
# preceding segment (mirroring the Uniscribe trailing-space rule).
if start > 0:
wordStart = _icu.ubrk_preceding(bi, start)
if wordStart == _icu.UBRK_DONE:
wordStart = 0
return (wordStart, end)
else:
# Offset is inside a word/punctuation segment. Extend the end
# through any immediately following whitespace run.
nextEnd = _icu.ubrk_following(bi, end)
if nextEnd != _icu.UBRK_DONE and buf[end:nextEnd].isspace():
return (start, nextEnd)

return (start, end)
except RuntimeError:
log.debugWarning("ICU word break iterator failed", exc_info=True)
return None
4 changes: 3 additions & 1 deletion source/textUtils/segFlag.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2025 NV Access Limited, Wang Chong
# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

Expand All @@ -9,6 +9,7 @@
_AUTO: int = 1 << 0
_UNISCRIBE: int = 1 << 1
_CHINESE: int = 1 << 2
_ICU: int = 1 << 3


class CharSegFlag(IntFlag):
Expand All @@ -26,3 +27,4 @@ class WordSegFlag(IntFlag):
AUTO = _AUTO
UNISCRIBE = _UNISCRIBE
CHINESE = _CHINESE
ICU = _ICU
Loading
Loading