From fca75b7668fd39a73efccb9287c615e350f8ae44 Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Mon, 15 Jun 2026 19:33:25 +0200 Subject: [PATCH 01/12] Add ICU word segmentation backend Add the Windows ICU ctypes bindings (winBindings/icu.py) and the textUtils.icu word-offset primitive (calculateWordOffsets), wire them into the word segmentation strategy framework via IcuWordSegmentationStrategy, and expose an ICU option through WordSegFlag and the WordNavigationUnitFlag feature flag. Word AUTO now prefers ICU whenever the ICU library is available (Chinese word segmentation still takes precedence for Chinese text), with Uniscribe as the fallback. ICU follows Unicode Standard Annex #29 and provides dictionary-based, locale-aware segmentation for complex scripts such as Thai, Lao and Khmer. Character segmentation is unchanged. Co-Authored-By: Claude Sonnet 4.6 --- source/config/featureFlagEnums.py | 5 +- source/textInfos/offsets.py | 2 + source/textUtils/_wordSeg/wordSegStrategy.py | 25 +++ source/textUtils/_wordSeg/wordSegmenter.py | 46 +++-- source/textUtils/icu.py | 117 +++++++++++ source/textUtils/segFlag.py | 2 + source/winBindings/icu.py | 164 +++++++++++++++ .../unit/test_textUtils_backendComparison.py | 189 ++++++++++++++++++ tests/unit/test_wordSegIcu.py | 93 +++++++++ user_docs/en/changes.md | 4 + user_docs/en/userGuide.md | 9 +- 11 files changed, 631 insertions(+), 25 deletions(-) create mode 100644 source/textUtils/icu.py create mode 100644 source/winBindings/icu.py create mode 100644 tests/unit/test_textUtils_backendComparison.py create mode 100644 tests/unit/test_wordSegIcu.py diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 68f6e019aad..88833783ec7 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -146,6 +146,7 @@ class WordNavigationUnitFlag(DisplayStringEnum): AUTO = enum.auto() UNISCRIBE = enum.auto() CHINESE = enum.auto() + ICU = enum.auto() @property def _displayStringLabels(self) -> dict["WordNavigationUnitFlag", str]: @@ -153,9 +154,11 @@ def _displayStringLabels(self) -> dict["WordNavigationUnitFlag", str]: # Translators: Label for a method of word segmentation. self.AUTO: _("Auto"), # Translators: Label for a method of word segmentation. - self.UNISCRIBE: _("Standard"), + self.UNISCRIBE: _("Windows (legacy)"), # Translators: Label for a method of word segmentation. self.CHINESE: _("Chinese"), + # Translators: Label for a method of word segmentation. + self.ICU: _("Windows Unicode (ICU)"), } diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index 91b66b90a02..9b63f48c67f 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -266,6 +266,8 @@ def wordSegFlag(self) -> WordSegFlag | None: return WordSegFlag.AUTO case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE: return WordSegFlag.CHINESE + case config.featureFlagEnums.WordNavigationUnitFlag.ICU: + return WordSegFlag.ICU case _: log.error(f"Unknown word segmentation standard, {self.wordSegConf.calculated()!r}") return None diff --git a/source/textUtils/_wordSeg/wordSegStrategy.py b/source/textUtils/_wordSeg/wordSegStrategy.py index 27e3f0e116b..272ad7ad98f 100644 --- a/source/textUtils/_wordSeg/wordSegStrategy.py +++ b/source/textUtils/_wordSeg/wordSegStrategy.py @@ -347,3 +347,28 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: def __init__(self, text: str, encoding: str | None = None) -> None: super().__init__(text, encoding) self.wordEnds = self._callCppJieba() + + +class IcuWordSegmentationStrategy(WordSegmentationStrategy): + """ICU-based word segmentation (Windows built-in ICU library). + + Word boundaries follow Unicode Standard Annex #29 default rules plus automatic + dictionary-based segmentation selected by the script of the text. + SegmentedText returns the text unchanged (no braille separator insertion). + """ + + def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: + from textUtils import icu + + if self.encoding == textUtils.WCHAR_ENCODING: + return icu.calculateWordOffsets(self.text, offset) + # Convert the str offset to a UTF-16 offset for ICU, then convert the result back. + offsetConverter = textUtils.WideStringOffsetConverter(self.text) + wideOffset = offsetConverter.strToEncodedOffsets(offset, offset)[0] + result = icu.calculateWordOffsets(self.text, wideOffset) + if result is None: + return None + return offsetConverter.encodedToStrOffsets(*result) + + def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: + return self.text diff --git a/source/textUtils/_wordSeg/wordSegmenter.py b/source/textUtils/_wordSeg/wordSegmenter.py index 745af441e2e..a3a9d3e5d4f 100644 --- a/source/textUtils/_wordSeg/wordSegmenter.py +++ b/source/textUtils/_wordSeg/wordSegmenter.py @@ -10,6 +10,7 @@ from ..segFlag import WordSegFlag from . import wordSegStrategy +from winBindings.icu import ICU_AVAILABLE as _ICU_AVAILABLE _GET_SEGMENT_RECOVERABLE_EXCEPTIONS = ( @@ -43,29 +44,34 @@ def __init__( def _chooseStrategy( self, ) -> wordSegStrategy.WordSegmentationStrategy: - """Choose the appropriate segmentation strategy based on the text content.""" - if self.wordSegFlag == WordSegFlag.AUTO: - if ( - wordSegStrategy.ChineseWordSegmentationStrategy._lib - and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search( - self.text, - ) + """Choose the segmentation strategy, falling back Chinese -> ICU -> Uniscribe. + + The CHINESE flag always uses the Chinese strategy when cppjieba is loaded; under + AUTO the Chinese strategy is used only for Chinese (non-kana) text. ICU is used + for AUTO and ICU, and as the fallback when cppjieba is unavailable: it follows + UAX#29 plus script-driven dictionary segmentation, handling complex scripts that + Uniscribe breaks poorly. Uniscribe is the final fallback and the only strategy + for the UNISCRIBE flag (it stays pinned where it is strictly required, e.g. + EditTextInfo, to match the Windows edit control / Notepad). + """ + flag = self.wordSegFlag + # Chinese: always for the CHINESE flag, or under AUTO for Chinese (non-kana) text. + if ( + flag in (WordSegFlag.AUTO, WordSegFlag.CHINESE) + and wordSegStrategy.ChineseWordSegmentationStrategy._lib + ): + if flag == WordSegFlag.CHINESE or ( + WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(self.text) and not WordSegmenter._KANA.search(self.text) ): return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) - return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) - match self.wordSegFlag: - case WordSegFlag.UNISCRIBE: - return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) - case WordSegFlag.CHINESE: - if wordSegStrategy.ChineseWordSegmentationStrategy._lib: - return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding) - log.debugWarning( - "Chinese word segmenter is currently unavailable. Falling back to Uniscribe.", - ) - return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) - case _: - pass + elif flag == WordSegFlag.CHINESE: + log.debugWarning("Chinese word segmenter is unavailable. Falling back to ICU/Uniscribe.") + # ICU for everything except the explicit UNISCRIBE flag. + if flag != WordSegFlag.UNISCRIBE and _ICU_AVAILABLE: + return wordSegStrategy.IcuWordSegmentationStrategy(self.text, self.encoding) + elif flag == WordSegFlag.ICU: + log.debugWarning("ICU word segmenter is unavailable. Falling back to Uniscribe.") return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding) def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: diff --git a/source/textUtils/icu.py b/source/textUtils/icu.py new file mode 100644 index 00000000000..2ca10b4e7a8 --- /dev/null +++ b/source/textUtils/icu.py @@ -0,0 +1,117 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2026 NV Access Limited, Leonard de Ruijter +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +"""ICU-based text boundary utilities using the Windows built-in ICU library. + +Requires Windows 10 version 1703 (Creators Update) or later. +""" + +import ctypes +from contextlib import contextmanager + +import winBindings.icu as _icu +from logHandler import log + +_ROOT_LOCALE: bytes = b"" +"""ICU root locale. Word and character segmentation are script-driven, not +locale-driven (see calculateWordOffsets), so the root locale is always used. +""" + + +@contextmanager +def _breakIterator(kind: int, locale: bytes, text: str): + """Context manager that opens an ICU BreakIterator, yields it, then closes it. + + The ctypes buffer is kept alive for the duration of the block, satisfying + ICU's requirement that the text pointer remains valid while the iterator is in use. + + :param kind: One of the UBRK_* constants from winBindings.icu. + :param locale: ICU locale byte string (the root locale, _ROOT_LOCALE). + :param text: Python str to analyze. + :raises RuntimeError: If ICU reports an error opening the iterator. + """ + buf = ctypes.create_unicode_buffer(text) + textLength = len(buf) - 1 + status = _icu.UErrorCode(0) + bi = _icu.ubrk_open(kind, locale, buf, textLength, ctypes.byref(status)) + if _icu.U_FAILURE(status.value) or not bi: + raise RuntimeError(f"ubrk_open failed with status {status.value}") + try: + yield bi + finally: + _icu.ubrk_close(bi) + + +def calculateWordOffsets( + text: str, + offset: int, +) -> tuple[int, int] | None: + """Calculate the UTF-16 start and end offsets of the word at the given offset. + + Word boundaries follow Unicode Standard Annex #29 default rules plus automatic + dictionary-based segmentation for scripts such as Thai, Lao, Khmer, and CJK + ideographs. ICU selects the dictionary by the script of the characters, not by + the locale, so no language is passed: any locale (including unrecognised codes) + would yield identical word boundaries and ICU never errors on an unknown locale + (it silently falls back to the root locale). The root locale is therefore used + unconditionally. (Locale-sensitive break types such as line and sentence + breaking would need a locale, but those are not used here.) + + Trailing whitespace is included in the preceding word segment, matching the + behaviour of NVDA's Uniscribe implementation (textUtils.cpp). When the offset + falls inside a whitespace run, the returned segment is the preceding word plus + the whitespace. + + Note: ICU coalesces a run of identical whitespace into one segment but splits + mixed whitespace (e.g. space + tab) into separate segments, so a mixed run is + not merged into a single word. This is not worth special-casing: the legacy + Uniscribe/Notepad behaviour for mixed whitespace runs is itself inconsistent. + + :param text: The line text as a Python str. + :param offset: UTF-16 code unit offset within text at which to find the boundary. + :return: (startOffset, endOffset) as UTF-16 code unit indices (endOffset exclusive), + or None if the ICU call failed. + """ + utf16_bytes = text.encode("utf-16-le", errors="surrogatepass") + textLength = len(utf16_bytes) // 2 + if offset >= textLength: + return (offset, offset + 1) + locale = _ROOT_LOCALE + + def _segText(segStart: int, segEnd: int) -> str: + return utf16_bytes[segStart * 2 : segEnd * 2].decode("utf-16-le", errors="surrogatepass") + + try: + with _breakIterator(_icu.UBRK_WORD, locale, text) as bi: + # Find [start, end) — the ICU segment containing offset. + # ICU offsets are code-point indexed, so anchor on the boundary following + # offset and take the boundary preceding that. (ubrk_preceding(offset + 1) + # would snap back for multi-unit segments.) + end = _icu.ubrk_following(bi, offset) + if end == _icu.UBRK_DONE: + end = textLength + start = _icu.ubrk_preceding(bi, end) + if start == _icu.UBRK_DONE: + start = 0 + + if _segText(start, end).isspace(): + # Offset is inside a whitespace run. Attach this run to the + # preceding segment (mirroring the Uniscribe trailing-space rule). + if start > 0: + wordStart = _icu.ubrk_preceding(bi, start) + if wordStart == _icu.UBRK_DONE: + wordStart = 0 + return (wordStart, end) + else: + # Offset is inside a word/punctuation segment. Extend the end + # through any immediately following whitespace run. + nextEnd = _icu.ubrk_following(bi, end) + if nextEnd != _icu.UBRK_DONE and _segText(end, nextEnd).isspace(): + return (start, nextEnd) + + return (start, end) + except RuntimeError: + log.debugWarning("ICU word break iterator failed", exc_info=True) + return None diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py index e996c244211..10f6d48ef3f 100644 --- a/source/textUtils/segFlag.py +++ b/source/textUtils/segFlag.py @@ -9,6 +9,7 @@ _AUTO: int = 1 << 0 _UNISCRIBE: int = 1 << 1 _CHINESE: int = 1 << 2 +_ICU: int = 1 << 3 class CharSegFlag(IntFlag): @@ -26,3 +27,4 @@ class WordSegFlag(IntFlag): AUTO = _AUTO UNISCRIBE = _UNISCRIBE CHINESE = _CHINESE + ICU = _ICU diff --git a/source/winBindings/icu.py b/source/winBindings/icu.py new file mode 100644 index 00000000000..68394602b6d --- /dev/null +++ b/source/winBindings/icu.py @@ -0,0 +1,164 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2026 NV Access Limited, Leonard de Ruijter +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +"""ctypes bindings for the Windows built-in ICU library. + +ICU has been built into Windows since Windows 10 version 1703 (Creators Update). +The combined icu.dll is available from Windows 10 version 1903 (May 2019 Update). +Only the C APIs are exposed; no C++ APIs are available due to ABI instability. + +See: https://learn.microsoft.com/windows/win32/intl/international-components-for-unicode--icu- +""" + +import ctypes +from ctypes import c_int32, c_void_p, c_char_p, c_wchar_p, POINTER + +# Try the combined icu.dll (Windows 10 1903+) first, then icuuc.dll (Windows 10 1703+). +# ubrk_* functions are part of the "common" library, present in both. +_lib: ctypes.WinDLL | None = None +for _dllName in ("icu.dll", "icuuc.dll"): + try: + _lib = ctypes.WinDLL(_dllName) + break + except OSError: + pass + +ICU_AVAILABLE: bool = _lib is not None +"""True if an ICU library was successfully loaded.""" + +UBRK_CHARACTER: int = 0 +"""Break iterator type for character boundaries.""" +UBRK_WORD: int = 1 +"""Break iterator type for word boundaries.""" +UBRK_LINE: int = 2 +"""Break iterator type for line-break boundaries.""" +UBRK_SENTENCE: int = 3 +"""Break iterator type for sentence boundaries.""" + +UBRK_WORD_NONE: int = 0 +"""Rule status tag: start of non-word boundary range (whitespace or punctuation between words).""" +UBRK_WORD_NONE_LIMIT: int = 100 +"""Rule status tag: exclusive end of non-word boundary range.""" +UBRK_WORD_NUMBER: int = 100 +"""Rule status tag: start of number boundary range.""" +UBRK_WORD_NUMBER_LIMIT: int = 200 +"""Rule status tag: exclusive end of number boundary range.""" +UBRK_WORD_LETTER: int = 200 +"""Rule status tag: start of letter boundary range; values >= this are actual word boundaries.""" +UBRK_WORD_LETTER_LIMIT: int = 300 +"""Rule status tag: exclusive end of letter boundary range.""" +UBRK_WORD_KANA: int = 300 +"""Rule status tag: start of kana boundary range.""" +UBRK_WORD_KANA_LIMIT: int = 400 +"""Rule status tag: exclusive end of kana boundary range.""" +UBRK_WORD_IDEO: int = 400 +"""Rule status tag: start of ideograph boundary range.""" +UBRK_WORD_IDEO_LIMIT: int = 500 +"""Rule status tag: exclusive end of ideograph boundary range.""" + +UBRK_DONE: int = -1 +"""Returned by iterator functions when there are no more boundaries.""" + +UErrorCode = c_int32 +"""Signed 32-bit integer error code. U_ZERO_ERROR = 0; positive values indicate errors.""" + + +def U_FAILURE(code: int) -> bool: + """Return True if the given UErrorCode indicates an error.""" + return code > 0 + + +if ICU_AVAILABLE: + assert _lib is not None + + ubrk_open = _lib.ubrk_open + """Create a new break iterator. + + :param kind: UBreakIteratorType (one of the UBRK_* constants). + :param locale: Null-terminated UTF-8 locale ID, or NULL/empty for the root locale. + :param text: UTF-16 text to analyze. + :param textLength: Number of UTF-16 code units, or -1 for NUL-terminated. + :param status: In/out UErrorCode; pass a pointer to a zero-initialised value. + :return: Opaque UBreakIterator* handle; must be freed with ubrk_close. + """ + ubrk_open.restype = c_void_p + ubrk_open.argtypes = ( + c_int32, # kind: UBreakIteratorType + c_char_p, # locale: UTF-8 locale ID or NULL + c_wchar_p, # text: UTF-16 text to analyze + c_int32, # textLength: code units, or -1 for NUL-terminated + POINTER(UErrorCode), # status: in/out error code + ) + + ubrk_close = _lib.ubrk_close + """Free a break iterator created by ubrk_open.""" + ubrk_close.restype = None + ubrk_close.argtypes = ( + c_void_p, # bi: UBreakIterator* handle to free + ) + + ubrk_setText = _lib.ubrk_setText + """Rebind an existing iterator to new text without reallocating. + + ICU holds a reference to the text buffer; the caller must keep it alive for the + lifetime of the iterator. + """ + ubrk_setText.restype = None + ubrk_setText.argtypes = ( + c_void_p, # bi: UBreakIterator* handle + c_wchar_p, # text: new UTF-16 text buffer + c_int32, # textLength: code units, or -1 for NUL-terminated + POINTER(UErrorCode), # status: in/out error code + ) + + ubrk_first = _lib.ubrk_first + """Move to the first boundary (start of text) and return its position.""" + ubrk_first.restype = c_int32 + ubrk_first.argtypes = ( + c_void_p, # bi: UBreakIterator* handle + ) + + ubrk_next = _lib.ubrk_next + """Advance to the next boundary and return its position. + + Returns UBRK_DONE when past the end of the text. + """ + ubrk_next.restype = c_int32 + ubrk_next.argtypes = ( + c_void_p, # bi: UBreakIterator* handle + ) + + ubrk_preceding = _lib.ubrk_preceding + """Return the largest boundary position strictly less than offset. + + Sets the iterator to that position. + """ + ubrk_preceding.restype = c_int32 + ubrk_preceding.argtypes = ( + c_void_p, # bi: UBreakIterator* handle + c_int32, # offset: position to search before + ) + + ubrk_following = _lib.ubrk_following + """Return the smallest boundary position strictly greater than offset. + + Sets the iterator to that position. Returns UBRK_DONE if past the end. + """ + ubrk_following.restype = c_int32 + ubrk_following.argtypes = ( + c_void_p, # bi: UBreakIterator* handle + c_int32, # offset: position to search after + ) + + ubrk_getRuleStatus = _lib.ubrk_getRuleStatus + """Return the rule status tag for the most recently returned boundary. + + For UBRK_WORD iterators, values < UBRK_WORD_NONE_LIMIT indicate non-word boundaries + (whitespace or punctuation); values >= UBRK_WORD_LETTER are actual word boundaries. + """ + ubrk_getRuleStatus.restype = c_int32 + ubrk_getRuleStatus.argtypes = ( + c_void_p, # bi: UBreakIterator* handle + ) diff --git a/tests/unit/test_textUtils_backendComparison.py b/tests/unit/test_textUtils_backendComparison.py new file mode 100644 index 00000000000..4f3cfaf3340 --- /dev/null +++ b/tests/unit/test_textUtils_backendComparison.py @@ -0,0 +1,189 @@ +# A part of NonVisual Desktop Access (NVDA) +# This file is covered by the GNU General Public License. +# See the file COPYING for more details. +# Copyright (C) 2026 NV Access Limited, Leonard de Ruijter + +"""Comparison tests between the Uniscribe and ICU word boundary backends. + +These tests document where the two backends agree and where they diverge, +using the same inputs on both sides. Tests that require ICU are skipped +when the ICU library is not present on the system. + +Word-offset comparisons are done by constructing a WordSegmenter with the +appropriate WordSegFlag and calling getSegmentForOffset. +""" + +import unittest + +import textUtils +from winBindings.icu import ICU_AVAILABLE +from textUtils._wordSeg.wordSegmenter import WordSegmenter +from textUtils.segFlag import WordSegFlag + + +skipIfNoICU = unittest.skipUnless(ICU_AVAILABLE, "ICU library not available on this system") + +# Encoding used for all WordSegmenter calls — matches what NVDA uses internally. +_ENCODING = textUtils.WCHAR_ENCODING + + +def _icuWordOffsets(text: str, offset: int) -> tuple[int, int] | None: + """Get word offsets via the ICU backend (UTF-16 offsets).""" + return WordSegmenter(text, _ENCODING, WordSegFlag.ICU).getSegmentForOffset(offset) + + +def _uniscribeWordOffsets(text: str, offset: int) -> tuple[int, int] | None: + """Get word offsets via the Uniscribe backend (UTF-16 offsets).""" + return WordSegmenter(text, _ENCODING, WordSegFlag.UNISCRIBE).getSegmentForOffset(offset) + + +# --------------------------------------------------------------------------- +# calculateWordOffsets — agreement on plain Latin / Hebrew text +# --------------------------------------------------------------------------- + + +@skipIfNoICU +class TestWordOffsetsEnglish(unittest.TestCase): + """Word offset comparison for English text. + + Both backends include trailing whitespace as part of the preceding word. + NVDA's Uniscribe implementation (textUtils.cpp) does this natively; + the ICU implementation mirrors that behaviour explicitly. + """ + + TEXT = "hello world" + + def _assertSameWordOffsets(self, offset: int) -> tuple[int, int] | None: + icu_result = _icuWordOffsets(self.TEXT, offset) + uni_result = _uniscribeWordOffsets(self.TEXT, offset) + self.assertEqual( + icu_result, + uni_result, + f"Backends disagree on word offsets for {self.TEXT!r} at offset {offset}: " + f"ICU={icu_result!r} Uniscribe={uni_result!r}", + ) + return icu_result + + def test_first_word(self): + # Both backends: "hello " — trailing space included. + result = self._assertSameWordOffsets(0) + self.assertEqual(result, (0, 6)) + + def test_mid_first_word(self): + result = self._assertSameWordOffsets(2) + self.assertEqual(result, (0, 6)) + + def test_space(self): + # Both backends: querying at the space returns the preceding word+space. + result = self._assertSameWordOffsets(5) + self.assertEqual(result, (0, 6)) + + def test_second_word(self): + # Both backends: "world" — no trailing space at end of string. + result = self._assertSameWordOffsets(6) + self.assertEqual(result, (6, 11)) + + def test_mid_second_word(self): + result = self._assertSameWordOffsets(8) + self.assertEqual(result, (6, 11)) + + +@skipIfNoICU +class TestWordOffsetsHebrew(unittest.TestCase): + """Word offset comparison for Hebrew text — שלום עולם (hello world).""" + + TEXT = "שלום עולם" + + def _assertSameWordOffsets(self, offset: int) -> tuple[int, int] | None: + icu_result = _icuWordOffsets(self.TEXT, offset) + uni_result = _uniscribeWordOffsets(self.TEXT, offset) + self.assertEqual( + icu_result, + uni_result, + f"Backends disagree on word offsets for {self.TEXT!r} at offset {offset}: " + f"ICU={icu_result!r} Uniscribe={uni_result!r}", + ) + return icu_result + + def test_first_word(self): + # Both backends: "שלום " — trailing space included, offsets (0, 5). + result = self._assertSameWordOffsets(0) + self.assertEqual(result, (0, 5)) + + def test_mid_first_word(self): + result = self._assertSameWordOffsets(2) + self.assertEqual(result, (0, 5)) + + def test_space(self): + # Both backends: querying at offset 4 (space) returns the preceding word+space. + result = self._assertSameWordOffsets(4) + self.assertEqual(result, (0, 5)) + + def test_second_word(self): + # Both backends: "עולם" — no trailing space. + result = self._assertSameWordOffsets(5) + self.assertEqual(result, (5, 9)) + + +# --------------------------------------------------------------------------- +# Complex-script cases. +# +# Uniscribe uses the Windows Script Processor (ScriptBreak) for word boundaries. +# On Windows 10/11 it DOES segment some space-less scripts correctly: for Thai +# (and Lao) both backends agree, so those scripts are not a differentiator. +# +# For other scripts — notably Japanese kana/kanji and Khmer — Uniscribe falls +# back to character- or syllable-cluster-level boundaries, whereas ICU applies +# UAX#29 rules with dictionary segmentation and returns whole words. These are +# the cases where the ICU backend actually helps; see #20343. +# --------------------------------------------------------------------------- + + +@skipIfNoICU +class TestWordOffsetsThaiParity(unittest.TestCase): + """Thai: both backends already produce real word segments and agree. + + Documents that Thai is NOT where ICU differs — Uniscribe segments it correctly + on current Windows, so we assert parity here rather than divergence. + """ + + def test_thai_greeting_agrees(self): + # "สวัสดีครับ" → "สวัสดี" / "ครับ" with both backends. + text = "สวัสดีครับ" + icu_result = _icuWordOffsets(text, 0) + uni_result = _uniscribeWordOffsets(text, 0) + self.assertEqual(icu_result, uni_result) + # Both return a real multi-code-point word, not a single character. + start, end = icu_result + self.assertGreater(end - start, 1) + + +@skipIfNoICU +class TestWordOffsetsComplexScriptDivergence(unittest.TestCase): + """Japanese and Khmer: Uniscribe falls back to character/cluster level; ICU groups words. + + These assert the actual divergence (not just that ICU returns non-None), which is + the behaviour the ICU backend exists to fix (#20343). + """ + + def test_japanese_uniscribe_is_character_level(self): + # "これは日本語です": Uniscribe returns single code points; ICU groups words. + text = "これは日本語です" + # Offset 0 ("こ"): ICU groups "これ"; Uniscribe returns just "こ". + icu_result = _icuWordOffsets(text, 0) + uni_result = _uniscribeWordOffsets(text, 0) + self.assertNotEqual(icu_result, uni_result) + # Uniscribe is character-level here (single code point). + self.assertEqual(uni_result[1] - uni_result[0], 1) + # ICU groups more than one code point into a word. + self.assertGreater(icu_result[1] - icu_result[0], 1) + + def test_khmer_uniscribe_breaks_clusters(self): + # "ខ្ញុំស្រលាញ់": Uniscribe breaks the second word into syllable clusters; ICU keeps it whole. + text = "ខ្ញុំស្រលាញ់" + # Offset 5 is the start of the second Khmer word "ស្រលាញ់". + icu_result = _icuWordOffsets(text, 5) + uni_result = _uniscribeWordOffsets(text, 5) + self.assertNotEqual(icu_result, uni_result) + # ICU spans the whole word; Uniscribe returns a shorter cluster. + self.assertGreater(icu_result[1] - icu_result[0], uni_result[1] - uni_result[0]) diff --git a/tests/unit/test_wordSegIcu.py b/tests/unit/test_wordSegIcu.py new file mode 100644 index 00000000000..76a6a97813d --- /dev/null +++ b/tests/unit/test_wordSegIcu.py @@ -0,0 +1,93 @@ +# A part of NonVisual Desktop Access (NVDA) +# This file is covered by the GNU General Public License. +# See the file COPYING for more details. +# Copyright (C) 2025-2026 NV Access Limited, Wang Chong + +"""Unit tests for ICU word segmentation strategy.""" + +import unittest +from unittest.mock import patch + +from textUtils._wordSeg import wordSegStrategy + + +class TestIcuStrategy(unittest.TestCase): + def test_icu_strategy_getSegmentForOffset_calls_primitive(self): + text = "hello world" + with patch("textUtils.icu.calculateWordOffsets", return_value=(0, 6)) as mockCalc: + strat = wordSegStrategy.IcuWordSegmentationStrategy(text, None) + result = strat.getSegmentForOffset(2) + mockCalc.assert_called_once_with(text, 2) + self.assertEqual(result, (0, 6)) + + def test_icu_segmentedText_returns_text_unchanged(self): + strat = wordSegStrategy.IcuWordSegmentationStrategy("hello", None) + self.assertEqual(strat.segmentedText(), "hello") + + def test_explicit_icu_flag_selects_icu_when_available(self): + from textUtils._wordSeg import wordSegmenter + from textUtils.segFlag import WordSegFlag + + with patch.object(wordSegmenter, "_ICU_AVAILABLE", True): + seg = wordSegmenter.WordSegmenter("hello", None, WordSegFlag.ICU) + self.assertIsInstance(seg.strategy, wordSegStrategy.IcuWordSegmentationStrategy) + + def test_explicit_icu_flag_falls_back_when_unavailable(self): + from textUtils._wordSeg import wordSegmenter + from textUtils.segFlag import WordSegFlag + + with patch.object(wordSegmenter, "_ICU_AVAILABLE", False): + seg = wordSegmenter.WordSegmenter("hello", None, WordSegFlag.ICU) + self.assertIsInstance(seg.strategy, wordSegStrategy.UniscribeWordSegmentationStrategy) + + def test_auto_selects_icu_for_thai(self): + from textUtils._wordSeg import wordSegmenter + from textUtils.segFlag import WordSegFlag + + thai = "สวัสดีครับ" + with ( + patch.object(wordSegmenter, "_ICU_AVAILABLE", True), + patch.object( + wordSegStrategy.ChineseWordSegmentationStrategy, + "_lib", + None, + ), + ): + seg = wordSegmenter.WordSegmenter(thai, None, WordSegFlag.AUTO) + self.assertIsInstance(seg.strategy, wordSegStrategy.IcuWordSegmentationStrategy) + + def test_auto_prefers_icu_for_latin_when_available(self): + from textUtils._wordSeg import wordSegmenter + from textUtils.segFlag import WordSegFlag + + with ( + patch.object(wordSegmenter, "_ICU_AVAILABLE", True), + patch.object( + wordSegStrategy.ChineseWordSegmentationStrategy, + "_lib", + None, + ), + ): + seg = wordSegmenter.WordSegmenter("hello world", None, WordSegFlag.AUTO) + self.assertIsInstance(seg.strategy, wordSegStrategy.IcuWordSegmentationStrategy) + + def test_auto_falls_back_to_uniscribe_when_icu_unavailable(self): + from textUtils._wordSeg import wordSegmenter + from textUtils.segFlag import WordSegFlag + + with ( + patch.object(wordSegmenter, "_ICU_AVAILABLE", False), + patch.object( + wordSegStrategy.ChineseWordSegmentationStrategy, + "_lib", + None, + ), + ): + seg = wordSegmenter.WordSegmenter("hello world", None, WordSegFlag.AUTO) + self.assertIsInstance(seg.strategy, wordSegStrategy.UniscribeWordSegmentationStrategy) + + def test_word_navigation_unit_flag_has_icu(self): + from config.featureFlagEnums import WordNavigationUnitFlag + + self.assertTrue(hasattr(WordNavigationUnitFlag, "ICU")) + self.assertTrue(WordNavigationUnitFlag.ICU.displayString) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index e2775d5b91d..9967ed6d47e 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -9,6 +9,10 @@ * Add-ons can be removed from the "Updatable add-ons" tab in the Add-on Store. (#15030, @nvdaes) * Chinese text can now be navigated by word using built-in input gestures. A Word Segmentation Standard setting was added to the "Document Navigation" panel. (#18735, @CrazySteve0605, @Cary-rowen) +* Word segmentation can now use the Windows built-in ICU library for boundary detection: Unicode Standard Annex #29 rules plus automatic dictionary-based segmentation selected by the script of the text. + A new "Windows Unicode (ICU)" option was added to the Word Segmentation Standard setting in the "Document Navigation" panel. + Under "Auto", ICU is now preferred over the legacy Windows segmentation wherever available, while Chinese word segmentation continues to take precedence for Chinese text. + This fixes word navigation in browse mode for Japanese and other scripts where the legacy Windows segmentation falls back to character-level boundaries. (#18735, #20343) * Braille output for Chinese now includes spaces between words. (#18865, @CrazySteve0605, @Cary-rowen) * Added sequential two-flick touch gestures that combine two flicks performed in quick succession into a single gesture, increasing the number of touch gestures that can be bound to scripts. (#19938, @kefaslungu) * Twelve combinations are recognised: opposite-direction pairs (e.g. flick right then flick left) and perpendicular L-shaped pairs (e.g. flick right then flick up). diff --git a/user_docs/en/userGuide.md b/user_docs/en/userGuide.md index 4bd29401318..4e1c007ebb6 100644 --- a/user_docs/en/userGuide.md +++ b/user_docs/en/userGuide.md @@ -3565,14 +3565,15 @@ When a Chinese braille output table is in use, NVDA can insert spaces between Ch | . {.hideHeaderRow} |.| |---|---| -| Options | Default (Auto), Auto, Standard, Chinese | +| Options | Default (Auto), Auto, Windows (legacy), Chinese, Windows Unicode (ICU) | | Default | Auto | | Option | Behaviour | |---|---| -| Auto | Use Chinese word segmentation for Chinese text when available. For other text, use standard word segmentation. | -| Standard | Use standard Windows word segmentation. | -| Chinese | Use Chinese word segmentation. If Chinese word segmentation is not available, NVDA falls back to standard word segmentation. | +| Auto | Use Chinese word segmentation for Chinese text when available. Otherwise, prefer Windows Unicode (ICU) word segmentation when available, falling back to the legacy Windows word segmentation. | +| Windows (legacy) | Use the legacy Windows (Uniscribe) word segmentation. | +| Chinese | Use Chinese word segmentation. If Chinese word segmentation is not available, NVDA falls back to the legacy Windows word segmentation. | +| Windows Unicode (ICU) | Use the Windows built-in ICU library for word boundary detection: Unicode Standard Annex #29 rules plus automatic dictionary-based segmentation selected by the script of the text (Chinese, Japanese, Thai, Lao, Khmer and Burmese). This is particularly useful for scripts such as Japanese and Khmer, and for multi-character emoji sequences, where the legacy Windows word segmentation falls back to character-level boundaries. | #### Math Settings {#MathSettings} From 4ab9acc755d8d46e3b835d85c5c417457bb0418a Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Mon, 22 Jun 2026 11:05:47 +0200 Subject: [PATCH 02/12] Add emoji test --- .../unit/test_textUtils_backendComparison.py | 35 +++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/tests/unit/test_textUtils_backendComparison.py b/tests/unit/test_textUtils_backendComparison.py index 4f3cfaf3340..6f3a3a3b7a3 100644 --- a/tests/unit/test_textUtils_backendComparison.py +++ b/tests/unit/test_textUtils_backendComparison.py @@ -187,3 +187,38 @@ def test_khmer_uniscribe_breaks_clusters(self): self.assertNotEqual(icu_result, uni_result) # ICU spans the whole word; Uniscribe returns a shorter cluster. self.assertGreater(icu_result[1] - icu_result[0], uni_result[1] - uni_result[0]) + + +@skipIfNoICU +class TestWordOffsetsEmojiZwjSequence(unittest.TestCase): + """Multi-person emoji ZWJ sequence with skin-tone modifiers. + + "👩🏻‍👧🏻‍👦🏻" (woman + girl + boy family, each with a light skin-tone modifier, + joined by ZERO WIDTH JOINER) is a single UAX#29 word. ICU treats the whole + sequence as one segment; Uniscribe falls back to grapheme/surrogate-level + boundaries and returns only the leading part. See #20343. + + The sequence is 14 UTF-16 code units: + 👩 (2) 🏻 (2) ZWJ (1) 👧 (2) 🏻 (2) ZWJ (1) 👦 (2) 🏻 (2). + """ + + TEXT = "👩🏻‍👧🏻‍👦🏻" + # UTF-16 code-unit length of the whole sequence. + LENGTH = len(TEXT.encode("utf-16-le")) // 2 + + def test_length_is_as_expected(self): + # Guards the constant the assertions below rely on. + self.assertEqual(self.LENGTH, 14) + + def test_icu_groups_whole_sequence(self): + # ICU returns the entire ZWJ sequence as one word from offset 0. + self.assertEqual(_icuWordOffsets(self.TEXT, 0), (0, self.LENGTH)) + + def test_backends_diverge(self): + # Uniscribe does not group the whole sequence; ICU does. + icu_result = _icuWordOffsets(self.TEXT, 0) + uni_result = _uniscribeWordOffsets(self.TEXT, 0) + self.assertNotEqual(icu_result, uni_result) + # ICU spans the full sequence; Uniscribe returns a shorter leading run. + self.assertEqual(icu_result, (0, self.LENGTH)) + self.assertLess(uni_result[1] - uni_result[0], self.LENGTH) From 6369257a804fca86b3f79e54083b6c6553d0a205 Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Mon, 22 Jun 2026 15:57:16 +0200 Subject: [PATCH 03/12] Simplification --- source/textUtils/_wordSeg/wordSegStrategy.py | 17 ++-- source/textUtils/icu.py | 26 +++---- source/winBindings/icu.py | 81 ++------------------ 3 files changed, 25 insertions(+), 99 deletions(-) diff --git a/source/textUtils/_wordSeg/wordSegStrategy.py b/source/textUtils/_wordSeg/wordSegStrategy.py index 272ad7ad98f..3b269d3e492 100644 --- a/source/textUtils/_wordSeg/wordSegStrategy.py +++ b/source/textUtils/_wordSeg/wordSegStrategy.py @@ -1,5 +1,5 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2025 NV Access Limited, Wang Chong +# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt @@ -142,10 +142,13 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: """Return (start inclusive, end exclusive) or None. Offsets are str offsets relative to self.text.""" pass - @abstractmethod def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: - """Segmented result with separators.""" - pass + """Segmented result with separators. + + The default returns the text unchanged; only strategies that insert separators + for braille output (e.g. Chinese) override this. + """ + return self.text def getWordOffsetRange( self, @@ -226,9 +229,6 @@ def _calculateUniscribeOffsets( def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: return self._calculateUniscribeOffsets(self.text, offset) - def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: - return self.text - class ChineseWordSegmentationStrategy(WordSegmentationStrategy): _lib: CDLL | None = None @@ -369,6 +369,3 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None: if result is None: return None return offsetConverter.encodedToStrOffsets(*result) - - def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str: - return self.text diff --git a/source/textUtils/icu.py b/source/textUtils/icu.py index 2ca10b4e7a8..8b07b78b55f 100644 --- a/source/textUtils/icu.py +++ b/source/textUtils/icu.py @@ -21,18 +21,18 @@ @contextmanager -def _breakIterator(kind: int, locale: bytes, text: str): +def _breakIterator(kind: int, locale: bytes, buf: ctypes.Array[ctypes.c_wchar]): """Context manager that opens an ICU BreakIterator, yields it, then closes it. - The ctypes buffer is kept alive for the duration of the block, satisfying - ICU's requirement that the text pointer remains valid while the iterator is in use. + The caller owns the text buffer and must keep it alive for the duration of the + block, satisfying ICU's requirement that the text pointer remains valid while + the iterator is in use. :param kind: One of the UBRK_* constants from winBindings.icu. :param locale: ICU locale byte string (the root locale, _ROOT_LOCALE). - :param text: Python str to analyze. + :param buf: NUL-terminated UTF-16 buffer (ctypes.create_unicode_buffer) to analyze. :raises RuntimeError: If ICU reports an error opening the iterator. """ - buf = ctypes.create_unicode_buffer(text) textLength = len(buf) - 1 status = _icu.UErrorCode(0) bi = _icu.ubrk_open(kind, locale, buf, textLength, ctypes.byref(status)) @@ -74,17 +74,15 @@ def calculateWordOffsets( :return: (startOffset, endOffset) as UTF-16 code unit indices (endOffset exclusive), or None if the ICU call failed. """ - utf16_bytes = text.encode("utf-16-le", errors="surrogatepass") - textLength = len(utf16_bytes) // 2 + # A c_wchar buffer is UTF-16 code-unit indexed on Windows, so buf[a:b] is exactly + # the segment ICU's offsets refer to (lone surrogates decode as non-space). + buf = ctypes.create_unicode_buffer(text) + textLength = len(buf) - 1 if offset >= textLength: return (offset, offset + 1) - locale = _ROOT_LOCALE - - def _segText(segStart: int, segEnd: int) -> str: - return utf16_bytes[segStart * 2 : segEnd * 2].decode("utf-16-le", errors="surrogatepass") try: - with _breakIterator(_icu.UBRK_WORD, locale, text) as bi: + with _breakIterator(_icu.UBRK_WORD, _ROOT_LOCALE, buf) as bi: # Find [start, end) — the ICU segment containing offset. # ICU offsets are code-point indexed, so anchor on the boundary following # offset and take the boundary preceding that. (ubrk_preceding(offset + 1) @@ -96,7 +94,7 @@ def _segText(segStart: int, segEnd: int) -> str: if start == _icu.UBRK_DONE: start = 0 - if _segText(start, end).isspace(): + if buf[start:end].isspace(): # Offset is inside a whitespace run. Attach this run to the # preceding segment (mirroring the Uniscribe trailing-space rule). if start > 0: @@ -108,7 +106,7 @@ def _segText(segStart: int, segEnd: int) -> str: # Offset is inside a word/punctuation segment. Extend the end # through any immediately following whitespace run. nextEnd = _icu.ubrk_following(bi, end) - if nextEnd != _icu.UBRK_DONE and _segText(end, nextEnd).isspace(): + if nextEnd != _icu.UBRK_DONE and buf[end:nextEnd].isspace(): return (start, nextEnd) return (start, end) diff --git a/source/winBindings/icu.py b/source/winBindings/icu.py index 68394602b6d..a5b4d1629fe 100644 --- a/source/winBindings/icu.py +++ b/source/winBindings/icu.py @@ -28,35 +28,8 @@ ICU_AVAILABLE: bool = _lib is not None """True if an ICU library was successfully loaded.""" -UBRK_CHARACTER: int = 0 -"""Break iterator type for character boundaries.""" UBRK_WORD: int = 1 """Break iterator type for word boundaries.""" -UBRK_LINE: int = 2 -"""Break iterator type for line-break boundaries.""" -UBRK_SENTENCE: int = 3 -"""Break iterator type for sentence boundaries.""" - -UBRK_WORD_NONE: int = 0 -"""Rule status tag: start of non-word boundary range (whitespace or punctuation between words).""" -UBRK_WORD_NONE_LIMIT: int = 100 -"""Rule status tag: exclusive end of non-word boundary range.""" -UBRK_WORD_NUMBER: int = 100 -"""Rule status tag: start of number boundary range.""" -UBRK_WORD_NUMBER_LIMIT: int = 200 -"""Rule status tag: exclusive end of number boundary range.""" -UBRK_WORD_LETTER: int = 200 -"""Rule status tag: start of letter boundary range; values >= this are actual word boundaries.""" -UBRK_WORD_LETTER_LIMIT: int = 300 -"""Rule status tag: exclusive end of letter boundary range.""" -UBRK_WORD_KANA: int = 300 -"""Rule status tag: start of kana boundary range.""" -UBRK_WORD_KANA_LIMIT: int = 400 -"""Rule status tag: exclusive end of kana boundary range.""" -UBRK_WORD_IDEO: int = 400 -"""Rule status tag: start of ideograph boundary range.""" -UBRK_WORD_IDEO_LIMIT: int = 500 -"""Rule status tag: exclusive end of ideograph boundary range.""" UBRK_DONE: int = -1 """Returned by iterator functions when there are no more boundaries.""" @@ -85,10 +58,10 @@ def U_FAILURE(code: int) -> bool: """ ubrk_open.restype = c_void_p ubrk_open.argtypes = ( - c_int32, # kind: UBreakIteratorType - c_char_p, # locale: UTF-8 locale ID or NULL - c_wchar_p, # text: UTF-16 text to analyze - c_int32, # textLength: code units, or -1 for NUL-terminated + c_int32, # kind: UBreakIteratorType + c_char_p, # locale: UTF-8 locale ID or NULL + c_wchar_p, # text: UTF-16 text to analyze + c_int32, # textLength: code units, or -1 for NUL-terminated POINTER(UErrorCode), # status: in/out error code ) @@ -99,37 +72,6 @@ def U_FAILURE(code: int) -> bool: c_void_p, # bi: UBreakIterator* handle to free ) - ubrk_setText = _lib.ubrk_setText - """Rebind an existing iterator to new text without reallocating. - - ICU holds a reference to the text buffer; the caller must keep it alive for the - lifetime of the iterator. - """ - ubrk_setText.restype = None - ubrk_setText.argtypes = ( - c_void_p, # bi: UBreakIterator* handle - c_wchar_p, # text: new UTF-16 text buffer - c_int32, # textLength: code units, or -1 for NUL-terminated - POINTER(UErrorCode), # status: in/out error code - ) - - ubrk_first = _lib.ubrk_first - """Move to the first boundary (start of text) and return its position.""" - ubrk_first.restype = c_int32 - ubrk_first.argtypes = ( - c_void_p, # bi: UBreakIterator* handle - ) - - ubrk_next = _lib.ubrk_next - """Advance to the next boundary and return its position. - - Returns UBRK_DONE when past the end of the text. - """ - ubrk_next.restype = c_int32 - ubrk_next.argtypes = ( - c_void_p, # bi: UBreakIterator* handle - ) - ubrk_preceding = _lib.ubrk_preceding """Return the largest boundary position strictly less than offset. @@ -138,7 +80,7 @@ def U_FAILURE(code: int) -> bool: ubrk_preceding.restype = c_int32 ubrk_preceding.argtypes = ( c_void_p, # bi: UBreakIterator* handle - c_int32, # offset: position to search before + c_int32, # offset: position to search before ) ubrk_following = _lib.ubrk_following @@ -149,16 +91,5 @@ def U_FAILURE(code: int) -> bool: ubrk_following.restype = c_int32 ubrk_following.argtypes = ( c_void_p, # bi: UBreakIterator* handle - c_int32, # offset: position to search after - ) - - ubrk_getRuleStatus = _lib.ubrk_getRuleStatus - """Return the rule status tag for the most recently returned boundary. - - For UBRK_WORD iterators, values < UBRK_WORD_NONE_LIMIT indicate non-word boundaries - (whitespace or punctuation); values >= UBRK_WORD_LETTER are actual word boundaries. - """ - ubrk_getRuleStatus.restype = c_int32 - ubrk_getRuleStatus.argtypes = ( - c_void_p, # bi: UBreakIterator* handle + c_int32, # offset: position to search after ) From 0392c0854b6f0fee6beda6d333107d24db983233 Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Mon, 22 Jun 2026 16:01:28 +0200 Subject: [PATCH 04/12] Fix copyright --- source/config/featureFlagEnums.py | 4 ++-- source/textInfos/offsets.py | 2 +- source/textUtils/_wordSeg/wordSegmenter.py | 2 +- source/textUtils/segFlag.py | 2 +- tests/unit/test_textUtils_backendComparison.py | 4 ++-- tests/unit/test_wordSegIcu.py | 6 +++--- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 88833783ec7..668a568d0d0 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) # Copyright (C) 2022-2026 NV Access Limited, Bill Dengler, Rob Meredith, Leonard de Ruijter, Wang Chong -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt """ Feature flag value enumerations. diff --git a/source/textInfos/offsets.py b/source/textInfos/offsets.py index 9b63f48c67f..a5f06589813 100755 --- a/source/textInfos/offsets.py +++ b/source/textInfos/offsets.py @@ -1,5 +1,5 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong +# Copyright (C) 2006-2026 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt diff --git a/source/textUtils/_wordSeg/wordSegmenter.py b/source/textUtils/_wordSeg/wordSegmenter.py index a3a9d3e5d4f..23e2d405c0a 100644 --- a/source/textUtils/_wordSeg/wordSegmenter.py +++ b/source/textUtils/_wordSeg/wordSegmenter.py @@ -1,5 +1,5 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2025-2026 NV Access Limited, Wang Chong +# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt diff --git a/source/textUtils/segFlag.py b/source/textUtils/segFlag.py index 10f6d48ef3f..ceb42757fb9 100644 --- a/source/textUtils/segFlag.py +++ b/source/textUtils/segFlag.py @@ -1,5 +1,5 @@ # A part of NonVisual Desktop Access (NVDA) -# Copyright (C) 2025 NV Access Limited, Wang Chong +# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt diff --git a/tests/unit/test_textUtils_backendComparison.py b/tests/unit/test_textUtils_backendComparison.py index 6f3a3a3b7a3..b887457694a 100644 --- a/tests/unit/test_textUtils_backendComparison.py +++ b/tests/unit/test_textUtils_backendComparison.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. # Copyright (C) 2026 NV Access Limited, Leonard de Ruijter +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt """Comparison tests between the Uniscribe and ICU word boundary backends. diff --git a/tests/unit/test_wordSegIcu.py b/tests/unit/test_wordSegIcu.py index 76a6a97813d..be0943f69a3 100644 --- a/tests/unit/test_wordSegIcu.py +++ b/tests/unit/test_wordSegIcu.py @@ -1,7 +1,7 @@ # A part of NonVisual Desktop Access (NVDA) -# This file is covered by the GNU General Public License. -# See the file COPYING for more details. -# Copyright (C) 2025-2026 NV Access Limited, Wang Chong +# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt """Unit tests for ICU word segmentation strategy.""" From fc2173c99d163cebd9fc4ba63ea43608af70f77e Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Mon, 22 Jun 2026 16:23:29 +0200 Subject: [PATCH 05/12] Cleanup --- .../unit/test_textUtils_backendComparison.py | 129 +++++++----------- tests/unit/test_wordSegIcu.py | 22 --- 2 files changed, 48 insertions(+), 103 deletions(-) diff --git a/tests/unit/test_textUtils_backendComparison.py b/tests/unit/test_textUtils_backendComparison.py index b887457694a..ca7c25bef73 100644 --- a/tests/unit/test_textUtils_backendComparison.py +++ b/tests/unit/test_textUtils_backendComparison.py @@ -37,23 +37,22 @@ def _uniscribeWordOffsets(text: str, offset: int) -> tuple[int, int] | None: return WordSegmenter(text, _ENCODING, WordSegFlag.UNISCRIBE).getSegmentForOffset(offset) -# --------------------------------------------------------------------------- -# calculateWordOffsets — agreement on plain Latin / Hebrew text -# --------------------------------------------------------------------------- +class _WordOffsetsParityTest(unittest.TestCase): + """Base for per-script word offset parity tests. - -@skipIfNoICU -class TestWordOffsetsEnglish(unittest.TestCase): - """Word offset comparison for English text. - - Both backends include trailing whitespace as part of the preceding word. - NVDA's Uniscribe implementation (textUtils.cpp) does this natively; - the ICU implementation mirrors that behaviour explicitly. + Subclasses set TEXT and add test_* methods that assert the exact span via + _assertSameWordOffsets. Has no test methods of its own, so the loader runs nothing. """ - TEXT = "hello world" + TEXT: str def _assertSameWordOffsets(self, offset: int) -> tuple[int, int] | None: + """Assert both backends return the same word offsets for self.TEXT at offset. + + :param offset: UTF-16 code unit offset within self.TEXT to query. + :return: The (start, end) offsets, so callers can additionally assert the exact span. + :raises AssertionError: If the ICU and Uniscribe backends disagree. + """ icu_result = _icuWordOffsets(self.TEXT, offset) uni_result = _uniscribeWordOffsets(self.TEXT, offset) self.assertEqual( @@ -64,8 +63,20 @@ def _assertSameWordOffsets(self, offset: int) -> tuple[int, int] | None: ) return icu_result + +@skipIfNoICU +class TestWordOffsetsEnglish(_WordOffsetsParityTest): + """Word offset comparison for English text. + + Both backends include trailing whitespace as part of the preceding word. + NVDA's Uniscribe implementation (textUtils.cpp) does this natively; + the ICU implementation mirrors that behaviour explicitly. + """ + + TEXT = "hello world" + def test_first_word(self): - # Both backends: "hello " — trailing space included. + """Both backends: "hello " — trailing space included.""" result = self._assertSameWordOffsets(0) self.assertEqual(result, (0, 6)) @@ -74,12 +85,12 @@ def test_mid_first_word(self): self.assertEqual(result, (0, 6)) def test_space(self): - # Both backends: querying at the space returns the preceding word+space. + """Both backends: querying at the space returns the preceding word+space.""" result = self._assertSameWordOffsets(5) self.assertEqual(result, (0, 6)) def test_second_word(self): - # Both backends: "world" — no trailing space at end of string. + """Both backends: "world" — no trailing space at end of string.""" result = self._assertSameWordOffsets(6) self.assertEqual(result, (6, 11)) @@ -89,85 +100,41 @@ def test_mid_second_word(self): @skipIfNoICU -class TestWordOffsetsHebrew(unittest.TestCase): - """Word offset comparison for Hebrew text — שלום עולם (hello world).""" +class TestWordOffsetsHebrew(_WordOffsetsParityTest): + """Word offset comparison for vocalized (Biblical) Hebrew — שָׁלוֹם עוֹלָם (peace, world). - TEXT = "שלום עולם" + The niqqud (combining vowel and shin points) attach to their base letters, so each + word stays a single segment: שָׁלוֹם is 7 UTF-16 code units, עוֹלָם is 6. + """ - def _assertSameWordOffsets(self, offset: int) -> tuple[int, int] | None: - icu_result = _icuWordOffsets(self.TEXT, offset) - uni_result = _uniscribeWordOffsets(self.TEXT, offset) - self.assertEqual( - icu_result, - uni_result, - f"Backends disagree on word offsets for {self.TEXT!r} at offset {offset}: " - f"ICU={icu_result!r} Uniscribe={uni_result!r}", - ) - return icu_result + TEXT = "שָׁלוֹם עוֹלָם" def test_first_word(self): - # Both backends: "שלום " — trailing space included, offsets (0, 5). + """Both backends: "שָׁלוֹם " — trailing space included, offsets (0, 8).""" result = self._assertSameWordOffsets(0) - self.assertEqual(result, (0, 5)) + self.assertEqual(result, (0, 8)) def test_mid_first_word(self): result = self._assertSameWordOffsets(2) - self.assertEqual(result, (0, 5)) + self.assertEqual(result, (0, 8)) def test_space(self): - # Both backends: querying at offset 4 (space) returns the preceding word+space. - result = self._assertSameWordOffsets(4) - self.assertEqual(result, (0, 5)) + """Both backends: querying at offset 7 (space) returns the preceding word+space.""" + result = self._assertSameWordOffsets(7) + self.assertEqual(result, (0, 8)) def test_second_word(self): - # Both backends: "עולם" — no trailing space. - result = self._assertSameWordOffsets(5) - self.assertEqual(result, (5, 9)) - - -# --------------------------------------------------------------------------- -# Complex-script cases. -# -# Uniscribe uses the Windows Script Processor (ScriptBreak) for word boundaries. -# On Windows 10/11 it DOES segment some space-less scripts correctly: for Thai -# (and Lao) both backends agree, so those scripts are not a differentiator. -# -# For other scripts — notably Japanese kana/kanji and Khmer — Uniscribe falls -# back to character- or syllable-cluster-level boundaries, whereas ICU applies -# UAX#29 rules with dictionary segmentation and returns whole words. These are -# the cases where the ICU backend actually helps; see #20343. -# --------------------------------------------------------------------------- - - -@skipIfNoICU -class TestWordOffsetsThaiParity(unittest.TestCase): - """Thai: both backends already produce real word segments and agree. - - Documents that Thai is NOT where ICU differs — Uniscribe segments it correctly - on current Windows, so we assert parity here rather than divergence. - """ - - def test_thai_greeting_agrees(self): - # "สวัสดีครับ" → "สวัสดี" / "ครับ" with both backends. - text = "สวัสดีครับ" - icu_result = _icuWordOffsets(text, 0) - uni_result = _uniscribeWordOffsets(text, 0) - self.assertEqual(icu_result, uni_result) - # Both return a real multi-code-point word, not a single character. - start, end = icu_result - self.assertGreater(end - start, 1) + """Both backends: "עוֹלָם" — no trailing space.""" + result = self._assertSameWordOffsets(8) + self.assertEqual(result, (8, 14)) @skipIfNoICU class TestWordOffsetsComplexScriptDivergence(unittest.TestCase): - """Japanese and Khmer: Uniscribe falls back to character/cluster level; ICU groups words. - - These assert the actual divergence (not just that ICU returns non-None), which is - the behaviour the ICU backend exists to fix (#20343). - """ + """Japanese and Khmer: Uniscribe falls back to character/cluster level; ICU groups words.""" def test_japanese_uniscribe_is_character_level(self): - # "これは日本語です": Uniscribe returns single code points; ICU groups words. + """Japanese "これは日本語です": Uniscribe returns single code points; ICU groups words.""" text = "これは日本語です" # Offset 0 ("こ"): ICU groups "これ"; Uniscribe returns just "こ". icu_result = _icuWordOffsets(text, 0) @@ -179,7 +146,7 @@ def test_japanese_uniscribe_is_character_level(self): self.assertGreater(icu_result[1] - icu_result[0], 1) def test_khmer_uniscribe_breaks_clusters(self): - # "ខ្ញុំស្រលាញ់": Uniscribe breaks the second word into syllable clusters; ICU keeps it whole. + """Khmer "ខ្ញុំស្រលាញ់": Uniscribe breaks the second word into syllable clusters; ICU keeps it whole.""" text = "ខ្ញុំស្រលាញ់" # Offset 5 is the start of the second Khmer word "ស្រលាញ់". icu_result = _icuWordOffsets(text, 5) @@ -196,7 +163,7 @@ class TestWordOffsetsEmojiZwjSequence(unittest.TestCase): "👩🏻‍👧🏻‍👦🏻" (woman + girl + boy family, each with a light skin-tone modifier, joined by ZERO WIDTH JOINER) is a single UAX#29 word. ICU treats the whole sequence as one segment; Uniscribe falls back to grapheme/surrogate-level - boundaries and returns only the leading part. See #20343. + boundaries and returns only the leading part. The sequence is 14 UTF-16 code units: 👩 (2) 🏻 (2) ZWJ (1) 👧 (2) 🏻 (2) ZWJ (1) 👦 (2) 🏻 (2). @@ -207,15 +174,15 @@ class TestWordOffsetsEmojiZwjSequence(unittest.TestCase): LENGTH = len(TEXT.encode("utf-16-le")) // 2 def test_length_is_as_expected(self): - # Guards the constant the assertions below rely on. + """Guard the constant the assertions below rely on.""" self.assertEqual(self.LENGTH, 14) def test_icu_groups_whole_sequence(self): - # ICU returns the entire ZWJ sequence as one word from offset 0. + """ICU returns the entire ZWJ sequence as one word from offset 0.""" self.assertEqual(_icuWordOffsets(self.TEXT, 0), (0, self.LENGTH)) def test_backends_diverge(self): - # Uniscribe does not group the whole sequence; ICU does. + """Uniscribe does not group the whole sequence; ICU does.""" icu_result = _icuWordOffsets(self.TEXT, 0) uni_result = _uniscribeWordOffsets(self.TEXT, 0) self.assertNotEqual(icu_result, uni_result) diff --git a/tests/unit/test_wordSegIcu.py b/tests/unit/test_wordSegIcu.py index be0943f69a3..0fea0da8f43 100644 --- a/tests/unit/test_wordSegIcu.py +++ b/tests/unit/test_wordSegIcu.py @@ -40,22 +40,6 @@ def test_explicit_icu_flag_falls_back_when_unavailable(self): seg = wordSegmenter.WordSegmenter("hello", None, WordSegFlag.ICU) self.assertIsInstance(seg.strategy, wordSegStrategy.UniscribeWordSegmentationStrategy) - def test_auto_selects_icu_for_thai(self): - from textUtils._wordSeg import wordSegmenter - from textUtils.segFlag import WordSegFlag - - thai = "สวัสดีครับ" - with ( - patch.object(wordSegmenter, "_ICU_AVAILABLE", True), - patch.object( - wordSegStrategy.ChineseWordSegmentationStrategy, - "_lib", - None, - ), - ): - seg = wordSegmenter.WordSegmenter(thai, None, WordSegFlag.AUTO) - self.assertIsInstance(seg.strategy, wordSegStrategy.IcuWordSegmentationStrategy) - def test_auto_prefers_icu_for_latin_when_available(self): from textUtils._wordSeg import wordSegmenter from textUtils.segFlag import WordSegFlag @@ -85,9 +69,3 @@ def test_auto_falls_back_to_uniscribe_when_icu_unavailable(self): ): seg = wordSegmenter.WordSegmenter("hello world", None, WordSegFlag.AUTO) self.assertIsInstance(seg.strategy, wordSegStrategy.UniscribeWordSegmentationStrategy) - - def test_word_navigation_unit_flag_has_icu(self): - from config.featureFlagEnums import WordNavigationUnitFlag - - self.assertTrue(hasattr(WordNavigationUnitFlag, "ICU")) - self.assertTrue(WordNavigationUnitFlag.ICU.displayString) From c398a1a784ce8c101e4f150379c21cb81d9cd578 Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Mon, 22 Jun 2026 16:38:15 +0200 Subject: [PATCH 06/12] Update changes --- user_docs/en/changes.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 9967ed6d47e..8e3dcdd01d1 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -8,11 +8,9 @@ * Add-ons can be removed from the "Updatable add-ons" tab in the Add-on Store. (#15030, @nvdaes) * Chinese text can now be navigated by word using built-in input gestures. - A Word Segmentation Standard setting was added to the "Document Navigation" panel. (#18735, @CrazySteve0605, @Cary-rowen) -* Word segmentation can now use the Windows built-in ICU library for boundary detection: Unicode Standard Annex #29 rules plus automatic dictionary-based segmentation selected by the script of the text. - A new "Windows Unicode (ICU)" option was added to the Word Segmentation Standard setting in the "Document Navigation" panel. - Under "Auto", ICU is now preferred over the legacy Windows segmentation wherever available, while Chinese word segmentation continues to take precedence for Chinese text. - This fixes word navigation in browse mode for Japanese and other scripts where the legacy Windows segmentation falls back to character-level boundaries. (#18735, #20343) + *A Word Segmentation Standard setting was added to the "Document Navigation" panel. (#18735, @CrazySteve0605, @Cary-rowen) + * Word segmentation can also use the Windows built-in ICU library for boundary detection, improving navigation for Japanese and emoji. (#20343, @LeonarddeR) + * By default, ICU is preferred over the legacy Windows segmentation wherever available, while Chinese word segmentation takes precedence for Chinese text. * Braille output for Chinese now includes spaces between words. (#18865, @CrazySteve0605, @Cary-rowen) * Added sequential two-flick touch gestures that combine two flicks performed in quick succession into a single gesture, increasing the number of touch gestures that can be bound to scripts. (#19938, @kefaslungu) * Twelve combinations are recognised: opposite-direction pairs (e.g. flick right then flick left) and perpendicular L-shaped pairs (e.g. flick right then flick up). From 2c43e835a55a904e6d2de5e3b058fbfd1e001eef Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Mon, 22 Jun 2026 16:42:58 +0200 Subject: [PATCH 07/12] Fix user guide --- user_docs/en/userGuide.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/user_docs/en/userGuide.md b/user_docs/en/userGuide.md index 4e1c007ebb6..2d709c5d444 100644 --- a/user_docs/en/userGuide.md +++ b/user_docs/en/userGuide.md @@ -3571,9 +3571,9 @@ When a Chinese braille output table is in use, NVDA can insert spaces between Ch | Option | Behaviour | |---|---| | Auto | Use Chinese word segmentation for Chinese text when available. Otherwise, prefer Windows Unicode (ICU) word segmentation when available, falling back to the legacy Windows word segmentation. | -| Windows (legacy) | Use the legacy Windows (Uniscribe) word segmentation. | -| Chinese | Use Chinese word segmentation. If Chinese word segmentation is not available, NVDA falls back to the legacy Windows word segmentation. | -| Windows Unicode (ICU) | Use the Windows built-in ICU library for word boundary detection: Unicode Standard Annex #29 rules plus automatic dictionary-based segmentation selected by the script of the text (Chinese, Japanese, Thai, Lao, Khmer and Burmese). This is particularly useful for scripts such as Japanese and Khmer, and for multi-character emoji sequences, where the legacy Windows word segmentation falls back to character-level boundaries. | +| Windows (legacy) | Use the legacy Windows word segmentation as used in Notepad classic and other legacy Win32 edit controls. | +| Chinese | Use Chinese word segmentation. If Chinese word segmentation is not available, NVDA uses the fallback path as in `Auto`. | +| Windows Unicode (ICU) | Use the Windows built-in ICU library for word segmentation available in Windows 10 version 1703 (Creators Update) and later, falling back to legacy. | #### Math Settings {#MathSettings} From 939d48990ff2dde3a7a64a8c2f9a806d5dc2ee1e Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Mon, 22 Jun 2026 17:44:11 +0200 Subject: [PATCH 08/12] Copilot review actions --- source/textUtils/icu.py | 2 +- user_docs/en/changes.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/source/textUtils/icu.py b/source/textUtils/icu.py index 8b07b78b55f..7443d9d14a4 100644 --- a/source/textUtils/icu.py +++ b/source/textUtils/icu.py @@ -84,7 +84,7 @@ def calculateWordOffsets( try: with _breakIterator(_icu.UBRK_WORD, _ROOT_LOCALE, buf) as bi: # Find [start, end) — the ICU segment containing offset. - # ICU offsets are code-point indexed, so anchor on the boundary following + # ICU offsets are UTF-16 code-unit indexed, so anchor on the boundary following # offset and take the boundary preceding that. (ubrk_preceding(offset + 1) # would snap back for multi-unit segments.) end = _icu.ubrk_following(bi, offset) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 8e3dcdd01d1..3f98cb6907b 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -8,7 +8,7 @@ * Add-ons can be removed from the "Updatable add-ons" tab in the Add-on Store. (#15030, @nvdaes) * Chinese text can now be navigated by word using built-in input gestures. - *A Word Segmentation Standard setting was added to the "Document Navigation" panel. (#18735, @CrazySteve0605, @Cary-rowen) + * A Word Segmentation Standard setting was added to the "Document Navigation" panel. (#18735, @CrazySteve0605, @Cary-rowen) * Word segmentation can also use the Windows built-in ICU library for boundary detection, improving navigation for Japanese and emoji. (#20343, @LeonarddeR) * By default, ICU is preferred over the legacy Windows segmentation wherever available, while Chinese word segmentation takes precedence for Chinese text. * Braille output for Chinese now includes spaces between words. (#18865, @CrazySteve0605, @Cary-rowen) From 98a1f385746fc2f1d0864b594538da81cd3268e5 Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter <3049216+LeonarddeR@users.noreply.github.com> Date: Fri, 26 Jun 2026 18:27:05 +0200 Subject: [PATCH 09/12] Apply suggestions from code review Co-authored-by: Sascha Cowley <16543535+SaschaCowley@users.noreply.github.com> --- source/config/featureFlagEnums.py | 4 ++-- source/winBindings/icu.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 668a568d0d0..9bbad17e6e3 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -152,9 +152,9 @@ class WordNavigationUnitFlag(DisplayStringEnum): def _displayStringLabels(self) -> dict["WordNavigationUnitFlag", str]: return { # Translators: Label for a method of word segmentation. - self.AUTO: _("Auto"), + self.AUTO: _("Automatic"), # Translators: Label for a method of word segmentation. - self.UNISCRIBE: _("Windows (legacy)"), + self.UNISCRIBE: _("Legacy (Uniscribe)"), # Translators: Label for a method of word segmentation. self.CHINESE: _("Chinese"), # Translators: Label for a method of word segmentation. diff --git a/source/winBindings/icu.py b/source/winBindings/icu.py index a5b4d1629fe..803f459dbac 100644 --- a/source/winBindings/icu.py +++ b/source/winBindings/icu.py @@ -9,7 +9,8 @@ The combined icu.dll is available from Windows 10 version 1903 (May 2019 Update). Only the C APIs are exposed; no C++ APIs are available due to ABI instability. -See: https://learn.microsoft.com/windows/win32/intl/international-components-for-unicode--icu- +.. seealso:: + https://learn.microsoft.com/windows/win32/intl/international-components-for-unicode--icu- """ import ctypes From 86d4bcaa76a9af505e32cb996de4ed112b1961d2 Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Fri, 26 Jun 2026 18:44:55 +0200 Subject: [PATCH 10/12] Review actions --- source/config/featureFlagEnums.py | 10 ++-- source/textUtils/_wordSeg/wordSegmenter.py | 4 +- source/textUtils/icu.py | 30 +++++----- source/winBindings/icu.py | 64 +++++++++++++--------- tests/unit/test_wordSegIcu.py | 8 +-- user_docs/en/userGuide.md | 8 +-- 6 files changed, 68 insertions(+), 56 deletions(-) diff --git a/source/config/featureFlagEnums.py b/source/config/featureFlagEnums.py index 9bbad17e6e3..99e247d0f28 100644 --- a/source/config/featureFlagEnums.py +++ b/source/config/featureFlagEnums.py @@ -144,21 +144,21 @@ class WordNavigationUnitFlag(DisplayStringEnum): DEFAULT = enum.auto() AUTO = enum.auto() - UNISCRIBE = enum.auto() CHINESE = enum.auto() ICU = enum.auto() + UNISCRIBE = enum.auto() @property def _displayStringLabels(self) -> dict["WordNavigationUnitFlag", str]: return { # Translators: Label for a method of word segmentation. - self.AUTO: _("Automatic"), - # Translators: Label for a method of word segmentation. - self.UNISCRIBE: _("Legacy (Uniscribe)"), + self.AUTO: _("Automatic"), # Translators: Label for a method of word segmentation. self.CHINESE: _("Chinese"), # Translators: Label for a method of word segmentation. - self.ICU: _("Windows Unicode (ICU)"), + self.ICU: _("Unicode (ICU)"), + # Translators: Label for a method of word segmentation. + self.UNISCRIBE: _("Legacy (Uniscribe)"), } diff --git a/source/textUtils/_wordSeg/wordSegmenter.py b/source/textUtils/_wordSeg/wordSegmenter.py index 23e2d405c0a..2a0c0ba7579 100644 --- a/source/textUtils/_wordSeg/wordSegmenter.py +++ b/source/textUtils/_wordSeg/wordSegmenter.py @@ -10,7 +10,7 @@ from ..segFlag import WordSegFlag from . import wordSegStrategy -from winBindings.icu import ICU_AVAILABLE as _ICU_AVAILABLE +from winBindings.icu import ICU_AVAILABLE _GET_SEGMENT_RECOVERABLE_EXCEPTIONS = ( @@ -68,7 +68,7 @@ def _chooseStrategy( elif flag == WordSegFlag.CHINESE: log.debugWarning("Chinese word segmenter is unavailable. Falling back to ICU/Uniscribe.") # ICU for everything except the explicit UNISCRIBE flag. - if flag != WordSegFlag.UNISCRIBE and _ICU_AVAILABLE: + if flag != WordSegFlag.UNISCRIBE and ICU_AVAILABLE: return wordSegStrategy.IcuWordSegmentationStrategy(self.text, self.encoding) elif flag == WordSegFlag.ICU: log.debugWarning("ICU word segmenter is unavailable. Falling back to Uniscribe.") diff --git a/source/textUtils/icu.py b/source/textUtils/icu.py index 7443d9d14a4..dc8e2344a8c 100644 --- a/source/textUtils/icu.py +++ b/source/textUtils/icu.py @@ -11,7 +11,7 @@ import ctypes from contextlib import contextmanager -import winBindings.icu as _icu +import winBindings.icu as icu from logHandler import log _ROOT_LOCALE: bytes = b"" @@ -28,20 +28,20 @@ def _breakIterator(kind: int, locale: bytes, buf: ctypes.Array[ctypes.c_wchar]): block, satisfying ICU's requirement that the text pointer remains valid while the iterator is in use. - :param kind: One of the UBRK_* constants from winBindings.icu. + :param kind: One of the UBRK members from winBindings.icu. :param locale: ICU locale byte string (the root locale, _ROOT_LOCALE). :param buf: NUL-terminated UTF-16 buffer (ctypes.create_unicode_buffer) to analyze. :raises RuntimeError: If ICU reports an error opening the iterator. """ textLength = len(buf) - 1 - status = _icu.UErrorCode(0) - bi = _icu.ubrk_open(kind, locale, buf, textLength, ctypes.byref(status)) - if _icu.U_FAILURE(status.value) or not bi: + status = icu.UErrorCode(0) + bi = icu.ubrk_open(kind, locale, buf, textLength, ctypes.byref(status)) + if icu.U_FAILURE(status.value) or not bi: raise RuntimeError(f"ubrk_open failed with status {status.value}") try: yield bi finally: - _icu.ubrk_close(bi) + icu.ubrk_close(bi) def calculateWordOffsets( @@ -82,31 +82,31 @@ def calculateWordOffsets( return (offset, offset + 1) try: - with _breakIterator(_icu.UBRK_WORD, _ROOT_LOCALE, buf) as bi: + with _breakIterator(icu.UBRK.WORD, _ROOT_LOCALE, buf) as bi: # Find [start, end) — the ICU segment containing offset. # ICU offsets are UTF-16 code-unit indexed, so anchor on the boundary following # offset and take the boundary preceding that. (ubrk_preceding(offset + 1) # would snap back for multi-unit segments.) - end = _icu.ubrk_following(bi, offset) - if end == _icu.UBRK_DONE: + end = icu.ubrk_following(bi, offset) + if end == icu.UBRK_DONE: end = textLength - start = _icu.ubrk_preceding(bi, end) - if start == _icu.UBRK_DONE: + start = icu.ubrk_preceding(bi, end) + if start == icu.UBRK_DONE: start = 0 if buf[start:end].isspace(): # Offset is inside a whitespace run. Attach this run to the # preceding segment (mirroring the Uniscribe trailing-space rule). if start > 0: - wordStart = _icu.ubrk_preceding(bi, start) - if wordStart == _icu.UBRK_DONE: + wordStart = icu.ubrk_preceding(bi, start) + if wordStart == icu.UBRK_DONE: wordStart = 0 return (wordStart, end) else: # Offset is inside a word/punctuation segment. Extend the end # through any immediately following whitespace run. - nextEnd = _icu.ubrk_following(bi, end) - if nextEnd != _icu.UBRK_DONE and buf[end:nextEnd].isspace(): + nextEnd = icu.ubrk_following(bi, end) + if nextEnd != icu.UBRK_DONE and buf[end:nextEnd].isspace(): return (start, nextEnd) return (start, end) diff --git a/source/winBindings/icu.py b/source/winBindings/icu.py index 803f459dbac..b6338e5a8b7 100644 --- a/source/winBindings/icu.py +++ b/source/winBindings/icu.py @@ -9,29 +9,36 @@ The combined icu.dll is available from Windows 10 version 1903 (May 2019 Update). Only the C APIs are exposed; no C++ APIs are available due to ABI instability. -.. seealso:: - https://learn.microsoft.com/windows/win32/intl/international-components-for-unicode--icu- -""" +The ``ubrk_*`` function bindings are only defined when :data:`ICU_AVAILABLE` is True. -import ctypes -from ctypes import c_int32, c_void_p, c_char_p, c_wchar_p, POINTER +.. seealso:: + https://learn.microsoft.com/windows/win32/intl/international-components-for-unicode--icu- +""" -# Try the combined icu.dll (Windows 10 1903+) first, then icuuc.dll (Windows 10 1703+). +from ctypes import ( + WINFUNCTYPE, + windll, + c_int32, + c_void_p, + c_char_p, + c_wchar_p, + POINTER, +) +from enum import IntEnum + +# Load the combined icu.dll (Windows 10 1903+) first, then icuuc.dll (Windows 10 1703+). # ubrk_* functions are part of the "common" library, present in both. -_lib: ctypes.WinDLL | None = None -for _dllName in ("icu.dll", "icuuc.dll"): +try: + dll = windll.icu +except OSError: try: - _lib = ctypes.WinDLL(_dllName) - break + dll = windll.icuuc except OSError: - pass + dll = None -ICU_AVAILABLE: bool = _lib is not None +ICU_AVAILABLE: bool = dll is not None """True if an ICU library was successfully loaded.""" -UBRK_WORD: int = 1 -"""Break iterator type for word boundaries.""" - UBRK_DONE: int = -1 """Returned by iterator functions when there are no more boundaries.""" @@ -39,25 +46,29 @@ """Signed 32-bit integer error code. U_ZERO_ERROR = 0; positive values indicate errors.""" +class UBRK(IntEnum): + """The possible types of text boundaries (UBreakIteratorType).""" + + WORD = 1 + """Word breaks.""" + + def U_FAILURE(code: int) -> bool: """Return True if the given UErrorCode indicates an error.""" return code > 0 if ICU_AVAILABLE: - assert _lib is not None - - ubrk_open = _lib.ubrk_open + ubrk_open = WINFUNCTYPE(None)(("ubrk_open", dll)) """Create a new break iterator. - :param kind: UBreakIteratorType (one of the UBRK_* constants). + :param kind: UBreakIteratorType (one of the UBRK members). :param locale: Null-terminated UTF-8 locale ID, or NULL/empty for the root locale. :param text: UTF-16 text to analyze. :param textLength: Number of UTF-16 code units, or -1 for NUL-terminated. :param status: In/out UErrorCode; pass a pointer to a zero-initialised value. :return: Opaque UBreakIterator* handle; must be freed with ubrk_close. """ - ubrk_open.restype = c_void_p ubrk_open.argtypes = ( c_int32, # kind: UBreakIteratorType c_char_p, # locale: UTF-8 locale ID or NULL @@ -65,32 +76,33 @@ def U_FAILURE(code: int) -> bool: c_int32, # textLength: code units, or -1 for NUL-terminated POINTER(UErrorCode), # status: in/out error code ) + ubrk_open.restype = c_void_p - ubrk_close = _lib.ubrk_close + ubrk_close = WINFUNCTYPE(None)(("ubrk_close", dll)) """Free a break iterator created by ubrk_open.""" - ubrk_close.restype = None ubrk_close.argtypes = ( c_void_p, # bi: UBreakIterator* handle to free ) + ubrk_close.restype = None - ubrk_preceding = _lib.ubrk_preceding + ubrk_preceding = WINFUNCTYPE(None)(("ubrk_preceding", dll)) """Return the largest boundary position strictly less than offset. Sets the iterator to that position. """ - ubrk_preceding.restype = c_int32 ubrk_preceding.argtypes = ( c_void_p, # bi: UBreakIterator* handle c_int32, # offset: position to search before ) + ubrk_preceding.restype = c_int32 - ubrk_following = _lib.ubrk_following + ubrk_following = WINFUNCTYPE(None)(("ubrk_following", dll)) """Return the smallest boundary position strictly greater than offset. Sets the iterator to that position. Returns UBRK_DONE if past the end. """ - ubrk_following.restype = c_int32 ubrk_following.argtypes = ( c_void_p, # bi: UBreakIterator* handle c_int32, # offset: position to search after ) + ubrk_following.restype = c_int32 diff --git a/tests/unit/test_wordSegIcu.py b/tests/unit/test_wordSegIcu.py index 0fea0da8f43..0d0c30b6651 100644 --- a/tests/unit/test_wordSegIcu.py +++ b/tests/unit/test_wordSegIcu.py @@ -28,7 +28,7 @@ def test_explicit_icu_flag_selects_icu_when_available(self): from textUtils._wordSeg import wordSegmenter from textUtils.segFlag import WordSegFlag - with patch.object(wordSegmenter, "_ICU_AVAILABLE", True): + with patch.object(wordSegmenter, "ICU_AVAILABLE", True): seg = wordSegmenter.WordSegmenter("hello", None, WordSegFlag.ICU) self.assertIsInstance(seg.strategy, wordSegStrategy.IcuWordSegmentationStrategy) @@ -36,7 +36,7 @@ def test_explicit_icu_flag_falls_back_when_unavailable(self): from textUtils._wordSeg import wordSegmenter from textUtils.segFlag import WordSegFlag - with patch.object(wordSegmenter, "_ICU_AVAILABLE", False): + with patch.object(wordSegmenter, "ICU_AVAILABLE", False): seg = wordSegmenter.WordSegmenter("hello", None, WordSegFlag.ICU) self.assertIsInstance(seg.strategy, wordSegStrategy.UniscribeWordSegmentationStrategy) @@ -45,7 +45,7 @@ def test_auto_prefers_icu_for_latin_when_available(self): from textUtils.segFlag import WordSegFlag with ( - patch.object(wordSegmenter, "_ICU_AVAILABLE", True), + patch.object(wordSegmenter, "ICU_AVAILABLE", True), patch.object( wordSegStrategy.ChineseWordSegmentationStrategy, "_lib", @@ -60,7 +60,7 @@ def test_auto_falls_back_to_uniscribe_when_icu_unavailable(self): from textUtils.segFlag import WordSegFlag with ( - patch.object(wordSegmenter, "_ICU_AVAILABLE", False), + patch.object(wordSegmenter, "ICU_AVAILABLE", False), patch.object( wordSegStrategy.ChineseWordSegmentationStrategy, "_lib", diff --git a/user_docs/en/userGuide.md b/user_docs/en/userGuide.md index 2d709c5d444..2564fd9f359 100644 --- a/user_docs/en/userGuide.md +++ b/user_docs/en/userGuide.md @@ -3565,15 +3565,15 @@ When a Chinese braille output table is in use, NVDA can insert spaces between Ch | . {.hideHeaderRow} |.| |---|---| -| Options | Default (Auto), Auto, Windows (legacy), Chinese, Windows Unicode (ICU) | +| Options | Default (Auto), Auto, Chinese, Unicode (ICU), Legacy (Uniscribe) | | Default | Auto | | Option | Behaviour | |---|---| -| Auto | Use Chinese word segmentation for Chinese text when available. Otherwise, prefer Windows Unicode (ICU) word segmentation when available, falling back to the legacy Windows word segmentation. | -| Windows (legacy) | Use the legacy Windows word segmentation as used in Notepad classic and other legacy Win32 edit controls. | +| Auto | Use Chinese word segmentation for Chinese text when available. Otherwise, prefer Unicode (ICU) word segmentation when available, falling back to the legacy Windows word segmentation. | | Chinese | Use Chinese word segmentation. If Chinese word segmentation is not available, NVDA uses the fallback path as in `Auto`. | -| Windows Unicode (ICU) | Use the Windows built-in ICU library for word segmentation available in Windows 10 version 1703 (Creators Update) and later, falling back to legacy. | +| Unicode (ICU) | Use the Windows built-in ICU library for word segmentation available in Windows 10 version 1703 (Creators Update) and later, falling back to legacy. | +| Legacy (Uniscribe) | Use the legacy Windows word segmentation as used in Notepad classic and other legacy Win32 edit controls. | #### Math Settings {#MathSettings} From 75c068e828ee85295c0fb3d483a8b6ef9b468d1f Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Fri, 26 Jun 2026 18:46:09 +0200 Subject: [PATCH 11/12] Fix user guide --- user_docs/en/userGuide.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/user_docs/en/userGuide.md b/user_docs/en/userGuide.md index 2564fd9f359..e71ee9cd7c0 100644 --- a/user_docs/en/userGuide.md +++ b/user_docs/en/userGuide.md @@ -3565,13 +3565,13 @@ When a Chinese braille output table is in use, NVDA can insert spaces between Ch | . {.hideHeaderRow} |.| |---|---| -| Options | Default (Auto), Auto, Chinese, Unicode (ICU), Legacy (Uniscribe) | -| Default | Auto | +| Options | Default (Automatic), Automatic, Chinese, Unicode (ICU), Legacy (Uniscribe) | +| Default | Automatic | | Option | Behaviour | |---|---| -| Auto | Use Chinese word segmentation for Chinese text when available. Otherwise, prefer Unicode (ICU) word segmentation when available, falling back to the legacy Windows word segmentation. | -| Chinese | Use Chinese word segmentation. If Chinese word segmentation is not available, NVDA uses the fallback path as in `Auto`. | +| Automatic | Use Chinese word segmentation for Chinese text when available. Otherwise, prefer Unicode (ICU) word segmentation when available, falling back to the legacy Windows word segmentation. | +| Chinese | Use Chinese word segmentation. If Chinese word segmentation is not available, NVDA uses the fallback path as in `Automatic`. | | Unicode (ICU) | Use the Windows built-in ICU library for word segmentation available in Windows 10 version 1703 (Creators Update) and later, falling back to legacy. | | Legacy (Uniscribe) | Use the legacy Windows word segmentation as used in Notepad classic and other legacy Win32 edit controls. | From cd06242967d9757abd2b53d3bd3590e0e253421e Mon Sep 17 00:00:00 2001 From: Leonard de Ruijter Date: Fri, 26 Jun 2026 20:07:37 +0200 Subject: [PATCH 12/12] Fix system tests --- tests/system/robot/chromeTests.py | 4 ++- .../unit/test_textUtils_backendComparison.py | 31 +++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/tests/system/robot/chromeTests.py b/tests/system/robot/chromeTests.py index 0a223521dfd..c9c04690bd7 100644 --- a/tests/system/robot/chromeTests.py +++ b/tests/system/robot/chromeTests.py @@ -1184,9 +1184,11 @@ def test_ariaRoleDescription_inline_browseMode(): "Unlabeled graphic Our", ) actualSpeech = _chrome.getSpeechAfterKey("control+rightArrow") + # ICU word segmentation (the AUTO default) follows UAX#29, which treats the + # trailing period as its own word segment, so it is no longer read with "logo". _asserts.strings_match( actualSpeech, - "logo.", + "logo", ) diff --git a/tests/unit/test_textUtils_backendComparison.py b/tests/unit/test_textUtils_backendComparison.py index ca7c25bef73..1c92d58b0a1 100644 --- a/tests/unit/test_textUtils_backendComparison.py +++ b/tests/unit/test_textUtils_backendComparison.py @@ -189,3 +189,34 @@ def test_backends_diverge(self): # ICU spans the full sequence; Uniscribe returns a shorter leading run. self.assertEqual(icu_result, (0, self.LENGTH)) self.assertLess(uni_result[1] - uni_result[0], self.LENGTH) + + +@skipIfNoICU +class TestWordOffsetsTrailingPunctuationDivergence(unittest.TestCase): + """Trailing punctuation: ICU splits it into its own word; Uniscribe keeps it attached. + + UAX#29 treats the full stop as a separate word segment, so ICU returns "logo" + and then "." as two words. NVDA's Uniscribe implementation keeps the trailing + punctuation attached to the preceding word ("logo."). ICU's behaviour matches + modern Windows edit controls such as the Start menu search field. + """ + + TEXT = "logo." + + def test_icu_splits_word_from_punctuation(self): + """ICU: "logo" (0, 4) then "." (4, 5).""" + self.assertEqual(_icuWordOffsets(self.TEXT, 0), (0, 4)) + self.assertEqual(_icuWordOffsets(self.TEXT, 4), (4, 5)) + + def test_uniscribe_keeps_punctuation_attached(self): + """Uniscribe: "logo." kept whole (0, 5).""" + self.assertEqual(_uniscribeWordOffsets(self.TEXT, 0), (0, 5)) + + def test_backends_diverge(self): + """ICU stops before the full stop; Uniscribe includes it.""" + icu_result = _icuWordOffsets(self.TEXT, 0) + uni_result = _uniscribeWordOffsets(self.TEXT, 0) + self.assertNotEqual(icu_result, uni_result) + # ICU ends the word before the punctuation; Uniscribe runs through it. + self.assertEqual(icu_result, (0, 4)) + self.assertEqual(uni_result, (0, 5))