nvaccess · LeonarddeR · Jun 15, 2026 · Jun 22, 2026 · Jun 22, 2026 · Jun 22, 2026
@@ -1,7 +1,7 @@
 # A part of NonVisual Desktop Access (NVDA)
 # Copyright (C) 2022-2026 NV Access Limited, Bill Dengler, Rob Meredith, Leonard de Ruijter, Wang Chong
-# This file is covered by the GNU General Public License.
-# See the file COPYING for more details.
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
 """
 Feature flag value enumerations.
@@ -146,16 +146,19 @@ class WordNavigationUnitFlag(DisplayStringEnum):
 	AUTO = enum.auto()
 	UNISCRIBE = enum.auto()
 	CHINESE = enum.auto()
+	ICU = enum.auto()
 
 	@property
 	def _displayStringLabels(self) -> dict["WordNavigationUnitFlag", str]:
 		return {
 			# Translators: Label for a method of word segmentation.
 			self.AUTO: _("Auto"),
 			# Translators: Label for a method of word segmentation.
-			self.UNISCRIBE: _("Standard"),
+			self.UNISCRIBE: _("Windows (legacy)"),
 			# Translators: Label for a method of word segmentation.
 			self.CHINESE: _("Chinese"),
+			# Translators: Label for a method of word segmentation.
+			self.ICU: _("Windows Unicode (ICU)"),
-			self.ICU: _("Windows Unicode (ICU)"),
+			self.ICU: _("Unicode (ICU)"),
-			self.ICU: _("Windows Unicode (ICU)"),
+			self.ICU: _("Unicode (ICU)"),
 		}
 
 

@@ -1,5 +1,5 @@
 # A part of NonVisual Desktop Access (NVDA)
-# Copyright (C) 2006-2025 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong
+# Copyright (C) 2006-2026 NV Access Limited, Babbage B.V., Leonard de Ruijter, Wang Chong
 # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
 # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
@@ -266,6 +266,8 @@ def wordSegFlag(self) -> WordSegFlag | None:
 				return WordSegFlag.AUTO
 			case config.featureFlagEnums.WordNavigationUnitFlag.CHINESE:
 				return WordSegFlag.CHINESE
+			case config.featureFlagEnums.WordNavigationUnitFlag.ICU:
+				return WordSegFlag.ICU
 			case _:
 				log.error(f"Unknown word segmentation standard, {self.wordSegConf.calculated()!r}")
 		return None

@@ -1,5 +1,5 @@
 # A part of NonVisual Desktop Access (NVDA)
-# Copyright (C) 2025 NV Access Limited, Wang Chong
+# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter
 # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
 # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
@@ -142,10 +142,13 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
 		"""Return (start inclusive, end exclusive) or None. Offsets are str offsets relative to self.text."""
 		pass
 
-	@abstractmethod
 	def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
-		"""Segmented result with separators."""
-		pass
+		"""Segmented result with separators.
+
+		The default returns the text unchanged; only strategies that insert separators
+		for braille output (e.g. Chinese) override this.
+		"""
+		return self.text
 
 	def getWordOffsetRange(
 		self,
@@ -226,9 +229,6 @@ def _calculateUniscribeOffsets(
 	def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
 		return self._calculateUniscribeOffsets(self.text, offset)
 
-	def segmentedText(self, sep: str = " ", newSepIndex: list[int] | None = None) -> str:
-		return self.text
-
 
 class ChineseWordSegmentationStrategy(WordSegmentationStrategy):
 	_lib: CDLL | None = None
@@ -347,3 +347,25 @@ def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
 	def __init__(self, text: str, encoding: str | None = None) -> None:
 		super().__init__(text, encoding)
 		self.wordEnds = self._callCppJieba()
+
+
+class IcuWordSegmentationStrategy(WordSegmentationStrategy):
+	"""ICU-based word segmentation (Windows built-in ICU library).
+
+	Word boundaries follow Unicode Standard Annex #29 default rules plus automatic
+	dictionary-based segmentation selected by the script of the text.
+	SegmentedText returns the text unchanged (no braille separator insertion).
+	"""
+
+	def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:
+		from textUtils import icu
+
+		if self.encoding == textUtils.WCHAR_ENCODING:
+			return icu.calculateWordOffsets(self.text, offset)
+		# Convert the str offset to a UTF-16 offset for ICU, then convert the result back.
+		offsetConverter = textUtils.WideStringOffsetConverter(self.text)
+		wideOffset = offsetConverter.strToEncodedOffsets(offset, offset)[0]
+		result = icu.calculateWordOffsets(self.text, wideOffset)
+		if result is None:
+			return None
+		return offsetConverter.encodedToStrOffsets(*result)
@@ -1,5 +1,5 @@
 # A part of NonVisual Desktop Access (NVDA)
-# Copyright (C) 2025-2026 NV Access Limited, Wang Chong
+# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter
 # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
 # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
@@ -10,6 +10,7 @@
 
 from ..segFlag import WordSegFlag
 from . import wordSegStrategy
+from winBindings.icu import ICU_AVAILABLE as _ICU_AVAILABLE
 
 
 _GET_SEGMENT_RECOVERABLE_EXCEPTIONS = (
@@ -43,29 +44,34 @@ def __init__(
 	def _chooseStrategy(
 		self,
 	) -> wordSegStrategy.WordSegmentationStrategy:
-		"""Choose the appropriate segmentation strategy based on the text content."""
-		if self.wordSegFlag == WordSegFlag.AUTO:
-			if (
-				wordSegStrategy.ChineseWordSegmentationStrategy._lib
-				and WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(
-					self.text,
-				)
+		"""Choose the segmentation strategy, falling back Chinese -> ICU -> Uniscribe.
+
+		The CHINESE flag always uses the Chinese strategy when cppjieba is loaded; under
+		AUTO the Chinese strategy is used only for Chinese (non-kana) text.  ICU is used
+		for AUTO and ICU, and as the fallback when cppjieba is unavailable: it follows
+		UAX#29 plus script-driven dictionary segmentation, handling complex scripts that
+		Uniscribe breaks poorly.  Uniscribe is the final fallback and the only strategy
+		for the UNISCRIBE flag (it stays pinned where it is strictly required, e.g.
+		EditTextInfo, to match the Windows edit control / Notepad).
+		"""
+		flag = self.wordSegFlag
+		# Chinese: always for the CHINESE flag, or under AUTO for Chinese (non-kana) text.
+		if (
+			flag in (WordSegFlag.AUTO, WordSegFlag.CHINESE)
+			and wordSegStrategy.ChineseWordSegmentationStrategy._lib
+		):
+			if flag == WordSegFlag.CHINESE or (
+				WordSegmenter._CHINESE_CHARACTER_AND_JAPANESE_KANJI.search(self.text)
 				and not WordSegmenter._KANA.search(self.text)
 			):
 				return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
-			return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
-		match self.wordSegFlag:
-			case WordSegFlag.UNISCRIBE:
-				return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
-			case WordSegFlag.CHINESE:
-				if wordSegStrategy.ChineseWordSegmentationStrategy._lib:
-					return wordSegStrategy.ChineseWordSegmentationStrategy(self.text, self.encoding)
-				log.debugWarning(
-					"Chinese word segmenter is currently unavailable. Falling back to Uniscribe.",
-				)
-				return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
-			case _:
-				pass
+		elif flag == WordSegFlag.CHINESE:
+			log.debugWarning("Chinese word segmenter is unavailable. Falling back to ICU/Uniscribe.")
+		# ICU for everything except the explicit UNISCRIBE flag.
+		if flag != WordSegFlag.UNISCRIBE and _ICU_AVAILABLE:
+			return wordSegStrategy.IcuWordSegmentationStrategy(self.text, self.encoding)
+		elif flag == WordSegFlag.ICU:
+			log.debugWarning("ICU word segmenter is unavailable. Falling back to Uniscribe.")
 		return wordSegStrategy.UniscribeWordSegmentationStrategy(self.text, self.encoding)
 
 	def getSegmentForOffset(self, offset: int) -> tuple[int, int] | None:

@@ -0,0 +1,115 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2026 NV Access Limited, Leonard de Ruijter
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+"""ICU-based text boundary utilities using the Windows built-in ICU library.
+
+Requires Windows 10 version 1703 (Creators Update) or later.
+"""
+
+import ctypes
+from contextlib import contextmanager
+
+import winBindings.icu as _icu
+from logHandler import log
+
+_ROOT_LOCALE: bytes = b""
+"""ICU root locale. Word and character segmentation are script-driven, not
+locale-driven (see calculateWordOffsets), so the root locale is always used.
+"""
+
+
+@contextmanager
+def _breakIterator(kind: int, locale: bytes, buf: ctypes.Array[ctypes.c_wchar]):
+	"""Context manager that opens an ICU BreakIterator, yields it, then closes it.
+
+	The caller owns the text buffer and must keep it alive for the duration of the
+	block, satisfying ICU's requirement that the text pointer remains valid while
+	the iterator is in use.
+
+	:param kind: One of the UBRK_* constants from winBindings.icu.
+	:param locale: ICU locale byte string (the root locale, _ROOT_LOCALE).
+	:param buf: NUL-terminated UTF-16 buffer (ctypes.create_unicode_buffer) to analyze.
+	:raises RuntimeError: If ICU reports an error opening the iterator.
+	"""
+	textLength = len(buf) - 1
+	status = _icu.UErrorCode(0)
+	bi = _icu.ubrk_open(kind, locale, buf, textLength, ctypes.byref(status))
+	if _icu.U_FAILURE(status.value) or not bi:
+		raise RuntimeError(f"ubrk_open failed with status {status.value}")
+	try:
+		yield bi
+	finally:
+		_icu.ubrk_close(bi)
+
+
+def calculateWordOffsets(
+	text: str,
+	offset: int,
+) -> tuple[int, int] | None:
+	"""Calculate the UTF-16 start and end offsets of the word at the given offset.
+
+	Word boundaries follow Unicode Standard Annex #29 default rules plus automatic
+	dictionary-based segmentation for scripts such as Thai, Lao, Khmer, and CJK
+	ideographs.  ICU selects the dictionary by the script of the characters, not by
+	the locale, so no language is passed: any locale (including unrecognised codes)
+	would yield identical word boundaries and ICU never errors on an unknown locale
+	(it silently falls back to the root locale).  The root locale is therefore used
+	unconditionally.  (Locale-sensitive break types such as line and sentence
+	breaking would need a locale, but those are not used here.)
+
+	Trailing whitespace is included in the preceding word segment, matching the
+	behaviour of NVDA's Uniscribe implementation (textUtils.cpp).  When the offset
+	falls inside a whitespace run, the returned segment is the preceding word plus
+	the whitespace.
+
+	Note: ICU coalesces a run of identical whitespace into one segment but splits
+	mixed whitespace (e.g. space + tab) into separate segments, so a mixed run is
+	not merged into a single word.  This is not worth special-casing: the legacy
+	Uniscribe/Notepad behaviour for mixed whitespace runs is itself inconsistent.
+
+	:param text: The line text as a Python str.
+	:param offset: UTF-16 code unit offset within text at which to find the boundary.
+	:return: (startOffset, endOffset) as UTF-16 code unit indices (endOffset exclusive),
+	    or None if the ICU call failed.
+	"""
+	# A c_wchar buffer is UTF-16 code-unit indexed on Windows, so buf[a:b] is exactly
+	# the segment ICU's offsets refer to (lone surrogates decode as non-space).
+	buf = ctypes.create_unicode_buffer(text)
+	textLength = len(buf) - 1
+	if offset >= textLength:
+		return (offset, offset + 1)
+
+	try:
+		with _breakIterator(_icu.UBRK_WORD, _ROOT_LOCALE, buf) as bi:
+			# Find [start, end) — the ICU segment containing offset.
+			# ICU offsets are UTF-16 code-unit indexed, so anchor on the boundary following
+			# offset and take the boundary preceding that. (ubrk_preceding(offset + 1)
+			# would snap back for multi-unit segments.)
+			end = _icu.ubrk_following(bi, offset)
+			if end == _icu.UBRK_DONE:
+				end = textLength
+			start = _icu.ubrk_preceding(bi, end)
+			if start == _icu.UBRK_DONE:
+				start = 0
+
+			if buf[start:end].isspace():
+				# Offset is inside a whitespace run.  Attach this run to the
+				# preceding segment (mirroring the Uniscribe trailing-space rule).
+				if start > 0:
+					wordStart = _icu.ubrk_preceding(bi, start)
+					if wordStart == _icu.UBRK_DONE:
+						wordStart = 0
+					return (wordStart, end)
+			else:
+				# Offset is inside a word/punctuation segment.  Extend the end
+				# through any immediately following whitespace run.
+				nextEnd = _icu.ubrk_following(bi, end)
+				if nextEnd != _icu.UBRK_DONE and buf[end:nextEnd].isspace():
+					return (start, nextEnd)
+
+			return (start, end)
+	except RuntimeError:
+		log.debugWarning("ICU word break iterator failed", exc_info=True)
+		return None
@@ -1,5 +1,5 @@
 # A part of NonVisual Desktop Access (NVDA)
-# Copyright (C) 2025 NV Access Limited, Wang Chong
+# Copyright (C) 2025-2026 NV Access Limited, Wang Chong, Leonard de Ruijter
 # This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
 # For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
 
@@ -9,6 +9,7 @@
 _AUTO: int = 1 << 0
 _UNISCRIBE: int = 1 << 1
 _CHINESE: int = 1 << 2
+_ICU: int = 1 << 3
 
 
 class CharSegFlag(IntFlag):
@@ -26,3 +27,4 @@ class WordSegFlag(IntFlag):
 	AUTO = _AUTO
 	UNISCRIBE = _UNISCRIBE
 	CHINESE = _CHINESE
+	ICU = _ICU