From c99a3650fa9bc4a947af0d21e4b73144591c64dd Mon Sep 17 00:00:00 2001 From: Zane Chee Date: Fri, 26 Jun 2026 01:30:07 +0800 Subject: [PATCH] fix(cua-driver/macos): prefer AX scroll actions --- .../platform-macos/src/input/ax_actions.rs | 44 +++++++++++++++++++ .../crates/platform-macos/src/tools/scroll.rs | 37 ++++++++++++---- 2 files changed, 72 insertions(+), 9 deletions(-) diff --git a/libs/cua-driver/rust/crates/platform-macos/src/input/ax_actions.rs b/libs/cua-driver/rust/crates/platform-macos/src/input/ax_actions.rs index 18f2cddea..e01d77c34 100644 --- a/libs/cua-driver/rust/crates/platform-macos/src/input/ax_actions.rs +++ b/libs/cua-driver/rust/crates/platform-macos/src/input/ax_actions.rs @@ -16,6 +16,36 @@ pub fn perform_ax_action(element_ptr: usize, action: &str) -> anyhow::Result<()> } } +/// Perform a native AX scroll action when the target element advertises one. +pub fn perform_ax_scroll_action_if_supported( + element_ptr: usize, + direction: &str, + by: &str, + amount: usize, +) -> anyhow::Result> { + let actions = unsafe { copy_action_names(element_ptr as AXUIElementRef) }; + let Some(candidates) = scroll_action_candidates(direction, by) else { + return Ok(None); + }; + let Some(action) = candidates + .iter() + .copied() + .find(|candidate| actions.iter().any(|advertised| advertised.as_str() == *candidate)) + else { + return Ok(None); + }; + + for _ in 0..amount { + let err = unsafe { perform_action(element_ptr as AXUIElementRef, action) }; + if err != kAXErrorSuccess { + anyhow::bail!("AXUIElementPerformAction({action}) failed with error {err}"); + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + + Ok(Some(action)) +} + fn map_action(action: &str) -> &'static str { match action.to_lowercase().as_str() { "press" | "click" => "AXPress", @@ -28,6 +58,20 @@ fn map_action(action: &str) -> &'static str { } } +fn scroll_action_candidates(direction: &str, by: &str) -> Option<[&'static str; 2]> { + Some(match (direction, by) { + ("up", "page") => ["AXScrollUpByPage", "AXScrollUp"], + ("down", "page") => ["AXScrollDownByPage", "AXScrollDown"], + ("left", "page") => ["AXScrollLeftByPage", "AXScrollLeft"], + ("right", "page") => ["AXScrollRightByPage", "AXScrollRight"], + ("up", _) => ["AXScrollUp", "AXScrollUpByPage"], + ("down", _) => ["AXScrollDown", "AXScrollDownByPage"], + ("left", _) => ["AXScrollLeft", "AXScrollLeftByPage"], + ("right", _) => ["AXScrollRight", "AXScrollRightByPage"], + _ => return None, + }) +} + /// Set AXFocused=true on an element (for pre-focusing before key press). pub fn focus_element(element_ptr: usize) -> anyhow::Result<()> { let err = unsafe { diff --git a/libs/cua-driver/rust/crates/platform-macos/src/tools/scroll.rs b/libs/cua-driver/rust/crates/platform-macos/src/tools/scroll.rs index 7a1e3374e..d62d483ad 100644 --- a/libs/cua-driver/rust/crates/platform-macos/src/tools/scroll.rs +++ b/libs/cua-driver/rust/crates/platform-macos/src/tools/scroll.rs @@ -22,9 +22,10 @@ static DEF: std::sync::OnceLock = std::sync::OnceLock::new(); fn def() -> &'static ToolDef { DEF.get_or_init(|| ToolDef { name: "scroll".into(), - description: "Scroll the target pid's focused region by synthesized keystrokes.\n\n\ - Mapping: by='page' → PageDown/PageUp × amount; by='line' → DownArrow/UpArrow × amount. \ - Horizontal variants use Left/Right arrow keys.\n\n\ + description: "Scroll the target pid's focused region.\n\n\ + If the target element advertises a native AX scroll action, that action is used first. \ + Otherwise, by='page' maps to PageDown/PageUp × amount and by='line' maps to \ + DownArrow/UpArrow × amount. Horizontal variants use Left/Right arrow keys.\n\n\ Optional element_index + window_id pre-focuses the element before scrolling.".into(), input_schema: serde_json::json!({ "type": "object", @@ -46,7 +47,7 @@ fn def() -> &'static ToolDef { "type": "integer", "minimum": 1, "maximum": 50, - "description": "Number of keystroke repetitions. Default: 3." + "description": "Number of native AX scroll action or fallback keystroke repetitions. Default: 3." }, "window_id": { "type": "integer" }, "element_index": { "type": "integer" }, @@ -115,9 +116,11 @@ impl Tool for ScrollTool { _ => "down", }; let key = key.to_owned(); + let ax_direction = direction.to_owned(); + let ax_by = by.to_owned(); // ── Focus-suppression wrap (Swift WindowChangeDetector + FocusGuard) ── - // Scroll keystrokes (PageDown / arrow) into search-box autocomplete + // Scroll actions or fallback keystrokes into search-box autocomplete // can spawn floating helper windows; rare but real. Wrap for parity // with the other action tools. // @@ -130,7 +133,7 @@ impl Tool for ScrollTool { let result = focus_guard::with_focus_suppressed( Some(pid), prior_front, - "scroll.CGEvent", + "scroll", || async move { // Pre-focus the element under suppression so its // side-effects are captured by the snapshot + lease. @@ -139,6 +142,22 @@ impl Tool for ScrollTool { crate::input::ax_actions::focus_element(element_ptr) }).await; tokio::time::sleep(std::time::Duration::from_millis(30)).await; + + let direction = ax_direction.clone(); + let by = ax_by.clone(); + match tokio::task::spawn_blocking(move || { + crate::input::ax_actions::perform_ax_scroll_action_if_supported( + element_ptr, + &direction, + &by, + amount, + ) + }).await { + Ok(Ok(Some(action))) => return Ok(Ok(action.to_owned())), + Ok(Ok(None)) => {}, + Ok(Err(e)) => return Ok(Err(e)), + Err(e) => return Err(e), + } } tokio::task::spawn_blocking(move || { @@ -148,7 +167,7 @@ impl Tool for ScrollTool { } std::thread::sleep(std::time::Duration::from_millis(50)); } - Ok(()) + Ok("key synthesis".to_owned()) }) .await }, @@ -158,8 +177,8 @@ impl Tool for ScrollTool { let changes = snapshot.detect_async().await; match result { - Ok(Ok(())) => ToolResult::text(format!( - "Scrolled {direction} by {by} × {amount}.{}", + Ok(Ok(method)) => ToolResult::text(format!( + "Scrolled {direction} by {by} × {amount} via {method}.{}", changes.result_suffix() )), Ok(Err(e)) => ToolResult::error(format!("Scroll failed: {e}")),