From f3fa8ab2bd3362053c8b309c6c349cc4c70dca1e Mon Sep 17 00:00:00 2001 From: "Arpit Singh (SW-CLOUD)" Date: Mon, 29 Jun 2026 23:41:49 -0700 Subject: [PATCH 1/2] feat(evaluator): add ATIF trace and log read handles for evidence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add read handles over candidate evidence for agent-eval metrics: - TraceHandle exposes a trace descriptor as an ATIF Trajectory with step/tool-call/token-usage views; lightweight ATIF read models live in values/atif.py (validated on read via parse_atif, raw payload persisted, no normalization — producers emit conformant ATIF). - LogHandle for log-bundle access and LocalFilesystemEvidence helpers (diff, unified_diff, run_verifier with symlink hardening). - WellKnownEvidenceKey literal and example metrics updates. Vendored mirror synced via make vendor. Signed-off-by: Arpit Singh (SW-CLOUD) --- .../run_agent_eval/example_metrics.py | 64 +++++-- .../src/nemo_evaluator_sdk/values/__init__.py | 20 +++ .../src/nemo_evaluator_sdk/values/atif.py | 78 ++++++++ .../src/nemo_evaluator_sdk/values/evidence.py | 166 ++++++++++++------ .../tests/agent_eval/test_evidence.py | 84 ++++++++- .../tests/agent_eval/test_example_metrics.py | 34 ++++ .../beta/evaluator/values/__init__.py | 20 +++ .../beta/evaluator/values/atif.py | 78 ++++++++ .../beta/evaluator/values/evidence.py | 166 ++++++++++++------ 9 files changed, 580 insertions(+), 130 deletions(-) create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/atif.py create mode 100644 sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/atif.py diff --git a/packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py b/packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py index 9e6118dedf..cf90400b10 100644 --- a/packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py +++ b/packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py @@ -3,19 +3,23 @@ """Reference metrics-over-evidence for this example (not SDK API). -These show how to score from the SDK's filesystem evidence handle instead of a -stamped verifier reward: +These show how to score from the SDK's evidence handles instead of a stamped +verifier reward: * :class:`TestsPassMetric` runs a command against ``final_state`` filesystem evidence (in a throwaway overlay) and scores on exit 0. * :class:`NoTestCheatingMetric` diffs ``initial_state`` against ``final_state`` and fails if the agent touched protected (e.g. test) paths. +* :class:`InefficientRetryLoopMetric` reads the normalized ``trace`` and fails + when the same tool call repeats past a threshold. """ from __future__ import annotations +import json from collections.abc import Sequence +from nemo_evaluator_sdk.agent_eval.trials import EVIDENCE_FINAL_STATE, EVIDENCE_INITIAL_STATE, EVIDENCE_TRACE from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult @@ -26,7 +30,7 @@ def __init__( self, command: Sequence[str], *, - evidence_name: str = "final_state", + evidence_name: str = EVIDENCE_FINAL_STATE, cwd: str = ".", timeout_s: float = 300.0, ) -> None: @@ -60,13 +64,9 @@ def __init__( *, protected: Sequence[str] = ("tests/",), change_types: Sequence[str] = ("added", "modified", "deleted"), - initial_name: str = "initial_state", - final_name: str = "final_state", ) -> None: self._protected = tuple(protected) self._change_types = set(change_types) - self._initial_name = initial_name - self._final_name = final_name @property def type(self) -> str: @@ -78,12 +78,56 @@ def output_spec(self) -> list[MetricOutputSpec]: async def compute_scores(self, input: MetricInput) -> MetricResult: evidence = input.candidate.evidence clean = True - if evidence is not None and evidence.get(self._initial_name) and evidence.get(self._final_name): - initial = await evidence.filesystem(self._initial_name) - final = await evidence.filesystem(self._final_name) + if evidence is not None and evidence.get(EVIDENCE_INITIAL_STATE) and evidence.get(EVIDENCE_FINAL_STATE): + initial = await evidence.filesystem(EVIDENCE_INITIAL_STATE) + final = await evidence.filesystem(EVIDENCE_FINAL_STATE) diff = await initial.diff(final) violations = [ entry for prefix in self._protected for entry in diff.changed(prefix=prefix, kinds=self._change_types) ] clean = not violations return MetricResult(outputs=[MetricOutput(name="no_test_cheating", value=clean)]) + + +class InefficientRetryLoopMetric: + """Score ``False`` when the same tool call repeats consecutively past ``threshold`` times.""" + + def __init__(self, *, threshold: int = 2) -> None: + self._threshold = threshold + + @property + def type(self) -> str: + return "inefficient_retry_loop" + + def output_spec(self) -> list[MetricOutputSpec]: + return [ + MetricOutputSpec.boolean("efficient_tool_use"), + MetricOutputSpec.discrete_score("max_repeated_tool_calls"), + ] + + async def compute_scores(self, input: MetricInput) -> MetricResult: + max_repeats = 0 + evidence = input.candidate.evidence + if evidence is not None and evidence.get(EVIDENCE_TRACE) is not None: + calls = await (await evidence.trace(EVIDENCE_TRACE)).tool_calls() + # Count the longest run of *consecutive* identical calls (a retry loop), not the + # global frequency, so legitimate reuse separated by other work isn't flagged. + previous_key: str | None = None + current_repeats = 0 + for call in calls: + # Canonicalize for comparison only (sorted keys): semantically identical calls + # match regardless of argument insertion order; execution order is untouched. + key = json.dumps( + {"function_name": call.function_name, "arguments": call.arguments or {}}, + sort_keys=True, + separators=(",", ":"), + ) + current_repeats = current_repeats + 1 if key == previous_key else 1 + previous_key = key + max_repeats = max(max_repeats, current_repeats) + return MetricResult( + outputs=[ + MetricOutput(name="efficient_tool_use", value=max_repeats <= self._threshold), + MetricOutput(name="max_repeated_tool_calls", value=max_repeats), + ] + ) diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py index 9c4e11eb7e..5190a5ff7b 100644 --- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py @@ -4,6 +4,13 @@ """Public value types for evaluator SDK runtime.""" from nemo_evaluator_sdk.values.agents import Agent +from nemo_evaluator_sdk.values.atif import ( + FinalMetrics, + Metrics, + Step, + ToolCall, + Trajectory, +) from nemo_evaluator_sdk.values.common import SecretRef, SupportedJobTypes from nemo_evaluator_sdk.values.dataset_schemas import ( FieldMapping, @@ -17,6 +24,10 @@ FilesystemDiff, FilesystemEntry, LocalFilesystemEvidence, + LogHandle, + TraceHandle, + WellKnownEvidenceKey, + parse_atif, ) from nemo_evaluator_sdk.values.metrics import ( BLEU, @@ -109,6 +120,15 @@ "ContinuousScore", "FilesystemDiff", "FilesystemEntry", + "FinalMetrics", + "LogHandle", + "Metrics", + "Step", + "ToolCall", + "Trajectory", + "TraceHandle", + "WellKnownEvidenceKey", + "parse_atif", "DatasetRow", "DatasetRows", "DefaultAggregateFieldName", diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/atif.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/atif.py new file mode 100644 index 0000000000..e5a4c6ec28 --- /dev/null +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/atif.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Lightweight ATIF read models for the evaluator SDK. + +The evaluator *ingests* traces that producers emit in the Agent Trajectory +Interchange Format (ATIF; RFC 0001, schema_version ``ATIF-v1.x``). It does not +produce or normalize ATIF, and it only reads a small subset of the schema, so +this module models *just that subset* rather than vendoring the full reference +implementation. + +These are deliberately permissive (``extra="ignore"``): fields the SDK does not +consume (images, content parts, observations, sub-agents, agent metadata, ...) +are accepted and dropped, so a trace emitted against a newer ATIF revision still +validates. Validation here means "this payload carries the ATIF fields the +evaluator's metrics rely on" — not full RFC conformance, which is the producer's +responsibility. The authoritative spec is RFC 0001. +""" + +from typing import Any, Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + + +class ToolCall(BaseModel): + """A single tool/function invocation within an agent step.""" + + model_config = ConfigDict(extra="ignore") + + tool_call_id: str | None = Field(default=None, description="Producer-assigned tool call id, if any.") + function_name: str = Field(description="Name of the invoked tool/function.") + arguments: dict[str, Any] | None = Field(default=None, description="Arguments passed to the tool.") + + +class Metrics(BaseModel): + """Per-step token metrics.""" + + model_config = ConfigDict(extra="ignore") + + prompt_tokens: int | None = None + completion_tokens: int | None = None + + +class FinalMetrics(BaseModel): + """Trajectory-level aggregate token metrics.""" + + model_config = ConfigDict(extra="ignore") + + total_prompt_tokens: int | None = None + total_completion_tokens: int | None = None + + +class Step(BaseModel): + """One step in an agent trajectory.""" + + model_config = ConfigDict(extra="ignore") + + source: Literal["system", "user", "agent"] = Field(description="Who produced this step.") + message: str = Field(default="", description="Step text content.") + tool_calls: list[ToolCall] | None = Field(default=None, description="Tool calls issued in this step.") + metrics: Metrics | None = Field(default=None, description="Per-step token metrics, if reported.") + + +class Trajectory(BaseModel): + """An ATIF agent trajectory (read view over the subset the SDK consumes).""" + + model_config = ConfigDict(extra="ignore") + + schema_version: str = Field(description="ATIF schema version, e.g. 'ATIF-v1.7'.") + steps: list[Step] = Field(min_length=1, description="Ordered trajectory steps.") + final_metrics: FinalMetrics | None = Field(default=None, description="Aggregate token metrics, if reported.") + + @field_validator("schema_version") + @classmethod + def _looks_like_atif(cls, value: str) -> str: + # Cheap sanity gate so arbitrary JSON isn't silently accepted as a trace. + if not value.startswith("ATIF-"): + raise ValueError(f"unexpected trace schema_version {value!r}; expected an 'ATIF-*' version") + return value diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py index 16f7248651..368e480e33 100644 --- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py @@ -8,6 +8,7 @@ import asyncio import difflib import hashlib +import json import os import shutil import signal @@ -18,6 +19,11 @@ from pydantic import BaseModel, ConfigDict, Field, JsonValue, PrivateAttr, model_validator +from nemo_evaluator_sdk.values.atif import FinalMetrics, Step, ToolCall, Trajectory + +# Well-known evidence keys (mirrored by the ``EVIDENCE_*`` constants in ``agent_eval.trials``). +WellKnownEvidenceKey = Literal["initial_state", "trace", "logs", "final_state", "verifier_logs"] + class FilesystemEntry(BaseModel): """One path that differs between two filesystem snapshots.""" @@ -100,11 +106,7 @@ async def read_text(self, relative_path: str | Path, *, encoding: str = "utf-8") return await asyncio.to_thread(path.read_text, encoding=encoding) async def iter_paths(self, relative_path: str | Path = ".", *, recursive: bool = False) -> list[str]: - """List entries (files *and* directories) rooted at ``relative_path``. - - Use this to walk a subtree or test for (non-)emptiness. For a flat, - files-only listing matched by a glob pattern, use :meth:`list_files`. - """ + """List entries (files and directories) rooted at ``relative_path``.""" base = self.path(relative_path) return await asyncio.to_thread(self._iter_paths_sync, base, recursive) @@ -120,11 +122,7 @@ async def read_bytes(self, relative_path: str | Path) -> bytes: return await asyncio.to_thread(path.read_bytes) async def list_files(self, pattern: str = "**/*") -> list[str]: - """List relative posix paths of files (not directories) matching ``pattern``. - - Complements :meth:`iter_paths`: this is the flat, glob-filtered, - files-only view; ``iter_paths`` walks a subtree and includes directories. - """ + """List relative posix paths of files (not directories) matching ``pattern``.""" return await asyncio.to_thread(self._list_sync, pattern) def _list_sync(self, pattern: str) -> list[str]: @@ -135,12 +133,7 @@ def _list_sync(self, pattern: str) -> list[str]: ) async def diff(self, other: LocalFilesystemEvidence) -> FilesystemDiff: - """Diff this snapshot (before) against ``other`` (after) by file content hash. - - Cost note: this hashes every file in both trees by reading each fully, so - it is O(total bytes). Fine for task-sized evidence; revisit (streamed - hashing / size+mtime prefilter) if used on large artifact trees. - """ + """Diff this snapshot (before) against ``other`` (after) by file content hash.""" return await asyncio.to_thread(self._diff_sync, other) def _diff_sync(self, other: LocalFilesystemEvidence) -> FilesystemDiff: @@ -162,12 +155,7 @@ async def unified_diff( *, context: int = 3, ) -> str: - """Unified diff of one path between this snapshot (before) and ``other`` (after). - - Opt-in, per-path companion to :meth:`diff` (which reports only which paths - changed). Returns ``""`` when the two versions are identical or binary - (non-UTF-8). Path access is traversal-guarded like the rest of the handle. - """ + """Unified diff of one path between this snapshot (before) and ``other`` (after); ``""`` if identical or binary.""" return await asyncio.to_thread(self._unified_diff_sync, other, relative_path, context) def _unified_diff_sync(self, other: LocalFilesystemEvidence, relative_path: str | Path, context: int) -> str: @@ -193,12 +181,7 @@ def _hashes(self) -> dict[str, str]: return hashes def _safe_files(self) -> list[Path]: - """Regular files under the root, never descending into or reading symlinks that escape it. - - ``os.walk(followlinks=False)`` keeps a ``vendor -> /`` style dir symlink from - exploding the walk, and escaping file symlinks (``leak -> /etc/passwd``) are - dropped so their target is never hashed. - """ + """Regular files under the root, skipping symlinks whose target escapes it.""" files: list[Path] = [] for dirpath, _dirnames, filenames in os.walk(self._root, followlinks=False): for name in filenames: @@ -215,24 +198,12 @@ async def run_verifier( cwd: str = ".", timeout_s: float | None = None, ) -> CommandResult: - """Run ``command`` (no shell) against a throwaway copy of the evidence. - - The evidence is copied to a temp overlay so the command can never mutate - stored evidence (pytest caches, build artifacts, ...). ``command`` is a - list passed straight to exec, so there is no shell parsing of it. - - This is NOT a sandbox: the command runs with the host's privileges and - full filesystem/network access. ``command`` is supplied by the (trusted) - metric author, never by the agent under test. Cost note: the whole tree - is copied on every call, so verifying large evidence repeatedly is heavy. - """ + """Run ``command`` (no shell) against a throwaway copy of the evidence; not a sandbox (host privileges).""" overlay = Path(tempfile.mkdtemp(prefix="evidence-verify-")).resolve() try: workdir = (overlay / cwd).resolve() if workdir != overlay and overlay not in workdir.parents: raise ValueError(f"verifier cwd {cwd!r} resolves outside evidence overlay") - # symlinks=True copies links as-is (no host deref); the ignore hook drops - # links whose target escapes the evidence root so the verifier can't read them. await asyncio.to_thread( shutil.copytree, self._root, @@ -246,13 +217,7 @@ async def run_verifier( await asyncio.to_thread(shutil.rmtree, overlay, True) def _ignore_escaping_symlinks(self, directory: str, names: list[str]) -> set[str]: - """copytree ignore hook: skip symlinks that can't be safely preserved in the overlay. - - Drops links whose resolved target escapes the evidence root (host-file reads) - and absolute links: ``symlinks=True`` would recreate the latter verbatim, so a - verifier write through ``link -> /real/evidence/answer.txt`` would mutate the - stored evidence instead of the throwaway copy. - """ + """copytree ignore hook: drop absolute symlinks and links whose target escapes the evidence root.""" ignored: set[str] = set() for name in names: full = Path(directory) / name @@ -264,8 +229,7 @@ def _ignore_escaping_symlinks(self, directory: str, names: list[str]) -> set[str @staticmethod async def _exec(command: list[str], cwd: Path, timeout_s: float | None) -> CommandResult: - # start_new_session makes the child its own process-group leader, so a timeout - # can reap the whole tree (grandchildren it spawned) rather than just the child. + # start_new_session: child leads its own process group so a timeout can reap the whole tree. process = await asyncio.create_subprocess_exec( *command, cwd=str(cwd), @@ -276,11 +240,11 @@ async def _exec(command: list[str], cwd: Path, timeout_s: float | None) -> Comma try: stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout_s) except TimeoutError: - # wait_for cancels communicate() but leaves the tree running; kill the group. + # wait_for leaves the tree running; kill the whole process group. try: os.killpg(os.getpgid(process.pid), signal.SIGKILL) except ProcessLookupError: - pass # already exited between the timeout and the kill + pass await process.wait() return CommandResult(exit_code=-1, timed_out=True) return CommandResult( @@ -325,6 +289,75 @@ def _requires_ref_or_data(self) -> EvidenceDescriptor: return self +# ATIF ingest: producers emit conformant ATIF (see values/atif.py, RFC 0001); we validate on read, not normalize. +def parse_atif(payload: Any) -> Trajectory: + """Validate a payload as a canonical ATIF :class:`Trajectory` (raises ``ValidationError`` if non-conformant).""" + return payload if isinstance(payload, Trajectory) else Trajectory.model_validate(payload) + + +class TraceHandle: + """Lazily validated read handle exposing a trace descriptor as an ATIF :class:`Trajectory`.""" + + def __init__(self, descriptor: EvidenceDescriptor) -> None: + self._descriptor = descriptor + self._trajectory: Trajectory | None = None + + async def trace(self) -> Trajectory: + """Return the ATIF trajectory, reading and validating on first access.""" + if self._trajectory is None: + payload = await asyncio.to_thread(self._load_payload) + self._trajectory = parse_atif(payload) + return self._trajectory + + def _load_payload(self) -> Any: + descriptor = self._descriptor + if descriptor.data is not None: + return descriptor.data + if descriptor.ref is None: + raise ValueError("trace evidence descriptor requires ref or data") + return json.loads(_local_filesystem_ref(descriptor.ref).read_text(encoding="utf-8")) + + async def steps(self) -> list[Step]: + """Return the ATIF steps in order.""" + return (await self.trace()).steps + + async def tool_calls(self) -> list[ToolCall]: + """Return all tool calls flattened across agent steps, in order.""" + calls: list[ToolCall] = [] + for step in await self.steps(): + calls.extend(step.tool_calls or []) + return calls + + async def token_usage(self) -> FinalMetrics: + """Return aggregate token usage (trajectory ``final_metrics``, else summed per step).""" + trajectory = await self.trace() + if trajectory.final_metrics is not None: + return trajectory.final_metrics + prompt = sum((step.metrics.prompt_tokens or 0) for step in trajectory.steps if step.metrics is not None) + completion = sum((step.metrics.completion_tokens or 0) for step in trajectory.steps if step.metrics is not None) + return FinalMetrics(total_prompt_tokens=prompt or None, total_completion_tokens=completion or None) + + +class LogHandle: + """Read handle over a log-bundle directory.""" + + def __init__(self, root: str | Path) -> None: + self._fs = LocalFilesystemEvidence(root) + + async def list_files(self) -> list[str]: + """Return relative paths of log files in the bundle.""" + return await self._fs.list_files("**/*") + + async def read_text(self, name: str) -> str: + """Read one log file's full text.""" + return await self._fs.read_text(name) + + async def tail(self, name: str, lines: int = 50) -> str: + """Return the last ``lines`` lines of a log file.""" + text = await self._fs.read_text(name) + return "\n".join(text.splitlines()[-lines:]) + + class CandidateEvidence(BaseModel): """Named evidence descriptors attached to an AgentEvalAttempt.""" @@ -339,6 +372,8 @@ class CandidateEvidence(BaseModel): description="Free-form metadata associated with the evidence collection.", ) _filesystem_cache: dict[str, LocalFilesystemEvidence] = PrivateAttr(default_factory=dict) + _trace_cache: dict[str, TraceHandle] = PrivateAttr(default_factory=dict) + _log_cache: dict[str, LogHandle] = PrivateAttr(default_factory=dict) @model_validator(mode="before") @classmethod @@ -383,13 +418,30 @@ async def filesystem(self, name: str) -> LocalFilesystemEvidence: self._filesystem_cache[name] = handle return handle + async def trace(self, name: str = "trace") -> TraceHandle: + """Return a cached trace handle for a named trace descriptor (read lazily on first access).""" + cached = self._trace_cache.get(name) + if cached is not None: + return cached + handle = TraceHandle(self.require(name, kind="trace")) + self._trace_cache[name] = handle + return handle -def _local_filesystem_ref(ref: str) -> Path: - """Resolve a local filesystem ref to a Path. + async def logs(self, name: str = "logs") -> LogHandle: + """Return a cached log-bundle handle for a named logs descriptor.""" + cached = self._log_cache.get(name) + if cached is not None: + return cached + descriptor = self.require(name, kind="logs") + if descriptor.ref is None: + raise ValueError(f"logs evidence descriptor {name!r} requires a local ref") + handle = LogHandle(_local_filesystem_ref(descriptor.ref)) + self._log_cache[name] = handle + return handle - Accepts POSIX paths, ``file://`` URIs, and Windows drive paths (e.g. ``C:\\dir``). - Network and cloud URI schemes (http, https, s3, gs, ...) are rejected. - """ + +def _local_filesystem_ref(ref: str) -> Path: + """Resolve a local ref (POSIX path, ``file://`` URI, or Windows drive path) to a Path; reject network/cloud URIs.""" parsed = urlparse(ref) # A single-letter scheme is a Windows drive letter (e.g. "C:\\dir"), not a URI scheme. if len(parsed.scheme) == 1 and parsed.scheme.isalpha(): diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_evidence.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_evidence.py index 7de29cf409..d58902d71a 100644 --- a/packages/nemo_evaluator_sdk/tests/agent_eval/test_evidence.py +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_evidence.py @@ -2,6 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import asyncio +import json +import os +import time from pathlib import Path import pytest @@ -11,6 +14,7 @@ EvidenceDescriptor, LocalFilesystemEvidence, ) +from pydantic import ValidationError def test_metric_input_preserves_candidate_evidence_out_of_metadata() -> None: @@ -151,14 +155,27 @@ async def test_verifier_timeout_kills_the_whole_process_tree(tmp_path: Path) -> root.mkdir() handle = LocalFilesystemEvidence(root) - # A grandchild backgrounded by the verifier would write the marker after 1s if it - # survived; killing the whole process group on timeout stops it first. - marker = tmp_path / "survived.txt" - result = await handle.run_verifier(["sh", "-c", f"(sleep 1; touch '{marker}') & sleep 5"], timeout_s=0.3) + # The verifier backgrounds a long-lived child and records its PID. Killing only the + # direct shell would orphan that child; killing the whole process group reaps it. + # (We assert the child PID is gone rather than watching for a follow-on side effect, + # which would race the kill: reaping the child can let the shell run its next command + # in the window before the shell itself is signalled.) + pidfile = tmp_path / "child.pid" + result = await handle.run_verifier(["sh", "-c", f"sleep 30 & echo $! > '{pidfile}'; wait"], timeout_s=0.3) assert result.timed_out - await asyncio.sleep(1.5) - assert not marker.exists() + child_pid = int(pidfile.read_text().strip()) + # Poll until the killed group is fully reaped rather than assuming a fixed window; + # on a busy runner the child can briefly linger as a zombie even when cleanup is correct. + deadline = time.monotonic() + 2.0 + while True: + try: + os.kill(child_pid, 0) + except ProcessLookupError: + break + if time.monotonic() >= deadline: + pytest.fail(f"child process {child_pid} still alive after cleanup timeout") + await asyncio.sleep(0.05) @pytest.mark.asyncio @@ -177,3 +194,58 @@ async def test_unified_diff_reports_text_patch_and_skips_binary(tmp_path: Path) assert "-b" in patch and "+c" in patch and patch.startswith("--- a/f.txt") assert await before.unified_diff(after, "img.bin") == "" # binary: no textual patch assert await before.unified_diff(before, "f.txt") == "" # identical: empty + + +_ATIF_TRAJECTORY = { + "schema_version": "ATIF-v1.7", + "agent": {"name": "demo", "version": "1.0"}, + "steps": [ + {"step_id": 1, "source": "user", "message": "do it"}, + { + "step_id": 2, + "source": "agent", + "message": "calling tool", + "tool_calls": [{"tool_call_id": "c1", "function_name": "search", "arguments": {"q": "x"}}], + "observation": {"results": [{"source_call_id": "c1", "content": "result text"}]}, + }, + ], + "final_metrics": {"total_prompt_tokens": 10, "total_completion_tokens": 5}, +} + + +@pytest.mark.asyncio +async def test_trace_handle_reads_atif(tmp_path: Path) -> None: + trace_path = tmp_path / "trajectory.json" + trace_path.write_text(json.dumps(_ATIF_TRAJECTORY), encoding="utf-8") + + evidence = CandidateEvidence( + descriptors={"trace": EvidenceDescriptor(kind="trace", ref=str(trace_path), format="atif")} + ) + handle = await evidence.trace("trace") + assert handle is await evidence.trace("trace") # cached + trajectory = await handle.trace() + assert trajectory.schema_version == "ATIF-v1.7" + assert [step.source for step in trajectory.steps] == ["user", "agent"] + assert (await handle.tool_calls())[0].function_name == "search" + assert (await handle.token_usage()).total_prompt_tokens == 10 + + # A non-conformant trace is rejected at read time (producers must emit ATIF). + bad = CandidateEvidence( + descriptors={"trace": EvidenceDescriptor(kind="trace", format="atif", data={"steps": "not-a-list"})} + ) + with pytest.raises(ValidationError): + await (await bad.trace("trace")).trace() + + +@pytest.mark.asyncio +async def test_logs_handle_reads_and_tails(tmp_path: Path) -> None: + log_dir = tmp_path / "logs" + log_dir.mkdir() + (log_dir / "agent.log").write_text("line1\nline2\nline3\n", encoding="utf-8") + + evidence = CandidateEvidence(descriptors={"logs": EvidenceDescriptor(kind="logs", format="dir", ref=str(log_dir))}) + handle = await evidence.logs("logs") + assert handle is await evidence.logs("logs") # cached + assert await handle.list_files() == ["agent.log"] + assert await handle.read_text("agent.log") == "line1\nline2\nline3\n" + assert await handle.tail("agent.log", 2) == "line2\nline3" diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_example_metrics.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_example_metrics.py index 9f25fabdd5..1a9acca47f 100644 --- a/packages/nemo_evaluator_sdk/tests/agent_eval/test_example_metrics.py +++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_example_metrics.py @@ -4,6 +4,7 @@ """Exercise the example's reference metrics-over-evidence.""" import importlib.util +import json from pathlib import Path import pytest @@ -55,3 +56,36 @@ async def test_tests_pass_and_no_test_cheating(tmp_path: Path) -> None: ) cheated = await example_metrics.NoTestCheatingMetric().compute_scores(_input_with_evidence(evidence_cheated)) assert cheated.outputs[0].value is False + + +@pytest.mark.asyncio +async def test_inefficient_retry_loop(tmp_path: Path) -> None: + def trajectory(repeats: int) -> dict: + calls = [ + {"tool_call_id": f"c{i}", "function_name": "search", "arguments": {"q": "same"}} for i in range(repeats) + ] + return { + "schema_version": "ATIF-v1.7", + "agent": {"name": "demo", "version": "1.0"}, + "steps": [{"step_id": 1, "source": "agent", "message": "", "tool_calls": calls}], + } + + looping = tmp_path / "loop.json" + looping.write_text(json.dumps(trajectory(5)), encoding="utf-8") + clean = tmp_path / "clean.json" + clean.write_text(json.dumps(trajectory(1)), encoding="utf-8") + + metric = example_metrics.InefficientRetryLoopMetric(threshold=2) + + loop_result = await metric.compute_scores( + _input_with_evidence( + CandidateEvidence(descriptors={"trace": EvidenceDescriptor(kind="trace", ref=str(looping))}) + ) + ) + assert loop_result.outputs[0].value is False + assert loop_result.outputs[1].value == 5 + + clean_result = await metric.compute_scores( + _input_with_evidence(CandidateEvidence(descriptors={"trace": EvidenceDescriptor(kind="trace", ref=str(clean))})) + ) + assert clean_result.outputs[0].value is True diff --git a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/__init__.py index 328bcabb0c..dfe65e5646 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/__init__.py @@ -4,6 +4,13 @@ """Public value types for evaluator SDK runtime.""" from nemo_platform.beta.evaluator.values.agents import Agent +from nemo_platform.beta.evaluator.values.atif import ( + FinalMetrics, + Metrics, + Step, + ToolCall, + Trajectory, +) from nemo_platform.beta.evaluator.values.common import SecretRef, SupportedJobTypes from nemo_platform.beta.evaluator.values.dataset_schemas import ( FieldMapping, @@ -17,6 +24,10 @@ FilesystemDiff, FilesystemEntry, LocalFilesystemEvidence, + LogHandle, + TraceHandle, + WellKnownEvidenceKey, + parse_atif, ) from nemo_platform.beta.evaluator.values.metrics import ( BLEU, @@ -109,6 +120,15 @@ "ContinuousScore", "FilesystemDiff", "FilesystemEntry", + "FinalMetrics", + "LogHandle", + "Metrics", + "Step", + "ToolCall", + "Trajectory", + "TraceHandle", + "WellKnownEvidenceKey", + "parse_atif", "DatasetRow", "DatasetRows", "DefaultAggregateFieldName", diff --git a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/atif.py b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/atif.py new file mode 100644 index 0000000000..e5a4c6ec28 --- /dev/null +++ b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/atif.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +"""Lightweight ATIF read models for the evaluator SDK. + +The evaluator *ingests* traces that producers emit in the Agent Trajectory +Interchange Format (ATIF; RFC 0001, schema_version ``ATIF-v1.x``). It does not +produce or normalize ATIF, and it only reads a small subset of the schema, so +this module models *just that subset* rather than vendoring the full reference +implementation. + +These are deliberately permissive (``extra="ignore"``): fields the SDK does not +consume (images, content parts, observations, sub-agents, agent metadata, ...) +are accepted and dropped, so a trace emitted against a newer ATIF revision still +validates. Validation here means "this payload carries the ATIF fields the +evaluator's metrics rely on" — not full RFC conformance, which is the producer's +responsibility. The authoritative spec is RFC 0001. +""" + +from typing import Any, Literal + +from pydantic import BaseModel, ConfigDict, Field, field_validator + + +class ToolCall(BaseModel): + """A single tool/function invocation within an agent step.""" + + model_config = ConfigDict(extra="ignore") + + tool_call_id: str | None = Field(default=None, description="Producer-assigned tool call id, if any.") + function_name: str = Field(description="Name of the invoked tool/function.") + arguments: dict[str, Any] | None = Field(default=None, description="Arguments passed to the tool.") + + +class Metrics(BaseModel): + """Per-step token metrics.""" + + model_config = ConfigDict(extra="ignore") + + prompt_tokens: int | None = None + completion_tokens: int | None = None + + +class FinalMetrics(BaseModel): + """Trajectory-level aggregate token metrics.""" + + model_config = ConfigDict(extra="ignore") + + total_prompt_tokens: int | None = None + total_completion_tokens: int | None = None + + +class Step(BaseModel): + """One step in an agent trajectory.""" + + model_config = ConfigDict(extra="ignore") + + source: Literal["system", "user", "agent"] = Field(description="Who produced this step.") + message: str = Field(default="", description="Step text content.") + tool_calls: list[ToolCall] | None = Field(default=None, description="Tool calls issued in this step.") + metrics: Metrics | None = Field(default=None, description="Per-step token metrics, if reported.") + + +class Trajectory(BaseModel): + """An ATIF agent trajectory (read view over the subset the SDK consumes).""" + + model_config = ConfigDict(extra="ignore") + + schema_version: str = Field(description="ATIF schema version, e.g. 'ATIF-v1.7'.") + steps: list[Step] = Field(min_length=1, description="Ordered trajectory steps.") + final_metrics: FinalMetrics | None = Field(default=None, description="Aggregate token metrics, if reported.") + + @field_validator("schema_version") + @classmethod + def _looks_like_atif(cls, value: str) -> str: + # Cheap sanity gate so arbitrary JSON isn't silently accepted as a trace. + if not value.startswith("ATIF-"): + raise ValueError(f"unexpected trace schema_version {value!r}; expected an 'ATIF-*' version") + return value diff --git a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py index 16f7248651..de215543a0 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py +++ b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py @@ -8,6 +8,7 @@ import asyncio import difflib import hashlib +import json import os import shutil import signal @@ -18,6 +19,11 @@ from pydantic import BaseModel, ConfigDict, Field, JsonValue, PrivateAttr, model_validator +from nemo_platform.beta.evaluator.values.atif import FinalMetrics, Step, ToolCall, Trajectory + +# Well-known evidence keys (mirrored by the ``EVIDENCE_*`` constants in ``agent_eval.trials``). +WellKnownEvidenceKey = Literal["initial_state", "trace", "logs", "final_state", "verifier_logs"] + class FilesystemEntry(BaseModel): """One path that differs between two filesystem snapshots.""" @@ -100,11 +106,7 @@ async def read_text(self, relative_path: str | Path, *, encoding: str = "utf-8") return await asyncio.to_thread(path.read_text, encoding=encoding) async def iter_paths(self, relative_path: str | Path = ".", *, recursive: bool = False) -> list[str]: - """List entries (files *and* directories) rooted at ``relative_path``. - - Use this to walk a subtree or test for (non-)emptiness. For a flat, - files-only listing matched by a glob pattern, use :meth:`list_files`. - """ + """List entries (files and directories) rooted at ``relative_path``.""" base = self.path(relative_path) return await asyncio.to_thread(self._iter_paths_sync, base, recursive) @@ -120,11 +122,7 @@ async def read_bytes(self, relative_path: str | Path) -> bytes: return await asyncio.to_thread(path.read_bytes) async def list_files(self, pattern: str = "**/*") -> list[str]: - """List relative posix paths of files (not directories) matching ``pattern``. - - Complements :meth:`iter_paths`: this is the flat, glob-filtered, - files-only view; ``iter_paths`` walks a subtree and includes directories. - """ + """List relative posix paths of files (not directories) matching ``pattern``.""" return await asyncio.to_thread(self._list_sync, pattern) def _list_sync(self, pattern: str) -> list[str]: @@ -135,12 +133,7 @@ def _list_sync(self, pattern: str) -> list[str]: ) async def diff(self, other: LocalFilesystemEvidence) -> FilesystemDiff: - """Diff this snapshot (before) against ``other`` (after) by file content hash. - - Cost note: this hashes every file in both trees by reading each fully, so - it is O(total bytes). Fine for task-sized evidence; revisit (streamed - hashing / size+mtime prefilter) if used on large artifact trees. - """ + """Diff this snapshot (before) against ``other`` (after) by file content hash.""" return await asyncio.to_thread(self._diff_sync, other) def _diff_sync(self, other: LocalFilesystemEvidence) -> FilesystemDiff: @@ -162,12 +155,7 @@ async def unified_diff( *, context: int = 3, ) -> str: - """Unified diff of one path between this snapshot (before) and ``other`` (after). - - Opt-in, per-path companion to :meth:`diff` (which reports only which paths - changed). Returns ``""`` when the two versions are identical or binary - (non-UTF-8). Path access is traversal-guarded like the rest of the handle. - """ + """Unified diff of one path between this snapshot (before) and ``other`` (after); ``""`` if identical or binary.""" return await asyncio.to_thread(self._unified_diff_sync, other, relative_path, context) def _unified_diff_sync(self, other: LocalFilesystemEvidence, relative_path: str | Path, context: int) -> str: @@ -193,12 +181,7 @@ def _hashes(self) -> dict[str, str]: return hashes def _safe_files(self) -> list[Path]: - """Regular files under the root, never descending into or reading symlinks that escape it. - - ``os.walk(followlinks=False)`` keeps a ``vendor -> /`` style dir symlink from - exploding the walk, and escaping file symlinks (``leak -> /etc/passwd``) are - dropped so their target is never hashed. - """ + """Regular files under the root, skipping symlinks whose target escapes it.""" files: list[Path] = [] for dirpath, _dirnames, filenames in os.walk(self._root, followlinks=False): for name in filenames: @@ -215,24 +198,12 @@ async def run_verifier( cwd: str = ".", timeout_s: float | None = None, ) -> CommandResult: - """Run ``command`` (no shell) against a throwaway copy of the evidence. - - The evidence is copied to a temp overlay so the command can never mutate - stored evidence (pytest caches, build artifacts, ...). ``command`` is a - list passed straight to exec, so there is no shell parsing of it. - - This is NOT a sandbox: the command runs with the host's privileges and - full filesystem/network access. ``command`` is supplied by the (trusted) - metric author, never by the agent under test. Cost note: the whole tree - is copied on every call, so verifying large evidence repeatedly is heavy. - """ + """Run ``command`` (no shell) against a throwaway copy of the evidence; not a sandbox (host privileges).""" overlay = Path(tempfile.mkdtemp(prefix="evidence-verify-")).resolve() try: workdir = (overlay / cwd).resolve() if workdir != overlay and overlay not in workdir.parents: raise ValueError(f"verifier cwd {cwd!r} resolves outside evidence overlay") - # symlinks=True copies links as-is (no host deref); the ignore hook drops - # links whose target escapes the evidence root so the verifier can't read them. await asyncio.to_thread( shutil.copytree, self._root, @@ -246,13 +217,7 @@ async def run_verifier( await asyncio.to_thread(shutil.rmtree, overlay, True) def _ignore_escaping_symlinks(self, directory: str, names: list[str]) -> set[str]: - """copytree ignore hook: skip symlinks that can't be safely preserved in the overlay. - - Drops links whose resolved target escapes the evidence root (host-file reads) - and absolute links: ``symlinks=True`` would recreate the latter verbatim, so a - verifier write through ``link -> /real/evidence/answer.txt`` would mutate the - stored evidence instead of the throwaway copy. - """ + """copytree ignore hook: drop absolute symlinks and links whose target escapes the evidence root.""" ignored: set[str] = set() for name in names: full = Path(directory) / name @@ -264,8 +229,7 @@ def _ignore_escaping_symlinks(self, directory: str, names: list[str]) -> set[str @staticmethod async def _exec(command: list[str], cwd: Path, timeout_s: float | None) -> CommandResult: - # start_new_session makes the child its own process-group leader, so a timeout - # can reap the whole tree (grandchildren it spawned) rather than just the child. + # start_new_session: child leads its own process group so a timeout can reap the whole tree. process = await asyncio.create_subprocess_exec( *command, cwd=str(cwd), @@ -276,11 +240,11 @@ async def _exec(command: list[str], cwd: Path, timeout_s: float | None) -> Comma try: stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout_s) except TimeoutError: - # wait_for cancels communicate() but leaves the tree running; kill the group. + # wait_for leaves the tree running; kill the whole process group. try: os.killpg(os.getpgid(process.pid), signal.SIGKILL) except ProcessLookupError: - pass # already exited between the timeout and the kill + pass await process.wait() return CommandResult(exit_code=-1, timed_out=True) return CommandResult( @@ -325,6 +289,75 @@ def _requires_ref_or_data(self) -> EvidenceDescriptor: return self +# ATIF ingest: producers emit conformant ATIF (see values/atif.py, RFC 0001); we validate on read, not normalize. +def parse_atif(payload: Any) -> Trajectory: + """Validate a payload as a canonical ATIF :class:`Trajectory` (raises ``ValidationError`` if non-conformant).""" + return payload if isinstance(payload, Trajectory) else Trajectory.model_validate(payload) + + +class TraceHandle: + """Lazily validated read handle exposing a trace descriptor as an ATIF :class:`Trajectory`.""" + + def __init__(self, descriptor: EvidenceDescriptor) -> None: + self._descriptor = descriptor + self._trajectory: Trajectory | None = None + + async def trace(self) -> Trajectory: + """Return the ATIF trajectory, reading and validating on first access.""" + if self._trajectory is None: + payload = await asyncio.to_thread(self._load_payload) + self._trajectory = parse_atif(payload) + return self._trajectory + + def _load_payload(self) -> Any: + descriptor = self._descriptor + if descriptor.data is not None: + return descriptor.data + if descriptor.ref is None: + raise ValueError("trace evidence descriptor requires ref or data") + return json.loads(_local_filesystem_ref(descriptor.ref).read_text(encoding="utf-8")) + + async def steps(self) -> list[Step]: + """Return the ATIF steps in order.""" + return (await self.trace()).steps + + async def tool_calls(self) -> list[ToolCall]: + """Return all tool calls flattened across agent steps, in order.""" + calls: list[ToolCall] = [] + for step in await self.steps(): + calls.extend(step.tool_calls or []) + return calls + + async def token_usage(self) -> FinalMetrics: + """Return aggregate token usage (trajectory ``final_metrics``, else summed per step).""" + trajectory = await self.trace() + if trajectory.final_metrics is not None: + return trajectory.final_metrics + prompt = sum((step.metrics.prompt_tokens or 0) for step in trajectory.steps if step.metrics is not None) + completion = sum((step.metrics.completion_tokens or 0) for step in trajectory.steps if step.metrics is not None) + return FinalMetrics(total_prompt_tokens=prompt or None, total_completion_tokens=completion or None) + + +class LogHandle: + """Read handle over a log-bundle directory.""" + + def __init__(self, root: str | Path) -> None: + self._fs = LocalFilesystemEvidence(root) + + async def list_files(self) -> list[str]: + """Return relative paths of log files in the bundle.""" + return await self._fs.list_files("**/*") + + async def read_text(self, name: str) -> str: + """Read one log file's full text.""" + return await self._fs.read_text(name) + + async def tail(self, name: str, lines: int = 50) -> str: + """Return the last ``lines`` lines of a log file.""" + text = await self._fs.read_text(name) + return "\n".join(text.splitlines()[-lines:]) + + class CandidateEvidence(BaseModel): """Named evidence descriptors attached to an AgentEvalAttempt.""" @@ -339,6 +372,8 @@ class CandidateEvidence(BaseModel): description="Free-form metadata associated with the evidence collection.", ) _filesystem_cache: dict[str, LocalFilesystemEvidence] = PrivateAttr(default_factory=dict) + _trace_cache: dict[str, TraceHandle] = PrivateAttr(default_factory=dict) + _log_cache: dict[str, LogHandle] = PrivateAttr(default_factory=dict) @model_validator(mode="before") @classmethod @@ -383,13 +418,30 @@ async def filesystem(self, name: str) -> LocalFilesystemEvidence: self._filesystem_cache[name] = handle return handle + async def trace(self, name: str = "trace") -> TraceHandle: + """Return a cached trace handle for a named trace descriptor (read lazily on first access).""" + cached = self._trace_cache.get(name) + if cached is not None: + return cached + handle = TraceHandle(self.require(name, kind="trace")) + self._trace_cache[name] = handle + return handle -def _local_filesystem_ref(ref: str) -> Path: - """Resolve a local filesystem ref to a Path. + async def logs(self, name: str = "logs") -> LogHandle: + """Return a cached log-bundle handle for a named logs descriptor.""" + cached = self._log_cache.get(name) + if cached is not None: + return cached + descriptor = self.require(name, kind="logs") + if descriptor.ref is None: + raise ValueError(f"logs evidence descriptor {name!r} requires a local ref") + handle = LogHandle(_local_filesystem_ref(descriptor.ref)) + self._log_cache[name] = handle + return handle - Accepts POSIX paths, ``file://`` URIs, and Windows drive paths (e.g. ``C:\\dir``). - Network and cloud URI schemes (http, https, s3, gs, ...) are rejected. - """ + +def _local_filesystem_ref(ref: str) -> Path: + """Resolve a local ref (POSIX path, ``file://`` URI, or Windows drive path) to a Path; reject network/cloud URIs.""" parsed = urlparse(ref) # A single-letter scheme is a Windows drive letter (e.g. "C:\\dir"), not a URI scheme. if len(parsed.scheme) == 1 and parsed.scheme.isalpha(): From fcd211dbee6d10baa44f0de40684ccd23e285b7a Mon Sep 17 00:00:00 2001 From: "Arpit Singh (SW-CLOUD)" Date: Tue, 30 Jun 2026 11:36:06 -0700 Subject: [PATCH 2/2] docs(evaluator): address review comments on evidence handles - Restore the symlink-safety comment in run_verifier explaining why copytree(symlinks=True) is safe (the ignore hook drops escaping links). - Reference AgentEvalTrial (not AgentEvalAttempt) in the CandidateEvidence docstring. Vendored mirror synced via make vendor. Signed-off-by: Arpit Singh (SW-CLOUD) --- .../src/nemo_evaluator_sdk/values/evidence.py | 4 +++- plugins/nemo-evaluator/openapi/openapi.yaml | 4 ++-- .../src/nemo_platform/beta/evaluator/values/evidence.py | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py index 368e480e33..2f70605951 100644 --- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py +++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py @@ -204,6 +204,8 @@ async def run_verifier( workdir = (overlay / cwd).resolve() if workdir != overlay and overlay not in workdir.parents: raise ValueError(f"verifier cwd {cwd!r} resolves outside evidence overlay") + # symlinks=True copies links as-is (no host deref); the ignore hook drops links whose + # target escapes the evidence root so the verifier can't read or write through them. await asyncio.to_thread( shutil.copytree, self._root, @@ -359,7 +361,7 @@ async def tail(self, name: str, lines: int = 50) -> str: class CandidateEvidence(BaseModel): - """Named evidence descriptors attached to an AgentEvalAttempt.""" + """Named evidence descriptors attached to an AgentEvalTrial.""" model_config = ConfigDict(extra="forbid") diff --git a/plugins/nemo-evaluator/openapi/openapi.yaml b/plugins/nemo-evaluator/openapi/openapi.yaml index e19f28ce6f..bbc8578245 100644 --- a/plugins/nemo-evaluator/openapi/openapi.yaml +++ b/plugins/nemo-evaluator/openapi/openapi.yaml @@ -1588,7 +1588,7 @@ components: additionalProperties: false type: object title: CandidateEvidenceInput - description: Named evidence descriptors attached to an AgentEvalAttempt. + description: Named evidence descriptors attached to an AgentEvalTrial. CandidateEvidenceOutput: properties: descriptors: @@ -1606,7 +1606,7 @@ components: additionalProperties: false type: object title: CandidateEvidenceOutput - description: Named evidence descriptors attached to an AgentEvalAttempt. + description: Named evidence descriptors attached to an AgentEvalTrial. CloudpickleMetricPayload: properties: kind: diff --git a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py index de215543a0..3ad7f200e9 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py +++ b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py @@ -204,6 +204,8 @@ async def run_verifier( workdir = (overlay / cwd).resolve() if workdir != overlay and overlay not in workdir.parents: raise ValueError(f"verifier cwd {cwd!r} resolves outside evidence overlay") + # symlinks=True copies links as-is (no host deref); the ignore hook drops links whose + # target escapes the evidence root so the verifier can't read or write through them. await asyncio.to_thread( shutil.copytree, self._root, @@ -359,7 +361,7 @@ async def tail(self, name: str, lines: int = 50) -> str: class CandidateEvidence(BaseModel): - """Named evidence descriptors attached to an AgentEvalAttempt.""" + """Named evidence descriptors attached to an AgentEvalTrial.""" model_config = ConfigDict(extra="forbid")