From f3fa8ab2bd3362053c8b309c6c349cc4c70dca1e Mon Sep 17 00:00:00 2001
From: "Arpit Singh (SW-CLOUD)" <arpsingh@nvidia.com>
Date: Mon, 29 Jun 2026 23:41:49 -0700
Subject: [PATCH 1/2] feat(evaluator): add ATIF trace and log read handles for
 evidence
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add read handles over candidate evidence for agent-eval metrics:

- TraceHandle exposes a trace descriptor as an ATIF Trajectory with
  step/tool-call/token-usage views; lightweight ATIF read models live in
  values/atif.py (validated on read via parse_atif, raw payload persisted,
  no normalization — producers emit conformant ATIF).
- LogHandle for log-bundle access and LocalFilesystemEvidence helpers
  (diff, unified_diff, run_verifier with symlink hardening).
- WellKnownEvidenceKey literal and example metrics updates.

Vendored mirror synced via make vendor.

Signed-off-by: Arpit Singh (SW-CLOUD) <arpsingh@nvidia.com>
---
 .../run_agent_eval/example_metrics.py         |  64 +++++--
 .../src/nemo_evaluator_sdk/values/__init__.py |  20 +++
 .../src/nemo_evaluator_sdk/values/atif.py     |  78 ++++++++
 .../src/nemo_evaluator_sdk/values/evidence.py | 166 ++++++++++++------
 .../tests/agent_eval/test_evidence.py         |  84 ++++++++-
 .../tests/agent_eval/test_example_metrics.py  |  34 ++++
 .../beta/evaluator/values/__init__.py         |  20 +++
 .../beta/evaluator/values/atif.py             |  78 ++++++++
 .../beta/evaluator/values/evidence.py         | 166 ++++++++++++------
 9 files changed, 580 insertions(+), 130 deletions(-)
 create mode 100644 packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/atif.py
 create mode 100644 sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/atif.py

diff --git a/packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py b/packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py
index 9e6118dedf..cf90400b10 100644
--- a/packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py
+++ b/packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py
@@ -3,19 +3,23 @@
 
 """Reference metrics-over-evidence for this example (not SDK API).
 
-These show how to score from the SDK's filesystem evidence handle instead of a
-stamped verifier reward:
+These show how to score from the SDK's evidence handles instead of a stamped
+verifier reward:
 
 * :class:`TestsPassMetric` runs a command against ``final_state`` filesystem
   evidence (in a throwaway overlay) and scores on exit 0.
 * :class:`NoTestCheatingMetric` diffs ``initial_state`` against ``final_state``
   and fails if the agent touched protected (e.g. test) paths.
+* :class:`InefficientRetryLoopMetric` reads the normalized ``trace`` and fails
+  when the same tool call repeats past a threshold.
 """
 
 from __future__ import annotations
 
+import json
 from collections.abc import Sequence
 
+from nemo_evaluator_sdk.agent_eval.trials import EVIDENCE_FINAL_STATE, EVIDENCE_INITIAL_STATE, EVIDENCE_TRACE
 from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult
 
 
@@ -26,7 +30,7 @@ def __init__(
         self,
         command: Sequence[str],
         *,
-        evidence_name: str = "final_state",
+        evidence_name: str = EVIDENCE_FINAL_STATE,
         cwd: str = ".",
         timeout_s: float = 300.0,
     ) -> None:
@@ -60,13 +64,9 @@ def __init__(
         *,
         protected: Sequence[str] = ("tests/",),
         change_types: Sequence[str] = ("added", "modified", "deleted"),
-        initial_name: str = "initial_state",
-        final_name: str = "final_state",
     ) -> None:
         self._protected = tuple(protected)
         self._change_types = set(change_types)
-        self._initial_name = initial_name
-        self._final_name = final_name
 
     @property
     def type(self) -> str:
@@ -78,12 +78,56 @@ def output_spec(self) -> list[MetricOutputSpec]:
     async def compute_scores(self, input: MetricInput) -> MetricResult:
         evidence = input.candidate.evidence
         clean = True
-        if evidence is not None and evidence.get(self._initial_name) and evidence.get(self._final_name):
-            initial = await evidence.filesystem(self._initial_name)
-            final = await evidence.filesystem(self._final_name)
+        if evidence is not None and evidence.get(EVIDENCE_INITIAL_STATE) and evidence.get(EVIDENCE_FINAL_STATE):
+            initial = await evidence.filesystem(EVIDENCE_INITIAL_STATE)
+            final = await evidence.filesystem(EVIDENCE_FINAL_STATE)
             diff = await initial.diff(final)
             violations = [
                 entry for prefix in self._protected for entry in diff.changed(prefix=prefix, kinds=self._change_types)
             ]
             clean = not violations
         return MetricResult(outputs=[MetricOutput(name="no_test_cheating", value=clean)])
+
+
+class InefficientRetryLoopMetric:
+    """Score ``False`` when the same tool call repeats consecutively past ``threshold`` times."""
+
+    def __init__(self, *, threshold: int = 2) -> None:
+        self._threshold = threshold
+
+    @property
+    def type(self) -> str:
+        return "inefficient_retry_loop"
+
+    def output_spec(self) -> list[MetricOutputSpec]:
+        return [
+            MetricOutputSpec.boolean("efficient_tool_use"),
+            MetricOutputSpec.discrete_score("max_repeated_tool_calls"),
+        ]
+
+    async def compute_scores(self, input: MetricInput) -> MetricResult:
+        max_repeats = 0
+        evidence = input.candidate.evidence
+        if evidence is not None and evidence.get(EVIDENCE_TRACE) is not None:
+            calls = await (await evidence.trace(EVIDENCE_TRACE)).tool_calls()
+            # Count the longest run of *consecutive* identical calls (a retry loop), not the
+            # global frequency, so legitimate reuse separated by other work isn't flagged.
+            previous_key: str | None = None
+            current_repeats = 0
+            for call in calls:
+                # Canonicalize for comparison only (sorted keys): semantically identical calls
+                # match regardless of argument insertion order; execution order is untouched.
+                key = json.dumps(
+                    {"function_name": call.function_name, "arguments": call.arguments or {}},
+                    sort_keys=True,
+                    separators=(",", ":"),
+                )
+                current_repeats = current_repeats + 1 if key == previous_key else 1
+                previous_key = key
+                max_repeats = max(max_repeats, current_repeats)
+        return MetricResult(
+            outputs=[
+                MetricOutput(name="efficient_tool_use", value=max_repeats <= self._threshold),
+                MetricOutput(name="max_repeated_tool_calls", value=max_repeats),
+            ]
+        )
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py
index 9c4e11eb7e..5190a5ff7b 100644
--- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/__init__.py
@@ -4,6 +4,13 @@
 """Public value types for evaluator SDK runtime."""
 
 from nemo_evaluator_sdk.values.agents import Agent
+from nemo_evaluator_sdk.values.atif import (
+    FinalMetrics,
+    Metrics,
+    Step,
+    ToolCall,
+    Trajectory,
+)
 from nemo_evaluator_sdk.values.common import SecretRef, SupportedJobTypes
 from nemo_evaluator_sdk.values.dataset_schemas import (
     FieldMapping,
@@ -17,6 +24,10 @@
     FilesystemDiff,
     FilesystemEntry,
     LocalFilesystemEvidence,
+    LogHandle,
+    TraceHandle,
+    WellKnownEvidenceKey,
+    parse_atif,
 )
 from nemo_evaluator_sdk.values.metrics import (
     BLEU,
@@ -109,6 +120,15 @@
     "ContinuousScore",
     "FilesystemDiff",
     "FilesystemEntry",
+    "FinalMetrics",
+    "LogHandle",
+    "Metrics",
+    "Step",
+    "ToolCall",
+    "Trajectory",
+    "TraceHandle",
+    "WellKnownEvidenceKey",
+    "parse_atif",
     "DatasetRow",
     "DatasetRows",
     "DefaultAggregateFieldName",
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/atif.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/atif.py
new file mode 100644
index 0000000000..e5a4c6ec28
--- /dev/null
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/atif.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Lightweight ATIF read models for the evaluator SDK.
+
+The evaluator *ingests* traces that producers emit in the Agent Trajectory
+Interchange Format (ATIF; RFC 0001, schema_version ``ATIF-v1.x``). It does not
+produce or normalize ATIF, and it only reads a small subset of the schema, so
+this module models *just that subset* rather than vendoring the full reference
+implementation.
+
+These are deliberately permissive (``extra="ignore"``): fields the SDK does not
+consume (images, content parts, observations, sub-agents, agent metadata, ...)
+are accepted and dropped, so a trace emitted against a newer ATIF revision still
+validates. Validation here means "this payload carries the ATIF fields the
+evaluator's metrics rely on" — not full RFC conformance, which is the producer's
+responsibility. The authoritative spec is RFC 0001.
+"""
+
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+
+
+class ToolCall(BaseModel):
+    """A single tool/function invocation within an agent step."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    tool_call_id: str | None = Field(default=None, description="Producer-assigned tool call id, if any.")
+    function_name: str = Field(description="Name of the invoked tool/function.")
+    arguments: dict[str, Any] | None = Field(default=None, description="Arguments passed to the tool.")
+
+
+class Metrics(BaseModel):
+    """Per-step token metrics."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+
+
+class FinalMetrics(BaseModel):
+    """Trajectory-level aggregate token metrics."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    total_prompt_tokens: int | None = None
+    total_completion_tokens: int | None = None
+
+
+class Step(BaseModel):
+    """One step in an agent trajectory."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    source: Literal["system", "user", "agent"] = Field(description="Who produced this step.")
+    message: str = Field(default="", description="Step text content.")
+    tool_calls: list[ToolCall] | None = Field(default=None, description="Tool calls issued in this step.")
+    metrics: Metrics | None = Field(default=None, description="Per-step token metrics, if reported.")
+
+
+class Trajectory(BaseModel):
+    """An ATIF agent trajectory (read view over the subset the SDK consumes)."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    schema_version: str = Field(description="ATIF schema version, e.g. 'ATIF-v1.7'.")
+    steps: list[Step] = Field(min_length=1, description="Ordered trajectory steps.")
+    final_metrics: FinalMetrics | None = Field(default=None, description="Aggregate token metrics, if reported.")
+
+    @field_validator("schema_version")
+    @classmethod
+    def _looks_like_atif(cls, value: str) -> str:
+        # Cheap sanity gate so arbitrary JSON isn't silently accepted as a trace.
+        if not value.startswith("ATIF-"):
+            raise ValueError(f"unexpected trace schema_version {value!r}; expected an 'ATIF-*' version")
+        return value
diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py
index 16f7248651..368e480e33 100644
--- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py
@@ -8,6 +8,7 @@
 import asyncio
 import difflib
 import hashlib
+import json
 import os
 import shutil
 import signal
@@ -18,6 +19,11 @@
 
 from pydantic import BaseModel, ConfigDict, Field, JsonValue, PrivateAttr, model_validator
 
+from nemo_evaluator_sdk.values.atif import FinalMetrics, Step, ToolCall, Trajectory
+
+# Well-known evidence keys (mirrored by the ``EVIDENCE_*`` constants in ``agent_eval.trials``).
+WellKnownEvidenceKey = Literal["initial_state", "trace", "logs", "final_state", "verifier_logs"]
+
 
 class FilesystemEntry(BaseModel):
     """One path that differs between two filesystem snapshots."""
@@ -100,11 +106,7 @@ async def read_text(self, relative_path: str | Path, *, encoding: str = "utf-8")
         return await asyncio.to_thread(path.read_text, encoding=encoding)
 
     async def iter_paths(self, relative_path: str | Path = ".", *, recursive: bool = False) -> list[str]:
-        """List entries (files *and* directories) rooted at ``relative_path``.
-
-        Use this to walk a subtree or test for (non-)emptiness. For a flat,
-        files-only listing matched by a glob pattern, use :meth:`list_files`.
-        """
+        """List entries (files and directories) rooted at ``relative_path``."""
         base = self.path(relative_path)
         return await asyncio.to_thread(self._iter_paths_sync, base, recursive)
 
@@ -120,11 +122,7 @@ async def read_bytes(self, relative_path: str | Path) -> bytes:
         return await asyncio.to_thread(path.read_bytes)
 
     async def list_files(self, pattern: str = "**/*") -> list[str]:
-        """List relative posix paths of files (not directories) matching ``pattern``.
-
-        Complements :meth:`iter_paths`: this is the flat, glob-filtered,
-        files-only view; ``iter_paths`` walks a subtree and includes directories.
-        """
+        """List relative posix paths of files (not directories) matching ``pattern``."""
         return await asyncio.to_thread(self._list_sync, pattern)
 
     def _list_sync(self, pattern: str) -> list[str]:
@@ -135,12 +133,7 @@ def _list_sync(self, pattern: str) -> list[str]:
         )
 
     async def diff(self, other: LocalFilesystemEvidence) -> FilesystemDiff:
-        """Diff this snapshot (before) against ``other`` (after) by file content hash.
-
-        Cost note: this hashes every file in both trees by reading each fully, so
-        it is O(total bytes). Fine for task-sized evidence; revisit (streamed
-        hashing / size+mtime prefilter) if used on large artifact trees.
-        """
+        """Diff this snapshot (before) against ``other`` (after) by file content hash."""
         return await asyncio.to_thread(self._diff_sync, other)
 
     def _diff_sync(self, other: LocalFilesystemEvidence) -> FilesystemDiff:
@@ -162,12 +155,7 @@ async def unified_diff(
         *,
         context: int = 3,
     ) -> str:
-        """Unified diff of one path between this snapshot (before) and ``other`` (after).
-
-        Opt-in, per-path companion to :meth:`diff` (which reports only which paths
-        changed). Returns ``""`` when the two versions are identical or binary
-        (non-UTF-8). Path access is traversal-guarded like the rest of the handle.
-        """
+        """Unified diff of one path between this snapshot (before) and ``other`` (after); ``""`` if identical or binary."""
         return await asyncio.to_thread(self._unified_diff_sync, other, relative_path, context)
 
     def _unified_diff_sync(self, other: LocalFilesystemEvidence, relative_path: str | Path, context: int) -> str:
@@ -193,12 +181,7 @@ def _hashes(self) -> dict[str, str]:
         return hashes
 
     def _safe_files(self) -> list[Path]:
-        """Regular files under the root, never descending into or reading symlinks that escape it.
-
-        ``os.walk(followlinks=False)`` keeps a ``vendor -> /`` style dir symlink from
-        exploding the walk, and escaping file symlinks (``leak -> /etc/passwd``) are
-        dropped so their target is never hashed.
-        """
+        """Regular files under the root, skipping symlinks whose target escapes it."""
         files: list[Path] = []
         for dirpath, _dirnames, filenames in os.walk(self._root, followlinks=False):
             for name in filenames:
@@ -215,24 +198,12 @@ async def run_verifier(
         cwd: str = ".",
         timeout_s: float | None = None,
     ) -> CommandResult:
-        """Run ``command`` (no shell) against a throwaway copy of the evidence.
-
-        The evidence is copied to a temp overlay so the command can never mutate
-        stored evidence (pytest caches, build artifacts, ...). ``command`` is a
-        list passed straight to exec, so there is no shell parsing of it.
-
-        This is NOT a sandbox: the command runs with the host's privileges and
-        full filesystem/network access. ``command`` is supplied by the (trusted)
-        metric author, never by the agent under test. Cost note: the whole tree
-        is copied on every call, so verifying large evidence repeatedly is heavy.
-        """
+        """Run ``command`` (no shell) against a throwaway copy of the evidence; not a sandbox (host privileges)."""
         overlay = Path(tempfile.mkdtemp(prefix="evidence-verify-")).resolve()
         try:
             workdir = (overlay / cwd).resolve()
             if workdir != overlay and overlay not in workdir.parents:
                 raise ValueError(f"verifier cwd {cwd!r} resolves outside evidence overlay")
-            # symlinks=True copies links as-is (no host deref); the ignore hook drops
-            # links whose target escapes the evidence root so the verifier can't read them.
             await asyncio.to_thread(
                 shutil.copytree,
                 self._root,
@@ -246,13 +217,7 @@ async def run_verifier(
             await asyncio.to_thread(shutil.rmtree, overlay, True)
 
     def _ignore_escaping_symlinks(self, directory: str, names: list[str]) -> set[str]:
-        """copytree ignore hook: skip symlinks that can't be safely preserved in the overlay.
-
-        Drops links whose resolved target escapes the evidence root (host-file reads)
-        and absolute links: ``symlinks=True`` would recreate the latter verbatim, so a
-        verifier write through ``link -> /real/evidence/answer.txt`` would mutate the
-        stored evidence instead of the throwaway copy.
-        """
+        """copytree ignore hook: drop absolute symlinks and links whose target escapes the evidence root."""
         ignored: set[str] = set()
         for name in names:
             full = Path(directory) / name
@@ -264,8 +229,7 @@ def _ignore_escaping_symlinks(self, directory: str, names: list[str]) -> set[str
 
     @staticmethod
     async def _exec(command: list[str], cwd: Path, timeout_s: float | None) -> CommandResult:
-        # start_new_session makes the child its own process-group leader, so a timeout
-        # can reap the whole tree (grandchildren it spawned) rather than just the child.
+        # start_new_session: child leads its own process group so a timeout can reap the whole tree.
         process = await asyncio.create_subprocess_exec(
             *command,
             cwd=str(cwd),
@@ -276,11 +240,11 @@ async def _exec(command: list[str], cwd: Path, timeout_s: float | None) -> Comma
         try:
             stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout_s)
         except TimeoutError:
-            # wait_for cancels communicate() but leaves the tree running; kill the group.
+            # wait_for leaves the tree running; kill the whole process group.
             try:
                 os.killpg(os.getpgid(process.pid), signal.SIGKILL)
             except ProcessLookupError:
-                pass  # already exited between the timeout and the kill
+                pass
             await process.wait()
             return CommandResult(exit_code=-1, timed_out=True)
         return CommandResult(
@@ -325,6 +289,75 @@ def _requires_ref_or_data(self) -> EvidenceDescriptor:
         return self
 
 
+# ATIF ingest: producers emit conformant ATIF (see values/atif.py, RFC 0001); we validate on read, not normalize.
+def parse_atif(payload: Any) -> Trajectory:
+    """Validate a payload as a canonical ATIF :class:`Trajectory` (raises ``ValidationError`` if non-conformant)."""
+    return payload if isinstance(payload, Trajectory) else Trajectory.model_validate(payload)
+
+
+class TraceHandle:
+    """Lazily validated read handle exposing a trace descriptor as an ATIF :class:`Trajectory`."""
+
+    def __init__(self, descriptor: EvidenceDescriptor) -> None:
+        self._descriptor = descriptor
+        self._trajectory: Trajectory | None = None
+
+    async def trace(self) -> Trajectory:
+        """Return the ATIF trajectory, reading and validating on first access."""
+        if self._trajectory is None:
+            payload = await asyncio.to_thread(self._load_payload)
+            self._trajectory = parse_atif(payload)
+        return self._trajectory
+
+    def _load_payload(self) -> Any:
+        descriptor = self._descriptor
+        if descriptor.data is not None:
+            return descriptor.data
+        if descriptor.ref is None:
+            raise ValueError("trace evidence descriptor requires ref or data")
+        return json.loads(_local_filesystem_ref(descriptor.ref).read_text(encoding="utf-8"))
+
+    async def steps(self) -> list[Step]:
+        """Return the ATIF steps in order."""
+        return (await self.trace()).steps
+
+    async def tool_calls(self) -> list[ToolCall]:
+        """Return all tool calls flattened across agent steps, in order."""
+        calls: list[ToolCall] = []
+        for step in await self.steps():
+            calls.extend(step.tool_calls or [])
+        return calls
+
+    async def token_usage(self) -> FinalMetrics:
+        """Return aggregate token usage (trajectory ``final_metrics``, else summed per step)."""
+        trajectory = await self.trace()
+        if trajectory.final_metrics is not None:
+            return trajectory.final_metrics
+        prompt = sum((step.metrics.prompt_tokens or 0) for step in trajectory.steps if step.metrics is not None)
+        completion = sum((step.metrics.completion_tokens or 0) for step in trajectory.steps if step.metrics is not None)
+        return FinalMetrics(total_prompt_tokens=prompt or None, total_completion_tokens=completion or None)
+
+
+class LogHandle:
+    """Read handle over a log-bundle directory."""
+
+    def __init__(self, root: str | Path) -> None:
+        self._fs = LocalFilesystemEvidence(root)
+
+    async def list_files(self) -> list[str]:
+        """Return relative paths of log files in the bundle."""
+        return await self._fs.list_files("**/*")
+
+    async def read_text(self, name: str) -> str:
+        """Read one log file's full text."""
+        return await self._fs.read_text(name)
+
+    async def tail(self, name: str, lines: int = 50) -> str:
+        """Return the last ``lines`` lines of a log file."""
+        text = await self._fs.read_text(name)
+        return "\n".join(text.splitlines()[-lines:])
+
+
 class CandidateEvidence(BaseModel):
     """Named evidence descriptors attached to an AgentEvalAttempt."""
 
@@ -339,6 +372,8 @@ class CandidateEvidence(BaseModel):
         description="Free-form metadata associated with the evidence collection.",
     )
     _filesystem_cache: dict[str, LocalFilesystemEvidence] = PrivateAttr(default_factory=dict)
+    _trace_cache: dict[str, TraceHandle] = PrivateAttr(default_factory=dict)
+    _log_cache: dict[str, LogHandle] = PrivateAttr(default_factory=dict)
 
     @model_validator(mode="before")
     @classmethod
@@ -383,13 +418,30 @@ async def filesystem(self, name: str) -> LocalFilesystemEvidence:
         self._filesystem_cache[name] = handle
         return handle
 
+    async def trace(self, name: str = "trace") -> TraceHandle:
+        """Return a cached trace handle for a named trace descriptor (read lazily on first access)."""
+        cached = self._trace_cache.get(name)
+        if cached is not None:
+            return cached
+        handle = TraceHandle(self.require(name, kind="trace"))
+        self._trace_cache[name] = handle
+        return handle
 
-def _local_filesystem_ref(ref: str) -> Path:
-    """Resolve a local filesystem ref to a Path.
+    async def logs(self, name: str = "logs") -> LogHandle:
+        """Return a cached log-bundle handle for a named logs descriptor."""
+        cached = self._log_cache.get(name)
+        if cached is not None:
+            return cached
+        descriptor = self.require(name, kind="logs")
+        if descriptor.ref is None:
+            raise ValueError(f"logs evidence descriptor {name!r} requires a local ref")
+        handle = LogHandle(_local_filesystem_ref(descriptor.ref))
+        self._log_cache[name] = handle
+        return handle
 
-    Accepts POSIX paths, ``file://`` URIs, and Windows drive paths (e.g. ``C:\\dir``).
-    Network and cloud URI schemes (http, https, s3, gs, ...) are rejected.
-    """
+
+def _local_filesystem_ref(ref: str) -> Path:
+    """Resolve a local ref (POSIX path, ``file://`` URI, or Windows drive path) to a Path; reject network/cloud URIs."""
     parsed = urlparse(ref)
     # A single-letter scheme is a Windows drive letter (e.g. "C:\\dir"), not a URI scheme.
     if len(parsed.scheme) == 1 and parsed.scheme.isalpha():
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_evidence.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_evidence.py
index 7de29cf409..d58902d71a 100644
--- a/packages/nemo_evaluator_sdk/tests/agent_eval/test_evidence.py
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_evidence.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import json
+import os
+import time
 from pathlib import Path
 
 import pytest
@@ -11,6 +14,7 @@
     EvidenceDescriptor,
     LocalFilesystemEvidence,
 )
+from pydantic import ValidationError
 
 
 def test_metric_input_preserves_candidate_evidence_out_of_metadata() -> None:
@@ -151,14 +155,27 @@ async def test_verifier_timeout_kills_the_whole_process_tree(tmp_path: Path) ->
     root.mkdir()
     handle = LocalFilesystemEvidence(root)
 
-    # A grandchild backgrounded by the verifier would write the marker after 1s if it
-    # survived; killing the whole process group on timeout stops it first.
-    marker = tmp_path / "survived.txt"
-    result = await handle.run_verifier(["sh", "-c", f"(sleep 1; touch '{marker}') & sleep 5"], timeout_s=0.3)
+    # The verifier backgrounds a long-lived child and records its PID. Killing only the
+    # direct shell would orphan that child; killing the whole process group reaps it.
+    # (We assert the child PID is gone rather than watching for a follow-on side effect,
+    # which would race the kill: reaping the child can let the shell run its next command
+    # in the window before the shell itself is signalled.)
+    pidfile = tmp_path / "child.pid"
+    result = await handle.run_verifier(["sh", "-c", f"sleep 30 & echo $! > '{pidfile}'; wait"], timeout_s=0.3)
     assert result.timed_out
 
-    await asyncio.sleep(1.5)
-    assert not marker.exists()
+    child_pid = int(pidfile.read_text().strip())
+    # Poll until the killed group is fully reaped rather than assuming a fixed window;
+    # on a busy runner the child can briefly linger as a zombie even when cleanup is correct.
+    deadline = time.monotonic() + 2.0
+    while True:
+        try:
+            os.kill(child_pid, 0)
+        except ProcessLookupError:
+            break
+        if time.monotonic() >= deadline:
+            pytest.fail(f"child process {child_pid} still alive after cleanup timeout")
+        await asyncio.sleep(0.05)
 
 
 @pytest.mark.asyncio
@@ -177,3 +194,58 @@ async def test_unified_diff_reports_text_patch_and_skips_binary(tmp_path: Path)
     assert "-b" in patch and "+c" in patch and patch.startswith("--- a/f.txt")
     assert await before.unified_diff(after, "img.bin") == ""  # binary: no textual patch
     assert await before.unified_diff(before, "f.txt") == ""  # identical: empty
+
+
+_ATIF_TRAJECTORY = {
+    "schema_version": "ATIF-v1.7",
+    "agent": {"name": "demo", "version": "1.0"},
+    "steps": [
+        {"step_id": 1, "source": "user", "message": "do it"},
+        {
+            "step_id": 2,
+            "source": "agent",
+            "message": "calling tool",
+            "tool_calls": [{"tool_call_id": "c1", "function_name": "search", "arguments": {"q": "x"}}],
+            "observation": {"results": [{"source_call_id": "c1", "content": "result text"}]},
+        },
+    ],
+    "final_metrics": {"total_prompt_tokens": 10, "total_completion_tokens": 5},
+}
+
+
+@pytest.mark.asyncio
+async def test_trace_handle_reads_atif(tmp_path: Path) -> None:
+    trace_path = tmp_path / "trajectory.json"
+    trace_path.write_text(json.dumps(_ATIF_TRAJECTORY), encoding="utf-8")
+
+    evidence = CandidateEvidence(
+        descriptors={"trace": EvidenceDescriptor(kind="trace", ref=str(trace_path), format="atif")}
+    )
+    handle = await evidence.trace("trace")
+    assert handle is await evidence.trace("trace")  # cached
+    trajectory = await handle.trace()
+    assert trajectory.schema_version == "ATIF-v1.7"
+    assert [step.source for step in trajectory.steps] == ["user", "agent"]
+    assert (await handle.tool_calls())[0].function_name == "search"
+    assert (await handle.token_usage()).total_prompt_tokens == 10
+
+    # A non-conformant trace is rejected at read time (producers must emit ATIF).
+    bad = CandidateEvidence(
+        descriptors={"trace": EvidenceDescriptor(kind="trace", format="atif", data={"steps": "not-a-list"})}
+    )
+    with pytest.raises(ValidationError):
+        await (await bad.trace("trace")).trace()
+
+
+@pytest.mark.asyncio
+async def test_logs_handle_reads_and_tails(tmp_path: Path) -> None:
+    log_dir = tmp_path / "logs"
+    log_dir.mkdir()
+    (log_dir / "agent.log").write_text("line1\nline2\nline3\n", encoding="utf-8")
+
+    evidence = CandidateEvidence(descriptors={"logs": EvidenceDescriptor(kind="logs", format="dir", ref=str(log_dir))})
+    handle = await evidence.logs("logs")
+    assert handle is await evidence.logs("logs")  # cached
+    assert await handle.list_files() == ["agent.log"]
+    assert await handle.read_text("agent.log") == "line1\nline2\nline3\n"
+    assert await handle.tail("agent.log", 2) == "line2\nline3"
diff --git a/packages/nemo_evaluator_sdk/tests/agent_eval/test_example_metrics.py b/packages/nemo_evaluator_sdk/tests/agent_eval/test_example_metrics.py
index 9f25fabdd5..1a9acca47f 100644
--- a/packages/nemo_evaluator_sdk/tests/agent_eval/test_example_metrics.py
+++ b/packages/nemo_evaluator_sdk/tests/agent_eval/test_example_metrics.py
@@ -4,6 +4,7 @@
 """Exercise the example's reference metrics-over-evidence."""
 
 import importlib.util
+import json
 from pathlib import Path
 
 import pytest
@@ -55,3 +56,36 @@ async def test_tests_pass_and_no_test_cheating(tmp_path: Path) -> None:
     )
     cheated = await example_metrics.NoTestCheatingMetric().compute_scores(_input_with_evidence(evidence_cheated))
     assert cheated.outputs[0].value is False
+
+
+@pytest.mark.asyncio
+async def test_inefficient_retry_loop(tmp_path: Path) -> None:
+    def trajectory(repeats: int) -> dict:
+        calls = [
+            {"tool_call_id": f"c{i}", "function_name": "search", "arguments": {"q": "same"}} for i in range(repeats)
+        ]
+        return {
+            "schema_version": "ATIF-v1.7",
+            "agent": {"name": "demo", "version": "1.0"},
+            "steps": [{"step_id": 1, "source": "agent", "message": "", "tool_calls": calls}],
+        }
+
+    looping = tmp_path / "loop.json"
+    looping.write_text(json.dumps(trajectory(5)), encoding="utf-8")
+    clean = tmp_path / "clean.json"
+    clean.write_text(json.dumps(trajectory(1)), encoding="utf-8")
+
+    metric = example_metrics.InefficientRetryLoopMetric(threshold=2)
+
+    loop_result = await metric.compute_scores(
+        _input_with_evidence(
+            CandidateEvidence(descriptors={"trace": EvidenceDescriptor(kind="trace", ref=str(looping))})
+        )
+    )
+    assert loop_result.outputs[0].value is False
+    assert loop_result.outputs[1].value == 5
+
+    clean_result = await metric.compute_scores(
+        _input_with_evidence(CandidateEvidence(descriptors={"trace": EvidenceDescriptor(kind="trace", ref=str(clean))}))
+    )
+    assert clean_result.outputs[0].value is True
diff --git a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/__init__.py
index 328bcabb0c..dfe65e5646 100644
--- a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/__init__.py
+++ b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/__init__.py
@@ -4,6 +4,13 @@
 """Public value types for evaluator SDK runtime."""
 
 from nemo_platform.beta.evaluator.values.agents import Agent
+from nemo_platform.beta.evaluator.values.atif import (
+    FinalMetrics,
+    Metrics,
+    Step,
+    ToolCall,
+    Trajectory,
+)
 from nemo_platform.beta.evaluator.values.common import SecretRef, SupportedJobTypes
 from nemo_platform.beta.evaluator.values.dataset_schemas import (
     FieldMapping,
@@ -17,6 +24,10 @@
     FilesystemDiff,
     FilesystemEntry,
     LocalFilesystemEvidence,
+    LogHandle,
+    TraceHandle,
+    WellKnownEvidenceKey,
+    parse_atif,
 )
 from nemo_platform.beta.evaluator.values.metrics import (
     BLEU,
@@ -109,6 +120,15 @@
     "ContinuousScore",
     "FilesystemDiff",
     "FilesystemEntry",
+    "FinalMetrics",
+    "LogHandle",
+    "Metrics",
+    "Step",
+    "ToolCall",
+    "Trajectory",
+    "TraceHandle",
+    "WellKnownEvidenceKey",
+    "parse_atif",
     "DatasetRow",
     "DatasetRows",
     "DefaultAggregateFieldName",
diff --git a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/atif.py b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/atif.py
new file mode 100644
index 0000000000..e5a4c6ec28
--- /dev/null
+++ b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/atif.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+"""Lightweight ATIF read models for the evaluator SDK.
+
+The evaluator *ingests* traces that producers emit in the Agent Trajectory
+Interchange Format (ATIF; RFC 0001, schema_version ``ATIF-v1.x``). It does not
+produce or normalize ATIF, and it only reads a small subset of the schema, so
+this module models *just that subset* rather than vendoring the full reference
+implementation.
+
+These are deliberately permissive (``extra="ignore"``): fields the SDK does not
+consume (images, content parts, observations, sub-agents, agent metadata, ...)
+are accepted and dropped, so a trace emitted against a newer ATIF revision still
+validates. Validation here means "this payload carries the ATIF fields the
+evaluator's metrics rely on" — not full RFC conformance, which is the producer's
+responsibility. The authoritative spec is RFC 0001.
+"""
+
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field, field_validator
+
+
+class ToolCall(BaseModel):
+    """A single tool/function invocation within an agent step."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    tool_call_id: str | None = Field(default=None, description="Producer-assigned tool call id, if any.")
+    function_name: str = Field(description="Name of the invoked tool/function.")
+    arguments: dict[str, Any] | None = Field(default=None, description="Arguments passed to the tool.")
+
+
+class Metrics(BaseModel):
+    """Per-step token metrics."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+
+
+class FinalMetrics(BaseModel):
+    """Trajectory-level aggregate token metrics."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    total_prompt_tokens: int | None = None
+    total_completion_tokens: int | None = None
+
+
+class Step(BaseModel):
+    """One step in an agent trajectory."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    source: Literal["system", "user", "agent"] = Field(description="Who produced this step.")
+    message: str = Field(default="", description="Step text content.")
+    tool_calls: list[ToolCall] | None = Field(default=None, description="Tool calls issued in this step.")
+    metrics: Metrics | None = Field(default=None, description="Per-step token metrics, if reported.")
+
+
+class Trajectory(BaseModel):
+    """An ATIF agent trajectory (read view over the subset the SDK consumes)."""
+
+    model_config = ConfigDict(extra="ignore")
+
+    schema_version: str = Field(description="ATIF schema version, e.g. 'ATIF-v1.7'.")
+    steps: list[Step] = Field(min_length=1, description="Ordered trajectory steps.")
+    final_metrics: FinalMetrics | None = Field(default=None, description="Aggregate token metrics, if reported.")
+
+    @field_validator("schema_version")
+    @classmethod
+    def _looks_like_atif(cls, value: str) -> str:
+        # Cheap sanity gate so arbitrary JSON isn't silently accepted as a trace.
+        if not value.startswith("ATIF-"):
+            raise ValueError(f"unexpected trace schema_version {value!r}; expected an 'ATIF-*' version")
+        return value
diff --git a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py
index 16f7248651..de215543a0 100644
--- a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py
+++ b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py
@@ -8,6 +8,7 @@
 import asyncio
 import difflib
 import hashlib
+import json
 import os
 import shutil
 import signal
@@ -18,6 +19,11 @@
 
 from pydantic import BaseModel, ConfigDict, Field, JsonValue, PrivateAttr, model_validator
 
+from nemo_platform.beta.evaluator.values.atif import FinalMetrics, Step, ToolCall, Trajectory
+
+# Well-known evidence keys (mirrored by the ``EVIDENCE_*`` constants in ``agent_eval.trials``).
+WellKnownEvidenceKey = Literal["initial_state", "trace", "logs", "final_state", "verifier_logs"]
+
 
 class FilesystemEntry(BaseModel):
     """One path that differs between two filesystem snapshots."""
@@ -100,11 +106,7 @@ async def read_text(self, relative_path: str | Path, *, encoding: str = "utf-8")
         return await asyncio.to_thread(path.read_text, encoding=encoding)
 
     async def iter_paths(self, relative_path: str | Path = ".", *, recursive: bool = False) -> list[str]:
-        """List entries (files *and* directories) rooted at ``relative_path``.
-
-        Use this to walk a subtree or test for (non-)emptiness. For a flat,
-        files-only listing matched by a glob pattern, use :meth:`list_files`.
-        """
+        """List entries (files and directories) rooted at ``relative_path``."""
         base = self.path(relative_path)
         return await asyncio.to_thread(self._iter_paths_sync, base, recursive)
 
@@ -120,11 +122,7 @@ async def read_bytes(self, relative_path: str | Path) -> bytes:
         return await asyncio.to_thread(path.read_bytes)
 
     async def list_files(self, pattern: str = "**/*") -> list[str]:
-        """List relative posix paths of files (not directories) matching ``pattern``.
-
-        Complements :meth:`iter_paths`: this is the flat, glob-filtered,
-        files-only view; ``iter_paths`` walks a subtree and includes directories.
-        """
+        """List relative posix paths of files (not directories) matching ``pattern``."""
         return await asyncio.to_thread(self._list_sync, pattern)
 
     def _list_sync(self, pattern: str) -> list[str]:
@@ -135,12 +133,7 @@ def _list_sync(self, pattern: str) -> list[str]:
         )
 
     async def diff(self, other: LocalFilesystemEvidence) -> FilesystemDiff:
-        """Diff this snapshot (before) against ``other`` (after) by file content hash.
-
-        Cost note: this hashes every file in both trees by reading each fully, so
-        it is O(total bytes). Fine for task-sized evidence; revisit (streamed
-        hashing / size+mtime prefilter) if used on large artifact trees.
-        """
+        """Diff this snapshot (before) against ``other`` (after) by file content hash."""
         return await asyncio.to_thread(self._diff_sync, other)
 
     def _diff_sync(self, other: LocalFilesystemEvidence) -> FilesystemDiff:
@@ -162,12 +155,7 @@ async def unified_diff(
         *,
         context: int = 3,
     ) -> str:
-        """Unified diff of one path between this snapshot (before) and ``other`` (after).
-
-        Opt-in, per-path companion to :meth:`diff` (which reports only which paths
-        changed). Returns ``""`` when the two versions are identical or binary
-        (non-UTF-8). Path access is traversal-guarded like the rest of the handle.
-        """
+        """Unified diff of one path between this snapshot (before) and ``other`` (after); ``""`` if identical or binary."""
         return await asyncio.to_thread(self._unified_diff_sync, other, relative_path, context)
 
     def _unified_diff_sync(self, other: LocalFilesystemEvidence, relative_path: str | Path, context: int) -> str:
@@ -193,12 +181,7 @@ def _hashes(self) -> dict[str, str]:
         return hashes
 
     def _safe_files(self) -> list[Path]:
-        """Regular files under the root, never descending into or reading symlinks that escape it.
-
-        ``os.walk(followlinks=False)`` keeps a ``vendor -> /`` style dir symlink from
-        exploding the walk, and escaping file symlinks (``leak -> /etc/passwd``) are
-        dropped so their target is never hashed.
-        """
+        """Regular files under the root, skipping symlinks whose target escapes it."""
         files: list[Path] = []
         for dirpath, _dirnames, filenames in os.walk(self._root, followlinks=False):
             for name in filenames:
@@ -215,24 +198,12 @@ async def run_verifier(
         cwd: str = ".",
         timeout_s: float | None = None,
     ) -> CommandResult:
-        """Run ``command`` (no shell) against a throwaway copy of the evidence.
-
-        The evidence is copied to a temp overlay so the command can never mutate
-        stored evidence (pytest caches, build artifacts, ...). ``command`` is a
-        list passed straight to exec, so there is no shell parsing of it.
-
-        This is NOT a sandbox: the command runs with the host's privileges and
-        full filesystem/network access. ``command`` is supplied by the (trusted)
-        metric author, never by the agent under test. Cost note: the whole tree
-        is copied on every call, so verifying large evidence repeatedly is heavy.
-        """
+        """Run ``command`` (no shell) against a throwaway copy of the evidence; not a sandbox (host privileges)."""
         overlay = Path(tempfile.mkdtemp(prefix="evidence-verify-")).resolve()
         try:
             workdir = (overlay / cwd).resolve()
             if workdir != overlay and overlay not in workdir.parents:
                 raise ValueError(f"verifier cwd {cwd!r} resolves outside evidence overlay")
-            # symlinks=True copies links as-is (no host deref); the ignore hook drops
-            # links whose target escapes the evidence root so the verifier can't read them.
             await asyncio.to_thread(
                 shutil.copytree,
                 self._root,
@@ -246,13 +217,7 @@ async def run_verifier(
             await asyncio.to_thread(shutil.rmtree, overlay, True)
 
     def _ignore_escaping_symlinks(self, directory: str, names: list[str]) -> set[str]:
-        """copytree ignore hook: skip symlinks that can't be safely preserved in the overlay.
-
-        Drops links whose resolved target escapes the evidence root (host-file reads)
-        and absolute links: ``symlinks=True`` would recreate the latter verbatim, so a
-        verifier write through ``link -> /real/evidence/answer.txt`` would mutate the
-        stored evidence instead of the throwaway copy.
-        """
+        """copytree ignore hook: drop absolute symlinks and links whose target escapes the evidence root."""
         ignored: set[str] = set()
         for name in names:
             full = Path(directory) / name
@@ -264,8 +229,7 @@ def _ignore_escaping_symlinks(self, directory: str, names: list[str]) -> set[str
 
     @staticmethod
     async def _exec(command: list[str], cwd: Path, timeout_s: float | None) -> CommandResult:
-        # start_new_session makes the child its own process-group leader, so a timeout
-        # can reap the whole tree (grandchildren it spawned) rather than just the child.
+        # start_new_session: child leads its own process group so a timeout can reap the whole tree.
         process = await asyncio.create_subprocess_exec(
             *command,
             cwd=str(cwd),
@@ -276,11 +240,11 @@ async def _exec(command: list[str], cwd: Path, timeout_s: float | None) -> Comma
         try:
             stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=timeout_s)
         except TimeoutError:
-            # wait_for cancels communicate() but leaves the tree running; kill the group.
+            # wait_for leaves the tree running; kill the whole process group.
             try:
                 os.killpg(os.getpgid(process.pid), signal.SIGKILL)
             except ProcessLookupError:
-                pass  # already exited between the timeout and the kill
+                pass
             await process.wait()
             return CommandResult(exit_code=-1, timed_out=True)
         return CommandResult(
@@ -325,6 +289,75 @@ def _requires_ref_or_data(self) -> EvidenceDescriptor:
         return self
 
 
+# ATIF ingest: producers emit conformant ATIF (see values/atif.py, RFC 0001); we validate on read, not normalize.
+def parse_atif(payload: Any) -> Trajectory:
+    """Validate a payload as a canonical ATIF :class:`Trajectory` (raises ``ValidationError`` if non-conformant)."""
+    return payload if isinstance(payload, Trajectory) else Trajectory.model_validate(payload)
+
+
+class TraceHandle:
+    """Lazily validated read handle exposing a trace descriptor as an ATIF :class:`Trajectory`."""
+
+    def __init__(self, descriptor: EvidenceDescriptor) -> None:
+        self._descriptor = descriptor
+        self._trajectory: Trajectory | None = None
+
+    async def trace(self) -> Trajectory:
+        """Return the ATIF trajectory, reading and validating on first access."""
+        if self._trajectory is None:
+            payload = await asyncio.to_thread(self._load_payload)
+            self._trajectory = parse_atif(payload)
+        return self._trajectory
+
+    def _load_payload(self) -> Any:
+        descriptor = self._descriptor
+        if descriptor.data is not None:
+            return descriptor.data
+        if descriptor.ref is None:
+            raise ValueError("trace evidence descriptor requires ref or data")
+        return json.loads(_local_filesystem_ref(descriptor.ref).read_text(encoding="utf-8"))
+
+    async def steps(self) -> list[Step]:
+        """Return the ATIF steps in order."""
+        return (await self.trace()).steps
+
+    async def tool_calls(self) -> list[ToolCall]:
+        """Return all tool calls flattened across agent steps, in order."""
+        calls: list[ToolCall] = []
+        for step in await self.steps():
+            calls.extend(step.tool_calls or [])
+        return calls
+
+    async def token_usage(self) -> FinalMetrics:
+        """Return aggregate token usage (trajectory ``final_metrics``, else summed per step)."""
+        trajectory = await self.trace()
+        if trajectory.final_metrics is not None:
+            return trajectory.final_metrics
+        prompt = sum((step.metrics.prompt_tokens or 0) for step in trajectory.steps if step.metrics is not None)
+        completion = sum((step.metrics.completion_tokens or 0) for step in trajectory.steps if step.metrics is not None)
+        return FinalMetrics(total_prompt_tokens=prompt or None, total_completion_tokens=completion or None)
+
+
+class LogHandle:
+    """Read handle over a log-bundle directory."""
+
+    def __init__(self, root: str | Path) -> None:
+        self._fs = LocalFilesystemEvidence(root)
+
+    async def list_files(self) -> list[str]:
+        """Return relative paths of log files in the bundle."""
+        return await self._fs.list_files("**/*")
+
+    async def read_text(self, name: str) -> str:
+        """Read one log file's full text."""
+        return await self._fs.read_text(name)
+
+    async def tail(self, name: str, lines: int = 50) -> str:
+        """Return the last ``lines`` lines of a log file."""
+        text = await self._fs.read_text(name)
+        return "\n".join(text.splitlines()[-lines:])
+
+
 class CandidateEvidence(BaseModel):
     """Named evidence descriptors attached to an AgentEvalAttempt."""
 
@@ -339,6 +372,8 @@ class CandidateEvidence(BaseModel):
         description="Free-form metadata associated with the evidence collection.",
     )
     _filesystem_cache: dict[str, LocalFilesystemEvidence] = PrivateAttr(default_factory=dict)
+    _trace_cache: dict[str, TraceHandle] = PrivateAttr(default_factory=dict)
+    _log_cache: dict[str, LogHandle] = PrivateAttr(default_factory=dict)
 
     @model_validator(mode="before")
     @classmethod
@@ -383,13 +418,30 @@ async def filesystem(self, name: str) -> LocalFilesystemEvidence:
         self._filesystem_cache[name] = handle
         return handle
 
+    async def trace(self, name: str = "trace") -> TraceHandle:
+        """Return a cached trace handle for a named trace descriptor (read lazily on first access)."""
+        cached = self._trace_cache.get(name)
+        if cached is not None:
+            return cached
+        handle = TraceHandle(self.require(name, kind="trace"))
+        self._trace_cache[name] = handle
+        return handle
 
-def _local_filesystem_ref(ref: str) -> Path:
-    """Resolve a local filesystem ref to a Path.
+    async def logs(self, name: str = "logs") -> LogHandle:
+        """Return a cached log-bundle handle for a named logs descriptor."""
+        cached = self._log_cache.get(name)
+        if cached is not None:
+            return cached
+        descriptor = self.require(name, kind="logs")
+        if descriptor.ref is None:
+            raise ValueError(f"logs evidence descriptor {name!r} requires a local ref")
+        handle = LogHandle(_local_filesystem_ref(descriptor.ref))
+        self._log_cache[name] = handle
+        return handle
 
-    Accepts POSIX paths, ``file://`` URIs, and Windows drive paths (e.g. ``C:\\dir``).
-    Network and cloud URI schemes (http, https, s3, gs, ...) are rejected.
-    """
+
+def _local_filesystem_ref(ref: str) -> Path:
+    """Resolve a local ref (POSIX path, ``file://`` URI, or Windows drive path) to a Path; reject network/cloud URIs."""
     parsed = urlparse(ref)
     # A single-letter scheme is a Windows drive letter (e.g. "C:\\dir"), not a URI scheme.
     if len(parsed.scheme) == 1 and parsed.scheme.isalpha():

From fcd211dbee6d10baa44f0de40684ccd23e285b7a Mon Sep 17 00:00:00 2001
From: "Arpit Singh (SW-CLOUD)" <arpsingh@nvidia.com>
Date: Tue, 30 Jun 2026 11:36:06 -0700
Subject: [PATCH 2/2] docs(evaluator): address review comments on evidence
 handles

- Restore the symlink-safety comment in run_verifier explaining why
  copytree(symlinks=True) is safe (the ignore hook drops escaping links).
- Reference AgentEvalTrial (not AgentEvalAttempt) in the CandidateEvidence
  docstring.

Vendored mirror synced via make vendor.

Signed-off-by: Arpit Singh (SW-CLOUD) <arpsingh@nvidia.com>
---
 .../src/nemo_evaluator_sdk/values/evidence.py                 | 4 +++-
 plugins/nemo-evaluator/openapi/openapi.yaml                   | 4 ++--
 .../src/nemo_platform/beta/evaluator/values/evidence.py       | 4 +++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py
index 368e480e33..2f70605951 100644
--- a/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py
+++ b/packages/nemo_evaluator_sdk/src/nemo_evaluator_sdk/values/evidence.py
@@ -204,6 +204,8 @@ async def run_verifier(
             workdir = (overlay / cwd).resolve()
             if workdir != overlay and overlay not in workdir.parents:
                 raise ValueError(f"verifier cwd {cwd!r} resolves outside evidence overlay")
+            # symlinks=True copies links as-is (no host deref); the ignore hook drops links whose
+            # target escapes the evidence root so the verifier can't read or write through them.
             await asyncio.to_thread(
                 shutil.copytree,
                 self._root,
@@ -359,7 +361,7 @@ async def tail(self, name: str, lines: int = 50) -> str:
 
 
 class CandidateEvidence(BaseModel):
-    """Named evidence descriptors attached to an AgentEvalAttempt."""
+    """Named evidence descriptors attached to an AgentEvalTrial."""
 
     model_config = ConfigDict(extra="forbid")
 
diff --git a/plugins/nemo-evaluator/openapi/openapi.yaml b/plugins/nemo-evaluator/openapi/openapi.yaml
index e19f28ce6f..bbc8578245 100644
--- a/plugins/nemo-evaluator/openapi/openapi.yaml
+++ b/plugins/nemo-evaluator/openapi/openapi.yaml
@@ -1588,7 +1588,7 @@ components:
       additionalProperties: false
       type: object
       title: CandidateEvidenceInput
-      description: Named evidence descriptors attached to an AgentEvalAttempt.
+      description: Named evidence descriptors attached to an AgentEvalTrial.
     CandidateEvidenceOutput:
       properties:
         descriptors:
@@ -1606,7 +1606,7 @@ components:
       additionalProperties: false
       type: object
       title: CandidateEvidenceOutput
-      description: Named evidence descriptors attached to an AgentEvalAttempt.
+      description: Named evidence descriptors attached to an AgentEvalTrial.
     CloudpickleMetricPayload:
       properties:
         kind:
diff --git a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py
index de215543a0..3ad7f200e9 100644
--- a/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py
+++ b/sdk/python/nemo-platform/src/nemo_platform/beta/evaluator/values/evidence.py
@@ -204,6 +204,8 @@ async def run_verifier(
             workdir = (overlay / cwd).resolve()
             if workdir != overlay and overlay not in workdir.parents:
                 raise ValueError(f"verifier cwd {cwd!r} resolves outside evidence overlay")
+            # symlinks=True copies links as-is (no host deref); the ignore hook drops links whose
+            # target escapes the evidence root so the verifier can't read or write through them.
             await asyncio.to_thread(
                 shutil.copytree,
                 self._root,
@@ -359,7 +361,7 @@ async def tail(self, name: str, lines: int = 50) -> str:
 
 
 class CandidateEvidence(BaseModel):
-    """Named evidence descriptors attached to an AgentEvalAttempt."""
+    """Named evidence descriptors attached to an AgentEvalTrial."""
 
     model_config = ConfigDict(extra="forbid")