Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,22 @@

"""Reference metrics-over-evidence for this example (not SDK API).

These show how to score from the SDK's filesystem evidence handle instead of a
stamped verifier reward:
These show how to score from the SDK's evidence handles instead of a stamped
verifier reward:

* :class:`TestsPassMetric` runs a command against ``final_state`` filesystem
evidence (in a throwaway overlay) and scores on exit 0.
* :class:`NoTestCheatingMetric` diffs ``initial_state`` against ``final_state``
and fails if the agent touched protected (e.g. test) paths.
* :class:`InefficientRetryLoopMetric` reads the normalized ``trace`` and fails
when the same tool call repeats past a threshold.
"""

from __future__ import annotations

from collections.abc import Sequence

from nemo_evaluator_sdk.agent_eval.trials import EVIDENCE_FINAL_STATE, EVIDENCE_INITIAL_STATE, EVIDENCE_TRACE
Comment on lines 19 to +21

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🎯 Functional Correctness | 🟠 Major | ⚡ Quick win

Count retry streaks, not global frequency.

Lines 118-121 count identical calls across the entire trace. That flags legitimate reuse separated by other work as a retry loop, and the current key still splits semantically identical nested args when dict insertion order differs. Track the longest consecutive run from a canonicalized call payload instead.

Suggested fix
+import json
 from collections.abc import Sequence
@@
     async def compute_scores(self, input: MetricInput) -> MetricResult:
         max_repeats = 0
         evidence = input.candidate.evidence
         if evidence is not None and evidence.get(self._evidence_name) is not None:
             calls = await (await evidence.trace(self._evidence_name)).tool_calls()
-            counts: dict[str, int] = {}
+            previous_key: str | None = None
+            current_repeats = 0
             for call in calls:
-                key = f"{call.name}:{sorted((call.arguments or {}).items())}"
-                counts[key] = counts.get(key, 0) + 1
-            max_repeats = max(counts.values(), default=0)
+                key = json.dumps(
+                    {"name": call.name, "arguments": call.arguments or {}},
+                    sort_keys=True,
+                    separators=(",", ":"),
+                )
+                current_repeats = current_repeats + 1 if key == previous_key else 1
+                previous_key = key
+                max_repeats = max(max_repeats, current_repeats)
         return MetricResult(
             outputs=[
                 MetricOutput(name="efficient_tool_use", value=max_repeats <= self._threshold),
                 MetricOutput(name="max_repeated_tool_calls", value=max_repeats),
             ]

Also applies to: 112-125

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/nemo_evaluator_sdk/examples/run_agent_eval/example_metrics.py`
around lines 19 - 21, The retry detection in the example metrics logic is
counting global frequency instead of consecutive repeats, so update the relevant
code in example_metrics.py to measure the longest streak of identical calls in
the trace rather than total occurrences. In the call-counting path around the
metrics computation (the logic that uses EVIDENCE_TRACE), canonicalize nested
args before comparing so semantically equivalent dicts with different insertion
order map to the same payload, and keep the streak bounded to adjacent entries
only. Use the existing symbols EVIDENCE_TRACE, EVIDENCE_INITIAL_STATE, and
EVIDENCE_FINAL_STATE to locate the trace-processing code and adjust the retry
metric accordingly.

from nemo_evaluator_sdk.metrics.protocol import MetricInput, MetricOutput, MetricOutputSpec, MetricResult


Expand All @@ -26,7 +29,7 @@ def __init__(
self,
command: Sequence[str],
*,
evidence_name: str = "final_state",
evidence_name: str = EVIDENCE_FINAL_STATE,

@ngoncharenko ngoncharenko Jun 26, 2026

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would it make sense to ad hint for well known literals Literal = [EVIDENCE_FINAL_STATE, ...] | str?

cwd: str = ".",
timeout_s: float = 300.0,
) -> None:
Expand Down Expand Up @@ -60,8 +63,8 @@ def __init__(
*,
protected: Sequence[str] = ("tests/",),
change_types: Sequence[str] = ("added", "modified", "deleted"),
initial_name: str = "initial_state",
final_name: str = "final_state",
initial_name: str = EVIDENCE_INITIAL_STATE,
final_name: str = EVIDENCE_FINAL_STATE,
Comment on lines +66 to +67

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question, do you think these need to be injected here? When would the initial/final states have a different name?

) -> None:
self._protected = tuple(protected)
self._change_types = set(change_types)
Expand All @@ -87,3 +90,38 @@ async def compute_scores(self, input: MetricInput) -> MetricResult:
]
clean = not violations
return MetricResult(outputs=[MetricOutput(name="no_test_cheating", value=clean)])


class InefficientRetryLoopMetric:
"""Score ``False`` when the same tool call repeats more than ``threshold`` times."""

def __init__(self, *, threshold: int = 2, evidence_name: str = EVIDENCE_TRACE) -> None:
self._threshold = threshold
self._evidence_name = evidence_name

@property
def type(self) -> str:
return "inefficient_retry_loop"

def output_spec(self) -> list[MetricOutputSpec]:
return [
MetricOutputSpec.boolean("efficient_tool_use"),
MetricOutputSpec.discrete_score("max_repeated_tool_calls"),
]

async def compute_scores(self, input: MetricInput) -> MetricResult:
max_repeats = 0
evidence = input.candidate.evidence
if evidence is not None and evidence.get(self._evidence_name) is not None:
calls = await (await evidence.trace(self._evidence_name)).tool_calls()
Comment on lines +115 to +116

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why would the user choose to get different evidence types in the metric? It seems like this always relies on it being EVIDENCE_TRACE, right?

counts: dict[str, int] = {}
for call in calls:
key = f"{call.function_name}:{sorted((call.arguments or {}).items())}"

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we sort call arguments? wouldn't it change the order of args which is important?

counts[key] = counts.get(key, 0) + 1
max_repeats = max(counts.values(), default=0)
return MetricResult(
outputs=[
MetricOutput(name="efficient_tool_use", value=max_repeats <= self._threshold),
MetricOutput(name="max_repeated_tool_calls", value=max_repeats),
]
)
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from nemo_evaluator_sdk.metrics.protocol import Metric, validate_metric_result
from nemo_evaluator_sdk.metrics.utils import metric_type_name
from nemo_evaluator_sdk.values import Agent, Model, RunConfig, RunConfigOnline, RunConfigOnlineModel
from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, normalize_trace_descriptor
from openai import AsyncOpenAI

log = getLogger(__name__)
Expand Down Expand Up @@ -327,7 +327,9 @@ def _trial_from_sample(task: AgentEvalTask, target: Model | Agent, sample: dict[
# trial stays scorable instead of being dropped as empty output.
output_text = _reasoning_content_fallback(sample.get("response"))
if "trajectory" in sample:
trace = EvidenceDescriptor(kind="trace", format="json", data=sample["trajectory"])
# Normalize to ATIF before the trial is persisted so the stored shape is
# source-agnostic (sources in, ATIF out); TraceHandle then reads it uniformly.
trace = normalize_trace_descriptor(EvidenceDescriptor(kind="trace", format="json", data=sample["trajectory"]))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is the right approach tbh. I think persisting the raw trace and convert on metric usage is the right approach. The problem with converting before persisting is that if there's a bug in our conversion code or we don't handle the source format fully, the conversion may be lossy. At least if we convert on read, users can improve the conversion function without loss.

else:
trace = EvidenceDescriptor(kind="sdk_online_generation", data={"task_id": task.id, "target": target.name})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from nemo_evaluator_sdk.agent_eval.tasks import AgentEvalRunConfig, AgentEvalTask
from nemo_evaluator_sdk.values import Agent, Model
from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor
from nemo_evaluator_sdk.values.evidence import CandidateEvidence, EvidenceDescriptor, normalize_trace_descriptor
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator

# Well-known evidence keys produced by ``standard_evidence_descriptors``. Harness
Expand Down Expand Up @@ -158,10 +158,14 @@ def standard_evidence_descriptors(
if trace_path is not None:
trace_name = Path(trace_path).name.lower()
is_atif = trace_name.startswith("atif") or ".atif." in trace_name
descriptors[EVIDENCE_TRACE] = EvidenceDescriptor(
kind="trace",
format="atif" if is_atif else "json",
ref=str(trace_path),
# Normalize the source trace into a sibling ATIF file before persistence so the
# stored descriptor is ATIF regardless of producer (no-op if already ATIF/missing).
descriptors[EVIDENCE_TRACE] = normalize_trace_descriptor(
EvidenceDescriptor(
kind="trace",
format="atif" if is_atif else "json",
ref=str(trace_path),
)
)

logs_metadata = {"primary_log": primary_log} if primary_log else {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@
"""Public value types for evaluator SDK runtime."""

from nemo_evaluator_sdk.values.agents import Agent
from nemo_evaluator_sdk.values.atif import (
FinalMetrics,
Metrics,
Observation,
ObservationResult,
Step,
ToolCall,
Trajectory,
)
from nemo_evaluator_sdk.values.common import SecretRef, SupportedJobTypes
from nemo_evaluator_sdk.values.dataset_schemas import (
FieldMapping,
Expand All @@ -17,6 +26,11 @@
FilesystemDiff,
FilesystemEntry,
LocalFilesystemEvidence,
LogHandle,
TraceHandle,
WellKnownEvidenceKey,
normalize_candidate_evidence,
normalize_trace_descriptor,
)
from nemo_evaluator_sdk.values.metrics import (
BLEU,
Expand Down Expand Up @@ -109,6 +123,18 @@
"ContinuousScore",
"FilesystemDiff",
"FilesystemEntry",
"FinalMetrics",
"LogHandle",
"Metrics",
"Observation",
"ObservationResult",
"Step",
"ToolCall",
"Trajectory",
"TraceHandle",
"WellKnownEvidenceKey",
"normalize_candidate_evidence",
"normalize_trace_descriptor",
"DatasetRow",
"DatasetRows",
"DefaultAggregateFieldName",
Expand Down
Loading
Loading