Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/guides/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,16 @@ These metrics provide a breakdown of the overall request statuses, helping users
- **Definition**: The average time taken to generate each output token, including the first token.
- **Use Case**: Provides a detailed view of the model's token generation efficiency.

### Time To Last Round Trip

- **Definition**: For the realtime WebSocket backend (`openai_websocket`), the time from the last sent packet to the last received token.
- **Use Case**: Measures tail latency of a streaming exchange (how long the final output lags the final input).

### Average Round-Trip Time (Avg RTT)

- **Definition**: For the WebSocket backend, the mean of received-token timestamps minus the mean of sent-packet timestamps.
- **Use Case**: Estimates the average send-to-receive lag across a request. It is approximate, since it assumes sent packets and received tokens line up evenly in time.

## Statistical Summaries

GuideLLM provides detailed statistical summaries for each of the above metrics using the `StatusDistributionSummary` and `DistributionSummary` models. These summaries include the following statistics:
Expand Down
24 changes: 22 additions & 2 deletions src/guidellm/backends/openai/websocket.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,9 @@ def _record_content_tokens(
if content_tokens <= 0:
return False

request_info.timings.token_received_sum += iter_time
request_info.timings.token_received_count += 1

if request_info.timings.first_token_iteration is None:
request_info.timings.first_token_iteration = iter_time
request_info.timings.token_iterations = 0
Expand All @@ -92,6 +95,18 @@ def _record_content_tokens(
return False


def _record_request_sent(request_info: RequestInfo) -> None:
"""
Record the timestamp of one outbound WebSocket frame for round-trip metrics.

:param request_info: Mutable timing state for the in-flight request.
"""
sent_time = time.time()
request_info.timings.last_request_sent = sent_time
request_info.timings.request_sent_sum += sent_time
request_info.timings.request_sent_count += 1


def _load_ws_event(raw: str) -> dict[str, Any]:
"""Parse a JSON WebSocket text frame; raise RuntimeError on invalid JSON."""
try:
Expand Down Expand Up @@ -374,8 +389,9 @@ async def resolve( # type: ignore[override, misc] # noqa: C901, PLR0912, PLR09
self,
request: GenerationRequest,
request_info: RequestInfo,
history: list[tuple[GenerationRequest, GenerationResponse | None]]
| None = None,
history: (
list[tuple[GenerationRequest, GenerationResponse | None]] | None
) = None,
) -> AsyncIterator[tuple[GenerationResponse | None, RequestInfo]]:
"""
Stream one realtime transcription over WebSocket for a single audio column.
Expand Down Expand Up @@ -451,18 +467,22 @@ async def resolve( # type: ignore[override, misc] # noqa: C901, PLR0912, PLR09
f"Expected session.created, got {first_event.get('type')!r}"
)
await ws.send(_json_text(session_update))
_record_request_sent(request_info)
for b64_chunk in chunks:
await ws.send(
_json_text(
{"type": "input_audio_buffer.append", "audio": b64_chunk}
)
)
_record_request_sent(request_info)
await ws.send(
_json_text({"type": "input_audio_buffer.commit", "final": False})
)
_record_request_sent(request_info)
await ws.send(
_json_text({"type": "input_audio_buffer.commit", "final": True})
)
_record_request_sent(request_info)

ignored_events = 0
while True:
Expand Down
10 changes: 10 additions & 0 deletions src/guidellm/benchmark/outputs/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,6 +480,16 @@ def print_request_latency_table(self, report: GenerativeBenchmarksReport):
group="TPOT",
name="ms",
)
columns.add_stats(
benchmark.metrics.time_to_last_round_trip_ms,
group="Last RT",
name="ms",
)
columns.add_stats(
benchmark.metrics.avg_round_trip_time_ms,
group="Avg RTT",
name="ms",
)

headers, values = columns.get_table_data()
self.console.print("\n")
Expand Down
14 changes: 14 additions & 0 deletions src/guidellm/benchmark/outputs/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,20 @@ def _add_request_latency_metrics(
"Inter Token Latency",
"ms",
)
self._add_stats_for_metric(
headers,
values,
benchmark.metrics.time_to_last_round_trip_ms,
"Time To Last Round Trip",
"ms",
)
self._add_stats_for_metric(
headers,
values,
benchmark.metrics.avg_round_trip_time_ms,
"Avg Round Trip Time",
"ms",
)

def _add_server_throughput_metrics(
self,
Expand Down
14 changes: 14 additions & 0 deletions src/guidellm/benchmark/schemas/accumulator.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,14 @@ class GenerativeMetricsAccumulator(StandardBaseModel):
default_factory=RunningMetricStats,
description="Accumulated time to first token statistics in milliseconds",
)
time_to_last_round_trip_ms: RunningMetricStats = Field(
default_factory=RunningMetricStats,
description="Accumulated websocket last round-trip latency in milliseconds",
)
avg_round_trip_time_ms: RunningMetricStats = Field(
default_factory=RunningMetricStats,
description="Accumulated websocket average round-trip time in milliseconds",
)
time_to_first_output_token_ms: RunningMetricStats = Field(
default_factory=RunningMetricStats,
description="Accumulated time to first content token stats in ms",
Expand Down Expand Up @@ -539,6 +547,12 @@ def update_estimate(self, stats: GenerativeRequestStats, duration: float):
self.time_to_first_token_ms.update_estimate(
stats.time_to_first_token_ms, duration=duration
)
self.time_to_last_round_trip_ms.update_estimate(
stats.time_to_last_round_trip_ms, duration=duration
)
self.avg_round_trip_time_ms.update_estimate(
stats.avg_round_trip_time_ms, duration=duration
)
self.time_to_first_output_token_ms.update_estimate(
stats.time_to_first_output_token_ms, duration=duration
)
Expand Down
34 changes: 30 additions & 4 deletions src/guidellm/benchmark/schemas/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,18 @@ class GenerativeMetrics(StandardBaseDict):
inter_token_latency_ms: StatusDistributionSummary = Field(
description="Distribution of inter-token latencies in milliseconds"
)
time_to_last_round_trip_ms: StatusDistributionSummary = Field(
description=(
"Distribution of websocket last-round-trip latencies in milliseconds "
"(last received token minus last sent packet)"
)
)
avg_round_trip_time_ms: StatusDistributionSummary = Field(
description=(
"Distribution of approximate websocket average round-trip times in "
"milliseconds (mean received minus mean sent)"
)
)
prompt_tokens_per_second: StatusDistributionSummary = Field(
description="Distribution of prompt token processing rates"
)
Expand Down Expand Up @@ -889,10 +901,12 @@ def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetri
),
request_concurrency=StatusDistributionSummary.concurrency_distribution_from_timings_function(
function=(
lambda req: (req.request_start_time, req.request_end_time)
if req.request_start_time is not None
and req.request_end_time is not None
else None
lambda req: (
(req.request_start_time, req.request_end_time)
if req.request_start_time is not None
and req.request_end_time is not None
else None
)
),
successful=successful,
incomplete=incomplete,
Expand Down Expand Up @@ -937,6 +951,18 @@ def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetri
incomplete=incomplete,
errored=errored,
),
time_to_last_round_trip_ms=StatusDistributionSummary.from_values_function(
function=lambda req: req.time_to_last_round_trip_ms or 0.0,
successful=successful,
incomplete=incomplete,
errored=errored,
),
avg_round_trip_time_ms=StatusDistributionSummary.from_values_function(
function=lambda req: req.avg_round_trip_time_ms or 0.0,
successful=successful,
incomplete=incomplete,
errored=errored,
),
time_to_first_output_token_ms=StatusDistributionSummary.from_values_function(
function=lambda req: req.time_to_first_output_token_ms or 0.0,
successful=successful,
Expand Down
29 changes: 29 additions & 0 deletions src/guidellm/schemas/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,35 @@ class RequestTimings(StandardBaseDict):
token_iterations: int = Field(
default=0,
)
last_request_sent: float | None = Field(
default=None,
description=(
"Unix timestamp of the last packet sent to the server, used for "
"round-trip metrics (openai_websocket backend)"
),
)
request_sent_sum: float = Field(
default=0.0,
description=(
"Sum of sent-packet timestamps for mean round-trip estimation "
"(openai_websocket backend)"
),
)
request_sent_count: int = Field(
default=0,
description="Number of packets sent to the server (openai_websocket backend)",
)
token_received_sum: float = Field(
default=0.0,
description=(
"Sum of received content-token timestamps for mean round-trip "
"estimation (openai_websocket backend)"
),
)
token_received_count: int = Field(
default=0,
description="Number of content tokens received (openai_websocket backend)",
)
request_end: float | None = Field(
default=None,
description="Unix timestamp when the backend completed processing the request",
Expand Down
39 changes: 39 additions & 0 deletions src/guidellm/schemas/request_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,45 @@ def time_to_first_token_ms(self) -> float | None:

return 1000 * (first_token - start)

@computed_field # type: ignore[misc]
@property
def time_to_last_round_trip_ms(self) -> float | None:
"""
Time from the last sent packet to the last received token in milliseconds.

Only populated by the websocket backend, which records send timestamps;
None for backends that do not record sends.

:return: Last round-trip latency in milliseconds, or None if unavailable
"""
last_received = self.info.timings.last_token_iteration
last_sent = self.info.timings.last_request_sent
if last_received is None or last_sent is None:
return None

return 1000 * (last_received - last_sent)

@computed_field # type: ignore[misc]
@property
def avg_round_trip_time_ms(self) -> float | None:
"""
Approximate average round-trip time in milliseconds.

Computed as the mean of received content-token timestamps minus the mean
of sent-packet timestamps. This is an approximation that assumes sent
packets and received tokens align uniformly in time. Only populated by
the websocket backend; None otherwise.

:return: Average round-trip time in milliseconds, or None if unavailable
"""
timings = self.info.timings
if timings.request_sent_count <= 0 or timings.token_received_count <= 0:
return None

mean_sent = timings.request_sent_sum / timings.request_sent_count
mean_received = timings.token_received_sum / timings.token_received_count
return 1000 * (mean_received - mean_sent)

@computed_field # type: ignore[misc]
@property
def time_per_output_token_ms(self) -> float | None:
Expand Down
Loading