vllm-project · dreamer-89 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/docs/guides/metrics.md b/docs/guides/metrics.md
@@ -71,6 +71,16 @@ These metrics provide a breakdown of the overall request statuses, helping users
 - **Definition**: The average time taken to generate each output token, including the first token.
 - **Use Case**: Provides a detailed view of the model's token generation efficiency.
 
+### Time To Last Round Trip
+
+- **Definition**: For the realtime WebSocket backend (`openai_websocket`), the time from the last sent packet to the last received token.
+- **Use Case**: Measures tail latency of a streaming exchange (how long the final output lags the final input).
+
+### Average Round-Trip Time (Avg RTT)
+
+- **Definition**: For the WebSocket backend, the mean of received-token timestamps minus the mean of sent-packet timestamps.
+- **Use Case**: Estimates the average send-to-receive lag across a request. It is approximate, since it assumes sent packets and received tokens line up evenly in time.
+
 ## Statistical Summaries
 
 GuideLLM provides detailed statistical summaries for each of the above metrics using the `StatusDistributionSummary` and `DistributionSummary` models. These summaries include the following statistics:

diff --git a/src/guidellm/backends/openai/websocket.py b/src/guidellm/backends/openai/websocket.py
@@ -80,6 +80,9 @@ def _record_content_tokens(
     if content_tokens <= 0:
         return False
 
+    request_info.timings.token_received_sum += iter_time
+    request_info.timings.token_received_count += 1
+
     if request_info.timings.first_token_iteration is None:
         request_info.timings.first_token_iteration = iter_time
         request_info.timings.token_iterations = 0
@@ -92,6 +95,18 @@ def _record_content_tokens(
     return False
 
 
+def _record_request_sent(request_info: RequestInfo) -> None:
+    """
+    Record the timestamp of one outbound WebSocket frame for round-trip metrics.
+
+    :param request_info: Mutable timing state for the in-flight request.
+    """
+    sent_time = time.time()
+    request_info.timings.last_request_sent = sent_time
+    request_info.timings.request_sent_sum += sent_time
+    request_info.timings.request_sent_count += 1
+
+
 def _load_ws_event(raw: str) -> dict[str, Any]:
     """Parse a JSON WebSocket text frame; raise RuntimeError on invalid JSON."""
     try:
@@ -374,8 +389,9 @@ async def resolve(  # type: ignore[override, misc]  # noqa: C901, PLR0912, PLR09
         self,
         request: GenerationRequest,
         request_info: RequestInfo,
-        history: list[tuple[GenerationRequest, GenerationResponse | None]]
-        | None = None,
+        history: (
+            list[tuple[GenerationRequest, GenerationResponse | None]] | None
+        ) = None,
     ) -> AsyncIterator[tuple[GenerationResponse | None, RequestInfo]]:
         """
         Stream one realtime transcription over WebSocket for a single audio column.
@@ -451,18 +467,22 @@ async def resolve(  # type: ignore[override, misc]  # noqa: C901, PLR0912, PLR09
                         f"Expected session.created, got {first_event.get('type')!r}"
                     )
                 await ws.send(_json_text(session_update))
+                _record_request_sent(request_info)
                 for b64_chunk in chunks:
                     await ws.send(
                         _json_text(
                             {"type": "input_audio_buffer.append", "audio": b64_chunk}
                         )
                     )
+                    _record_request_sent(request_info)
                 await ws.send(
                     _json_text({"type": "input_audio_buffer.commit", "final": False})
                 )
+                _record_request_sent(request_info)
                 await ws.send(
                     _json_text({"type": "input_audio_buffer.commit", "final": True})
                 )
+                _record_request_sent(request_info)
 
                 ignored_events = 0
                 while True:

diff --git a/src/guidellm/benchmark/outputs/console.py b/src/guidellm/benchmark/outputs/console.py
@@ -480,6 +480,16 @@ def print_request_latency_table(self, report: GenerativeBenchmarksReport):
                 group="TPOT",
                 name="ms",
             )
+            columns.add_stats(
+                benchmark.metrics.time_to_last_round_trip_ms,
+                group="Last RT",
+                name="ms",
+            )
+            columns.add_stats(
+                benchmark.metrics.avg_round_trip_time_ms,
+                group="Avg RTT",
+                name="ms",
+            )
 
         headers, values = columns.get_table_data()
         self.console.print("\n")

diff --git a/src/guidellm/benchmark/outputs/csv.py b/src/guidellm/benchmark/outputs/csv.py
@@ -454,6 +454,20 @@ def _add_request_latency_metrics(
             "Inter Token Latency",
             "ms",
         )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.time_to_last_round_trip_ms,
+            "Time To Last Round Trip",
+            "ms",
+        )
+        self._add_stats_for_metric(
+            headers,
+            values,
+            benchmark.metrics.avg_round_trip_time_ms,
+            "Avg Round Trip Time",
+            "ms",
+        )
 
     def _add_server_throughput_metrics(
         self,

diff --git a/src/guidellm/benchmark/schemas/accumulator.py b/src/guidellm/benchmark/schemas/accumulator.py
@@ -495,6 +495,14 @@ class GenerativeMetricsAccumulator(StandardBaseModel):
         default_factory=RunningMetricStats,
         description="Accumulated time to first token statistics in milliseconds",
     )
+    time_to_last_round_trip_ms: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated websocket last round-trip latency in milliseconds",
+    )
+    avg_round_trip_time_ms: RunningMetricStats = Field(
+        default_factory=RunningMetricStats,
+        description="Accumulated websocket average round-trip time in milliseconds",
+    )
     time_to_first_output_token_ms: RunningMetricStats = Field(
         default_factory=RunningMetricStats,
         description="Accumulated time to first content token stats in ms",
@@ -539,6 +547,12 @@ def update_estimate(self, stats: GenerativeRequestStats, duration: float):
         self.time_to_first_token_ms.update_estimate(
             stats.time_to_first_token_ms, duration=duration
         )
+        self.time_to_last_round_trip_ms.update_estimate(
+            stats.time_to_last_round_trip_ms, duration=duration
+        )
+        self.avg_round_trip_time_ms.update_estimate(
+            stats.avg_round_trip_time_ms, duration=duration
+        )
         self.time_to_first_output_token_ms.update_estimate(
             stats.time_to_first_output_token_ms, duration=duration
         )

diff --git a/src/guidellm/benchmark/schemas/metrics.py b/src/guidellm/benchmark/schemas/metrics.py
@@ -813,6 +813,18 @@ class GenerativeMetrics(StandardBaseDict):
     inter_token_latency_ms: StatusDistributionSummary = Field(
         description="Distribution of inter-token latencies in milliseconds"
     )
+    time_to_last_round_trip_ms: StatusDistributionSummary = Field(
+        description=(
+            "Distribution of websocket last-round-trip latencies in milliseconds "
+            "(last received token minus last sent packet)"
+        )
+    )
+    avg_round_trip_time_ms: StatusDistributionSummary = Field(
+        description=(
+            "Distribution of approximate websocket average round-trip times in "
+            "milliseconds (mean received minus mean sent)"
+        )
+    )
     prompt_tokens_per_second: StatusDistributionSummary = Field(
         description="Distribution of prompt token processing rates"
     )
@@ -889,10 +901,12 @@ def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetri
             ),
             request_concurrency=StatusDistributionSummary.concurrency_distribution_from_timings_function(
                 function=(
-                    lambda req: (req.request_start_time, req.request_end_time)
-                    if req.request_start_time is not None
-                    and req.request_end_time is not None
-                    else None
+                    lambda req: (
+                        (req.request_start_time, req.request_end_time)
+                        if req.request_start_time is not None
+                        and req.request_end_time is not None
+                        else None
+                    )
                 ),
                 successful=successful,
                 incomplete=incomplete,
@@ -937,6 +951,18 @@ def compile(cls, accumulator: GenerativeBenchmarkAccumulator) -> GenerativeMetri
                 incomplete=incomplete,
                 errored=errored,
             ),
+            time_to_last_round_trip_ms=StatusDistributionSummary.from_values_function(
+                function=lambda req: req.time_to_last_round_trip_ms or 0.0,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
+            avg_round_trip_time_ms=StatusDistributionSummary.from_values_function(
+                function=lambda req: req.avg_round_trip_time_ms or 0.0,
+                successful=successful,
+                incomplete=incomplete,
+                errored=errored,
+            ),
             time_to_first_output_token_ms=StatusDistributionSummary.from_values_function(
                 function=lambda req: req.time_to_first_output_token_ms or 0.0,
                 successful=successful,

diff --git a/src/guidellm/schemas/info.py b/src/guidellm/schemas/info.py
@@ -78,6 +78,35 @@ class RequestTimings(StandardBaseDict):
     token_iterations: int = Field(
         default=0,
     )
+    last_request_sent: float | None = Field(
+        default=None,
+        description=(
+            "Unix timestamp of the last packet sent to the server, used for "
+            "round-trip metrics (openai_websocket backend)"
+        ),
+    )
+    request_sent_sum: float = Field(
+        default=0.0,
+        description=(
+            "Sum of sent-packet timestamps for mean round-trip estimation "
+            "(openai_websocket backend)"
+        ),
+    )
+    request_sent_count: int = Field(
+        default=0,
+        description="Number of packets sent to the server (openai_websocket backend)",
+    )
+    token_received_sum: float = Field(
+        default=0.0,
+        description=(
+            "Sum of received content-token timestamps for mean round-trip "
+            "estimation (openai_websocket backend)"
+        ),
+    )
+    token_received_count: int = Field(
+        default=0,
+        description="Number of content tokens received (openai_websocket backend)",
+    )
     request_end: float | None = Field(
         default=None,
         description="Unix timestamp when the backend completed processing the request",

diff --git a/src/guidellm/schemas/request_stats.py b/src/guidellm/schemas/request_stats.py
@@ -162,6 +162,45 @@ def time_to_first_token_ms(self) -> float | None:
 
         return 1000 * (first_token - start)
 
+    @computed_field  # type: ignore[misc]
+    @property
+    def time_to_last_round_trip_ms(self) -> float | None:
+        """
+        Time from the last sent packet to the last received token in milliseconds.
+
+        Only populated by the websocket backend, which records send timestamps;
+        None for backends that do not record sends.
+
+        :return: Last round-trip latency in milliseconds, or None if unavailable
+        """
+        last_received = self.info.timings.last_token_iteration
+        last_sent = self.info.timings.last_request_sent
+        if last_received is None or last_sent is None:
+            return None
+
+        return 1000 * (last_received - last_sent)
+
+    @computed_field  # type: ignore[misc]
+    @property
+    def avg_round_trip_time_ms(self) -> float | None:
+        """
+        Approximate average round-trip time in milliseconds.
+
+        Computed as the mean of received content-token timestamps minus the mean
+        of sent-packet timestamps. This is an approximation that assumes sent
+        packets and received tokens align uniformly in time. Only populated by
+        the websocket backend; None otherwise.
+
+        :return: Average round-trip time in milliseconds, or None if unavailable
+        """
+        timings = self.info.timings
+        if timings.request_sent_count <= 0 or timings.token_received_count <= 0:
+            return None
+
+        mean_sent = timings.request_sent_sum / timings.request_sent_count
+        mean_received = timings.token_received_sum / timings.token_received_count
+        return 1000 * (mean_received - mean_sent)
+
     @computed_field  # type: ignore[misc]
     @property
     def time_per_output_token_ms(self) -> float | None: