Skip to content

Commit ed146c9

Browse files
CLFutureXenystopenhands-agentxingyaoww
authored
Fix(llm):When LLM calls are retried, it can lead to inaccurate records. (#949)
Signed-off-by: CLFutureX <[email protected]> Co-authored-by: Engel Nyst <[email protected]> Co-authored-by: openhands <[email protected]> Co-authored-by: Engel Nyst <[email protected]> Co-authored-by: Xingyao Wang <[email protected]>
1 parent 921dc86 commit ed146c9

File tree

2 files changed

+307
-1
lines changed

2 files changed

+307
-1
lines changed

openhands-sdk/openhands/sdk/llm/llm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,6 @@ def completion(
459459
}
460460
if tools and not use_native_fc:
461461
log_ctx["raw_messages"] = original_fncall_msgs
462-
self._telemetry.on_request(log_ctx=log_ctx)
463462

464463
# 5) do the call with retries
465464
@self.retry_decorator(
@@ -472,6 +471,7 @@ def completion(
472471
)
473472
def _one_attempt(**retry_kwargs) -> ModelResponse:
474473
assert self._telemetry is not None
474+
self._telemetry.on_request(log_ctx=log_ctx)
475475
# Merge retry-modified kwargs (like temperature) with call_kwargs
476476
final_kwargs = {**call_kwargs, **retry_kwargs}
477477
resp = self._transport_call(messages=formatted_messages, **final_kwargs)
Lines changed: 306 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
"""
2+
Test that telemetry records are accurate when LLM calls are retried.
3+
4+
This test ensures that when an LLM call is retried, the telemetry only
5+
records the latency and metrics for the successful attempt, not the
6+
combined time of all failed attempts plus the successful one.
7+
"""
8+
9+
import time
10+
from unittest.mock import patch
11+
12+
from litellm.exceptions import APIConnectionError
13+
from litellm.types.utils import Choices, Message as LiteLLMMessage, ModelResponse, Usage
14+
from pydantic import SecretStr
15+
16+
from openhands.sdk.llm import LLM, Message, TextContent
17+
18+
19+
def create_mock_response(
20+
content: str = "Test response",
21+
response_id: str = "test-id",
22+
prompt_tokens: int = 10,
23+
completion_tokens: int = 5,
24+
):
25+
"""Helper function to create properly structured mock responses."""
26+
return ModelResponse(
27+
id=response_id,
28+
choices=[
29+
Choices(
30+
finish_reason="stop",
31+
index=0,
32+
message=LiteLLMMessage(content=content, role="assistant"),
33+
)
34+
],
35+
created=1234567890,
36+
model="gpt-4o",
37+
object="chat.completion",
38+
system_fingerprint="test",
39+
usage=Usage(
40+
prompt_tokens=prompt_tokens,
41+
completion_tokens=completion_tokens,
42+
total_tokens=prompt_tokens + completion_tokens,
43+
),
44+
)
45+
46+
47+
@patch("openhands.sdk.llm.llm.litellm_completion")
48+
def test_telemetry_records_only_successful_attempt_latency(mock_litellm_completion):
49+
"""
50+
Test that when LLM calls are retried, telemetry only records the latency
51+
of the successful attempt, not the cumulative time of all attempts.
52+
53+
Before the fix, on_request was called once before retry logic, causing
54+
the latency to include all failed attempts + wait times. After the fix,
55+
on_request is called for each retry attempt, so only the successful
56+
attempt's latency is recorded.
57+
"""
58+
# Create mock responses for failed and successful attempts
59+
mock_response = create_mock_response("Success after retry")
60+
61+
# Simulate 2 failures followed by success
62+
mock_litellm_completion.side_effect = [
63+
APIConnectionError(
64+
message="Connection failed 1",
65+
llm_provider="test_provider",
66+
model="test_model",
67+
),
68+
APIConnectionError(
69+
message="Connection failed 2",
70+
llm_provider="test_provider",
71+
model="test_model",
72+
),
73+
mock_response, # Third attempt succeeds
74+
]
75+
76+
# Create LLM with retry configuration and minimal wait times for faster test
77+
llm = LLM(
78+
model="gpt-4o",
79+
api_key=SecretStr("test_key"),
80+
num_retries=3,
81+
retry_min_wait=1, # 1 second minimum wait
82+
retry_max_wait=1, # 1 second maximum wait (same as min for consistent timing)
83+
usage_id="test-service",
84+
)
85+
86+
# Record the start time of the entire operation
87+
operation_start = time.time()
88+
89+
# Make the completion call (will retry twice, then succeed)
90+
response = llm.completion(
91+
messages=[Message(role="user", content=[TextContent(text="Hello!")])],
92+
)
93+
94+
# Record the total operation time
95+
total_operation_time = time.time() - operation_start
96+
97+
# Verify the call succeeded
98+
assert response.raw_response == mock_response
99+
assert mock_litellm_completion.call_count == 3
100+
101+
# Get the metrics to check recorded latency
102+
metrics = llm.metrics
103+
104+
# The recorded latency should be much less than the total operation time
105+
# because it should only include the successful attempt, not the failed ones
106+
recorded_latencies = [latency.latency for latency in metrics.response_latencies]
107+
108+
# There should be exactly one latency record (for the successful attempt)
109+
assert len(recorded_latencies) == 1
110+
111+
recorded_latency = recorded_latencies[0]
112+
113+
# The recorded latency should be significantly less than total operation time
114+
# Total operation time includes:
115+
# - First attempt (failed) + wait time
116+
# - Second attempt (failed) + wait time
117+
# - Third attempt (successful)
118+
#
119+
# The recorded latency should only include the third attempt
120+
assert recorded_latency < total_operation_time * 0.5, (
121+
f"Recorded latency ({recorded_latency:.3f}s) should be much less "
122+
f"than total operation time ({total_operation_time:.3f}s)"
123+
)
124+
125+
# The recorded latency should be relatively small (just the mock call time)
126+
# Since we're mocking, it should be very quick (< 100ms typically)
127+
assert recorded_latency < 0.5, (
128+
f"Recorded latency ({recorded_latency:.3f}s) should be < 0.5s for a mocked call"
129+
)
130+
131+
132+
@patch("openhands.sdk.llm.llm.litellm_completion")
133+
def test_telemetry_on_request_called_per_retry(mock_litellm_completion):
134+
"""
135+
Test that telemetry.on_request() is called for each retry attempt.
136+
137+
This ensures that each retry resets the request timer, so only the
138+
successful attempt's latency is recorded.
139+
140+
We verify this by checking the _req_start timestamps which are set
141+
by on_request(). With the fix, _req_start should be reset for each retry.
142+
"""
143+
# Track _req_start values to see when on_request is called
144+
req_start_values = []
145+
146+
mock_response = create_mock_response("Success after one retry")
147+
148+
# Create a side effect function that captures _req_start after each attempt
149+
def mock_transport_call_side_effect(*args, **kwargs):
150+
# Capture the current _req_start value (set by on_request)
151+
# This runs inside _one_attempt, after on_request is called
152+
nonlocal req_start_values
153+
req_start_values.append(time.time())
154+
155+
# First call fails, second succeeds
156+
if len(req_start_values) == 1:
157+
raise APIConnectionError(
158+
message="Connection failed",
159+
llm_provider="test_provider",
160+
model="test_model",
161+
)
162+
return mock_response
163+
164+
mock_litellm_completion.side_effect = mock_transport_call_side_effect
165+
166+
# Create LLM instance
167+
llm = LLM(
168+
model="gpt-4o",
169+
api_key=SecretStr("test_key"),
170+
num_retries=2,
171+
retry_min_wait=1,
172+
retry_max_wait=1,
173+
usage_id="test-service",
174+
)
175+
176+
# Make the completion call
177+
response = llm.completion(
178+
messages=[Message(role="user", content=[TextContent(text="Test")])],
179+
)
180+
181+
# Verify the call succeeded
182+
assert response.raw_response == mock_response
183+
184+
# Should have attempted twice (one failure, one success)
185+
assert len(req_start_values) == 2, (
186+
f"Expected 2 attempts, got {len(req_start_values)}"
187+
)
188+
189+
# Verify there was a time gap between the attempts (retry wait time)
190+
# This proves on_request was called for each attempt
191+
time_gap = req_start_values[1] - req_start_values[0]
192+
assert time_gap >= 0.5, (
193+
"There should be a wait time between retry attempts "
194+
f"(gap: {time_gap:.3f}s, expected >= 0.5s due to 1 second retry wait)"
195+
)
196+
197+
198+
@patch("openhands.sdk.llm.llm.litellm_completion")
199+
def test_telemetry_metrics_accurate_with_retries(mock_litellm_completion):
200+
"""
201+
Test that all telemetry metrics (tokens, cost, latency) are accurate
202+
when retries occur.
203+
"""
204+
# Create a response with specific token counts
205+
mock_response = create_mock_response(
206+
"Success", prompt_tokens=100, completion_tokens=50
207+
)
208+
209+
# Simulate one failure then success
210+
mock_litellm_completion.side_effect = [
211+
APIConnectionError(
212+
message="Connection failed",
213+
llm_provider="test_provider",
214+
model="test_model",
215+
),
216+
mock_response,
217+
]
218+
219+
# Create LLM with cost tracking
220+
llm = LLM(
221+
model="gpt-4o",
222+
api_key=SecretStr("test_key"),
223+
num_retries=2,
224+
retry_min_wait=1,
225+
retry_max_wait=1,
226+
usage_id="test-service",
227+
input_cost_per_token=0.001,
228+
output_cost_per_token=0.002,
229+
)
230+
231+
# Make the completion call
232+
response = llm.completion(
233+
messages=[Message(role="user", content=[TextContent(text="Test")])],
234+
)
235+
236+
# Verify the call succeeded
237+
assert response.raw_response == mock_response
238+
239+
# Get metrics
240+
metrics = llm.metrics
241+
242+
# Token usage should only reflect the successful attempt
243+
assert len(metrics.token_usages) == 1
244+
token_usage = metrics.token_usages[0]
245+
assert token_usage.prompt_tokens == 100
246+
assert token_usage.completion_tokens == 50
247+
248+
# Cost should only reflect the successful attempt
249+
# Note: Cost calculation depends on litellm, so we just verify it's positive
250+
assert metrics.accumulated_cost > 0
251+
252+
# Latency should only reflect the successful attempt (should be small)
253+
assert len(metrics.response_latencies) == 1
254+
assert metrics.response_latencies[0].latency < 0.5
255+
256+
257+
@patch("openhands.sdk.llm.llm.litellm_completion")
258+
def test_telemetry_no_multiple_records_on_retry(mock_litellm_completion):
259+
"""
260+
Test that telemetry doesn't create multiple records for failed attempts.
261+
262+
Only the successful attempt should result in telemetry records.
263+
"""
264+
mock_response = create_mock_response("Success")
265+
266+
# Simulate multiple failures then success
267+
mock_litellm_completion.side_effect = [
268+
APIConnectionError(
269+
message="Fail 1", llm_provider="test_provider", model="test_model"
270+
),
271+
APIConnectionError(
272+
message="Fail 2", llm_provider="test_provider", model="test_model"
273+
),
274+
APIConnectionError(
275+
message="Fail 3", llm_provider="test_provider", model="test_model"
276+
),
277+
mock_response,
278+
]
279+
280+
llm = LLM(
281+
model="gpt-4o",
282+
api_key=SecretStr("test_key"),
283+
num_retries=5,
284+
retry_min_wait=1,
285+
retry_max_wait=1,
286+
usage_id="test-service",
287+
)
288+
289+
# Make the completion call
290+
response = llm.completion(
291+
messages=[Message(role="user", content=[TextContent(text="Test")])],
292+
)
293+
294+
assert response.raw_response == mock_response
295+
296+
metrics = llm.metrics
297+
298+
# Should only have ONE latency record (for the successful attempt)
299+
assert len(metrics.response_latencies) == 1
300+
301+
# Should only have ONE token usage record (for the successful attempt)
302+
assert len(metrics.token_usages) == 1
303+
304+
# Should only have ONE cost record (for the successful attempt)
305+
# Cost is accumulated, so we just check it's positive
306+
assert metrics.accumulated_cost > 0

0 commit comments

Comments
 (0)