feat(anthropic): Add proper tool calling data to Anthropic integration (#4769)

antonpirker · web-flow · commit 398b7c4f4486 · 2025-09-12T11:31:44.000+02:00
- Format the response of the LLM (`gen_ai.response.text`) correctly. Not using the JSON but only use the actual text that was returned. - Add responses for tool calls (`gen_ai.response.tool_calls`) to the LLM spans. - Add results of tool calls to the request (`gen_ai.request.messages`). Before: <img width="1120" height="570" alt="Screenshot 2025-09-12 at 10 43 32" src="https://github.com/user-attachments/assets/3c9aa656-b7d8-4520-9220-87dad45e49fb" /> After: <img width="1120" height="690" alt="Screenshot 2025-09-12 at 10 45 11" src="https://github.com/user-attachments/assets/3d33b27a-f1aa-4467-b2f3-cb16ce1de31e" />
diff --git a/sentry_sdk/integrations/anthropic.py b/sentry_sdk/integrations/anthropic.py
@@ -1,5 +1,4 @@
 from functools import wraps
-import json
 from typing import TYPE_CHECKING
 
 import sentry_sdk
@@ -117,8 +116,29 @@ def _set_input_data(span, kwargs, integration):
         and should_send_default_pii()
         and integration.include_prompts
     ):
+        normalized_messages = []
+        for message in messages:
+            if (
+                message.get("role") == "user"
+                and "content" in message
+                and isinstance(message["content"], (list, tuple))
+            ):
+                for item in message["content"]:
+                    if item.get("type") == "tool_result":
+                        normalized_messages.append(
+                            {
+                                "role": "tool",
+                                "content": {
+                                    "tool_use_id": item.get("tool_use_id"),
+                                    "output": item.get("content"),
+                                },
+                            }
+                        )
+            else:
+                normalized_messages.append(message)
+
         set_data_normalized(
-            span, SPANDATA.GEN_AI_REQUEST_MESSAGES, safe_serialize(messages)
+            span, SPANDATA.GEN_AI_REQUEST_MESSAGES, normalized_messages, unpack=False
         )
 
     set_data_normalized(
@@ -159,21 +179,36 @@ def _set_output_data(
     Set output data for the span based on the AI response."""
     span.set_data(SPANDATA.GEN_AI_RESPONSE_MODEL, model)
     if should_send_default_pii() and integration.include_prompts:
-        set_data_normalized(
-            span,
-            SPANDATA.GEN_AI_RESPONSE_TEXT,
-            json.dumps(content_blocks),
-            unpack=False,
-        )
+        output_messages = {
+            "response": [],
+            "tool": [],
+        }  # type: (dict[str, list[Any]])
+
+        for output in content_blocks:
+            if output["type"] == "text":
+                output_messages["response"].append(output["text"])
+            elif output["type"] == "tool_use":
+                output_messages["tool"].append(output)
+
+        if len(output_messages["tool"]) > 0:
+            set_data_normalized(
+                span,
+                SPANDATA.GEN_AI_RESPONSE_TOOL_CALLS,
+                output_messages["tool"],
+                unpack=False,
+            )
+
+        if len(output_messages["response"]) > 0:
+            set_data_normalized(
+                span, SPANDATA.GEN_AI_RESPONSE_TEXT, output_messages["response"]
+            )
 
     record_token_usage(
         span,
         input_tokens=input_tokens,
         output_tokens=output_tokens,
     )
 
-    # TODO: GEN_AI_RESPONSE_TOOL_CALLS ?
-
     if finish_span:
         span.__exit__(None, None, None)
 
diff --git a/tests/integrations/anthropic/test_anthropic.py b/tests/integrations/anthropic/test_anthropic.py
@@ -1,6 +1,6 @@
+import pytest
 from unittest import mock
 
-
 try:
     from unittest.mock import AsyncMock
 except ImportError:
@@ -10,7 +10,6 @@ async def __call__(self, *args, **kwargs):
             return super(AsyncMock, self).__call__(*args, **kwargs)
 
 
-import pytest
 from anthropic import Anthropic, AnthropicError, AsyncAnthropic, AsyncStream, Stream
 from anthropic.types import MessageDeltaUsage, TextDelta, Usage
 from anthropic.types.content_block_delta_event import ContentBlockDeltaEvent
@@ -20,9 +19,6 @@ async def __call__(self, *args, **kwargs):
 from anthropic.types.message_delta_event import MessageDeltaEvent
 from anthropic.types.message_start_event import MessageStartEvent
 
-from sentry_sdk.integrations.anthropic import _set_output_data, _collect_ai_data
-from sentry_sdk.utils import package_version
-
 try:
     from anthropic.types import InputJSONDelta
 except ImportError:
@@ -46,9 +42,16 @@ async def __call__(self, *args, **kwargs):
 
 from sentry_sdk import start_transaction, start_span
 from sentry_sdk.consts import OP, SPANDATA
-from sentry_sdk.integrations.anthropic import AnthropicIntegration
+from sentry_sdk.integrations.anthropic import (
+    AnthropicIntegration,
+    _set_output_data,
+    _collect_ai_data,
+)
+from sentry_sdk.utils import package_version
+
 
 ANTHROPIC_VERSION = package_version("anthropic")
+
 EXAMPLE_MESSAGE = Message(
     id="id",
     model="model",
@@ -121,10 +124,7 @@ def test_nonstreaming_create_message(
             span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]
             == '[{"role": "user", "content": "Hello, Claude"}]'
         )
-        assert (
-            span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
-            == '[{"text": "Hi, I\'m Claude.", "type": "text"}]'
-        )
+        assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude."
     else:
         assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"]
         assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"]
@@ -193,10 +193,7 @@ async def test_nonstreaming_create_message_async(
             span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]
             == '[{"role": "user", "content": "Hello, Claude"}]'
         )
-        assert (
-            span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
-            == '[{"text": "Hi, I\'m Claude.", "type": "text"}]'
-        )
+        assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi, I'm Claude."
     else:
         assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"]
         assert SPANDATA.GEN_AI_RESPONSE_TEXT not in span["data"]
@@ -296,10 +293,7 @@ def test_streaming_create_message(
             span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]
             == '[{"role": "user", "content": "Hello, Claude"}]'
         )
-        assert (
-            span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
-            == '[{"text": "Hi! I\'m Claude!", "type": "text"}]'
-        )
+        assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!"
 
     else:
         assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"]
@@ -403,10 +397,7 @@ async def test_streaming_create_message_async(
             span["data"][SPANDATA.GEN_AI_REQUEST_MESSAGES]
             == '[{"role": "user", "content": "Hello, Claude"}]'
         )
-        assert (
-            span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
-            == '[{"text": "Hi! I\'m Claude!", "type": "text"}]'
-        )
+        assert span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT] == "Hi! I'm Claude!"
 
     else:
         assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"]
@@ -539,7 +530,7 @@ def test_streaming_create_message_with_input_json_delta(
         )
         assert (
             span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
-            == '[{"text": "{\'location\': \'San Francisco, CA\'}", "type": "text"}]'
+            == "{'location': 'San Francisco, CA'}"
         )
     else:
         assert SPANDATA.GEN_AI_REQUEST_MESSAGES not in span["data"]
@@ -679,7 +670,7 @@ async def test_streaming_create_message_with_input_json_delta_async(
         )
         assert (
             span["data"][SPANDATA.GEN_AI_RESPONSE_TEXT]
-            == '[{"text": "{\'location\': \'San Francisco, CA\'}", "type": "text"}]'
+            == "{'location': 'San Francisco, CA'}"
         )
 
     else:
@@ -835,7 +826,7 @@ def test_set_output_data_with_input_json_delta(sentry_init):
 
         assert (
             span._data.get(SPANDATA.GEN_AI_RESPONSE_TEXT)
-            == "[{\"text\": \"{'test': 'data','more': 'json'}\", \"type\": \"text\"}]"
+            == "{'test': 'data','more': 'json'}"
         )
         assert span._data.get(SPANDATA.GEN_AI_USAGE_INPUT_TOKENS) == 10
         assert span._data.get(SPANDATA.GEN_AI_USAGE_OUTPUT_TOKENS) == 20