From 33dfca16da7515edc9db9fe16e1a170ff42aed9d Mon Sep 17 00:00:00 2001 From: zhsnddn <2027606134@qq.com> Date: Mon, 13 Apr 2026 21:07:17 +0800 Subject: [PATCH 1/2] fix(core): preserve multimodal content parts in DashScope tool results instead of merging into text --- .../dashscope/DashScopeMessageConverter.java | 81 +++++++++++++++++-- ...DashScopeChatFormatterGroundTruthTest.java | 19 +++-- ...opeMultiAgentFormatterGroundTruthTest.java | 35 ++++---- .../DashScopeMessageConverterTest.java | 47 +++++++++++ 4 files changed, 148 insertions(+), 34 deletions(-) diff --git a/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java b/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java index 20f362da3..96e179207 100644 --- a/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java +++ b/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java @@ -168,9 +168,12 @@ private DashScopeMessage convertToMultimodalContent(Msg msg) { private DashScopeMessage convertToolRoleMessage(Msg msg) { ToolResultBlock toolResult = msg.getFirstContentBlock(ToolResultBlock.class); if (toolResult != null) { - String toolResultText = toolResultConverter.apply(toolResult.getOutput()); - List content = new ArrayList<>(); - content.add(DashScopeContentPart.text(toolResultText)); + List content = + hasMediaContent(toolResult.getOutput()) + ? convertContentBlocks(toolResult.getOutput()) + : List.of( + DashScopeContentPart.text( + toolResultConverter.apply(toolResult.getOutput()))); return DashScopeMessage.builder() .role("tool") @@ -181,9 +184,10 @@ private DashScopeMessage convertToolRoleMessage(Msg msg) { } // Fallback: no ToolResultBlock found, use text content - List content = new ArrayList<>(); - content.add(DashScopeContentPart.text(extractTextContent(msg))); - return DashScopeMessage.builder().role("tool").content(content).build(); + return DashScopeMessage.builder() + .role("tool") + .content(List.of(DashScopeContentPart.text(extractTextContent(msg)))) + .build(); } /** @@ -260,4 +264,69 @@ private void applyCacheControlFromMetadata(Msg msg, DashScopeMessage result) { result.setCacheControl(DashScopeChatFormatter.getEphemeralCacheControl()); } } + + /** + * Check if blocks contain media content (image, audio, video). + * + * @param blocks the list of content blocks to check + * @return true if any block is ImageBlock, AudioBlock, or VideoBlock + */ + private boolean hasMediaContent(List blocks) { + if (blocks == null) { + return false; + } + for (ContentBlock block : blocks) { + if (block instanceof ImageBlock + || block instanceof AudioBlock + || block instanceof VideoBlock) { + return true; + } + } + return false; + } + + /** + * Convert content blocks to DashScope content parts for multimodal messages. + * + * @param blocks the list of content blocks to convert + * @return the converted list of DashScopeContentPart + */ + private List convertContentBlocks(List blocks) { + List content = new ArrayList<>(); + for (ContentBlock block : blocks) { + if (block instanceof TextBlock tb) { + content.add(DashScopeContentPart.text(tb.getText())); + } else if (block instanceof ImageBlock ib) { + try { + content.add(mediaConverter.convertImageBlockToContentPart(ib)); + } catch (Exception e) { + log.warn("Failed to process ImageBlock in tool result: {}", e.getMessage()); + content.add( + DashScopeContentPart.text( + "[Image - processing failed: " + e.getMessage() + "]")); + } + } else if (block instanceof AudioBlock ab) { + try { + content.add(mediaConverter.convertAudioBlockToContentPart(ab)); + } catch (Exception e) { + log.warn("Failed to process AudioBlock in tool result: {}", e.getMessage()); + content.add( + DashScopeContentPart.text( + "[Audio - processing failed: " + e.getMessage() + "]")); + } + } else if (block instanceof VideoBlock vb) { + try { + content.add(mediaConverter.convertVideoBlockToContentPart(vb)); + } catch (Exception e) { + log.warn("Failed to process VideoBlock in tool result: {}", e.getMessage()); + content.add( + DashScopeContentPart.text( + "[Video - processing failed: " + e.getMessage() + "]")); + } + } else if (block instanceof ThinkingBlock) { + // Skip thinking blocks + } + } + return content; + } } diff --git a/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeChatFormatterGroundTruthTest.java b/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeChatFormatterGroundTruthTest.java index c46656597..992ee27e0 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeChatFormatterGroundTruthTest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeChatFormatterGroundTruthTest.java @@ -341,13 +341,8 @@ private static void buildGroundTruth() { .build()); // Message 8: Tool result - String toolResultContent = - "- The capital of Japan is Tokyo.\n" - + "- The returned image can be found at: " - + imagePath - + "\n" - + "- The returned audio can be found at: " - + mockAudioPath; + File imageFile2 = new File(imagePath); + String absoluteImagePath2 = "file://" + imageFile2.getAbsolutePath(); groundTruthChat.add( DashScopeMessage.builder() .role("tool") @@ -356,7 +351,15 @@ private static void buildGroundTruth() { .content( List.of( DashScopeContentPart.builder() - .text(toolResultContent) + .text("The capital of Japan is Tokyo.") + .build(), + DashScopeContentPart.builder() + .image(absoluteImagePath2) + .build(), + DashScopeContentPart.builder() + .audio( + "data:audio/wav;base64," + + "ZmFrZSBhdWRpbyBjb250ZW50") .build())) .build()); diff --git a/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeMultiAgentFormatterGroundTruthTest.java b/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeMultiAgentFormatterGroundTruthTest.java index ce104fca3..b3c8e92a5 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeMultiAgentFormatterGroundTruthTest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeMultiAgentFormatterGroundTruthTest.java @@ -391,20 +391,18 @@ private static void buildGroundTruth() { .build()); // Message 4: Tool result - String toolResultContent = - "- The capital of Japan is Tokyo.\n" - + "- The returned image can be found at: " - + imagePath - + "\n" - + "- The returned audio can be found at: " - + mockAudioPath; - DashScopeContentPart toolContent = DashScopeContentPart.text(toolResultContent); + List toolResultParts = + List.of( + DashScopeContentPart.text("The capital of Japan is Tokyo."), + DashScopeContentPart.image(absoluteImagePath), + DashScopeContentPart.audio( + "data:audio/wav;base64," + "ZmFrZSBhdWRpbyBjb250ZW50")); groundTruthMultiagent.add( DashScopeMessage.builder() .role("tool") .toolCallId("1") .name("get_capital") - .content(List.of(toolContent)) + .content(toolResultParts) .build()); // Message 5: User with assistant response in history @@ -435,7 +433,7 @@ private static void buildGroundTruth() { .role("tool") .toolCallId("1") .name("get_capital") - .content(List.of(toolContent)) + .content(toolResultParts) .build()); // Message 4: User with history @@ -475,7 +473,7 @@ private static void buildGroundTruth() { .role("tool") .toolCallId("1") .name("get_capital") - .content(List.of(toolContent)) + .content(toolResultParts) .build()); // Message 5: User with updated history including second conversation @@ -508,20 +506,17 @@ private static void buildGroundTruth() { .build()); // Message 7: Second tool result (note: tool_call_id is "2") - String toolResultContent2 = - "- The capital of South Korea is Seoul.\n" - + "- The returned image can be found at: " - + imagePath - + "\n" - + "- The returned audio can be found at: " - + mockAudioPath; - DashScopeContentPart toolContent2 = DashScopeContentPart.text(toolResultContent2); + DashScopeContentPart toolContent2 = + DashScopeContentPart.text("The capital of South Korea is Seoul."); + DashScopeContentPart toolImageContent2 = DashScopeContentPart.image(absoluteImagePath); + DashScopeContentPart toolAudioContent2 = + DashScopeContentPart.audio("data:audio/wav;base64," + "ZmFrZSBhdWRpbyBjb250ZW50"); groundTruthMultiagent2.add( DashScopeMessage.builder() .role("tool") .toolCallId("2") .name("get_capital") - .content(List.of(toolContent2)) + .content(List.of(toolContent2, toolImageContent2, toolAudioContent2)) .build()); // Message 8: Final user with last response in history diff --git a/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java b/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java index 5087826c0..d1cf5994c 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java @@ -563,4 +563,51 @@ void testToolCallFallbackToInputMapWhenContentEmpty() { assertTrue(args.contains("city")); assertTrue(args.contains("Shanghai")); } + + @Test + void testToolResultWithImageBlock() { + DashScopeMessageConverter conv = + new DashScopeMessageConverter( + blocks -> { + StringBuilder sb = new StringBuilder(); + for (ContentBlock block : blocks) { + if (block instanceof TextBlock tb) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append(tb.getText()); + } else if (block instanceof ImageBlock) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append("The returned image can be found at: /tmp/test.png"); + } + } + return sb.toString(); + }); + + ToolResultBlock toolResult = + ToolResultBlock.builder() + .id("call_123") + .name("get_image") + .output( + List.of( + TextBlock.builder().text("Here is a cat image").build(), + ImageBlock.builder() + .source( + URLSource.builder() + .url( + "https://agentscope-test.oss-cn-beijing.aliyuncs.com/Cat03.jpg") + .build()) + .build())) + .build(); + + Msg msg = Msg.builder().role(MsgRole.TOOL).content(List.of(toolResult)).build(); + DashScopeMessage dsMsg = conv.convertToMessage(msg, true); + + assertEquals("tool", dsMsg.getRole()); + assertEquals("call_123", dsMsg.getToolCallId()); + assertTrue(dsMsg.isMultimodal()); + assertEquals(2, dsMsg.getContentAsList().size()); + } } From 8d6c548eb2d60b58581fa546f6cc58e3193497b1 Mon Sep 17 00:00:00 2001 From: zhsnddn <2027606134@qq.com> Date: Mon, 13 Apr 2026 22:33:33 +0800 Subject: [PATCH 2/2] fix(core): preserve multimodal content parts in DashScope tool results instead of merging into text --- .../dashscope/DashScopeMessageConverter.java | 11 --- .../DashScopeMessageConverterTest.java | 67 +++++++++++++++++++ 2 files changed, 67 insertions(+), 11 deletions(-) diff --git a/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java b/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java index 96e179207..a3f3b82d3 100644 --- a/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java +++ b/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java @@ -301,30 +301,19 @@ private List convertContentBlocks(List block content.add(mediaConverter.convertImageBlockToContentPart(ib)); } catch (Exception e) { log.warn("Failed to process ImageBlock in tool result: {}", e.getMessage()); - content.add( - DashScopeContentPart.text( - "[Image - processing failed: " + e.getMessage() + "]")); } } else if (block instanceof AudioBlock ab) { try { content.add(mediaConverter.convertAudioBlockToContentPart(ab)); } catch (Exception e) { log.warn("Failed to process AudioBlock in tool result: {}", e.getMessage()); - content.add( - DashScopeContentPart.text( - "[Audio - processing failed: " + e.getMessage() + "]")); } } else if (block instanceof VideoBlock vb) { try { content.add(mediaConverter.convertVideoBlockToContentPart(vb)); } catch (Exception e) { log.warn("Failed to process VideoBlock in tool result: {}", e.getMessage()); - content.add( - DashScopeContentPart.text( - "[Video - processing failed: " + e.getMessage() + "]")); } - } else if (block instanceof ThinkingBlock) { - // Skip thinking blocks } } return content; diff --git a/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java b/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java index d1cf5994c..934171d8f 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java @@ -610,4 +610,71 @@ void testToolResultWithImageBlock() { assertTrue(dsMsg.isMultimodal()); assertEquals(2, dsMsg.getContentAsList().size()); } + + @Test + void testToolResultWithTextImageAudioBlocks() { + // Test that tool result with TextBlock + ImageBlock + AudioBlock returns 3 content parts + DashScopeMessageConverter conv = + new DashScopeMessageConverter( + blocks -> { + StringBuilder sb = new StringBuilder(); + for (ContentBlock block : blocks) { + if (block instanceof TextBlock tb) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append(tb.getText()); + } else if (block instanceof ImageBlock) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append("The returned image can be found at: /tmp/test.png"); + } else if (block instanceof AudioBlock) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append("The returned audio can be found at: /tmp/test.wav"); + } + } + return sb.toString(); + }); + + ToolResultBlock toolResult = + ToolResultBlock.builder() + .id("call_multi_media") + .name("get_multimodal") + .output( + List.of( + TextBlock.builder() + .text("The capital of Japan is Tokyo.") + .build(), + ImageBlock.builder() + .source( + URLSource.builder() + .url( + "https://example.com/image.png") + .build()) + .build(), + AudioBlock.builder() + .source( + URLSource.builder() + .url( + "https://example.com/audio.wav") + .build()) + .build())) + .build(); + + Msg msg = Msg.builder().role(MsgRole.TOOL).content(List.of(toolResult)).build(); + DashScopeMessage dsMsg = conv.convertToMessage(msg, true); + + assertEquals("tool", dsMsg.getRole()); + assertEquals("call_multi_media", dsMsg.getToolCallId()); + assertEquals("get_multimodal", dsMsg.getName()); + assertTrue(dsMsg.isMultimodal()); + // Should preserve all 3 content parts: text, image, audio + assertEquals(3, dsMsg.getContentAsList().size()); + assertEquals("The capital of Japan is Tokyo.", dsMsg.getContentAsList().get(0).getText()); + assertEquals("https://example.com/image.png", dsMsg.getContentAsList().get(1).getImage()); + assertEquals("https://example.com/audio.wav", dsMsg.getContentAsList().get(2).getAudio()); + } }