diff --git a/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java b/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java index 20f362da3..a3f3b82d3 100644 --- a/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java +++ b/agentscope-core/src/main/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverter.java @@ -168,9 +168,12 @@ private DashScopeMessage convertToMultimodalContent(Msg msg) { private DashScopeMessage convertToolRoleMessage(Msg msg) { ToolResultBlock toolResult = msg.getFirstContentBlock(ToolResultBlock.class); if (toolResult != null) { - String toolResultText = toolResultConverter.apply(toolResult.getOutput()); - List content = new ArrayList<>(); - content.add(DashScopeContentPart.text(toolResultText)); + List content = + hasMediaContent(toolResult.getOutput()) + ? convertContentBlocks(toolResult.getOutput()) + : List.of( + DashScopeContentPart.text( + toolResultConverter.apply(toolResult.getOutput()))); return DashScopeMessage.builder() .role("tool") @@ -181,9 +184,10 @@ private DashScopeMessage convertToolRoleMessage(Msg msg) { } // Fallback: no ToolResultBlock found, use text content - List content = new ArrayList<>(); - content.add(DashScopeContentPart.text(extractTextContent(msg))); - return DashScopeMessage.builder().role("tool").content(content).build(); + return DashScopeMessage.builder() + .role("tool") + .content(List.of(DashScopeContentPart.text(extractTextContent(msg)))) + .build(); } /** @@ -260,4 +264,58 @@ private void applyCacheControlFromMetadata(Msg msg, DashScopeMessage result) { result.setCacheControl(DashScopeChatFormatter.getEphemeralCacheControl()); } } + + /** + * Check if blocks contain media content (image, audio, video). + * + * @param blocks the list of content blocks to check + * @return true if any block is ImageBlock, AudioBlock, or VideoBlock + */ + private boolean hasMediaContent(List blocks) { + if (blocks == null) { + return false; + } + for (ContentBlock block : blocks) { + if (block instanceof ImageBlock + || block instanceof AudioBlock + || block instanceof VideoBlock) { + return true; + } + } + return false; + } + + /** + * Convert content blocks to DashScope content parts for multimodal messages. + * + * @param blocks the list of content blocks to convert + * @return the converted list of DashScopeContentPart + */ + private List convertContentBlocks(List blocks) { + List content = new ArrayList<>(); + for (ContentBlock block : blocks) { + if (block instanceof TextBlock tb) { + content.add(DashScopeContentPart.text(tb.getText())); + } else if (block instanceof ImageBlock ib) { + try { + content.add(mediaConverter.convertImageBlockToContentPart(ib)); + } catch (Exception e) { + log.warn("Failed to process ImageBlock in tool result: {}", e.getMessage()); + } + } else if (block instanceof AudioBlock ab) { + try { + content.add(mediaConverter.convertAudioBlockToContentPart(ab)); + } catch (Exception e) { + log.warn("Failed to process AudioBlock in tool result: {}", e.getMessage()); + } + } else if (block instanceof VideoBlock vb) { + try { + content.add(mediaConverter.convertVideoBlockToContentPart(vb)); + } catch (Exception e) { + log.warn("Failed to process VideoBlock in tool result: {}", e.getMessage()); + } + } + } + return content; + } } diff --git a/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeChatFormatterGroundTruthTest.java b/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeChatFormatterGroundTruthTest.java index c46656597..992ee27e0 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeChatFormatterGroundTruthTest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeChatFormatterGroundTruthTest.java @@ -341,13 +341,8 @@ private static void buildGroundTruth() { .build()); // Message 8: Tool result - String toolResultContent = - "- The capital of Japan is Tokyo.\n" - + "- The returned image can be found at: " - + imagePath - + "\n" - + "- The returned audio can be found at: " - + mockAudioPath; + File imageFile2 = new File(imagePath); + String absoluteImagePath2 = "file://" + imageFile2.getAbsolutePath(); groundTruthChat.add( DashScopeMessage.builder() .role("tool") @@ -356,7 +351,15 @@ private static void buildGroundTruth() { .content( List.of( DashScopeContentPart.builder() - .text(toolResultContent) + .text("The capital of Japan is Tokyo.") + .build(), + DashScopeContentPart.builder() + .image(absoluteImagePath2) + .build(), + DashScopeContentPart.builder() + .audio( + "data:audio/wav;base64," + + "ZmFrZSBhdWRpbyBjb250ZW50") .build())) .build()); diff --git a/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeMultiAgentFormatterGroundTruthTest.java b/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeMultiAgentFormatterGroundTruthTest.java index ce104fca3..b3c8e92a5 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeMultiAgentFormatterGroundTruthTest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/formatter/DashScopeMultiAgentFormatterGroundTruthTest.java @@ -391,20 +391,18 @@ private static void buildGroundTruth() { .build()); // Message 4: Tool result - String toolResultContent = - "- The capital of Japan is Tokyo.\n" - + "- The returned image can be found at: " - + imagePath - + "\n" - + "- The returned audio can be found at: " - + mockAudioPath; - DashScopeContentPart toolContent = DashScopeContentPart.text(toolResultContent); + List toolResultParts = + List.of( + DashScopeContentPart.text("The capital of Japan is Tokyo."), + DashScopeContentPart.image(absoluteImagePath), + DashScopeContentPart.audio( + "data:audio/wav;base64," + "ZmFrZSBhdWRpbyBjb250ZW50")); groundTruthMultiagent.add( DashScopeMessage.builder() .role("tool") .toolCallId("1") .name("get_capital") - .content(List.of(toolContent)) + .content(toolResultParts) .build()); // Message 5: User with assistant response in history @@ -435,7 +433,7 @@ private static void buildGroundTruth() { .role("tool") .toolCallId("1") .name("get_capital") - .content(List.of(toolContent)) + .content(toolResultParts) .build()); // Message 4: User with history @@ -475,7 +473,7 @@ private static void buildGroundTruth() { .role("tool") .toolCallId("1") .name("get_capital") - .content(List.of(toolContent)) + .content(toolResultParts) .build()); // Message 5: User with updated history including second conversation @@ -508,20 +506,17 @@ private static void buildGroundTruth() { .build()); // Message 7: Second tool result (note: tool_call_id is "2") - String toolResultContent2 = - "- The capital of South Korea is Seoul.\n" - + "- The returned image can be found at: " - + imagePath - + "\n" - + "- The returned audio can be found at: " - + mockAudioPath; - DashScopeContentPart toolContent2 = DashScopeContentPart.text(toolResultContent2); + DashScopeContentPart toolContent2 = + DashScopeContentPart.text("The capital of South Korea is Seoul."); + DashScopeContentPart toolImageContent2 = DashScopeContentPart.image(absoluteImagePath); + DashScopeContentPart toolAudioContent2 = + DashScopeContentPart.audio("data:audio/wav;base64," + "ZmFrZSBhdWRpbyBjb250ZW50"); groundTruthMultiagent2.add( DashScopeMessage.builder() .role("tool") .toolCallId("2") .name("get_capital") - .content(List.of(toolContent2)) + .content(List.of(toolContent2, toolImageContent2, toolAudioContent2)) .build()); // Message 8: Final user with last response in history diff --git a/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java b/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java index 5087826c0..934171d8f 100644 --- a/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java +++ b/agentscope-core/src/test/java/io/agentscope/core/formatter/dashscope/DashScopeMessageConverterTest.java @@ -563,4 +563,118 @@ void testToolCallFallbackToInputMapWhenContentEmpty() { assertTrue(args.contains("city")); assertTrue(args.contains("Shanghai")); } + + @Test + void testToolResultWithImageBlock() { + DashScopeMessageConverter conv = + new DashScopeMessageConverter( + blocks -> { + StringBuilder sb = new StringBuilder(); + for (ContentBlock block : blocks) { + if (block instanceof TextBlock tb) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append(tb.getText()); + } else if (block instanceof ImageBlock) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append("The returned image can be found at: /tmp/test.png"); + } + } + return sb.toString(); + }); + + ToolResultBlock toolResult = + ToolResultBlock.builder() + .id("call_123") + .name("get_image") + .output( + List.of( + TextBlock.builder().text("Here is a cat image").build(), + ImageBlock.builder() + .source( + URLSource.builder() + .url( + "https://agentscope-test.oss-cn-beijing.aliyuncs.com/Cat03.jpg") + .build()) + .build())) + .build(); + + Msg msg = Msg.builder().role(MsgRole.TOOL).content(List.of(toolResult)).build(); + DashScopeMessage dsMsg = conv.convertToMessage(msg, true); + + assertEquals("tool", dsMsg.getRole()); + assertEquals("call_123", dsMsg.getToolCallId()); + assertTrue(dsMsg.isMultimodal()); + assertEquals(2, dsMsg.getContentAsList().size()); + } + + @Test + void testToolResultWithTextImageAudioBlocks() { + // Test that tool result with TextBlock + ImageBlock + AudioBlock returns 3 content parts + DashScopeMessageConverter conv = + new DashScopeMessageConverter( + blocks -> { + StringBuilder sb = new StringBuilder(); + for (ContentBlock block : blocks) { + if (block instanceof TextBlock tb) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append(tb.getText()); + } else if (block instanceof ImageBlock) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append("The returned image can be found at: /tmp/test.png"); + } else if (block instanceof AudioBlock) { + if (!sb.isEmpty()) { + sb.append("\n"); + } + sb.append("The returned audio can be found at: /tmp/test.wav"); + } + } + return sb.toString(); + }); + + ToolResultBlock toolResult = + ToolResultBlock.builder() + .id("call_multi_media") + .name("get_multimodal") + .output( + List.of( + TextBlock.builder() + .text("The capital of Japan is Tokyo.") + .build(), + ImageBlock.builder() + .source( + URLSource.builder() + .url( + "https://example.com/image.png") + .build()) + .build(), + AudioBlock.builder() + .source( + URLSource.builder() + .url( + "https://example.com/audio.wav") + .build()) + .build())) + .build(); + + Msg msg = Msg.builder().role(MsgRole.TOOL).content(List.of(toolResult)).build(); + DashScopeMessage dsMsg = conv.convertToMessage(msg, true); + + assertEquals("tool", dsMsg.getRole()); + assertEquals("call_multi_media", dsMsg.getToolCallId()); + assertEquals("get_multimodal", dsMsg.getName()); + assertTrue(dsMsg.isMultimodal()); + // Should preserve all 3 content parts: text, image, audio + assertEquals(3, dsMsg.getContentAsList().size()); + assertEquals("The capital of Japan is Tokyo.", dsMsg.getContentAsList().get(0).getText()); + assertEquals("https://example.com/image.png", dsMsg.getContentAsList().get(1).getImage()); + assertEquals("https://example.com/audio.wav", dsMsg.getContentAsList().get(2).getAudio()); + } }