Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,12 @@ private DashScopeMessage convertToMultimodalContent(Msg msg) {
private DashScopeMessage convertToolRoleMessage(Msg msg) {
ToolResultBlock toolResult = msg.getFirstContentBlock(ToolResultBlock.class);
if (toolResult != null) {
String toolResultText = toolResultConverter.apply(toolResult.getOutput());
List<DashScopeContentPart> content = new ArrayList<>();
content.add(DashScopeContentPart.text(toolResultText));
List<DashScopeContentPart> content =
hasMediaContent(toolResult.getOutput())
? convertContentBlocks(toolResult.getOutput())
: List.of(
DashScopeContentPart.text(
toolResultConverter.apply(toolResult.getOutput())));

return DashScopeMessage.builder()
.role("tool")
Expand All @@ -181,9 +184,10 @@ private DashScopeMessage convertToolRoleMessage(Msg msg) {
}

// Fallback: no ToolResultBlock found, use text content
List<DashScopeContentPart> content = new ArrayList<>();
content.add(DashScopeContentPart.text(extractTextContent(msg)));
return DashScopeMessage.builder().role("tool").content(content).build();
return DashScopeMessage.builder()
.role("tool")
.content(List.of(DashScopeContentPart.text(extractTextContent(msg))))
.build();
}

/**
Expand Down Expand Up @@ -260,4 +264,58 @@ private void applyCacheControlFromMetadata(Msg msg, DashScopeMessage result) {
result.setCacheControl(DashScopeChatFormatter.getEphemeralCacheControl());
}
}

/**
* Check if blocks contain media content (image, audio, video).
*
* @param blocks the list of content blocks to check
* @return true if any block is ImageBlock, AudioBlock, or VideoBlock
*/
private boolean hasMediaContent(List<ContentBlock> blocks) {
if (blocks == null) {
return false;
}
for (ContentBlock block : blocks) {
if (block instanceof ImageBlock
|| block instanceof AudioBlock
|| block instanceof VideoBlock) {
return true;
}
}
return false;
}

/**
* Convert content blocks to DashScope content parts for multimodal messages.
*
* @param blocks the list of content blocks to convert
* @return the converted list of DashScopeContentPart
*/
private List<DashScopeContentPart> convertContentBlocks(List<ContentBlock> blocks) {
List<DashScopeContentPart> content = new ArrayList<>();
for (ContentBlock block : blocks) {
if (block instanceof TextBlock tb) {
content.add(DashScopeContentPart.text(tb.getText()));
} else if (block instanceof ImageBlock ib) {
try {
content.add(mediaConverter.convertImageBlockToContentPart(ib));
} catch (Exception e) {
log.warn("Failed to process ImageBlock in tool result: {}", e.getMessage());
}
} else if (block instanceof AudioBlock ab) {
try {
content.add(mediaConverter.convertAudioBlockToContentPart(ab));
} catch (Exception e) {
log.warn("Failed to process AudioBlock in tool result: {}", e.getMessage());
}
} else if (block instanceof VideoBlock vb) {
try {
content.add(mediaConverter.convertVideoBlockToContentPart(vb));
} catch (Exception e) {
log.warn("Failed to process VideoBlock in tool result: {}", e.getMessage());
}
}
}
return content;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -341,13 +341,8 @@ private static void buildGroundTruth() {
.build());

// Message 8: Tool result
String toolResultContent =
"- The capital of Japan is Tokyo.\n"
+ "- The returned image can be found at: "
+ imagePath
+ "\n"
+ "- The returned audio can be found at: "
+ mockAudioPath;
File imageFile2 = new File(imagePath);
String absoluteImagePath2 = "file://" + imageFile2.getAbsolutePath();
groundTruthChat.add(
DashScopeMessage.builder()
.role("tool")
Expand All @@ -356,7 +351,15 @@ private static void buildGroundTruth() {
.content(
List.of(
DashScopeContentPart.builder()
.text(toolResultContent)
.text("The capital of Japan is Tokyo.")
.build(),
DashScopeContentPart.builder()
.image(absoluteImagePath2)
.build(),
DashScopeContentPart.builder()
.audio(
"data:audio/wav;base64,"
+ "ZmFrZSBhdWRpbyBjb250ZW50")
.build()))
.build());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,20 +391,18 @@ private static void buildGroundTruth() {
.build());

// Message 4: Tool result
String toolResultContent =
"- The capital of Japan is Tokyo.\n"
+ "- The returned image can be found at: "
+ imagePath
+ "\n"
+ "- The returned audio can be found at: "
+ mockAudioPath;
DashScopeContentPart toolContent = DashScopeContentPart.text(toolResultContent);
List<DashScopeContentPart> toolResultParts =
List.of(
DashScopeContentPart.text("The capital of Japan is Tokyo."),
DashScopeContentPart.image(absoluteImagePath),
DashScopeContentPart.audio(
"data:audio/wav;base64," + "ZmFrZSBhdWRpbyBjb250ZW50"));
groundTruthMultiagent.add(
DashScopeMessage.builder()
.role("tool")
.toolCallId("1")
.name("get_capital")
.content(List.of(toolContent))
.content(toolResultParts)
.build());

// Message 5: User with assistant response in history
Expand Down Expand Up @@ -435,7 +433,7 @@ private static void buildGroundTruth() {
.role("tool")
.toolCallId("1")
.name("get_capital")
.content(List.of(toolContent))
.content(toolResultParts)
.build());

// Message 4: User with history
Expand Down Expand Up @@ -475,7 +473,7 @@ private static void buildGroundTruth() {
.role("tool")
.toolCallId("1")
.name("get_capital")
.content(List.of(toolContent))
.content(toolResultParts)
.build());

// Message 5: User with updated history including second conversation
Expand Down Expand Up @@ -508,20 +506,17 @@ private static void buildGroundTruth() {
.build());

// Message 7: Second tool result (note: tool_call_id is "2")
String toolResultContent2 =
"- The capital of South Korea is Seoul.\n"
+ "- The returned image can be found at: "
+ imagePath
+ "\n"
+ "- The returned audio can be found at: "
+ mockAudioPath;
DashScopeContentPart toolContent2 = DashScopeContentPart.text(toolResultContent2);
DashScopeContentPart toolContent2 =
DashScopeContentPart.text("The capital of South Korea is Seoul.");
DashScopeContentPart toolImageContent2 = DashScopeContentPart.image(absoluteImagePath);
DashScopeContentPart toolAudioContent2 =
DashScopeContentPart.audio("data:audio/wav;base64," + "ZmFrZSBhdWRpbyBjb250ZW50");
groundTruthMultiagent2.add(
DashScopeMessage.builder()
.role("tool")
.toolCallId("2")
.name("get_capital")
.content(List.of(toolContent2))
.content(List.of(toolContent2, toolImageContent2, toolAudioContent2))
.build());

// Message 8: Final user with last response in history
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -563,4 +563,118 @@ void testToolCallFallbackToInputMapWhenContentEmpty() {
assertTrue(args.contains("city"));
assertTrue(args.contains("Shanghai"));
}

@Test
void testToolResultWithImageBlock() {
DashScopeMessageConverter conv =
new DashScopeMessageConverter(
blocks -> {
StringBuilder sb = new StringBuilder();
for (ContentBlock block : blocks) {
if (block instanceof TextBlock tb) {
if (!sb.isEmpty()) {
sb.append("\n");
}
sb.append(tb.getText());
} else if (block instanceof ImageBlock) {
if (!sb.isEmpty()) {
sb.append("\n");
}
sb.append("The returned image can be found at: /tmp/test.png");
}
}
return sb.toString();
});

ToolResultBlock toolResult =
ToolResultBlock.builder()
.id("call_123")
.name("get_image")
.output(
List.of(
TextBlock.builder().text("Here is a cat image").build(),
ImageBlock.builder()
.source(
URLSource.builder()
.url(
"https://agentscope-test.oss-cn-beijing.aliyuncs.com/Cat03.jpg")
.build())
.build()))
.build();

Msg msg = Msg.builder().role(MsgRole.TOOL).content(List.of(toolResult)).build();
DashScopeMessage dsMsg = conv.convertToMessage(msg, true);

assertEquals("tool", dsMsg.getRole());
assertEquals("call_123", dsMsg.getToolCallId());
assertTrue(dsMsg.isMultimodal());
assertEquals(2, dsMsg.getContentAsList().size());
}

@Test
void testToolResultWithTextImageAudioBlocks() {
// Test that tool result with TextBlock + ImageBlock + AudioBlock returns 3 content parts
DashScopeMessageConverter conv =
new DashScopeMessageConverter(
blocks -> {
StringBuilder sb = new StringBuilder();
for (ContentBlock block : blocks) {
if (block instanceof TextBlock tb) {
if (!sb.isEmpty()) {
sb.append("\n");
}
sb.append(tb.getText());
} else if (block instanceof ImageBlock) {
if (!sb.isEmpty()) {
sb.append("\n");
}
sb.append("The returned image can be found at: /tmp/test.png");
} else if (block instanceof AudioBlock) {
if (!sb.isEmpty()) {
sb.append("\n");
}
sb.append("The returned audio can be found at: /tmp/test.wav");
}
}
return sb.toString();
});

ToolResultBlock toolResult =
ToolResultBlock.builder()
.id("call_multi_media")
.name("get_multimodal")
.output(
List.of(
TextBlock.builder()
.text("The capital of Japan is Tokyo.")
.build(),
ImageBlock.builder()
.source(
URLSource.builder()
.url(
"https://example.com/image.png")
.build())
.build(),
AudioBlock.builder()
.source(
URLSource.builder()
.url(
"https://example.com/audio.wav")
.build())
.build()))
.build();

Msg msg = Msg.builder().role(MsgRole.TOOL).content(List.of(toolResult)).build();
DashScopeMessage dsMsg = conv.convertToMessage(msg, true);

assertEquals("tool", dsMsg.getRole());
assertEquals("call_multi_media", dsMsg.getToolCallId());
assertEquals("get_multimodal", dsMsg.getName());
assertTrue(dsMsg.isMultimodal());
// Should preserve all 3 content parts: text, image, audio
assertEquals(3, dsMsg.getContentAsList().size());
assertEquals("The capital of Japan is Tokyo.", dsMsg.getContentAsList().get(0).getText());
assertEquals("https://example.com/image.png", dsMsg.getContentAsList().get(1).getImage());
assertEquals("https://example.com/audio.wav", dsMsg.getContentAsList().get(2).getAudio());
}
}
Loading