Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 0 additions & 137 deletions vite-app/dist/assets/index-10cZ11iB.js

This file was deleted.

1 change: 0 additions & 1 deletion vite-app/dist/assets/index-10cZ11iB.js.map

This file was deleted.

75 changes: 75 additions & 0 deletions vite-app/dist/assets/index-DFeF7AG_.js

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions vite-app/dist/assets/index-DFeF7AG_.js.map

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion vite-app/dist/assets/index-DOD73Wyg.css

This file was deleted.

1 change: 1 addition & 0 deletions vite-app/dist/assets/index-DvKW7FQL.css

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions vite-app/dist/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>EP | Log Viewer</title>
<link rel="icon" href="/assets/favicon-BkAAWQga.png" />
<script type="module" crossorigin src="/assets/index-10cZ11iB.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-DOD73Wyg.css">
<script type="module" crossorigin src="/assets/index-DFeF7AG_.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-DvKW7FQL.css">
</head>
<body>
<div id="root"></div>
Expand Down
74 changes: 71 additions & 3 deletions vite-app/src/components/EvaluationRow.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -337,9 +337,77 @@ const ToolsSection = observer(
)
);

function buildToolDeclareContent(tools: EvaluationRowType["tools"]): string {
if (!tools?.length) return "";
const blocks = tools
.map((tool) => {
const fn = (tool as any)?.function || {};
const properties = fn.parameters?.properties || {};
const actionEnum = Array.isArray(properties.action?.enum)
? properties.action.enum.map((value: string) => `"${value}"`).join(" | ")
: "string";
return [
`// ${fn.description || "Tool declaration."}`,
`type ${fn.name || "tool"} = (_: {`,
` // ${properties.action?.description || "Tool argument."}`,
` action: ${actionEnum},`,
" [k: string]: never",
Comment on lines +353 to +354

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Generate tool declaration from actual function parameters

The synthesized tool signature is hard-coded to an action argument and then forbids all other keys via [k: string]: never, regardless of each tool’s real schema. For tools whose parameters are not action-based (e.g. get_weather(location, unit) in tests/pytest/data/function_calling.jsonl), the displayed declaration is incorrect and can’t represent valid calls, which defeats the new “prompt-faithful” transcript behavior.

Useful? React with 👍 / 👎.

"}) => any;",
].join("\n");
})
.join("\n");

return `# Tools\n\n## functions\nnamespace functions {\n${blocks}\n}`;
}

function buildPromptFaithfulMessages(
messages: EvaluationRowType["messages"],
tools: EvaluationRowType["tools"]
): EvaluationRowType["messages"] {
const toolDeclareContent = buildToolDeclareContent(tools);
if (!toolDeclareContent) return messages;
const nextMessages = [...(messages || [])];
const firstSystemIdx = nextMessages.findIndex(
(message) => message?.role === "system"
);
if (firstSystemIdx === -1) {
return [{ role: "system", content: toolDeclareContent } as any, ...nextMessages];
}

const firstSystem = nextMessages[firstSystemIdx] as any;
const existingContent =
typeof firstSystem?.content === "string"
? firstSystem.content
: Array.isArray(firstSystem?.content)
? firstSystem.content
.map((part: any) => {
if (part?.type === "text") return part.text || "";
if (part?.type === "image_url") return "[Image]";
return JSON.stringify(part);
})
.join("")
: firstSystem?.content != null
? JSON.stringify(firstSystem.content)
: "";

nextMessages[firstSystemIdx] = {
...firstSystem,
content: existingContent
? `${toolDeclareContent}\n\n${existingContent}`
: toolDeclareContent,
} as any;
return nextMessages;
}

const ChatInterfaceSection = observer(
({ messages }: { messages: EvaluationRowType["messages"] }) => (
<ChatInterface messages={messages} />
({
messages,
tools,
}: {
messages: EvaluationRowType["messages"];
tools: EvaluationRowType["tools"];
}) => (
<ChatInterface messages={buildPromptFaithfulMessages(messages, tools)} />
)
);

Expand Down Expand Up @@ -376,7 +444,7 @@ const ExpandedContent = observer(
<div className="flex gap-3 w-fit">
{/* Left Column - Chat Interface */}
<div className="min-w-0">
<ChatInterfaceSection messages={messages} />
<ChatInterfaceSection messages={messages} tools={tools} />
</div>

{/* Token Debug Column */}
Expand Down
10 changes: 8 additions & 2 deletions vite-app/src/components/MessageBubble.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,12 @@ export const MessageBubble = ({ message }: { message: Message }) => {
const isTool = message.role === "tool";
const hasToolCalls = message.tool_calls && message.tool_calls.length > 0;
const hasFunctionCall = message.function_call;
const hideMessageContent = message.role === "assistant" && hasToolCalls;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Preserve assistant text when tool_calls are present

This unconditionally suppresses assistant content for any message that has tool_calls, so renderContent() returns null even when the content contains meaningful text (for example, many recorded trajectories include <think>...</think> in assistant turns alongside tool calls, such as examples/cliff_walking_mcp/tests/recordings/production_trajectory.jsonl). In those cases the chat transcript loses the assistant’s reasoning/context entirely, which makes rollout review inaccurate; the hide logic should only apply to payload-only duplicates, not all tool-call turns.

Useful? React with 👍 / 👎.


// Get the message content as a string
const reasoning = (message as any).reasoning_content as string | undefined;
const titleLabel =
message.role === "system" && message.name ? message.name : message.role;
const getMessageContent = () => {
if (typeof message.content === "string") {
return message.content;
Expand All @@ -33,11 +36,14 @@ export const MessageBubble = ({ message }: { message: Message }) => {
}
};

const messageContent = getMessageContent();
const messageContent = hideMessageContent ? "" : getMessageContent();
const hasMessageContent = messageContent.trim().length > 0;
const isLongMessage = messageContent.length > 200; // Threshold for considering a message "long"

const renderContent = () => {
if (hideMessageContent) {
return null;
}
if (typeof message.content === "string") {
return isLongMessage && !isExpanded
? message.content.substring(0, 200) + "..."
Expand Down Expand Up @@ -161,7 +167,7 @@ export const MessageBubble = ({ message }: { message: Message }) => {
hasMessageContent ? "pr-8" : ""
}`}
>
{message.role}
{titleLabel}
</div>
<div className="whitespace-pre-wrap break-words overflow-hidden text-xs">
{renderContent()}
Expand Down
Loading