diff --git a/packages/core/src/adapters/__tests__/session-runtime.test.ts b/packages/core/src/adapters/__tests__/session-runtime.test.ts index 647f014..b5d962b 100644 --- a/packages/core/src/adapters/__tests__/session-runtime.test.ts +++ b/packages/core/src/adapters/__tests__/session-runtime.test.ts @@ -224,7 +224,7 @@ describe('session-runtime adapters', () => { topologyContextProvider: provider, }); await runtime.send('hi'); - const [, opts] = session.send.mock.calls[0]; + const [, opts] = session.send.mock.calls[0]!; expect(opts).not.toHaveProperty('topologyContext'); }); @@ -240,7 +240,7 @@ describe('session-runtime adapters', () => { topologyContextProvider: async () => '', }); await runtime.send('hi'); - const [, opts] = session.send.mock.calls[0]; + const [, opts] = session.send.mock.calls[0]!; expect(opts).not.toHaveProperty('topologyContext'); }); diff --git a/packages/core/src/adapters/session-runtime.ts b/packages/core/src/adapters/session-runtime.ts index a104ef1..e873891 100644 --- a/packages/core/src/adapters/session-runtime.ts +++ b/packages/core/src/adapters/session-runtime.ts @@ -1,4 +1,4 @@ -import type { ForkContextFn, LLMAdapter, LLMCompleteOptions } from '@stello-ai/session'; +import type { ForkContextFn, LLMAdapter, LLMCompleteOptions, SessionInput } from '@stello-ai/session'; import type { EngineRuntimeSession } from '../engine/stello-engine'; import type { ToolCallParser } from '../engine/turn-runner'; @@ -24,6 +24,8 @@ export interface SessionCompatibleSendResult { }; } +export type SessionCompatibleInput = string | SessionInput; + /** 结构兼容 @stello-ai/session 的 consolidate 函数签名 */ export type SessionCompatibleConsolidateFn = ( currentMemory: string | null, @@ -67,11 +69,11 @@ export interface SessionCompatible { status: 'active' | 'archived'; }; send( - content: string, + content: SessionCompatibleInput, options?: SessionCompatibleSendOptions, ): Promise; stream?( - content: string, + content: SessionCompatibleInput, options?: SessionCompatibleSendOptions, ): AsyncIterable & { result: Promise }; messages(): Promise>; @@ -170,7 +172,7 @@ export async function adaptSessionToEngineRuntime( get turnCount() { return turnCount; }, - async send(input: string, sendOptions?: SessionCompatibleSendOptions): Promise { + async send(input: SessionCompatibleInput, sendOptions?: SessionCompatibleSendOptions): Promise { const sharedMemoryContext = await options.sharedMemoryContextProvider?.(); const topologyContext = await options.topologyContextProvider?.(session.meta.id); const mergedOptions: SessionCompatibleSendOptions = { @@ -193,7 +195,7 @@ export async function adaptSessionToEngineRuntime( }, ...(session.stream ? { - stream(input: string, sendOptions?: SessionCompatibleSendOptions) { + stream(input: SessionCompatibleInput, sendOptions?: SessionCompatibleSendOptions) { const contextPromise = options.sharedMemoryContextProvider?.() ?? Promise.resolve(undefined); const topologyPromise = options.topologyContextProvider?.(session.meta.id) ?? Promise.resolve(undefined); const source = (async () => { @@ -234,4 +236,3 @@ export async function adaptSessionToEngineRuntime( : {}), }; } - diff --git a/packages/core/src/agent/stello-agent.ts b/packages/core/src/agent/stello-agent.ts index 349f165..37d27d8 100644 --- a/packages/core/src/agent/stello-agent.ts +++ b/packages/core/src/agent/stello-agent.ts @@ -1,5 +1,5 @@ import type { BootstrapResult } from '../types/lifecycle'; -import { TurnRunner, type ToolCallParser, type TurnRunnerOptions } from '../engine/turn-runner'; +import { TurnRunner, type ToolCallParser, type TurnInput, type TurnRunnerOptions } from '../engine/turn-runner'; import type { EngineTurnResult } from '../engine/stello-engine'; import type { EngineStreamResult } from '../engine/stello-engine'; import { @@ -447,7 +447,7 @@ export class StelloAgent { /** 在指定 session 上运行一轮对话 */ turn( sessionId: string, - input: string, + input: TurnInput, options?: TurnRunnerOptions, ): Promise { return this.orchestrator.turn(sessionId, input, options); @@ -456,7 +456,7 @@ export class StelloAgent { /** 在指定 session 上流式运行一轮对话 */ stream( sessionId: string, - input: string, + input: TurnInput, options?: TurnRunnerOptions, ): Promise { return this.orchestrator.stream(sessionId, input, options); diff --git a/packages/core/src/engine/__tests__/topology-render.test.ts b/packages/core/src/engine/__tests__/topology-render.test.ts index b6ad5f0..4a79aed 100644 --- a/packages/core/src/engine/__tests__/topology-render.test.ts +++ b/packages/core/src/engine/__tests__/topology-render.test.ts @@ -3,7 +3,7 @@ import type { SessionTreeNode } from '../../types/session.js'; import { renderTopologyMarkdown } from '../topology-render.js'; function node(id: string, label: string, children: SessionTreeNode[] = [], status: 'active' | 'archived' = 'active'): SessionTreeNode { - return { id, label, parentId: null, status, children } as SessionTreeNode; + return { id, label, parentId: null, status, children, turnCount: 0 } as SessionTreeNode; } describe('renderTopologyMarkdown', () => { diff --git a/packages/core/src/engine/stello-engine.ts b/packages/core/src/engine/stello-engine.ts index d3ba168..916b176 100644 --- a/packages/core/src/engine/stello-engine.ts +++ b/packages/core/src/engine/stello-engine.ts @@ -33,8 +33,14 @@ import { type TurnRunnerOptions, type TurnRunnerResult, type TurnRunnerStreamResult, + type TurnInput, } from './turn-runner'; + +function turnInputText(input: TurnInput): string { + return typeof input === 'string' ? input : input.text; +} + /** Engine 调用 session.send/stream 时的运行时选项 */ export interface EngineRuntimeSessionCallOptions { /** AbortSignal — 透传给底层 session.send/stream 与 LLM 调用 */ @@ -54,10 +60,10 @@ export interface EngineRuntimeSession { /** 当前已完成轮次 */ turnCount: number; /** 运行一次单条对话 */ - send(input: string, options?: EngineRuntimeSessionCallOptions): Promise; + send(input: TurnInput, options?: EngineRuntimeSessionCallOptions): Promise; /** 可选:流式运行一次单条对话 */ stream?( - input: string, + input: TurnInput, options?: EngineRuntimeSessionCallOptions, ): AsyncIterable & { result: Promise }; /** fork 子 session,返回子 session 的 runtime */ @@ -216,9 +222,10 @@ export class StelloEngineImpl implements StelloEngine { } /** 处理一轮编排:当前 session send + tool loop + 调度 */ - async turn(input: string, options?: TurnRunnerOptions): Promise { - this.fireHook('onMessageReceived', { sessionId: this.session.id, input }); - this.fireHook('onRoundStart', { sessionId: this.session.id, input }); + async turn(input: TurnInput, options?: TurnRunnerOptions): Promise { + const inputText = turnInputText(input); + this.fireHook('onMessageReceived', { sessionId: this.session.id, input: inputText }); + this.fireHook('onRoundStart', { sessionId: this.session.id, input: inputText }); let turn: TurnRunnerResult; try { turn = await this.turnRunner.run(this.session, input, this, { @@ -238,20 +245,21 @@ export class StelloEngineImpl implements StelloEngine { } this.fireHook('onAssistantReply', { sessionId: this.session.id, - input, + input: inputText, content: turn.finalContent, rawResponse: turn.rawResponse, }); this.fireHook('onRoundEnd', { sessionId: this.session.id, - input, + input: inputText, turn, }); return { turn }; } /** 流式处理一轮编排:先输出增量文本,完成后再返回完整 turn */ - stream(input: string, options?: TurnRunnerOptions): EngineStreamResult { + stream(input: TurnInput, options?: TurnRunnerOptions): EngineStreamResult { + const inputText = turnInputText(input); const source: TurnRunnerStreamResult = this.turnRunner.runStream(this.session, input, this, { ...options, onToolCall: (toolCall) => { @@ -265,8 +273,8 @@ export class StelloEngineImpl implements StelloEngine { }); const result = (async () => { - this.fireHook('onMessageReceived', { sessionId: this.session.id, input }); - this.fireHook('onRoundStart', { sessionId: this.session.id, input }); + this.fireHook('onMessageReceived', { sessionId: this.session.id, input: inputText }); + this.fireHook('onRoundStart', { sessionId: this.session.id, input: inputText }); let turn: TurnRunnerResult; try { @@ -278,13 +286,13 @@ export class StelloEngineImpl implements StelloEngine { this.fireHook('onAssistantReply', { sessionId: this.session.id, - input, + input: inputText, content: turn.finalContent, rawResponse: turn.rawResponse, }); this.fireHook('onRoundEnd', { sessionId: this.session.id, - input, + input: inputText, turn, }); return { turn }; diff --git a/packages/core/src/engine/turn-runner.ts b/packages/core/src/engine/turn-runner.ts index 85ac3e3..e085a8d 100644 --- a/packages/core/src/engine/turn-runner.ts +++ b/packages/core/src/engine/turn-runner.ts @@ -1,5 +1,8 @@ +import type { SessionInput } from '@stello-ai/session'; import type { ToolExecutionResult } from '../types/lifecycle'; +export type TurnInput = string | SessionInput; + /** * 在单轮内并行执行所有 tool call,按输入顺序整理结果并按序触发事件。 * @@ -90,10 +93,10 @@ export interface TurnRunnerSession { /** Session 标识 */ id: string; /** 执行一次单条对话 */ - send(input: string, options?: TurnRunnerSessionCallOptions): Promise; + send(input: TurnInput, options?: TurnRunnerSessionCallOptions): Promise; /** 可选:流式执行一次单条对话 */ stream?( - input: string, + input: TurnInput, options?: TurnRunnerSessionCallOptions, ): AsyncIterable & { result: Promise }; } @@ -185,12 +188,12 @@ export class TurnRunner { */ async run( session: TurnRunnerSession, - input: string, + input: TurnInput, tools: TurnRunnerToolExecutor, options: TurnRunnerOptions = {}, ): Promise { const maxToolRounds = options.maxToolRounds ?? 5; - let currentInput = input; + let currentInput: TurnInput = input; let toolRoundCount = 0; let toolCallsExecuted = 0; let lastRawResponse = ''; @@ -231,7 +234,7 @@ export class TurnRunner { */ runStream( session: TurnRunnerSession, - input: string, + input: TurnInput, tools: TurnRunnerToolExecutor, options: TurnRunnerOptions = {}, ): TurnRunnerStreamResult { diff --git a/packages/core/src/orchestrator/session-orchestrator.ts b/packages/core/src/orchestrator/session-orchestrator.ts index b7a7cc0..7b75cda 100644 --- a/packages/core/src/orchestrator/session-orchestrator.ts +++ b/packages/core/src/orchestrator/session-orchestrator.ts @@ -3,15 +3,15 @@ import type { BootstrapResult } from '../types/lifecycle'; import type { StelloEngine, EngineForkOptions } from '../types/engine'; import type { EngineTurnResult } from '../engine/stello-engine'; import type { EngineStreamResult } from '../engine/stello-engine'; -import type { TurnRunnerOptions } from '../engine/turn-runner'; +import type { TurnInput, TurnRunnerOptions } from '../engine/turn-runner'; import type { EngineRuntimeManager } from './engine-runtime-manager'; /** Orchestrator 对 Engine 的最小依赖 */ export interface OrchestratorEngine extends StelloEngine { /** 运行当前 session 的一轮对话 */ - turn(input: string, options?: TurnRunnerOptions): Promise; + turn(input: TurnInput, options?: TurnRunnerOptions): Promise; /** 流式运行当前 session 的一轮对话 */ - stream(input: string, options?: TurnRunnerOptions): EngineStreamResult; + stream(input: TurnInput, options?: TurnRunnerOptions): EngineStreamResult; /** 归档当前绑定 session */ archiveSession(): Promise<{ sessionId: string }>; /** 从当前绑定 session 发起 fork */ @@ -59,7 +59,7 @@ export class SessionOrchestrator { /** 在指定 session 上运行一轮对话 */ async turn( sessionId: string, - input: string, + input: TurnInput, options?: TurnRunnerOptions, ): Promise { return this.runSerial(sessionId, async () => { @@ -71,7 +71,7 @@ export class SessionOrchestrator { /** 在指定 session 上流式运行一轮对话 */ async stream( sessionId: string, - input: string, + input: TurnInput, options?: TurnRunnerOptions, ): Promise { await this.requireSession(sessionId) diff --git a/packages/core/src/types/engine.ts b/packages/core/src/types/engine.ts index d485e00..8919d56 100644 --- a/packages/core/src/types/engine.ts +++ b/packages/core/src/types/engine.ts @@ -13,7 +13,7 @@ import type { ConfirmProtocol, } from './lifecycle'; import type { EngineRuntimeSession, EngineStreamResult, EngineTurnResult } from '../engine/stello-engine'; -import type { TurnRunnerOptions } from '../engine/turn-runner'; +import type { TurnInput, TurnRunnerOptions } from '../engine/turn-runner'; import type { ForkContextFn } from '@stello-ai/session'; import type { SessionConfig } from './session-config'; @@ -85,9 +85,9 @@ export interface StelloEngine { /** 离开当前绑定 Session 的整轮对话 */ leaveSession(): Promise<{ sessionId: string }>; /** 流式处理当前绑定 Session 的一轮对话 */ - stream(input: string, options?: TurnRunnerOptions): EngineStreamResult; + stream(input: TurnInput, options?: TurnRunnerOptions): EngineStreamResult; /** 非流式处理当前绑定 Session 的一轮对话 */ - turn?(input: string, options?: TurnRunnerOptions): Promise; + turn?(input: TurnInput, options?: TurnRunnerOptions): Promise; /** 导出 Agent Tool 定义(兼容 OpenAI / Claude tool use) */ getToolDefinitions(): ToolDefinition[]; /** 执行 Agent Tool */ diff --git a/packages/session/src/__tests__/openai-compatible.test.ts b/packages/session/src/__tests__/openai-compatible.test.ts index 4355a38..6b7644d 100644 --- a/packages/session/src/__tests__/openai-compatible.test.ts +++ b/packages/session/src/__tests__/openai-compatible.test.ts @@ -166,6 +166,113 @@ describe('createOpenAICompatibleAdapter', () => { expect(result.reasoningContent).toBeUndefined() }) + it('StepFun 3.7 将 image/video parts 转成 Chat Completions 多模态 content', async () => { + const adapter = createOpenAICompatibleAdapter({ + apiKey: 'test-key', + baseURL: 'https://api.stepfun.com/v1', + model: 'step-3.7-flash', + maxContextTokens: 128_000, + }) + + await adapter.complete([{ role: 'user', content: '看一下', parts: [ + { kind: 'image', source: { type: 'url', url: 'https://example.com/a.png' }, detail: 'high' }, + { kind: 'video', source: { type: 'url', url: 'https://example.com/a.mp4' } }, + ] }]) + + const sentMessages = createCompletion.mock.calls[0]![0].messages + expect(sentMessages[0]).toEqual({ + role: 'user', + content: [ + { type: 'text', text: '看一下' }, + { type: 'image_url', image_url: { url: 'https://example.com/a.png', detail: 'high' } }, + { type: 'video_url', video_url: { url: 'https://example.com/a.mp4' } }, + ], + }) + }) + it('StepFun 3.7 多模态能力不绑定固定 baseURL', async () => { + const adapter = createOpenAICompatibleAdapter({ + apiKey: 'test-key', + baseURL: 'https://api.stepfun.com/step_plan/v1', + model: 'step-3.7-flash', + maxContextTokens: 128_000, + }) + + await adapter.complete([{ role: 'user', content: '描述图片', parts: [ + { kind: 'image', source: { type: 'url', url: 'https://example.com/a.png' } }, + ] }]) + + expect(createCompletion.mock.calls[0]![0].messages[0].content).toEqual([ + { type: 'text', text: '描述图片' }, + { type: 'image_url', image_url: { url: 'https://example.com/a.png' } }, + ]) + }) + it('StepFun 3.7 支持 data URL 与 stepfile provider_file', async () => { + const adapter = createOpenAICompatibleAdapter({ + apiKey: 'test-key', + baseURL: 'https://api.stepfun.com/step_plan/v1', + model: 'step-3.7-flash', + maxContextTokens: 128_000, + }) + + await adapter.complete([{ role: 'user', content: '比较', parts: [ + { kind: 'image', source: { type: 'data', mediaType: 'image/png', data: 'abc123' } }, + { kind: 'video', source: { type: 'provider_file', provider: 'stepfun', fileId: 'file_123' } }, + ] }]) + + expect(createCompletion.mock.calls[0]![0].messages[0].content).toEqual([ + { type: 'text', text: '比较' }, + { type: 'image_url', image_url: { url: 'data:image/png;base64,abc123' } }, + { type: 'video_url', video_url: { url: 'stepfile://file_123' } }, + ]) + }) + + it('StepFun 3.7 将已解析 file part 转成文本块并保留原始用户问题', async () => { + const adapter = createOpenAICompatibleAdapter({ + apiKey: 'test-key', + baseURL: 'https://api.stepfun.com/step_plan/v1', + model: 'step-3.7-flash', + maxContextTokens: 128_000, + }) + + await adapter.complete([{ role: 'user', content: '总结重点', parts: [ + { + kind: 'file', + source: { type: 'kitkit_file', fileId: 'mmf_1', objectKey: 'multimodal/doc.pdf', backend: 's3' }, + filename: 'report.pdf', + mediaType: 'application/pdf', + extraction: { provider: 'stepfun', fileId: 'file-C0DD', status: 'success', content: '第一章:项目概况' }, + }, + ] }]) + + const sentMessages = createCompletion.mock.calls[0]![0].messages + expect(sentMessages[0].content).toEqual([ + { type: 'text', text: '总结重点' }, + { + type: 'text', + text: [ + '用户上传了文档:report.pdf', + '', + '第一章:项目概况', + '', + ].join('\n'), + }, + ]) + }) + + it('非 StepFun 3.7 模型收到 parts 时明确报错', async () => { + const adapter = createOpenAICompatibleAdapter({ + apiKey: 'test-key', + baseURL: 'https://api.example.com/v1', + model: 'other-model', + maxContextTokens: 128_000, + }) + + await expect(adapter.complete([{ role: 'user', content: '看图', parts: [ + { kind: 'image', source: { type: 'url', url: 'https://example.com/a.png' } }, + ] }])).rejects.toThrow('Multimodal content parts are only supported for StepFun step-3.7-flash') + expect(createCompletion).not.toHaveBeenCalled() + }) + it('signal 透传到 SDK request options', async () => { const adapter = createOpenAICompatibleAdapter({ apiKey: 'test-key', diff --git a/packages/session/src/__tests__/turn.test.ts b/packages/session/src/__tests__/turn.test.ts index 1a445c3..e8c9ebc 100644 --- a/packages/session/src/__tests__/turn.test.ts +++ b/packages/session/src/__tests__/turn.test.ts @@ -73,6 +73,37 @@ describe('send() 契约', () => { expect(secondCall[4]!.content).toBe('问题2') }) + it('send() 支持当前 turn 多模态 parts,持久化但不在后续历史中重复回放', async () => { + const capturedMessages: Message[][] = [] + const llm = createMockLLM([{ content: '看到了' }, { content: '继续' }]) + const originalComplete = llm.complete.bind(llm) + llm.complete = async (msgs, options) => { + capturedMessages.push(msgs.map((msg) => ({ ...msg, parts: msg.parts ? [...msg.parts] : undefined }))) + return originalComplete(msgs, options) + } + const { session } = await makeSession({ llm }) + const parts: Message['parts'] = [ + { kind: 'image', source: { type: 'url', url: 'https://example.com/a.png' }, detail: 'high' }, + ] + + await session.send({ text: '描述这张图', parts }) + + const firstUserMessage = capturedMessages[0]!.find((message) => message.role === 'user')! + expect(firstUserMessage.content).toBe('描述这张图') + expect(firstUserMessage.parts).toEqual(parts) + + const persistedAfterFirstTurn = await session.messages() + expect(persistedAfterFirstTurn[0]).toMatchObject({ role: 'user', content: '描述这张图', parts }) + + await session.send('继续分析') + + const secondCall = capturedMessages[1]! + const historicalUser = secondCall.find((message) => message.role === 'user' && message.content === '描述这张图')! + const currentUser = secondCall.find((message) => message.role === 'user' && message.content === '继续分析')! + expect(historicalUser.parts).toBeUndefined() + expect(currentUser.parts).toBeUndefined() + }) + it('send() 返回 toolCalls 时透传', async () => { const responseWithTools: LLMResult = { content: null, diff --git a/packages/session/src/adapters/openai-compatible.ts b/packages/session/src/adapters/openai-compatible.ts index 22ad1ee..e109da0 100644 --- a/packages/session/src/adapters/openai-compatible.ts +++ b/packages/session/src/adapters/openai-compatible.ts @@ -1,7 +1,7 @@ import OpenAI from 'openai' import type { ChatCompletion, ChatCompletionChunk } from 'openai/resources/chat/completions' import type { Stream } from 'openai/streaming' -import type { LLMAdapter, LLMResult, Message, LLMCompleteOptions } from '../types/llm.js' +import type { ContentPart, LLMAdapter, LLMResult, Message, LLMCompleteOptions } from '../types/llm.js' type ChatToolCallDelta = NonNullable< NonNullable[number] @@ -23,6 +23,8 @@ export interface OpenAICompatibleOptions { * 在中途被截断,引发上层 JSON 解析失败。 */ maxOutputTokens?: number + /** 将 KitKit 托管的多模态文件转成模型服务可访问的 URL。 */ + resolveMediaUrl?: (source: Extract['source'], { type: 'kitkit_file' }>) => string | Promise } /** 合并连续的 system 消息,兼容只接受单条 system 的提供方。 */ @@ -41,6 +43,89 @@ function mergeConsecutiveSystemMessages(messages: Message[]): Message[] { return merged } +function isStepFun37Flash(options: OpenAICompatibleOptions): boolean { + // StepFun 的不同套餐/入口可能使用不同 baseURL(如 /v1 与 /step_plan/v1)。 + // 多模态能力跟随模型名判断,不能把能力限定死在某一个 endpoint。 + return options.model === 'step-3.7-flash' +} + +function escapeDocumentAttribute(value: string): string { + return value.replace(/&/g, '&').replace(/"/g, '"').replace(//g, '>') +} + +function renderExtractedFilePart(part: Extract): string { + const filename = part.filename || '未命名文件' + const mediaType = part.mediaType || 'application/octet-stream' + const content = part.extraction?.content + if (!content) { + throw new Error('StepFun file content part requires extracted text content before reaching the OpenAI-compatible adapter') + } + return [ + `用户上传了文档:${filename}`, + ``, + content, + '', + ].join('\n') +} + +async function sourceToURL(part: Extract, options: OpenAICompatibleOptions): Promise { + const source = part.source + if (source.type === 'url') return source.url + if (source.type === 'data') return `data:${source.mediaType};base64,${source.data}` + if (source.type === 'provider_file') { + if (source.uri) return source.uri + if (source.provider === 'stepfun') return `stepfile://${source.fileId}` + throw new Error(`Unsupported provider_file source provider: ${source.provider}`) + } + if (source.type === 'kitkit_file') { + if (options.resolveMediaUrl) return options.resolveMediaUrl(source) + if (source.url && /^https?:\/\//.test(source.url)) return source.url + throw new Error('kitkit_file source must be converted to a model-readable URL before reaching the OpenAI-compatible adapter') + } + const unreachable = source as never + throw new Error(`Unsupported media source: ${JSON.stringify(unreachable)}`) +} + +async function toOpenAIContent(message: Message, allowMultimodal: boolean, options: OpenAICompatibleOptions): Promise>> { + if (!message.parts || message.parts.length === 0) return message.content + if (!allowMultimodal) { + throw new Error('Multimodal content parts are only supported for StepFun step-3.7-flash in this adapter') + } + if (message.role !== 'user') { + throw new Error(`Multimodal content parts are only supported on user messages, got role=${message.role}`) + } + + const blocks: Array> = [] + const hasTextPart = message.parts.some((part) => part.kind === 'text') + if (message.content && !hasTextPart) { + blocks.push({ type: 'text', text: message.content }) + } + + for (const part of message.parts) { + if (part.kind === 'text') { + blocks.push({ type: 'text', text: part.text }) + continue + } + if (part.kind === 'image') { + const imageUrl: Record = { url: await sourceToURL(part, options) } + if (part.detail && part.detail !== 'auto') imageUrl.detail = part.detail + blocks.push({ type: 'image_url', image_url: imageUrl }) + continue + } + if (part.kind === 'video') { + blocks.push({ type: 'video_url', video_url: { url: await sourceToURL(part, options) } }) + continue + } + if (part.kind === 'file') { + blocks.push({ type: 'text', text: renderExtractedFilePart(part) }) + continue + } + throw new Error(`Unsupported multimodal content part kind: ${part.kind}`) + } + + return blocks.length > 0 ? blocks : message.content +} + /** 创建 OpenAI 兼容协议的 LLMAdapter,可对接 MiniMax / DeepSeek / OpenAI 等 */ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions): LLMAdapter { const client = new OpenAI({ @@ -49,8 +134,9 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions): }) /** 构建公共请求参数 */ - function buildParams(messages: Message[], completeOptions?: LLMCompleteOptions) { + async function buildParams(messages: Message[], completeOptions?: LLMCompleteOptions) { const normalizedMessages = mergeConsecutiveSystemMessages(messages) + const allowMultimodal = isStepFun37Flash(options) return { model: options.model, max_tokens: completeOptions?.maxTokens ?? options.maxOutputTokens ?? 4096, @@ -67,9 +153,9 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions): })), } : {}), - messages: normalizedMessages.map((m) => ({ + messages: await Promise.all(normalizedMessages.map(async (m) => ({ role: m.role as 'system' | 'user' | 'assistant' | 'tool', - content: m.content, + content: await toOpenAIContent(m, allowMultimodal, options), ...(m.role === 'tool' && m.toolCallId ? { tool_call_id: m.toolCallId } : {}), ...(m.role === 'assistant' && m.reasoningContent ? { reasoning_content: m.reasoningContent } @@ -86,7 +172,7 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions): })), } : {}), - })), + }))), } } @@ -95,7 +181,7 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions): async complete(messages: Message[], completeOptions?: LLMCompleteOptions): Promise { const response = await client.chat.completions.create( { - ...buildParams(messages, completeOptions), + ...(await buildParams(messages, completeOptions)), ...(options.extraBody ?? {}), stream: false, } as Parameters[0], @@ -131,7 +217,7 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions): async *stream(messages: Message[], completeOptions?: LLMCompleteOptions) { const stream = await client.chat.completions.create( { - ...buildParams(messages, completeOptions), + ...(await buildParams(messages, completeOptions)), ...(options.extraBody ?? {}), stream: true, } as Parameters[0], diff --git a/packages/session/src/context-utils.ts b/packages/session/src/context-utils.ts index b7eb86b..f017770 100644 --- a/packages/session/src/context-utils.ts +++ b/packages/session/src/context-utils.ts @@ -45,6 +45,14 @@ export function removeIncompleteToolCallGroups(records: Message[]): Message[] { return result } +function stripMultimodalParts(messages: Message[]): Message[] { + return messages.map((message) => { + if (!message.parts) return message + const { parts: _parts, ...rest } = message + return rest + }) +} + /** 内置默认压缩提示词 */ const BUILTIN_COMPRESS_PROMPT = `你是对话压缩助手。请将以下对话历史压缩为一段简洁的摘要,保留关键上下文信息。 要求: @@ -181,6 +189,7 @@ export async function assembleSessionContext( label?: string, sharedMemoryContext?: string, topologyContext?: string, + userParts?: Message['parts'], ): Promise { const prefixMessages: Message[] = [] let insightConsumed = false @@ -212,10 +221,15 @@ export async function assembleSessionContext( } const userTimestamp = new Date().toISOString() - const userMessage: Message = { role: 'user', content: userContent, timestamp: userTimestamp } + const userMessage: Message = { + role: 'user', + content: userContent, + timestamp: userTimestamp, + ...(userParts && userParts.length > 0 ? { parts: userParts } : {}), + } // 净化历史:移除中断/崩溃残留的不完整 tool call 组,保证送给 LLM 的 prompt 协议合法 - const history = removeIncompleteToolCallGroups(await storage.listRecords(sessionId)) + const history = stripMultimodalParts(removeIncompleteToolCallGroups(await storage.listRecords(sessionId))) // 估算全量 token 数 const fullMessages = [...prefixMessages, ...history, userMessage] diff --git a/packages/session/src/create-session.ts b/packages/session/src/create-session.ts index 0fd5a55..304ebb0 100644 --- a/packages/session/src/create-session.ts +++ b/packages/session/src/create-session.ts @@ -1,5 +1,5 @@ import { randomUUID } from 'node:crypto' -import type { Session, MessageQueryOptions, SessionSendOptions } from './types/session-api.js' +import type { Session, MessageQueryOptions, SessionInput, SessionSendOptions } from './types/session-api.js' import { SessionArchivedError } from './types/session-api.js' import type { SessionMeta, SessionMetaUpdate, ForkOptions } from './types/session.js' import type { Message } from './types/llm.js' @@ -49,6 +49,14 @@ function attachTurnMetadata(records: Message[], turnId: string): Message[] { })) } +function normalizeSessionInput(input: string | SessionInput): { text: string; parts?: Message['parts'] } { + if (typeof input === 'string') return { text: input } + return { + text: input.text, + ...(input.parts && input.parts.length > 0 ? { parts: input.parts } : {}), + } +} + /** 把 tool 执行结果序列化为 tool message content,对齐 OpenAI/Anthropic 标准(只含结果数据)。 */ function serializeToolResultContent(result: ToolResultEnvelope['toolResults'][number]): string { if (!result.success) { @@ -59,6 +67,14 @@ function serializeToolResultContent(result: ToolResultEnvelope['toolResults'][nu return JSON.stringify(result.data) } +function stripMultimodalParts(records: Message[]): Message[] { + return records.map((record) => { + if (!record.parts) return record + const { parts: _parts, ...rest } = record + return rest + }) +} + /** 为 toolResults continuation 组装固定上下文与历史。 */ async function assembleSessionReplayContext( sessionId: string, @@ -99,7 +115,7 @@ async function assembleSessionReplayContext( // 注意:此处刻意不调用 removeIncompleteToolCallGroups。 // replay 路径会把"assistant(toolCalls) + 由 envelope 合成的 tool 消息"拼接成完整组, // 在加载阶段过早裁剪反而会把回灌目标删掉。完整组校验放在拼接后由调用方做。 - const history = await storage.listRecords(sessionId) + const history = stripMultimodalParts(await storage.listRecords(sessionId)) messages.push(...history) return { messages, insightConsumed } } @@ -204,7 +220,7 @@ function buildSession( return currentMeta }, - async send(content: string, sendOptions?: SessionSendOptions): Promise { + async send(input: string | SessionInput, sendOptions?: SessionSendOptions): Promise { if (currentMeta.status === 'archived') { throw new SessionArchivedError(currentMeta.id) } @@ -213,6 +229,8 @@ function buildSession( } // pre-flight:已 abort 的 signal 立即抛出,不发起任何 LLM 请求 sendOptions?.signal?.throwIfAborted() + const normalizedInput = normalizeSessionInput(input) + const content = normalizedInput.text // 组装上下文(自动压缩) const assembled = await assembleSessionContext( @@ -221,6 +239,7 @@ function buildSession( currentMeta.label, sendOptions?.sharedMemoryContext, sendOptions?.topologyContext, + normalizedInput.parts, ) persistAndApplyCompressionCache(assembled.compressionCache) @@ -230,7 +249,12 @@ function buildSession( } let promptMessages = assembled.messages - let recordsToPersist: Message[] = [{ role: 'user', content, timestamp: assembled.userTimestamp }] + let recordsToPersist: Message[] = [{ + role: 'user', + content, + timestamp: assembled.userTimestamp, + ...(normalizedInput.parts ? { parts: normalizedInput.parts } : {}), + }] const toolEnvelope = parseToolResultEnvelope(content) if (toolEnvelope) { const replayContext = await assembleSessionReplayContext(currentMeta.id, storage, currentMeta.label, sendOptions?.sharedMemoryContext, sendOptions?.topologyContext) @@ -279,7 +303,7 @@ function buildSession( } }, - stream(content: string, sendOptions?: SessionSendOptions): StreamResult { + stream(input: string | SessionInput, sendOptions?: SessionSendOptions): StreamResult { if (currentMeta.status === 'archived') { throw new SessionArchivedError(currentMeta.id) } @@ -290,6 +314,8 @@ function buildSession( return createStreamResult(async (push) => { // pre-flight:已 abort 的 signal 立即让 result reject,processor 不进入下游 sendOptions?.signal?.throwIfAborted() + const normalizedInput = normalizeSessionInput(input) + const content = normalizedInput.text // 组装上下文(自动压缩) const assembled = await assembleSessionContext( @@ -298,6 +324,7 @@ function buildSession( currentMeta.label, sendOptions?.sharedMemoryContext, sendOptions?.topologyContext, + normalizedInput.parts, ) persistAndApplyCompressionCache(assembled.compressionCache) @@ -307,7 +334,12 @@ function buildSession( } let promptMessages = assembled.messages - let recordsToPersist: Message[] = [{ role: 'user', content, timestamp: assembled.userTimestamp }] + let recordsToPersist: Message[] = [{ + role: 'user', + content, + timestamp: assembled.userTimestamp, + ...(normalizedInput.parts ? { parts: normalizedInput.parts } : {}), + }] const toolEnvelope = parseToolResultEnvelope(content) if (toolEnvelope) { const replayContext = await assembleSessionReplayContext(currentMeta.id, storage, currentMeta.label, sendOptions?.sharedMemoryContext, sendOptions?.topologyContext) diff --git a/packages/session/src/index.ts b/packages/session/src/index.ts index 1e545f7..382fd3a 100644 --- a/packages/session/src/index.ts +++ b/packages/session/src/index.ts @@ -2,10 +2,12 @@ export type { SessionMeta, SessionMetaUpdate, SessionFilter, ForkOptions, ForkContextFn } from './types/session.js' export type { SessionStorage, ListRecordsOptions, CompressionCacheSnapshot } from './types/storage.js' export type { - Message, ToolCall, LLMCompleteOptions, LLMResult, LLMChunk, LLMAdapter, + Message, ContentPart, TextPart, ImagePart, VideoPart, FilePart, AudioPart, MediaSource, + ToolCall, LLMCompleteOptions, LLMResult, LLMChunk, LLMAdapter, } from './types/llm.js' export type { Session, + SessionInput, MessageQueryOptions, SessionSendOptions, } from './types/session-api.js' diff --git a/packages/session/src/types/llm.ts b/packages/session/src/types/llm.ts index 8871fea..7e45afa 100644 --- a/packages/session/src/types/llm.ts +++ b/packages/session/src/types/llm.ts @@ -2,6 +2,8 @@ export interface Message { role: 'system' | 'user' | 'assistant' | 'tool' content: string + /** Provider 中性的多模态内容块。content 仍是文本投影;adapter 可按需使用 parts。 */ + parts?: ContentPart[] /** 推理模型的思考内容(stepFun/DeepSeek 等),仅 role=assistant 时有效 */ reasoningContent?: string /** assistant 发起的工具调用列表,仅 role=assistant 时有效 */ @@ -14,6 +16,65 @@ export interface Message { metadata?: Record } +export type ContentPart = TextPart | ImagePart | VideoPart | FilePart | AudioPart + +export interface TextPart { + kind: 'text' + text: string +} + +export interface ImagePart { + kind: 'image' + source: MediaSource + detail?: 'low' | 'high' | 'auto' + altText?: string + filename?: string + mediaType?: string + sizeBytes?: number +} + +export interface VideoPart { + kind: 'video' + source: MediaSource + filename?: string + mediaType?: string + durationSeconds?: number + sizeBytes?: number +} + +export interface FilePart { + kind: 'file' + source: MediaSource + filename?: string + mediaType?: string + sizeBytes?: number + /** Parsed document text supplied by an upstream document extraction service. */ + extraction?: DocumentExtraction +} + +export interface DocumentExtraction { + provider: 'stepfun' + fileId: string + status?: 'processed' | 'success' | 'failed' + content?: string + contentChars?: number +} + +export interface AudioPart { + kind: 'audio' + source: MediaSource + filename?: string + mediaType?: string + durationSeconds?: number + sizeBytes?: number +} + +export type MediaSource = + | { type: 'url'; url: string } + | { type: 'data'; mediaType: string; data: string } + | { type: 'provider_file'; provider: string; fileId: string; uri?: string } + | { type: 'kitkit_file'; fileId: string; objectKey: string; backend: 'local' | 's3'; bucket?: string; url?: string } + /** LLM 返回的工具调用请求 */ export interface ToolCall { id: string diff --git a/packages/session/src/types/session-api.ts b/packages/session/src/types/session-api.ts index 7713b2a..bfb0299 100644 --- a/packages/session/src/types/session-api.ts +++ b/packages/session/src/types/session-api.ts @@ -1,5 +1,5 @@ import type { SessionMeta, SessionMetaUpdate, ForkOptions } from './session.js' -import type { Message, LLMAdapter, LLMCompleteOptions } from './llm.js' +import type { ContentPart, Message, LLMAdapter, LLMCompleteOptions } from './llm.js' import type { SendResult, StreamResult } from './functions.js' @@ -31,6 +31,12 @@ export interface SessionSendOptions { topologyContext?: string } +/** Session.send / Session.stream 的对象输入。text 是持久文本投影,parts 是当前 turn 的多模态源数据。 */ +export interface SessionInput { + text: string + parts?: ContentPart[] +} + /** Session 错误:操作归档中的 Session */ export class SessionArchivedError extends Error { constructor(sessionId: string) { @@ -56,10 +62,10 @@ export interface Session { readonly meta: Readonly /** 发送一条消息:组装上下文 → 调 LLM → 存 L3(用户消息 + LLM 响应)→ 返回结果 */ - send(content: string, options?: SessionSendOptions): Promise + send(input: string | SessionInput, options?: SessionSendOptions): Promise /** 流式发送:同 send() 但逐 chunk 输出,流结束后自动存 L3 */ - stream(content: string, options?: SessionSendOptions): StreamResult + stream(input: string | SessionInput, options?: SessionSendOptions): StreamResult /** 读取 L3 对话记录 */ messages(options?: MessageQueryOptions): Promise