diff --git a/packages/core/src/adapters/__tests__/session-runtime.test.ts b/packages/core/src/adapters/__tests__/session-runtime.test.ts
index 647f014..b5d962b 100644
--- a/packages/core/src/adapters/__tests__/session-runtime.test.ts
+++ b/packages/core/src/adapters/__tests__/session-runtime.test.ts
@@ -224,7 +224,7 @@ describe('session-runtime adapters', () => {
         topologyContextProvider: provider,
       });
       await runtime.send('hi');
-      const [, opts] = session.send.mock.calls[0];
+      const [, opts] = session.send.mock.calls[0]!;
       expect(opts).not.toHaveProperty('topologyContext');
     });
 
@@ -240,7 +240,7 @@ describe('session-runtime adapters', () => {
         topologyContextProvider: async () => '',
       });
       await runtime.send('hi');
-      const [, opts] = session.send.mock.calls[0];
+      const [, opts] = session.send.mock.calls[0]!;
       expect(opts).not.toHaveProperty('topologyContext');
     });
 
diff --git a/packages/core/src/adapters/session-runtime.ts b/packages/core/src/adapters/session-runtime.ts
index a104ef1..e873891 100644
--- a/packages/core/src/adapters/session-runtime.ts
+++ b/packages/core/src/adapters/session-runtime.ts
@@ -1,4 +1,4 @@
-import type { ForkContextFn, LLMAdapter, LLMCompleteOptions } from '@stello-ai/session';
+import type { ForkContextFn, LLMAdapter, LLMCompleteOptions, SessionInput } from '@stello-ai/session';
 import type { EngineRuntimeSession } from '../engine/stello-engine';
 import type { ToolCallParser } from '../engine/turn-runner';
 
@@ -24,6 +24,8 @@ export interface SessionCompatibleSendResult {
   };
 }
 
+export type SessionCompatibleInput = string | SessionInput;
+
 /** 结构兼容 @stello-ai/session 的 consolidate 函数签名 */
 export type SessionCompatibleConsolidateFn = (
   currentMemory: string | null,
@@ -67,11 +69,11 @@ export interface SessionCompatible {
     status: 'active' | 'archived';
   };
   send(
-    content: string,
+    content: SessionCompatibleInput,
     options?: SessionCompatibleSendOptions,
   ): Promise<SessionCompatibleSendResult>;
   stream?(
-    content: string,
+    content: SessionCompatibleInput,
     options?: SessionCompatibleSendOptions,
   ): AsyncIterable<string> & { result: Promise<SessionCompatibleSendResult> };
   messages(): Promise<Array<{ role: string; content: string; timestamp?: string }>>;
@@ -170,7 +172,7 @@ export async function adaptSessionToEngineRuntime(
     get turnCount() {
       return turnCount;
     },
-    async send(input: string, sendOptions?: SessionCompatibleSendOptions): Promise<string> {
+    async send(input: SessionCompatibleInput, sendOptions?: SessionCompatibleSendOptions): Promise<string> {
       const sharedMemoryContext = await options.sharedMemoryContextProvider?.();
       const topologyContext = await options.topologyContextProvider?.(session.meta.id);
       const mergedOptions: SessionCompatibleSendOptions = {
@@ -193,7 +195,7 @@ export async function adaptSessionToEngineRuntime(
     },
     ...(session.stream
       ? {
-          stream(input: string, sendOptions?: SessionCompatibleSendOptions) {
+          stream(input: SessionCompatibleInput, sendOptions?: SessionCompatibleSendOptions) {
             const contextPromise = options.sharedMemoryContextProvider?.() ?? Promise.resolve(undefined);
             const topologyPromise = options.topologyContextProvider?.(session.meta.id) ?? Promise.resolve(undefined);
             const source = (async () => {
@@ -234,4 +236,3 @@ export async function adaptSessionToEngineRuntime(
       : {}),
   };
 }
-
diff --git a/packages/core/src/agent/stello-agent.ts b/packages/core/src/agent/stello-agent.ts
index 349f165..37d27d8 100644
--- a/packages/core/src/agent/stello-agent.ts
+++ b/packages/core/src/agent/stello-agent.ts
@@ -1,5 +1,5 @@
 import type { BootstrapResult } from '../types/lifecycle';
-import { TurnRunner, type ToolCallParser, type TurnRunnerOptions } from '../engine/turn-runner';
+import { TurnRunner, type ToolCallParser, type TurnInput, type TurnRunnerOptions } from '../engine/turn-runner';
 import type { EngineTurnResult } from '../engine/stello-engine';
 import type { EngineStreamResult } from '../engine/stello-engine';
 import {
@@ -447,7 +447,7 @@ export class StelloAgent {
   /** 在指定 session 上运行一轮对话 */
   turn(
     sessionId: string,
-    input: string,
+    input: TurnInput,
     options?: TurnRunnerOptions,
   ): Promise<EngineTurnResult> {
     return this.orchestrator.turn(sessionId, input, options);
@@ -456,7 +456,7 @@ export class StelloAgent {
   /** 在指定 session 上流式运行一轮对话 */
   stream(
     sessionId: string,
-    input: string,
+    input: TurnInput,
     options?: TurnRunnerOptions,
   ): Promise<EngineStreamResult> {
     return this.orchestrator.stream(sessionId, input, options);
diff --git a/packages/core/src/engine/__tests__/topology-render.test.ts b/packages/core/src/engine/__tests__/topology-render.test.ts
index b6ad5f0..4a79aed 100644
--- a/packages/core/src/engine/__tests__/topology-render.test.ts
+++ b/packages/core/src/engine/__tests__/topology-render.test.ts
@@ -3,7 +3,7 @@ import type { SessionTreeNode } from '../../types/session.js';
 import { renderTopologyMarkdown } from '../topology-render.js';
 
 function node(id: string, label: string, children: SessionTreeNode[] = [], status: 'active' | 'archived' = 'active'): SessionTreeNode {
-  return { id, label, parentId: null, status, children } as SessionTreeNode;
+  return { id, label, parentId: null, status, children, turnCount: 0 } as SessionTreeNode;
 }
 
 describe('renderTopologyMarkdown', () => {
diff --git a/packages/core/src/engine/stello-engine.ts b/packages/core/src/engine/stello-engine.ts
index d3ba168..916b176 100644
--- a/packages/core/src/engine/stello-engine.ts
+++ b/packages/core/src/engine/stello-engine.ts
@@ -33,8 +33,14 @@ import {
   type TurnRunnerOptions,
   type TurnRunnerResult,
   type TurnRunnerStreamResult,
+  type TurnInput,
 } from './turn-runner';
 
+
+function turnInputText(input: TurnInput): string {
+  return typeof input === 'string' ? input : input.text;
+}
+
 /** Engine 调用 session.send/stream 时的运行时选项 */
 export interface EngineRuntimeSessionCallOptions {
   /** AbortSignal — 透传给底层 session.send/stream 与 LLM 调用 */
@@ -54,10 +60,10 @@ export interface EngineRuntimeSession {
   /** 当前已完成轮次 */
   turnCount: number;
   /** 运行一次单条对话 */
-  send(input: string, options?: EngineRuntimeSessionCallOptions): Promise<string>;
+  send(input: TurnInput, options?: EngineRuntimeSessionCallOptions): Promise<string>;
   /** 可选：流式运行一次单条对话 */
   stream?(
-    input: string,
+    input: TurnInput,
     options?: EngineRuntimeSessionCallOptions,
   ): AsyncIterable<string> & { result: Promise<string> };
   /** fork 子 session，返回子 session 的 runtime */
@@ -216,9 +222,10 @@ export class StelloEngineImpl implements StelloEngine {
   }
 
   /** 处理一轮编排：当前 session send + tool loop + 调度 */
-  async turn(input: string, options?: TurnRunnerOptions): Promise<EngineTurnResult> {
-    this.fireHook('onMessageReceived', { sessionId: this.session.id, input });
-    this.fireHook('onRoundStart', { sessionId: this.session.id, input });
+  async turn(input: TurnInput, options?: TurnRunnerOptions): Promise<EngineTurnResult> {
+    const inputText = turnInputText(input);
+    this.fireHook('onMessageReceived', { sessionId: this.session.id, input: inputText });
+    this.fireHook('onRoundStart', { sessionId: this.session.id, input: inputText });
     let turn: TurnRunnerResult;
     try {
       turn = await this.turnRunner.run(this.session, input, this, {
@@ -238,20 +245,21 @@ export class StelloEngineImpl implements StelloEngine {
     }
     this.fireHook('onAssistantReply', {
       sessionId: this.session.id,
-      input,
+      input: inputText,
       content: turn.finalContent,
       rawResponse: turn.rawResponse,
     });
     this.fireHook('onRoundEnd', {
       sessionId: this.session.id,
-      input,
+      input: inputText,
       turn,
     });
     return { turn };
   }
 
   /** 流式处理一轮编排：先输出增量文本，完成后再返回完整 turn */
-  stream(input: string, options?: TurnRunnerOptions): EngineStreamResult {
+  stream(input: TurnInput, options?: TurnRunnerOptions): EngineStreamResult {
+    const inputText = turnInputText(input);
     const source: TurnRunnerStreamResult = this.turnRunner.runStream(this.session, input, this, {
       ...options,
       onToolCall: (toolCall) => {
@@ -265,8 +273,8 @@ export class StelloEngineImpl implements StelloEngine {
     });
 
     const result = (async () => {
-      this.fireHook('onMessageReceived', { sessionId: this.session.id, input });
-      this.fireHook('onRoundStart', { sessionId: this.session.id, input });
+      this.fireHook('onMessageReceived', { sessionId: this.session.id, input: inputText });
+      this.fireHook('onRoundStart', { sessionId: this.session.id, input: inputText });
 
       let turn: TurnRunnerResult;
       try {
@@ -278,13 +286,13 @@ export class StelloEngineImpl implements StelloEngine {
 
       this.fireHook('onAssistantReply', {
         sessionId: this.session.id,
-        input,
+        input: inputText,
         content: turn.finalContent,
         rawResponse: turn.rawResponse,
       });
       this.fireHook('onRoundEnd', {
         sessionId: this.session.id,
-        input,
+        input: inputText,
         turn,
       });
       return { turn };
diff --git a/packages/core/src/engine/turn-runner.ts b/packages/core/src/engine/turn-runner.ts
index 85ac3e3..e085a8d 100644
--- a/packages/core/src/engine/turn-runner.ts
+++ b/packages/core/src/engine/turn-runner.ts
@@ -1,5 +1,8 @@
+import type { SessionInput } from '@stello-ai/session';
 import type { ToolExecutionResult } from '../types/lifecycle';
 
+export type TurnInput = string | SessionInput;
+
 /**
  * 在单轮内并行执行所有 tool call，按输入顺序整理结果并按序触发事件。
  *
@@ -90,10 +93,10 @@ export interface TurnRunnerSession {
   /** Session 标识 */
   id: string;
   /** 执行一次单条对话 */
-  send(input: string, options?: TurnRunnerSessionCallOptions): Promise<string>;
+  send(input: TurnInput, options?: TurnRunnerSessionCallOptions): Promise<string>;
   /** 可选：流式执行一次单条对话 */
   stream?(
-    input: string,
+    input: TurnInput,
     options?: TurnRunnerSessionCallOptions,
   ): AsyncIterable<string> & { result: Promise<string> };
 }
@@ -185,12 +188,12 @@ export class TurnRunner {
    */
   async run(
     session: TurnRunnerSession,
-    input: string,
+    input: TurnInput,
     tools: TurnRunnerToolExecutor,
     options: TurnRunnerOptions = {},
   ): Promise<TurnRunnerResult> {
     const maxToolRounds = options.maxToolRounds ?? 5;
-    let currentInput = input;
+    let currentInput: TurnInput = input;
     let toolRoundCount = 0;
     let toolCallsExecuted = 0;
     let lastRawResponse = '';
@@ -231,7 +234,7 @@ export class TurnRunner {
    */
   runStream(
     session: TurnRunnerSession,
-    input: string,
+    input: TurnInput,
     tools: TurnRunnerToolExecutor,
     options: TurnRunnerOptions = {},
   ): TurnRunnerStreamResult {
diff --git a/packages/core/src/orchestrator/session-orchestrator.ts b/packages/core/src/orchestrator/session-orchestrator.ts
index b7a7cc0..7b75cda 100644
--- a/packages/core/src/orchestrator/session-orchestrator.ts
+++ b/packages/core/src/orchestrator/session-orchestrator.ts
@@ -3,15 +3,15 @@ import type { BootstrapResult } from '../types/lifecycle';
 import type { StelloEngine, EngineForkOptions } from '../types/engine';
 import type { EngineTurnResult } from '../engine/stello-engine';
 import type { EngineStreamResult } from '../engine/stello-engine';
-import type { TurnRunnerOptions } from '../engine/turn-runner';
+import type { TurnInput, TurnRunnerOptions } from '../engine/turn-runner';
 import type { EngineRuntimeManager } from './engine-runtime-manager';
 
 /** Orchestrator 对 Engine 的最小依赖 */
 export interface OrchestratorEngine extends StelloEngine {
   /** 运行当前 session 的一轮对话 */
-  turn(input: string, options?: TurnRunnerOptions): Promise<EngineTurnResult>;
+  turn(input: TurnInput, options?: TurnRunnerOptions): Promise<EngineTurnResult>;
   /** 流式运行当前 session 的一轮对话 */
-  stream(input: string, options?: TurnRunnerOptions): EngineStreamResult;
+  stream(input: TurnInput, options?: TurnRunnerOptions): EngineStreamResult;
   /** 归档当前绑定 session */
   archiveSession(): Promise<{ sessionId: string }>;
   /** 从当前绑定 session 发起 fork */
@@ -59,7 +59,7 @@ export class SessionOrchestrator {
   /** 在指定 session 上运行一轮对话 */
   async turn(
     sessionId: string,
-    input: string,
+    input: TurnInput,
     options?: TurnRunnerOptions,
   ): Promise<EngineTurnResult> {
     return this.runSerial(sessionId, async () => {
@@ -71,7 +71,7 @@ export class SessionOrchestrator {
   /** 在指定 session 上流式运行一轮对话 */
   async stream(
     sessionId: string,
-    input: string,
+    input: TurnInput,
     options?: TurnRunnerOptions,
   ): Promise<EngineStreamResult> {
     await this.requireSession(sessionId)
diff --git a/packages/core/src/types/engine.ts b/packages/core/src/types/engine.ts
index d485e00..8919d56 100644
--- a/packages/core/src/types/engine.ts
+++ b/packages/core/src/types/engine.ts
@@ -13,7 +13,7 @@ import type {
   ConfirmProtocol,
 } from './lifecycle';
 import type { EngineRuntimeSession, EngineStreamResult, EngineTurnResult } from '../engine/stello-engine';
-import type { TurnRunnerOptions } from '../engine/turn-runner';
+import type { TurnInput, TurnRunnerOptions } from '../engine/turn-runner';
 import type { ForkContextFn } from '@stello-ai/session';
 import type { SessionConfig } from './session-config';
 
@@ -85,9 +85,9 @@ export interface StelloEngine {
   /** 离开当前绑定 Session 的整轮对话 */
   leaveSession(): Promise<{ sessionId: string }>;
   /** 流式处理当前绑定 Session 的一轮对话 */
-  stream(input: string, options?: TurnRunnerOptions): EngineStreamResult;
+  stream(input: TurnInput, options?: TurnRunnerOptions): EngineStreamResult;
   /** 非流式处理当前绑定 Session 的一轮对话 */
-  turn?(input: string, options?: TurnRunnerOptions): Promise<EngineTurnResult>;
+  turn?(input: TurnInput, options?: TurnRunnerOptions): Promise<EngineTurnResult>;
   /** 导出 Agent Tool 定义（兼容 OpenAI / Claude tool use） */
   getToolDefinitions(): ToolDefinition[];
   /** 执行 Agent Tool */
diff --git a/packages/session/src/__tests__/openai-compatible.test.ts b/packages/session/src/__tests__/openai-compatible.test.ts
index 4355a38..6b7644d 100644
--- a/packages/session/src/__tests__/openai-compatible.test.ts
+++ b/packages/session/src/__tests__/openai-compatible.test.ts
@@ -166,6 +166,113 @@ describe('createOpenAICompatibleAdapter', () => {
     expect(result.reasoningContent).toBeUndefined()
   })
 
+  it('StepFun 3.7 将 image/video parts 转成 Chat Completions 多模态 content', async () => {
+    const adapter = createOpenAICompatibleAdapter({
+      apiKey: 'test-key',
+      baseURL: 'https://api.stepfun.com/v1',
+      model: 'step-3.7-flash',
+      maxContextTokens: 128_000,
+    })
+
+    await adapter.complete([{ role: 'user', content: '看一下', parts: [
+      { kind: 'image', source: { type: 'url', url: 'https://example.com/a.png' }, detail: 'high' },
+      { kind: 'video', source: { type: 'url', url: 'https://example.com/a.mp4' } },
+    ] }])
+
+    const sentMessages = createCompletion.mock.calls[0]![0].messages
+    expect(sentMessages[0]).toEqual({
+      role: 'user',
+      content: [
+        { type: 'text', text: '看一下' },
+        { type: 'image_url', image_url: { url: 'https://example.com/a.png', detail: 'high' } },
+        { type: 'video_url', video_url: { url: 'https://example.com/a.mp4' } },
+      ],
+    })
+  })
+  it('StepFun 3.7 多模态能力不绑定固定 baseURL', async () => {
+    const adapter = createOpenAICompatibleAdapter({
+      apiKey: 'test-key',
+      baseURL: 'https://api.stepfun.com/step_plan/v1',
+      model: 'step-3.7-flash',
+      maxContextTokens: 128_000,
+    })
+
+    await adapter.complete([{ role: 'user', content: '描述图片', parts: [
+      { kind: 'image', source: { type: 'url', url: 'https://example.com/a.png' } },
+    ] }])
+
+    expect(createCompletion.mock.calls[0]![0].messages[0].content).toEqual([
+      { type: 'text', text: '描述图片' },
+      { type: 'image_url', image_url: { url: 'https://example.com/a.png' } },
+    ])
+  })
+  it('StepFun 3.7 支持 data URL 与 stepfile provider_file', async () => {
+    const adapter = createOpenAICompatibleAdapter({
+      apiKey: 'test-key',
+      baseURL: 'https://api.stepfun.com/step_plan/v1',
+      model: 'step-3.7-flash',
+      maxContextTokens: 128_000,
+    })
+
+    await adapter.complete([{ role: 'user', content: '比较', parts: [
+      { kind: 'image', source: { type: 'data', mediaType: 'image/png', data: 'abc123' } },
+      { kind: 'video', source: { type: 'provider_file', provider: 'stepfun', fileId: 'file_123' } },
+    ] }])
+
+    expect(createCompletion.mock.calls[0]![0].messages[0].content).toEqual([
+      { type: 'text', text: '比较' },
+      { type: 'image_url', image_url: { url: 'data:image/png;base64,abc123' } },
+      { type: 'video_url', video_url: { url: 'stepfile://file_123' } },
+    ])
+  })
+
+  it('StepFun 3.7 将已解析 file part 转成文本块并保留原始用户问题', async () => {
+    const adapter = createOpenAICompatibleAdapter({
+      apiKey: 'test-key',
+      baseURL: 'https://api.stepfun.com/step_plan/v1',
+      model: 'step-3.7-flash',
+      maxContextTokens: 128_000,
+    })
+
+    await adapter.complete([{ role: 'user', content: '总结重点', parts: [
+      {
+        kind: 'file',
+        source: { type: 'kitkit_file', fileId: 'mmf_1', objectKey: 'multimodal/doc.pdf', backend: 's3' },
+        filename: 'report.pdf',
+        mediaType: 'application/pdf',
+        extraction: { provider: 'stepfun', fileId: 'file-C0DD', status: 'success', content: '第一章：项目概况' },
+      },
+    ] }])
+
+    const sentMessages = createCompletion.mock.calls[0]![0].messages
+    expect(sentMessages[0].content).toEqual([
+      { type: 'text', text: '总结重点' },
+      {
+        type: 'text',
+        text: [
+          '用户上传了文档：report.pdf',
+          '<document filename="report.pdf" media_type="application/pdf">',
+          '第一章：项目概况',
+          '</document>',
+        ].join('\n'),
+      },
+    ])
+  })
+
+  it('非 StepFun 3.7 模型收到 parts 时明确报错', async () => {
+    const adapter = createOpenAICompatibleAdapter({
+      apiKey: 'test-key',
+      baseURL: 'https://api.example.com/v1',
+      model: 'other-model',
+      maxContextTokens: 128_000,
+    })
+
+    await expect(adapter.complete([{ role: 'user', content: '看图', parts: [
+      { kind: 'image', source: { type: 'url', url: 'https://example.com/a.png' } },
+    ] }])).rejects.toThrow('Multimodal content parts are only supported for StepFun step-3.7-flash')
+    expect(createCompletion).not.toHaveBeenCalled()
+  })
+
   it('signal 透传到 SDK request options', async () => {
     const adapter = createOpenAICompatibleAdapter({
       apiKey: 'test-key',
diff --git a/packages/session/src/__tests__/turn.test.ts b/packages/session/src/__tests__/turn.test.ts
index 1a445c3..e8c9ebc 100644
--- a/packages/session/src/__tests__/turn.test.ts
+++ b/packages/session/src/__tests__/turn.test.ts
@@ -73,6 +73,37 @@ describe('send() 契约', () => {
     expect(secondCall[4]!.content).toBe('问题2')
   })
 
+  it('send() 支持当前 turn 多模态 parts，持久化但不在后续历史中重复回放', async () => {
+    const capturedMessages: Message[][] = []
+    const llm = createMockLLM([{ content: '看到了' }, { content: '继续' }])
+    const originalComplete = llm.complete.bind(llm)
+    llm.complete = async (msgs, options) => {
+      capturedMessages.push(msgs.map((msg) => ({ ...msg, parts: msg.parts ? [...msg.parts] : undefined })))
+      return originalComplete(msgs, options)
+    }
+    const { session } = await makeSession({ llm })
+    const parts: Message['parts'] = [
+      { kind: 'image', source: { type: 'url', url: 'https://example.com/a.png' }, detail: 'high' },
+    ]
+
+    await session.send({ text: '描述这张图', parts })
+
+    const firstUserMessage = capturedMessages[0]!.find((message) => message.role === 'user')!
+    expect(firstUserMessage.content).toBe('描述这张图')
+    expect(firstUserMessage.parts).toEqual(parts)
+
+    const persistedAfterFirstTurn = await session.messages()
+    expect(persistedAfterFirstTurn[0]).toMatchObject({ role: 'user', content: '描述这张图', parts })
+
+    await session.send('继续分析')
+
+    const secondCall = capturedMessages[1]!
+    const historicalUser = secondCall.find((message) => message.role === 'user' && message.content === '描述这张图')!
+    const currentUser = secondCall.find((message) => message.role === 'user' && message.content === '继续分析')!
+    expect(historicalUser.parts).toBeUndefined()
+    expect(currentUser.parts).toBeUndefined()
+  })
+
   it('send() 返回 toolCalls 时透传', async () => {
     const responseWithTools: LLMResult = {
       content: null,
diff --git a/packages/session/src/adapters/openai-compatible.ts b/packages/session/src/adapters/openai-compatible.ts
index 22ad1ee..e109da0 100644
--- a/packages/session/src/adapters/openai-compatible.ts
+++ b/packages/session/src/adapters/openai-compatible.ts
@@ -1,7 +1,7 @@
 import OpenAI from 'openai'
 import type { ChatCompletion, ChatCompletionChunk } from 'openai/resources/chat/completions'
 import type { Stream } from 'openai/streaming'
-import type { LLMAdapter, LLMResult, Message, LLMCompleteOptions } from '../types/llm.js'
+import type { ContentPart, LLMAdapter, LLMResult, Message, LLMCompleteOptions } from '../types/llm.js'
 
 type ChatToolCallDelta = NonNullable<
   NonNullable<ChatCompletionChunk['choices'][number]['delta']['tool_calls']>[number]
@@ -23,6 +23,8 @@ export interface OpenAICompatibleOptions {
    * 在中途被截断，引发上层 JSON 解析失败。
    */
   maxOutputTokens?: number
+  /** 将 KitKit 托管的多模态文件转成模型服务可访问的 URL。 */
+  resolveMediaUrl?: (source: Extract<Extract<ContentPart, { kind: 'image' | 'video' }>['source'], { type: 'kitkit_file' }>) => string | Promise<string>
 }
 
 /** 合并连续的 system 消息，兼容只接受单条 system 的提供方。 */
@@ -41,6 +43,89 @@ function mergeConsecutiveSystemMessages(messages: Message[]): Message[] {
   return merged
 }
 
+function isStepFun37Flash(options: OpenAICompatibleOptions): boolean {
+  // StepFun 的不同套餐/入口可能使用不同 baseURL（如 /v1 与 /step_plan/v1）。
+  // 多模态能力跟随模型名判断，不能把能力限定死在某一个 endpoint。
+  return options.model === 'step-3.7-flash'
+}
+
+function escapeDocumentAttribute(value: string): string {
+  return value.replace(/&/g, '&amp;').replace(/"/g, '&quot;').replace(/</g, '&lt;').replace(/>/g, '&gt;')
+}
+
+function renderExtractedFilePart(part: Extract<ContentPart, { kind: 'file' }>): string {
+  const filename = part.filename || '未命名文件'
+  const mediaType = part.mediaType || 'application/octet-stream'
+  const content = part.extraction?.content
+  if (!content) {
+    throw new Error('StepFun file content part requires extracted text content before reaching the OpenAI-compatible adapter')
+  }
+  return [
+    `用户上传了文档：${filename}`,
+    `<document filename="${escapeDocumentAttribute(filename)}" media_type="${escapeDocumentAttribute(mediaType)}">`,
+    content,
+    '</document>',
+  ].join('\n')
+}
+
+async function sourceToURL(part: Extract<ContentPart, { kind: 'image' | 'video' }>, options: OpenAICompatibleOptions): Promise<string> {
+  const source = part.source
+  if (source.type === 'url') return source.url
+  if (source.type === 'data') return `data:${source.mediaType};base64,${source.data}`
+  if (source.type === 'provider_file') {
+    if (source.uri) return source.uri
+    if (source.provider === 'stepfun') return `stepfile://${source.fileId}`
+    throw new Error(`Unsupported provider_file source provider: ${source.provider}`)
+  }
+  if (source.type === 'kitkit_file') {
+    if (options.resolveMediaUrl) return options.resolveMediaUrl(source)
+    if (source.url && /^https?:\/\//.test(source.url)) return source.url
+    throw new Error('kitkit_file source must be converted to a model-readable URL before reaching the OpenAI-compatible adapter')
+  }
+  const unreachable = source as never
+  throw new Error(`Unsupported media source: ${JSON.stringify(unreachable)}`)
+}
+
+async function toOpenAIContent(message: Message, allowMultimodal: boolean, options: OpenAICompatibleOptions): Promise<string | Array<Record<string, unknown>>> {
+  if (!message.parts || message.parts.length === 0) return message.content
+  if (!allowMultimodal) {
+    throw new Error('Multimodal content parts are only supported for StepFun step-3.7-flash in this adapter')
+  }
+  if (message.role !== 'user') {
+    throw new Error(`Multimodal content parts are only supported on user messages, got role=${message.role}`)
+  }
+
+  const blocks: Array<Record<string, unknown>> = []
+  const hasTextPart = message.parts.some((part) => part.kind === 'text')
+  if (message.content && !hasTextPart) {
+    blocks.push({ type: 'text', text: message.content })
+  }
+
+  for (const part of message.parts) {
+    if (part.kind === 'text') {
+      blocks.push({ type: 'text', text: part.text })
+      continue
+    }
+    if (part.kind === 'image') {
+      const imageUrl: Record<string, unknown> = { url: await sourceToURL(part, options) }
+      if (part.detail && part.detail !== 'auto') imageUrl.detail = part.detail
+      blocks.push({ type: 'image_url', image_url: imageUrl })
+      continue
+    }
+    if (part.kind === 'video') {
+      blocks.push({ type: 'video_url', video_url: { url: await sourceToURL(part, options) } })
+      continue
+    }
+    if (part.kind === 'file') {
+      blocks.push({ type: 'text', text: renderExtractedFilePart(part) })
+      continue
+    }
+    throw new Error(`Unsupported multimodal content part kind: ${part.kind}`)
+  }
+
+  return blocks.length > 0 ? blocks : message.content
+}
+
 /** 创建 OpenAI 兼容协议的 LLMAdapter，可对接 MiniMax / DeepSeek / OpenAI 等 */
 export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions): LLMAdapter {
   const client = new OpenAI({
@@ -49,8 +134,9 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions):
   })
 
   /** 构建公共请求参数 */
-  function buildParams(messages: Message[], completeOptions?: LLMCompleteOptions) {
+  async function buildParams(messages: Message[], completeOptions?: LLMCompleteOptions) {
     const normalizedMessages = mergeConsecutiveSystemMessages(messages)
+    const allowMultimodal = isStepFun37Flash(options)
     return {
       model: options.model,
       max_tokens: completeOptions?.maxTokens ?? options.maxOutputTokens ?? 4096,
@@ -67,9 +153,9 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions):
             })),
           }
         : {}),
-      messages: normalizedMessages.map((m) => ({
+      messages: await Promise.all(normalizedMessages.map(async (m) => ({
         role: m.role as 'system' | 'user' | 'assistant' | 'tool',
-        content: m.content,
+        content: await toOpenAIContent(m, allowMultimodal, options),
         ...(m.role === 'tool' && m.toolCallId ? { tool_call_id: m.toolCallId } : {}),
         ...(m.role === 'assistant' && m.reasoningContent
           ? { reasoning_content: m.reasoningContent }
@@ -86,7 +172,7 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions):
               })),
             }
           : {}),
-      })),
+      }))),
     }
   }
 
@@ -95,7 +181,7 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions):
     async complete(messages: Message[], completeOptions?: LLMCompleteOptions): Promise<LLMResult> {
       const response = await client.chat.completions.create(
         {
-          ...buildParams(messages, completeOptions),
+          ...(await buildParams(messages, completeOptions)),
           ...(options.extraBody ?? {}),
           stream: false,
         } as Parameters<typeof client.chat.completions.create>[0],
@@ -131,7 +217,7 @@ export function createOpenAICompatibleAdapter(options: OpenAICompatibleOptions):
     async *stream(messages: Message[], completeOptions?: LLMCompleteOptions) {
       const stream = await client.chat.completions.create(
         {
-          ...buildParams(messages, completeOptions),
+          ...(await buildParams(messages, completeOptions)),
           ...(options.extraBody ?? {}),
           stream: true,
         } as Parameters<typeof client.chat.completions.create>[0],
diff --git a/packages/session/src/context-utils.ts b/packages/session/src/context-utils.ts
index b7eb86b..f017770 100644
--- a/packages/session/src/context-utils.ts
+++ b/packages/session/src/context-utils.ts
@@ -45,6 +45,14 @@ export function removeIncompleteToolCallGroups(records: Message[]): Message[] {
   return result
 }
 
+function stripMultimodalParts(messages: Message[]): Message[] {
+  return messages.map((message) => {
+    if (!message.parts) return message
+    const { parts: _parts, ...rest } = message
+    return rest
+  })
+}
+
 /** 内置默认压缩提示词 */
 const BUILTIN_COMPRESS_PROMPT = `你是对话压缩助手。请将以下对话历史压缩为一段简洁的摘要，保留关键上下文信息。
 要求：
@@ -181,6 +189,7 @@ export async function assembleSessionContext(
   label?: string,
   sharedMemoryContext?: string,
   topologyContext?: string,
+  userParts?: Message['parts'],
 ): Promise<AssembleResult> {
   const prefixMessages: Message[] = []
   let insightConsumed = false
@@ -212,10 +221,15 @@ export async function assembleSessionContext(
   }
 
   const userTimestamp = new Date().toISOString()
-  const userMessage: Message = { role: 'user', content: userContent, timestamp: userTimestamp }
+  const userMessage: Message = {
+    role: 'user',
+    content: userContent,
+    timestamp: userTimestamp,
+    ...(userParts && userParts.length > 0 ? { parts: userParts } : {}),
+  }
 
   // 净化历史：移除中断/崩溃残留的不完整 tool call 组，保证送给 LLM 的 prompt 协议合法
-  const history = removeIncompleteToolCallGroups(await storage.listRecords(sessionId))
+  const history = stripMultimodalParts(removeIncompleteToolCallGroups(await storage.listRecords(sessionId)))
 
   // 估算全量 token 数
   const fullMessages = [...prefixMessages, ...history, userMessage]
diff --git a/packages/session/src/create-session.ts b/packages/session/src/create-session.ts
index 0fd5a55..304ebb0 100644
--- a/packages/session/src/create-session.ts
+++ b/packages/session/src/create-session.ts
@@ -1,5 +1,5 @@
 import { randomUUID } from 'node:crypto'
-import type { Session, MessageQueryOptions, SessionSendOptions } from './types/session-api.js'
+import type { Session, MessageQueryOptions, SessionInput, SessionSendOptions } from './types/session-api.js'
 import { SessionArchivedError } from './types/session-api.js'
 import type { SessionMeta, SessionMetaUpdate, ForkOptions } from './types/session.js'
 import type { Message } from './types/llm.js'
@@ -49,6 +49,14 @@ function attachTurnMetadata(records: Message[], turnId: string): Message[] {
   }))
 }
 
+function normalizeSessionInput(input: string | SessionInput): { text: string; parts?: Message['parts'] } {
+  if (typeof input === 'string') return { text: input }
+  return {
+    text: input.text,
+    ...(input.parts && input.parts.length > 0 ? { parts: input.parts } : {}),
+  }
+}
+
 /** 把 tool 执行结果序列化为 tool message content，对齐 OpenAI/Anthropic 标准（只含结果数据）。 */
 function serializeToolResultContent(result: ToolResultEnvelope['toolResults'][number]): string {
   if (!result.success) {
@@ -59,6 +67,14 @@ function serializeToolResultContent(result: ToolResultEnvelope['toolResults'][nu
   return JSON.stringify(result.data)
 }
 
+function stripMultimodalParts(records: Message[]): Message[] {
+  return records.map((record) => {
+    if (!record.parts) return record
+    const { parts: _parts, ...rest } = record
+    return rest
+  })
+}
+
 /** 为 toolResults continuation 组装固定上下文与历史。 */
 async function assembleSessionReplayContext(
   sessionId: string,
@@ -99,7 +115,7 @@ async function assembleSessionReplayContext(
   // 注意：此处刻意不调用 removeIncompleteToolCallGroups。
   // replay 路径会把"assistant(toolCalls) + 由 envelope 合成的 tool 消息"拼接成完整组，
   // 在加载阶段过早裁剪反而会把回灌目标删掉。完整组校验放在拼接后由调用方做。
-  const history = await storage.listRecords(sessionId)
+  const history = stripMultimodalParts(await storage.listRecords(sessionId))
   messages.push(...history)
   return { messages, insightConsumed }
 }
@@ -204,7 +220,7 @@ function buildSession(
       return currentMeta
     },
 
-    async send(content: string, sendOptions?: SessionSendOptions): Promise<SendResult> {
+    async send(input: string | SessionInput, sendOptions?: SessionSendOptions): Promise<SendResult> {
       if (currentMeta.status === 'archived') {
         throw new SessionArchivedError(currentMeta.id)
       }
@@ -213,6 +229,8 @@ function buildSession(
       }
       // pre-flight：已 abort 的 signal 立即抛出，不发起任何 LLM 请求
       sendOptions?.signal?.throwIfAborted()
+      const normalizedInput = normalizeSessionInput(input)
+      const content = normalizedInput.text
 
       // 组装上下文（自动压缩）
       const assembled = await assembleSessionContext(
@@ -221,6 +239,7 @@ function buildSession(
         currentMeta.label,
         sendOptions?.sharedMemoryContext,
         sendOptions?.topologyContext,
+        normalizedInput.parts,
       )
       persistAndApplyCompressionCache(assembled.compressionCache)
 
@@ -230,7 +249,12 @@ function buildSession(
       }
 
       let promptMessages = assembled.messages
-      let recordsToPersist: Message[] = [{ role: 'user', content, timestamp: assembled.userTimestamp }]
+      let recordsToPersist: Message[] = [{
+        role: 'user',
+        content,
+        timestamp: assembled.userTimestamp,
+        ...(normalizedInput.parts ? { parts: normalizedInput.parts } : {}),
+      }]
       const toolEnvelope = parseToolResultEnvelope(content)
       if (toolEnvelope) {
         const replayContext = await assembleSessionReplayContext(currentMeta.id, storage, currentMeta.label, sendOptions?.sharedMemoryContext, sendOptions?.topologyContext)
@@ -279,7 +303,7 @@ function buildSession(
       }
     },
 
-    stream(content: string, sendOptions?: SessionSendOptions): StreamResult {
+    stream(input: string | SessionInput, sendOptions?: SessionSendOptions): StreamResult {
       if (currentMeta.status === 'archived') {
         throw new SessionArchivedError(currentMeta.id)
       }
@@ -290,6 +314,8 @@ function buildSession(
       return createStreamResult(async (push) => {
         // pre-flight：已 abort 的 signal 立即让 result reject，processor 不进入下游
         sendOptions?.signal?.throwIfAborted()
+        const normalizedInput = normalizeSessionInput(input)
+        const content = normalizedInput.text
 
         // 组装上下文（自动压缩）
         const assembled = await assembleSessionContext(
@@ -298,6 +324,7 @@ function buildSession(
           currentMeta.label,
           sendOptions?.sharedMemoryContext,
           sendOptions?.topologyContext,
+          normalizedInput.parts,
         )
         persistAndApplyCompressionCache(assembled.compressionCache)
 
@@ -307,7 +334,12 @@ function buildSession(
         }
 
         let promptMessages = assembled.messages
-        let recordsToPersist: Message[] = [{ role: 'user', content, timestamp: assembled.userTimestamp }]
+        let recordsToPersist: Message[] = [{
+          role: 'user',
+          content,
+          timestamp: assembled.userTimestamp,
+          ...(normalizedInput.parts ? { parts: normalizedInput.parts } : {}),
+        }]
         const toolEnvelope = parseToolResultEnvelope(content)
         if (toolEnvelope) {
           const replayContext = await assembleSessionReplayContext(currentMeta.id, storage, currentMeta.label, sendOptions?.sharedMemoryContext, sendOptions?.topologyContext)
diff --git a/packages/session/src/index.ts b/packages/session/src/index.ts
index 1e545f7..382fd3a 100644
--- a/packages/session/src/index.ts
+++ b/packages/session/src/index.ts
@@ -2,10 +2,12 @@
 export type { SessionMeta, SessionMetaUpdate, SessionFilter, ForkOptions, ForkContextFn } from './types/session.js'
 export type { SessionStorage, ListRecordsOptions, CompressionCacheSnapshot } from './types/storage.js'
 export type {
-  Message, ToolCall, LLMCompleteOptions, LLMResult, LLMChunk, LLMAdapter,
+  Message, ContentPart, TextPart, ImagePart, VideoPart, FilePart, AudioPart, MediaSource,
+  ToolCall, LLMCompleteOptions, LLMResult, LLMChunk, LLMAdapter,
 } from './types/llm.js'
 export type {
   Session,
+  SessionInput,
   MessageQueryOptions,
   SessionSendOptions,
 } from './types/session-api.js'
diff --git a/packages/session/src/types/llm.ts b/packages/session/src/types/llm.ts
index 8871fea..7e45afa 100644
--- a/packages/session/src/types/llm.ts
+++ b/packages/session/src/types/llm.ts
@@ -2,6 +2,8 @@
 export interface Message {
   role: 'system' | 'user' | 'assistant' | 'tool'
   content: string
+  /** Provider 中性的多模态内容块。content 仍是文本投影；adapter 可按需使用 parts。 */
+  parts?: ContentPart[]
   /** 推理模型的思考内容（stepFun/DeepSeek 等），仅 role=assistant 时有效 */
   reasoningContent?: string
   /** assistant 发起的工具调用列表，仅 role=assistant 时有效 */
@@ -14,6 +16,65 @@ export interface Message {
   metadata?: Record<string, unknown>
 }
 
+export type ContentPart = TextPart | ImagePart | VideoPart | FilePart | AudioPart
+
+export interface TextPart {
+  kind: 'text'
+  text: string
+}
+
+export interface ImagePart {
+  kind: 'image'
+  source: MediaSource
+  detail?: 'low' | 'high' | 'auto'
+  altText?: string
+  filename?: string
+  mediaType?: string
+  sizeBytes?: number
+}
+
+export interface VideoPart {
+  kind: 'video'
+  source: MediaSource
+  filename?: string
+  mediaType?: string
+  durationSeconds?: number
+  sizeBytes?: number
+}
+
+export interface FilePart {
+  kind: 'file'
+  source: MediaSource
+  filename?: string
+  mediaType?: string
+  sizeBytes?: number
+  /** Parsed document text supplied by an upstream document extraction service. */
+  extraction?: DocumentExtraction
+}
+
+export interface DocumentExtraction {
+  provider: 'stepfun'
+  fileId: string
+  status?: 'processed' | 'success' | 'failed'
+  content?: string
+  contentChars?: number
+}
+
+export interface AudioPart {
+  kind: 'audio'
+  source: MediaSource
+  filename?: string
+  mediaType?: string
+  durationSeconds?: number
+  sizeBytes?: number
+}
+
+export type MediaSource =
+  | { type: 'url'; url: string }
+  | { type: 'data'; mediaType: string; data: string }
+  | { type: 'provider_file'; provider: string; fileId: string; uri?: string }
+  | { type: 'kitkit_file'; fileId: string; objectKey: string; backend: 'local' | 's3'; bucket?: string; url?: string }
+
 /** LLM 返回的工具调用请求 */
 export interface ToolCall {
   id: string
diff --git a/packages/session/src/types/session-api.ts b/packages/session/src/types/session-api.ts
index 7713b2a..bfb0299 100644
--- a/packages/session/src/types/session-api.ts
+++ b/packages/session/src/types/session-api.ts
@@ -1,5 +1,5 @@
 import type { SessionMeta, SessionMetaUpdate, ForkOptions } from './session.js'
-import type { Message, LLMAdapter, LLMCompleteOptions } from './llm.js'
+import type { ContentPart, Message, LLMAdapter, LLMCompleteOptions } from './llm.js'
 import type { SendResult, StreamResult } from './functions.js'
 
 
@@ -31,6 +31,12 @@ export interface SessionSendOptions {
   topologyContext?: string
 }
 
+/** Session.send / Session.stream 的对象输入。text 是持久文本投影，parts 是当前 turn 的多模态源数据。 */
+export interface SessionInput {
+  text: string
+  parts?: ContentPart[]
+}
+
 /** Session 错误：操作归档中的 Session */
 export class SessionArchivedError extends Error {
   constructor(sessionId: string) {
@@ -56,10 +62,10 @@ export interface Session {
   readonly meta: Readonly<SessionMeta>
 
   /** 发送一条消息：组装上下文 → 调 LLM → 存 L3（用户消息 + LLM 响应）→ 返回结果 */
-  send(content: string, options?: SessionSendOptions): Promise<SendResult>
+  send(input: string | SessionInput, options?: SessionSendOptions): Promise<SendResult>
 
   /** 流式发送：同 send() 但逐 chunk 输出，流结束后自动存 L3 */
-  stream(content: string, options?: SessionSendOptions): StreamResult
+  stream(input: string | SessionInput, options?: SessionSendOptions): StreamResult
 
   /** 读取 L3 对话记录 */
   messages(options?: MessageQueryOptions): Promise<Message[]>