From 498f5952602e676cef2bd5a70602fd540cdc8485 Mon Sep 17 00:00:00 2001 From: Richard Das Date: Tue, 2 Jun 2026 15:46:51 +0100 Subject: [PATCH] Add Realtime Reasoning API parity Includes Realtime Reasoning session and response-create types for reasoning effort and parallel tool calls while preserving existing Performance Realtime call sites. Decodes phased Realtime output for commentary and final answer items across response completion, output item, and conversation item events. Documents the current Realtime schema mapping and README examples, and removes obsolete Realtime GA/beta terminology. Adds focused encoding and decoding tests for the new wire shapes and compatibility behavior. Co-authored-by: Cursor --- Documentation/OpenAI/RealtimeSchemaMatrix.md | 92 +++++++++ README.md | 74 ++++++-- .../OpenAI/OpenAIRealtimeMessage.swift | 42 +++++ ...OpenAIRealtimeReasoningConfiguration.swift | 24 +++ ...penAIRealtimeReasoningResponseCreate.swift | 78 ++++++++ ...ealtimeReasoningSessionConfiguration.swift | 25 +++ .../OpenAI/OpenAIRealtimeSession.swift | 19 +- .../OpenAIRealtimeSessionConfiguration.swift | 30 ++- .../OpenAI/OpenAIRealtimeSessionUpdate.swift | 22 +++ Sources/AIProxy/OpenAI/OpenAIService.swift | 29 +++ .../OpenAIRealtimeMessageTests.swift | 175 ++++++++++++++++++ .../OpenAIRealtimeSessionEncodingTests.swift | 82 ++++++++ 12 files changed, 676 insertions(+), 16 deletions(-) create mode 100644 Documentation/OpenAI/RealtimeSchemaMatrix.md create mode 100644 Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningConfiguration.swift create mode 100644 Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningResponseCreate.swift create mode 100644 Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningSessionConfiguration.swift diff --git a/Documentation/OpenAI/RealtimeSchemaMatrix.md b/Documentation/OpenAI/RealtimeSchemaMatrix.md new file mode 100644 index 00000000..0c825182 --- /dev/null +++ b/Documentation/OpenAI/RealtimeSchemaMatrix.md @@ -0,0 +1,92 @@ +# Realtime API Schema Matrix + +This matrix maps the current OpenAI Realtime `session.update.session` and `response.create.response` +fields to AIProxySwift types and wire encoding behavior. + +Reference: https://developers.openai.com/api/reference/resources/realtime + +## Shared Realtime Session + +These fields are used by Performance Realtime models, such as `gpt-realtime-1.5`, and are also the +base session shape composed by Realtime Reasoning models. + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `type` | `OpenAIRealtimeSessionConfiguration.type` | string | +| `include` | `OpenAIRealtimeSessionConfiguration.include` | string array | +| `model` | `OpenAIRealtimeSessionConfiguration.model` | string | +| `instructions` | `OpenAIRealtimeSessionConfiguration.instructions` | string | +| `max_output_tokens` | `OpenAIRealtimeSessionConfiguration.maxOutputTokens` | int or `"inf"` | +| `output_modalities` | `OpenAIRealtimeSessionConfiguration.outputModalities` | enum string array | +| `prompt` | `OpenAIRealtimeSessionConfiguration.prompt` | object (`id`, optional `variables`, optional `version`) | +| `tracing` | `OpenAIRealtimeSessionConfiguration.tracing` | string `"auto"` or object (`group_id`, `metadata`, `workflow_name`) | +| `truncation` | `OpenAIRealtimeSessionConfiguration.truncation` | string (`"auto"`/`"disabled"`) or retention-ratio object | +| `tools` | `OpenAIRealtimeSessionConfiguration.tools` | union array (`function`, `mcp`, `web_search`) | +| `tool_choice` | `OpenAIRealtimeSessionConfiguration.toolChoice` | string (`auto`/`none`/`required`) or typed selector object | +| `audio.input.format` | `OpenAIRealtimeSessionConfiguration.inputAudioFormat` | object (`type`, optional `rate`) | +| `audio.input.noise_reduction` | `OpenAIRealtimeSessionConfiguration.inputAudioNoiseReduction` | object (`type`) | +| `audio.input.transcription` | `OpenAIRealtimeSessionConfiguration.inputAudioTranscription` | object (`language`, `model`, `prompt`) | +| `audio.input.turn_detection` | `OpenAIRealtimeSessionConfiguration.turnDetection` | typed object union (`server_vad` / `semantic_vad`) | +| `audio.output.format` | `OpenAIRealtimeSessionConfiguration.outputAudioFormat` | object (`type`, optional `rate`) | +| `audio.output.speed` | `OpenAIRealtimeSessionConfiguration.speed` | number (range 0.25...1.5) | +| `audio.output.voice` | `OpenAIRealtimeSessionConfiguration.voice` | string or object (`id`) | + +## Realtime Reasoning Session + +Realtime Reasoning models, such as `gpt-realtime-2`, compose the shared session fields above and add +Reasoning-only fields to the same `session.update.session` object. + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `reasoning` | `OpenAIRealtimeReasoningSessionConfiguration.reasoning` | object | +| `reasoning.effort` | `OpenAIRealtimeReasoningConfiguration.effort` | `minimal`, `low`, `medium`, `high`, or `xhigh` | +| `parallel_tool_calls` | `OpenAIRealtimeReasoningSessionConfiguration.parallelToolCalls` | boolean | + +## Shared `response.create` + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `type` | `OpenAIRealtimeResponseCreate.type` | `"response.create"` | +| `event_id` | `OpenAIRealtimeResponseCreate.eventID` | optional string | +| `response.instructions` | `OpenAIRealtimeResponseCreate.Response.instructions` | optional string | +| `response.output_modalities` | `OpenAIRealtimeResponseCreate.Response.outputModalities` | optional enum string array | +| `response.tools` | `OpenAIRealtimeResponseCreate.Response.tools` | optional tool union array (`function`, `mcp`, `web_search`) | +| `response.tool_choice` | `OpenAIRealtimeResponseCreate.Response.toolChoice` | optional string/object union | + +## Realtime Reasoning `response.create` + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `type` | `OpenAIRealtimeReasoningResponseCreate.type` | `"response.create"` | +| `event_id` | `OpenAIRealtimeReasoningResponseCreate.eventID` | optional string | +| `response.reasoning` | `OpenAIRealtimeReasoningResponseCreate.Response.reasoning` | object | +| `response.reasoning.effort` | `OpenAIRealtimeReasoningConfiguration.effort` | `minimal`, `low`, `medium`, `high`, or `xhigh` | +| `response.parallel_tool_calls` | `OpenAIRealtimeReasoningResponseCreate.Response.parallelToolCalls` | boolean | + +## Realtime Reasoning Output Phases + +Realtime Reasoning output can be split into commentary and final answer phases. + +| Wire field | AIProxySwift API | Wire shape decoded | +| --- | --- | --- | +| `response.output[].phase` | `OpenAIRealtimeResponseOutputItem.phase` | `commentary` or `final_answer` | +| `response.output_item.*.item.phase` | `OpenAIRealtimeResponseOutputItemAddedEvent.phase` / `OpenAIRealtimeResponseOutputItemDoneEvent.phase` | `commentary` or `final_answer` | +| `conversation.item.*.item.phase` | `OpenAIRealtimeConversationItemCreatedEvent.phase` | `commentary` or `final_answer` | + +## `conversation.item.create` + +Reference: https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/create + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `type` | `OpenAIRealtimeConversationItemCreate.type` | `"conversation.item.create"` | +| `item.type` | `OpenAIRealtimeConversationItemCreate.Item` | `"message"`, `"function_call"`, `"function_call_output"` | +| `item.role` | `OpenAIRealtimeConversationItemCreate.Item.role` | optional string for message items | +| `item.content[].type` | `OpenAIRealtimeConversationItemCreate.Item.Content.type` | `input_text`, `output_text`, `input_audio`, `item_reference`, `input_image` | +| `item.content[].text` | `OpenAIRealtimeConversationItemCreate.Item.Content.text` | optional string | +| `item.content[].audio` | `OpenAIRealtimeConversationItemCreate.Item.Content.audio` | optional string | +| `item.content[].item_id` | `OpenAIRealtimeConversationItemCreate.Item.Content.itemID` | optional string | +| `item.call_id` | `OpenAIRealtimeConversationItemCreate.Item.callID` | optional string | +| `item.name` | `OpenAIRealtimeConversationItemCreate.Item.name` | optional string | +| `item.arguments` | `OpenAIRealtimeConversationItemCreate.Item.arguments` | optional string | +| `item.output` | `OpenAIRealtimeConversationItemCreate.Item.output` | optional string | diff --git a/README.md b/README.md index 3aea6cbf..71b85480 100644 --- a/README.md +++ b/README.md @@ -1384,13 +1384,10 @@ final class RealtimeManager { inputAudioFormat: .pcm16, inputAudioTranscription: .init(model: "whisper-1"), instructions: "You are a tour guide of Yosemite national park", - maxResponseOutputTokens: .int(4096), - modalities: [.audio], + maxOutputTokens: .int(4096), + outputModalities: [.audio], outputAudioFormat: .pcm16, - temperature: 0.7, - turnDetection: .init( - type: .semanticVAD(eagerness: .medium) - ), + turnDetection: .semanticVAD(.init(eagerness: .medium)), voice: "shimmer" ) @@ -1449,14 +1446,15 @@ final class RealtimeManager { } ``` -#### General Availability (GA) Realtime migration notes +#### Current Realtime API notes -- OpenAI has announced Realtime beta (`OpenAI-Beta: realtime=v1`) deprecation and shutdown on 2026-05-07. -- For `response.create`, GA uses `output_modalities` (not `modalities`). -- The new `output_modalities` for OpenAI realtime GA (general availability) is as follows: +- For a field-by-field mapping of the Realtime wire shape to AIProxySwift types, see + [Realtime schema matrix](Documentation/OpenAI/RealtimeSchemaMatrix.md). +- For `response.create`, the current Realtime API uses `output_modalities` (not `modalities`). +- `output_modalities` is as follows: - `["audio"]` returns audio with transcript. - `["text"]` returns text only. -- For voice mode with built-in web search, use GA tool (`.webSearch`) and specify `.auto` for toolChoice to let the model decide when to use it. +- For voice mode with built-in web search, use the `.webSearch` tool and specify `.auto` for `toolChoice` to let the model decide when to use it. ```swift let configuration = OpenAIRealtimeSessionConfiguration( @@ -1473,6 +1471,60 @@ let session = try await openAIService.realtimeSession( ) ``` +#### Realtime Reasoning models + +OpenAI's Realtime Reasoning models, such as `gpt-realtime-2`, use the same Realtime WebSocket +transport and shared session fields as Performance models like `gpt-realtime-1.5`, plus +Reasoning-only configuration for effort and parallel tool calls. + +```swift +let configuration = OpenAIRealtimeReasoningSessionConfiguration( + session: OpenAIRealtimeSessionConfiguration( + outputModalities: [.audio], + voice: .builtin("alloy"), + tools: [.webSearch(.init(searchContextSize: .medium))], + toolChoice: .auto + ), + reasoning: .init(effort: .low), + parallelToolCalls: true +) + +let session = try await openAIService.realtimeSession( + model: "gpt-realtime-2", + configuration: configuration, + logLevel: .info +) +``` + +You can also override Reasoning settings for a single response: + +```swift +await session.sendMessage( + OpenAIRealtimeReasoningResponseCreate( + response: .init( + base: .init( + instructions: "Use the lowest sufficient reasoning effort.", + outputModalities: [.audio] + ), + reasoning: .init(effort: .minimal), + parallelToolCalls: false + ) + ) +) +``` + +Realtime Reasoning responses can include phased output. Use `phase` to separate short commentary +from the final answer when the model emits both in a turn: + +```swift +for await message in session.receiver { + if case .responseDone(let event) = message { + let commentary = event.output?.filter { $0.phase == .commentary } + let finalAnswer = event.output?.filter { $0.phase == .finalAnswer } + } +} +``` + ### How to make a basic request using OpenAI's Responses API Note: there is also a streaming version of this snippet below. diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift index 55a77d0d..6f67ea7d 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift @@ -277,15 +277,46 @@ public struct OpenAIRealtimeInputAudioBufferDTMFEventReceivedEvent: Decodable, S } } +public enum OpenAIRealtimeResponsePhase: String, Decodable, Sendable { + case commentary + case finalAnswer = "final_answer" +} + +public struct OpenAIRealtimeResponseOutputItem: Decodable, Sendable { + public let id: String? + public let phase: OpenAIRealtimeResponsePhase? + public let content: [Content]? + + public var transcript: String? { + content?.first(where: { ($0.transcript?.isEmpty == false) })?.transcript + } + + private enum CodingKeys: String, CodingKey { + case id + case phase + case content + } +} + +extension OpenAIRealtimeResponseOutputItem { + public struct Content: Decodable, Sendable { + public let type: String? + public let text: String? + public let transcript: String? + } +} + public struct OpenAIRealtimeConversationItemCreatedEvent: Decodable, Sendable { public let itemID: String? public let previousItemID: String? public let role: String? + public let phase: OpenAIRealtimeResponsePhase? public let eventID: String? private struct ItemBody: Decodable { let id: String? let role: String? + let phase: OpenAIRealtimeResponsePhase? } private enum CodingKeys: String, CodingKey { @@ -302,6 +333,7 @@ public struct OpenAIRealtimeConversationItemCreatedEvent: Decodable, Sendable { self.itemID = item?.id ?? fallbackItemID self.previousItemID = try container.decodeIfPresent(String.self, forKey: .previousItemID) self.role = item?.role + self.phase = item?.phase self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID) } } @@ -325,10 +357,12 @@ public struct OpenAIRealtimeResponseOutputItemAddedEvent: Decodable, Sendable { public let responseID: String? public let itemID: String? public let outputIndex: Int? + public let phase: OpenAIRealtimeResponsePhase? public let eventID: String? private struct ItemBody: Decodable { let id: String? + let phase: OpenAIRealtimeResponsePhase? } private enum CodingKeys: String, CodingKey { @@ -346,6 +380,7 @@ public struct OpenAIRealtimeResponseOutputItemAddedEvent: Decodable, Sendable { let fallbackItemID = try container.decodeIfPresent(String.self, forKey: .itemID) self.itemID = item?.id ?? fallbackItemID self.outputIndex = container.decodeFlexibleIntIfPresent(forKey: .outputIndex) + self.phase = item?.phase self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID) } } @@ -354,6 +389,7 @@ public struct OpenAIRealtimeResponseOutputItemDoneEvent: Decodable, Sendable { public let responseID: String? public let itemID: String? public let outputIndex: Int? + public let phase: OpenAIRealtimeResponsePhase? public let transcript: String? public let eventID: String? @@ -362,6 +398,7 @@ public struct OpenAIRealtimeResponseOutputItemDoneEvent: Decodable, Sendable { let transcript: String? } let id: String? + let phase: OpenAIRealtimeResponsePhase? let content: [ContentBody]? } @@ -380,6 +417,7 @@ public struct OpenAIRealtimeResponseOutputItemDoneEvent: Decodable, Sendable { let fallbackItemID = try container.decodeIfPresent(String.self, forKey: .itemID) self.itemID = item?.id ?? fallbackItemID self.outputIndex = container.decodeFlexibleIntIfPresent(forKey: .outputIndex) + self.phase = item?.phase self.transcript = item?.content?.first(where: { ($0.transcript?.isEmpty == false) })?.transcript self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID) } @@ -473,6 +511,7 @@ public struct OpenAIRealtimeResponseDoneEvent: Decodable, Sendable { public let responseID: String? public let conversationID: String? public let status: String? + public let output: [OpenAIRealtimeResponseOutputItem]? public let usage: OpenAIRealtimeResponseUsage? public let eventID: String? @@ -480,12 +519,14 @@ public struct OpenAIRealtimeResponseDoneEvent: Decodable, Sendable { let id: String? let conversationID: String? let status: String? + let output: [OpenAIRealtimeResponseOutputItem]? let usage: OpenAIRealtimeResponseUsage? private enum CodingKeys: String, CodingKey { case id case conversationID = "conversation_id" case status + case output case usage } } @@ -503,6 +544,7 @@ public struct OpenAIRealtimeResponseDoneEvent: Decodable, Sendable { self.responseID = response?.id ?? fallbackResponseID self.conversationID = response?.conversationID self.status = response?.status + self.output = response?.output self.usage = response?.usage self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID) } diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningConfiguration.swift new file mode 100644 index 00000000..e049cb10 --- /dev/null +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningConfiguration.swift @@ -0,0 +1,24 @@ +// +// OpenAIRealtimeReasoningConfiguration.swift +// AIProxy +// + +/// Configuration for OpenAI Realtime Reasoning models such as `gpt-realtime-2`. +nonisolated public struct OpenAIRealtimeReasoningConfiguration: Encodable, Sendable { + /// Constrains effort on Realtime Reasoning models. + public let effort: Effort? + + public init(effort: Effort? = nil) { + self.effort = effort + } +} + +extension OpenAIRealtimeReasoningConfiguration { + nonisolated public enum Effort: String, Encodable, Sendable { + case minimal + case low + case medium + case high + case xhigh + } +} diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningResponseCreate.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningResponseCreate.swift new file mode 100644 index 00000000..60086893 --- /dev/null +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningResponseCreate.swift @@ -0,0 +1,78 @@ +// +// OpenAIRealtimeReasoningResponseCreate.swift +// AIProxy +// + +/// `response.create` for Realtime Reasoning models. +nonisolated public struct OpenAIRealtimeReasoningResponseCreate: Encodable { + public let type = "response.create" + public let eventID: String? + public let response: Response? + + private enum CodingKeys: String, CodingKey { + case type + case eventID = "event_id" + case response + } + + public init(eventID: String? = nil, response: Response? = nil) { + self.eventID = eventID + self.response = response + } +} + +extension OpenAIRealtimeReasoningResponseCreate { + nonisolated public struct Response: Encodable { + public let conversation: String? + public let instructions: String? + public let outputModalities: [OpenAIRealtimeSessionConfiguration.Modality]? + public let tools: [OpenAIRealtimeResponseCreate.Response.Tool]? + public let toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? + public let reasoning: OpenAIRealtimeReasoningConfiguration? + public let parallelToolCalls: Bool? + + private enum CodingKeys: String, CodingKey { + case conversation + case instructions + case outputModalities = "output_modalities" + case tools + case toolChoice = "tool_choice" + case reasoning + case parallelToolCalls = "parallel_tool_calls" + } + + public init( + conversation: String? = nil, + instructions: String? = nil, + outputModalities: [OpenAIRealtimeSessionConfiguration.Modality]? = nil, + tools: [OpenAIRealtimeResponseCreate.Response.Tool]? = nil, + toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? = nil, + reasoning: OpenAIRealtimeReasoningConfiguration? = nil, + parallelToolCalls: Bool? = nil + ) { + self.conversation = conversation + self.instructions = instructions + self.outputModalities = outputModalities + self.tools = tools + self.toolChoice = toolChoice + self.reasoning = reasoning + self.parallelToolCalls = parallelToolCalls + } + + public init( + base: OpenAIRealtimeResponseCreate.Response, + reasoning: OpenAIRealtimeReasoningConfiguration? = nil, + parallelToolCalls: Bool? = nil + ) { + self.init( + conversation: base.conversation, + instructions: base.instructions, + outputModalities: base.outputModalities, + tools: base.tools, + toolChoice: base.toolChoice, + reasoning: reasoning, + parallelToolCalls: parallelToolCalls + ) + } + } +} diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningSessionConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningSessionConfiguration.swift new file mode 100644 index 00000000..39025277 --- /dev/null +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningSessionConfiguration.swift @@ -0,0 +1,25 @@ +// +// OpenAIRealtimeReasoningSessionConfiguration.swift +// AIProxy +// + +/// Session configuration for Realtime Reasoning models. +/// +/// The Realtime API still expects one `session.update.session` object. This type composes +/// the shared Realtime session configuration with Reasoning-only fields and flattens them +/// into that single wire object when encoded. +nonisolated public struct OpenAIRealtimeReasoningSessionConfiguration: Encodable, Sendable { + public let session: OpenAIRealtimeSessionConfiguration + public let reasoning: OpenAIRealtimeReasoningConfiguration? + public let parallelToolCalls: Bool? + + public init( + session: OpenAIRealtimeSessionConfiguration, + reasoning: OpenAIRealtimeReasoningConfiguration? = nil, + parallelToolCalls: Bool? = nil + ) { + self.session = session + self.reasoning = reasoning + self.parallelToolCalls = parallelToolCalls + } +} diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSession.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSession.swift index 27f33e9c..eb699a14 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeSession.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeSession.swift @@ -17,6 +17,7 @@ nonisolated private let kWebsocketDisconnectedEarlyThreshold: TimeInterval = 3 private var continuation: AsyncStream.Continuation? private let setupTime = Date() let sessionConfiguration: OpenAIRealtimeSessionConfiguration + private let initialSessionUpdate: OpenAIRealtimeSessionUpdate init( webSocketTask: URLSessionWebSocketTask, @@ -24,9 +25,25 @@ nonisolated private let kWebsocketDisconnectedEarlyThreshold: TimeInterval = 3 ) { self.webSocketTask = webSocketTask self.sessionConfiguration = sessionConfiguration + self.initialSessionUpdate = OpenAIRealtimeSessionUpdate(session: sessionConfiguration) Task { - await self.sendMessage(OpenAIRealtimeSessionUpdate(session: self.sessionConfiguration)) + await self.sendMessage(self.initialSessionUpdate) + } + self.webSocketTask.resume() + self.receiveMessage() + } + + init( + webSocketTask: URLSessionWebSocketTask, + sessionConfiguration: OpenAIRealtimeReasoningSessionConfiguration + ) { + self.webSocketTask = webSocketTask + self.sessionConfiguration = sessionConfiguration.session + self.initialSessionUpdate = OpenAIRealtimeSessionUpdate(session: sessionConfiguration) + + Task { + await self.sendMessage(self.initialSessionUpdate) } self.webSocketTask.resume() self.receiveMessage() diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift index 4502773e..e78d7aad 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift @@ -45,7 +45,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl outputModalities: [OpenAIRealtimeSessionConfiguration.Modality]? = nil, outputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil, speed: Float? = 1.0, - temperature: Double? = nil, // Deprecated in realtime GA + temperature: Double? = nil, // Deprecated in the current Realtime API tools: [Tool]? = nil, toolChoice: ToolChoice? = nil, turnDetection: TurnDetection? = nil, @@ -549,7 +549,7 @@ extension OpenAIRealtimeSessionConfiguration { } -// MARK: - Legacy fixes for pre-GA callsites +// MARK: - Legacy callsite compatibility extension OpenAIRealtimeSessionConfiguration { public typealias MaxResponseOutputTokens = MaxOutputTokens } @@ -561,7 +561,7 @@ extension OpenAIRealtimeSessionConfiguration.Voice: ExpressibleByStringLiteral { } extension OpenAIRealtimeSessionConfiguration.TurnDetection { - /// Pre-GA initializer kept for source compatibility with call sites that + /// Legacy initializer kept for source compatibility with call sites that /// build `TurnDetection(type: .semanticVAD(eagerness: ...))`. public init(type: DetectionType) { switch type { @@ -639,8 +639,14 @@ private struct OpenAIRealtimeSessionConfigurationWire: Encodable, Sendable { let prompt: OpenAIRealtimeSessionConfiguration.Prompt? let tracing: OpenAIRealtimeSessionConfiguration.Tracing? let truncation: OpenAIRealtimeSessionConfiguration.Truncation? + let reasoning: OpenAIRealtimeReasoningConfiguration? + let parallelToolCalls: Bool? - init(_ configuration: OpenAIRealtimeSessionConfiguration) { + init( + _ configuration: OpenAIRealtimeSessionConfiguration, + reasoning: OpenAIRealtimeReasoningConfiguration? = nil, + parallelToolCalls: Bool? = nil + ) { self.include = configuration.include self.type = configuration.type self.inputAudioFormat = configuration.inputAudioFormat @@ -659,6 +665,8 @@ private struct OpenAIRealtimeSessionConfigurationWire: Encodable, Sendable { self.prompt = configuration.prompt self.tracing = configuration.tracing self.truncation = configuration.truncation + self.reasoning = reasoning + self.parallelToolCalls = parallelToolCalls } private enum CodingKeys: String, CodingKey { @@ -670,6 +678,8 @@ private struct OpenAIRealtimeSessionConfigurationWire: Encodable, Sendable { case model case outputModalities = "output_modalities" case prompt + case reasoning + case parallelToolCalls = "parallel_tool_calls" case tracing case truncation case tools @@ -722,6 +732,8 @@ private struct OpenAIRealtimeSessionConfigurationWire: Encodable, Sendable { try container.encodeIfPresent(model, forKey: .model) try container.encodeIfPresent(outputModalities, forKey: .outputModalities) try container.encodeIfPresent(prompt, forKey: .prompt) + try container.encodeIfPresent(reasoning, forKey: .reasoning) + try container.encodeIfPresent(parallelToolCalls, forKey: .parallelToolCalls) try container.encodeIfPresent(tracing, forKey: .tracing) try container.encodeIfPresent(truncation, forKey: .truncation) try container.encodeIfPresent(tools, forKey: .tools) @@ -775,3 +787,13 @@ extension OpenAIRealtimeSessionConfiguration { try OpenAIRealtimeSessionConfigurationWire(self).encode(to: encoder) } } + +extension OpenAIRealtimeReasoningSessionConfiguration { + public func encode(to encoder: Encoder) throws { + try OpenAIRealtimeSessionConfigurationWire( + session, + reasoning: reasoning, + parallelToolCalls: parallelToolCalls + ).encode(to: encoder) + } +} diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionUpdate.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionUpdate.swift index c22121dc..8134bc0f 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionUpdate.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionUpdate.swift @@ -8,6 +8,7 @@ nonisolated public struct OpenAIRealtimeSessionUpdate: Encodable { /// Session configuration to update public let session: OpenAIRealtimeSessionConfiguration + private let reasoningSession: OpenAIRealtimeReasoningSessionConfiguration? /// The event type, must be "session.update". public let type = "session.update" @@ -24,5 +25,26 @@ nonisolated public struct OpenAIRealtimeSessionUpdate: Encodable { ) { self.eventId = eventId self.session = session + self.reasoningSession = nil + } + + public init( + eventId: String? = nil, + session: OpenAIRealtimeReasoningSessionConfiguration + ) { + self.eventId = eventId + self.session = session.session + self.reasoningSession = session + } + + public func encode(to encoder: Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + try container.encodeIfPresent(eventId, forKey: .eventId) + if let reasoningSession { + try container.encode(reasoningSession, forKey: .session) + } else { + try container.encode(session, forKey: .session) + } + try container.encode(type, forKey: .type) } } diff --git a/Sources/AIProxy/OpenAI/OpenAIService.swift b/Sources/AIProxy/OpenAI/OpenAIService.swift index e7adfb8e..5184ad91 100644 --- a/Sources/AIProxy/OpenAI/OpenAIService.swift +++ b/Sources/AIProxy/OpenAI/OpenAIService.swift @@ -285,6 +285,35 @@ import Foundation ) } + /// Starts a realtime session for Realtime Reasoning models such as `gpt-realtime-2`. + /// + /// This uses the same Realtime WebSocket transport as performance models, but sends + /// Reasoning-only session fields such as `reasoning` and `parallel_tool_calls` in + /// the initial `session.update`. + /// + /// - Parameters: + /// - model: The Realtime Reasoning model to use, for example `gpt-realtime-2`. + /// - configuration: The Reasoning session configuration object. + /// - logLevel: The threshold level that this library begins emitting log messages. + /// + /// - Returns: A realtime session manager that the caller can send and receive messages with. + public func realtimeSession( + model: String, + configuration: OpenAIRealtimeReasoningSessionConfiguration, + logLevel: AIProxyLogLevel + ) async throws -> OpenAIRealtimeSession { + AIProxyLogLevel.callerDesiredLogLevel = logLevel + let request = try await self.requestBuilder.plainGET( + path: "/v1/realtime?model=\(model)", + secondsToWait: 60, + additionalHeaders: [:] + ) + return OpenAIRealtimeSession( + webSocketTask: self.serviceNetworker.urlSession.webSocketTask(with: request), + sessionConfiguration: configuration + ) + } + /// Uploads a file to OpenAI for use in a future tool call /// https://platform.openai.com/docs/api-reference/files/create /// diff --git a/Tests/AIProxyTests/OpenAIRealtimeMessageTests.swift b/Tests/AIProxyTests/OpenAIRealtimeMessageTests.swift index d5990071..016a653d 100644 --- a/Tests/AIProxyTests/OpenAIRealtimeMessageTests.swift +++ b/Tests/AIProxyTests/OpenAIRealtimeMessageTests.swift @@ -214,6 +214,181 @@ struct OpenAIRealtimeMessageTests { #expect(payload.usage?.totalTokens == 225) } + @Test + func testResponseDoneDecodesPhasedOutput() throws { + let event = try decode( + #""" + { + "type": "response.done", + "event_id": "event_40", + "response": { + "id": "resp_40", + "conversation_id": "conv_40", + "status": "completed", + "output": [ + { + "id": "msg_commentary", + "phase": "commentary", + "content": [ + { + "type": "output_audio", + "transcript": "I'll check that now." + } + ] + }, + { + "id": "msg_final", + "phase": "final_answer", + "content": [ + { + "type": "output_audio", + "transcript": "The appointment is confirmed." + } + ] + } + ] + } + } + """# + ) + + guard case .responseDone(let payload) = event else { + Issue.record("Expected responseDone") + return + } + #expect(payload.output?.count == 2) + #expect(payload.output?.first?.phase == .commentary) + #expect(payload.output?.first?.transcript == "I'll check that now.") + #expect(payload.output?.last?.phase == .finalAnswer) + #expect(payload.output?.last?.transcript == "The appointment is confirmed.") + } + + @Test + func testResponseOutputItemDoneDecodesPhase() throws { + let event = try decode( + #""" + { + "type": "response.output_item.done", + "event_id": "event_41", + "response_id": "resp_41", + "output_index": 0, + "item": { + "id": "msg_41", + "phase": "final_answer", + "content": [ + { + "type": "output_audio", + "transcript": "Done." + } + ] + } + } + """# + ) + + guard case .responseOutputItemDone(let payload) = event else { + Issue.record("Expected responseOutputItemDone") + return + } + #expect(payload.itemID == "msg_41") + #expect(payload.phase == .finalAnswer) + #expect(payload.transcript == "Done.") + } + + @Test + func testResponseOutputItemAddedDecodesPhase() throws { + let event = try decode( + #""" + { + "type": "response.output_item.added", + "event_id": "event_42", + "response_id": "resp_42", + "output_index": 0, + "item": { + "id": "msg_42", + "phase": "commentary" + } + } + """# + ) + + guard case .responseOutputItemAdded(let payload) = event else { + Issue.record("Expected responseOutputItemAdded") + return + } + #expect(payload.itemID == "msg_42") + #expect(payload.phase == .commentary) + } + + @Test + func testConversationItemAddedDecodesPhase() throws { + let event = try decode( + #""" + { + "type": "conversation.item.added", + "event_id": "event_43", + "item": { + "id": "msg_43", + "role": "assistant", + "phase": "final_answer" + } + } + """# + ) + + guard case .conversationItemAdded(let payload) = event else { + Issue.record("Expected conversationItemAdded") + return + } + #expect(payload.itemID == "msg_43") + #expect(payload.role == "assistant") + #expect(payload.phase == .finalAnswer) + } + + @Test + func testConversationItemCreatedAndDoneDecodePhase() throws { + let created = try decode( + #""" + { + "type": "conversation.item.created", + "event_id": "event_44", + "item": { + "id": "msg_44", + "role": "assistant", + "phase": "commentary" + } + } + """# + ) + let done = try decode( + #""" + { + "type": "conversation.item.done", + "event_id": "event_45", + "item": { + "id": "msg_45", + "role": "assistant", + "phase": "final_answer" + } + } + """# + ) + + guard case .conversationItemCreated(let createdPayload) = created else { + Issue.record("Expected conversationItemCreated") + return + } + #expect(createdPayload.itemID == "msg_44") + #expect(createdPayload.phase == .commentary) + + guard case .conversationItemDone(let donePayload) = done else { + Issue.record("Expected conversationItemDone") + return + } + #expect(donePayload.itemID == "msg_45") + #expect(donePayload.phase == .finalAnswer) + } + @Test func testInputAudioTranscriptionDeltaLogprobsAreDecodable() throws { let event = try decode( diff --git a/Tests/AIProxyTests/OpenAIRealtimeSessionEncodingTests.swift b/Tests/AIProxyTests/OpenAIRealtimeSessionEncodingTests.swift index 7d3b8d1e..e4a6eadc 100644 --- a/Tests/AIProxyTests/OpenAIRealtimeSessionEncodingTests.swift +++ b/Tests/AIProxyTests/OpenAIRealtimeSessionEncodingTests.swift @@ -16,6 +16,17 @@ struct OpenAIRealtimeSessionEncodingTests { return e }() + @AIProxyActor + private func compilePerformanceRealtimeSessionCall( + service: OpenAIService + ) async throws { + _ = try await service.realtimeSession( + model: "gpt-realtime-1.5", + configuration: .init(), + logLevel: .debug + ) + } + @Test func sessionUpdateEncodesNestedAudioAndOutputModalities() throws { let update = OpenAIRealtimeSessionUpdate( @@ -38,6 +49,8 @@ struct OpenAIRealtimeSessionEncodingTests { #expect(session["output_modalities"] as? [String] == ["audio"]) #expect(session["modalities"] == nil) #expect(session["max_response_output_tokens"] == nil) + #expect(session["reasoning"] == nil) + #expect(session["parallel_tool_calls"] == nil) let audio = session["audio"] as! [String: Any] let input = audio["input"] as! [String: Any] let inputFormat = input["format"] as! [String: Any] @@ -45,6 +58,48 @@ struct OpenAIRealtimeSessionEncodingTests { #expect(inputFormat["rate"] as? Int == 24000) } + @Test + func sessionUpdateAcceptsInlineDefaultPerformanceConfiguration() throws { + let update = OpenAIRealtimeSessionUpdate(session: .init()) + let encoded = try encoder.encode(update) + let root = try Self.jsonObject(encoded) as! [String: Any] + let session = root["session"] as! [String: Any] + + #expect(session["type"] as? String == "realtime") + #expect(session["reasoning"] == nil) + #expect(session["parallel_tool_calls"] == nil) + } + + @Test + func reasoningSessionUpdateMergesBaseAndReasoningFields() throws { + let update = OpenAIRealtimeSessionUpdate( + session: OpenAIRealtimeReasoningSessionConfiguration( + session: OpenAIRealtimeSessionConfiguration( + inputAudioFormat: .pcm16, + instructions: "Solve carefully.", + outputModalities: [.audio], + voice: .builtin("alloy") + ), + reasoning: .init(effort: .low), + parallelToolCalls: true + ) + ) + let encoded = try encoder.encode(update) + let root = try Self.jsonObject(encoded) as! [String: Any] + #expect(root["type"] as? String == "session.update") + let session = root["session"] as! [String: Any] + #expect(session["instructions"] as? String == "Solve carefully.") + #expect(session["output_modalities"] as? [String] == ["audio"]) + #expect(session["parallel_tool_calls"] as? Bool == true) + let reasoning = session["reasoning"] as! [String: Any] + #expect(reasoning["effort"] as? String == "low") + let audio = session["audio"] as! [String: Any] + let inputFormat = (audio["input"] as! [String: Any])["format"] as! [String: Any] + #expect(inputFormat["type"] as? String == "audio/pcm") + let output = audio["output"] as! [String: Any] + #expect(output["voice"] as? String == "alloy") + } + @Test func sessionUpdateEncodesG711AudioFormatObjects() throws { let update = OpenAIRealtimeSessionUpdate( @@ -153,6 +208,33 @@ struct OpenAIRealtimeSessionEncodingTests { #expect(response["modalities"] == nil) } + @Test + func reasoningResponseCreateEncodesReasoning() throws { + let event = OpenAIRealtimeReasoningResponseCreate( + eventID: "evt_reasoning", + response: .init( + base: .init( + instructions: "Use the lowest sufficient reasoning effort.", + outputModalities: [.audio], + toolChoice: .auto + ), + reasoning: .init(effort: .minimal), + parallelToolCalls: false + ) + ) + let encoded = try encoder.encode(event) + let root = try Self.jsonObject(encoded) as! [String: Any] + #expect(root["type"] as? String == "response.create") + #expect(root["event_id"] as? String == "evt_reasoning") + let response = root["response"] as! [String: Any] + #expect(response["instructions"] as? String == "Use the lowest sufficient reasoning effort.") + #expect(response["output_modalities"] as? [String] == ["audio"]) + #expect(response["tool_choice"] as? String == "auto") + #expect(response["parallel_tool_calls"] as? Bool == false) + let reasoning = response["reasoning"] as! [String: Any] + #expect(reasoning["effort"] as? String == "minimal") + } + @Test func responseCreateToolChoiceMCPEncodesObjectShape() throws { let event = OpenAIRealtimeResponseCreate(