diff --git a/Documentation/OpenAI/RealtimeSchemaMatrix.md b/Documentation/OpenAI/RealtimeSchemaMatrix.md new file mode 100644 index 00000000..0c825182 --- /dev/null +++ b/Documentation/OpenAI/RealtimeSchemaMatrix.md @@ -0,0 +1,92 @@ +# Realtime API Schema Matrix + +This matrix maps the current OpenAI Realtime `session.update.session` and `response.create.response` +fields to AIProxySwift types and wire encoding behavior. + +Reference: https://developers.openai.com/api/reference/resources/realtime + +## Shared Realtime Session + +These fields are used by Performance Realtime models, such as `gpt-realtime-1.5`, and are also the +base session shape composed by Realtime Reasoning models. + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `type` | `OpenAIRealtimeSessionConfiguration.type` | string | +| `include` | `OpenAIRealtimeSessionConfiguration.include` | string array | +| `model` | `OpenAIRealtimeSessionConfiguration.model` | string | +| `instructions` | `OpenAIRealtimeSessionConfiguration.instructions` | string | +| `max_output_tokens` | `OpenAIRealtimeSessionConfiguration.maxOutputTokens` | int or `"inf"` | +| `output_modalities` | `OpenAIRealtimeSessionConfiguration.outputModalities` | enum string array | +| `prompt` | `OpenAIRealtimeSessionConfiguration.prompt` | object (`id`, optional `variables`, optional `version`) | +| `tracing` | `OpenAIRealtimeSessionConfiguration.tracing` | string `"auto"` or object (`group_id`, `metadata`, `workflow_name`) | +| `truncation` | `OpenAIRealtimeSessionConfiguration.truncation` | string (`"auto"`/`"disabled"`) or retention-ratio object | +| `tools` | `OpenAIRealtimeSessionConfiguration.tools` | union array (`function`, `mcp`, `web_search`) | +| `tool_choice` | `OpenAIRealtimeSessionConfiguration.toolChoice` | string (`auto`/`none`/`required`) or typed selector object | +| `audio.input.format` | `OpenAIRealtimeSessionConfiguration.inputAudioFormat` | object (`type`, optional `rate`) | +| `audio.input.noise_reduction` | `OpenAIRealtimeSessionConfiguration.inputAudioNoiseReduction` | object (`type`) | +| `audio.input.transcription` | `OpenAIRealtimeSessionConfiguration.inputAudioTranscription` | object (`language`, `model`, `prompt`) | +| `audio.input.turn_detection` | `OpenAIRealtimeSessionConfiguration.turnDetection` | typed object union (`server_vad` / `semantic_vad`) | +| `audio.output.format` | `OpenAIRealtimeSessionConfiguration.outputAudioFormat` | object (`type`, optional `rate`) | +| `audio.output.speed` | `OpenAIRealtimeSessionConfiguration.speed` | number (range 0.25...1.5) | +| `audio.output.voice` | `OpenAIRealtimeSessionConfiguration.voice` | string or object (`id`) | + +## Realtime Reasoning Session + +Realtime Reasoning models, such as `gpt-realtime-2`, compose the shared session fields above and add +Reasoning-only fields to the same `session.update.session` object. + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `reasoning` | `OpenAIRealtimeReasoningSessionConfiguration.reasoning` | object | +| `reasoning.effort` | `OpenAIRealtimeReasoningConfiguration.effort` | `minimal`, `low`, `medium`, `high`, or `xhigh` | +| `parallel_tool_calls` | `OpenAIRealtimeReasoningSessionConfiguration.parallelToolCalls` | boolean | + +## Shared `response.create` + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `type` | `OpenAIRealtimeResponseCreate.type` | `"response.create"` | +| `event_id` | `OpenAIRealtimeResponseCreate.eventID` | optional string | +| `response.instructions` | `OpenAIRealtimeResponseCreate.Response.instructions` | optional string | +| `response.output_modalities` | `OpenAIRealtimeResponseCreate.Response.outputModalities` | optional enum string array | +| `response.tools` | `OpenAIRealtimeResponseCreate.Response.tools` | optional tool union array (`function`, `mcp`, `web_search`) | +| `response.tool_choice` | `OpenAIRealtimeResponseCreate.Response.toolChoice` | optional string/object union | + +## Realtime Reasoning `response.create` + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `type` | `OpenAIRealtimeReasoningResponseCreate.type` | `"response.create"` | +| `event_id` | `OpenAIRealtimeReasoningResponseCreate.eventID` | optional string | +| `response.reasoning` | `OpenAIRealtimeReasoningResponseCreate.Response.reasoning` | object | +| `response.reasoning.effort` | `OpenAIRealtimeReasoningConfiguration.effort` | `minimal`, `low`, `medium`, `high`, or `xhigh` | +| `response.parallel_tool_calls` | `OpenAIRealtimeReasoningResponseCreate.Response.parallelToolCalls` | boolean | + +## Realtime Reasoning Output Phases + +Realtime Reasoning output can be split into commentary and final answer phases. + +| Wire field | AIProxySwift API | Wire shape decoded | +| --- | --- | --- | +| `response.output[].phase` | `OpenAIRealtimeResponseOutputItem.phase` | `commentary` or `final_answer` | +| `response.output_item.*.item.phase` | `OpenAIRealtimeResponseOutputItemAddedEvent.phase` / `OpenAIRealtimeResponseOutputItemDoneEvent.phase` | `commentary` or `final_answer` | +| `conversation.item.*.item.phase` | `OpenAIRealtimeConversationItemCreatedEvent.phase` | `commentary` or `final_answer` | + +## `conversation.item.create` + +Reference: https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/create + +| Wire field | AIProxySwift API | Wire shape emitted | +| --- | --- | --- | +| `type` | `OpenAIRealtimeConversationItemCreate.type` | `"conversation.item.create"` | +| `item.type` | `OpenAIRealtimeConversationItemCreate.Item` | `"message"`, `"function_call"`, `"function_call_output"` | +| `item.role` | `OpenAIRealtimeConversationItemCreate.Item.role` | optional string for message items | +| `item.content[].type` | `OpenAIRealtimeConversationItemCreate.Item.Content.type` | `input_text`, `output_text`, `input_audio`, `item_reference`, `input_image` | +| `item.content[].text` | `OpenAIRealtimeConversationItemCreate.Item.Content.text` | optional string | +| `item.content[].audio` | `OpenAIRealtimeConversationItemCreate.Item.Content.audio` | optional string | +| `item.content[].item_id` | `OpenAIRealtimeConversationItemCreate.Item.Content.itemID` | optional string | +| `item.call_id` | `OpenAIRealtimeConversationItemCreate.Item.callID` | optional string | +| `item.name` | `OpenAIRealtimeConversationItemCreate.Item.name` | optional string | +| `item.arguments` | `OpenAIRealtimeConversationItemCreate.Item.arguments` | optional string | +| `item.output` | `OpenAIRealtimeConversationItemCreate.Item.output` | optional string | diff --git a/README.md b/README.md index 3aea6cbf..71b85480 100644 --- a/README.md +++ b/README.md @@ -1384,13 +1384,10 @@ final class RealtimeManager { inputAudioFormat: .pcm16, inputAudioTranscription: .init(model: "whisper-1"), instructions: "You are a tour guide of Yosemite national park", - maxResponseOutputTokens: .int(4096), - modalities: [.audio], + maxOutputTokens: .int(4096), + outputModalities: [.audio], outputAudioFormat: .pcm16, - temperature: 0.7, - turnDetection: .init( - type: .semanticVAD(eagerness: .medium) - ), + turnDetection: .semanticVAD(.init(eagerness: .medium)), voice: "shimmer" ) @@ -1449,14 +1446,15 @@ final class RealtimeManager { } ``` -#### General Availability (GA) Realtime migration notes +#### Current Realtime API notes -- OpenAI has announced Realtime beta (`OpenAI-Beta: realtime=v1`) deprecation and shutdown on 2026-05-07. -- For `response.create`, GA uses `output_modalities` (not `modalities`). -- The new `output_modalities` for OpenAI realtime GA (general availability) is as follows: +- For a field-by-field mapping of the Realtime wire shape to AIProxySwift types, see + [Realtime schema matrix](Documentation/OpenAI/RealtimeSchemaMatrix.md). +- For `response.create`, the current Realtime API uses `output_modalities` (not `modalities`). +- `output_modalities` is as follows: - `["audio"]` returns audio with transcript. - `["text"]` returns text only. -- For voice mode with built-in web search, use GA tool (`.webSearch`) and specify `.auto` for toolChoice to let the model decide when to use it. +- For voice mode with built-in web search, use the `.webSearch` tool and specify `.auto` for `toolChoice` to let the model decide when to use it. ```swift let configuration = OpenAIRealtimeSessionConfiguration( @@ -1473,6 +1471,60 @@ let session = try await openAIService.realtimeSession( ) ``` +#### Realtime Reasoning models + +OpenAI's Realtime Reasoning models, such as `gpt-realtime-2`, use the same Realtime WebSocket +transport and shared session fields as Performance models like `gpt-realtime-1.5`, plus +Reasoning-only configuration for effort and parallel tool calls. + +```swift +let configuration = OpenAIRealtimeReasoningSessionConfiguration( + session: OpenAIRealtimeSessionConfiguration( + outputModalities: [.audio], + voice: .builtin("alloy"), + tools: [.webSearch(.init(searchContextSize: .medium))], + toolChoice: .auto + ), + reasoning: .init(effort: .low), + parallelToolCalls: true +) + +let session = try await openAIService.realtimeSession( + model: "gpt-realtime-2", + configuration: configuration, + logLevel: .info +) +``` + +You can also override Reasoning settings for a single response: + +```swift +await session.sendMessage( + OpenAIRealtimeReasoningResponseCreate( + response: .init( + base: .init( + instructions: "Use the lowest sufficient reasoning effort.", + outputModalities: [.audio] + ), + reasoning: .init(effort: .minimal), + parallelToolCalls: false + ) + ) +) +``` + +Realtime Reasoning responses can include phased output. Use `phase` to separate short commentary +from the final answer when the model emits both in a turn: + +```swift +for await message in session.receiver { + if case .responseDone(let event) = message { + let commentary = event.output?.filter { $0.phase == .commentary } + let finalAnswer = event.output?.filter { $0.phase == .finalAnswer } + } +} +``` + ### How to make a basic request using OpenAI's Responses API Note: there is also a streaming version of this snippet below. diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift index 55a77d0d..6f67ea7d 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift @@ -277,15 +277,46 @@ public struct OpenAIRealtimeInputAudioBufferDTMFEventReceivedEvent: Decodable, S } } +public enum OpenAIRealtimeResponsePhase: String, Decodable, Sendable { + case commentary + case finalAnswer = "final_answer" +} + +public struct OpenAIRealtimeResponseOutputItem: Decodable, Sendable { + public let id: String? + public let phase: OpenAIRealtimeResponsePhase? + public let content: [Content]? + + public var transcript: String? { + content?.first(where: { ($0.transcript?.isEmpty == false) })?.transcript + } + + private enum CodingKeys: String, CodingKey { + case id + case phase + case content + } +} + +extension OpenAIRealtimeResponseOutputItem { + public struct Content: Decodable, Sendable { + public let type: String? + public let text: String? + public let transcript: String? + } +} + public struct OpenAIRealtimeConversationItemCreatedEvent: Decodable, Sendable { public let itemID: String? public let previousItemID: String? public let role: String? + public let phase: OpenAIRealtimeResponsePhase? public let eventID: String? private struct ItemBody: Decodable { let id: String? let role: String? + let phase: OpenAIRealtimeResponsePhase? } private enum CodingKeys: String, CodingKey { @@ -302,6 +333,7 @@ public struct OpenAIRealtimeConversationItemCreatedEvent: Decodable, Sendable { self.itemID = item?.id ?? fallbackItemID self.previousItemID = try container.decodeIfPresent(String.self, forKey: .previousItemID) self.role = item?.role + self.phase = item?.phase self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID) } } @@ -325,10 +357,12 @@ public struct OpenAIRealtimeResponseOutputItemAddedEvent: Decodable, Sendable { public let responseID: String? public let itemID: String? public let outputIndex: Int? + public let phase: OpenAIRealtimeResponsePhase? public let eventID: String? private struct ItemBody: Decodable { let id: String? + let phase: OpenAIRealtimeResponsePhase? } private enum CodingKeys: String, CodingKey { @@ -346,6 +380,7 @@ public struct OpenAIRealtimeResponseOutputItemAddedEvent: Decodable, Sendable { let fallbackItemID = try container.decodeIfPresent(String.self, forKey: .itemID) self.itemID = item?.id ?? fallbackItemID self.outputIndex = container.decodeFlexibleIntIfPresent(forKey: .outputIndex) + self.phase = item?.phase self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID) } } @@ -354,6 +389,7 @@ public struct OpenAIRealtimeResponseOutputItemDoneEvent: Decodable, Sendable { public let responseID: String? public let itemID: String? public let outputIndex: Int? + public let phase: OpenAIRealtimeResponsePhase? public let transcript: String? public let eventID: String? @@ -362,6 +398,7 @@ public struct OpenAIRealtimeResponseOutputItemDoneEvent: Decodable, Sendable { let transcript: String? } let id: String? + let phase: OpenAIRealtimeResponsePhase? let content: [ContentBody]? } @@ -380,6 +417,7 @@ public struct OpenAIRealtimeResponseOutputItemDoneEvent: Decodable, Sendable { let fallbackItemID = try container.decodeIfPresent(String.self, forKey: .itemID) self.itemID = item?.id ?? fallbackItemID self.outputIndex = container.decodeFlexibleIntIfPresent(forKey: .outputIndex) + self.phase = item?.phase self.transcript = item?.content?.first(where: { ($0.transcript?.isEmpty == false) })?.transcript self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID) } @@ -473,6 +511,7 @@ public struct OpenAIRealtimeResponseDoneEvent: Decodable, Sendable { public let responseID: String? public let conversationID: String? public let status: String? + public let output: [OpenAIRealtimeResponseOutputItem]? public let usage: OpenAIRealtimeResponseUsage? public let eventID: String? @@ -480,12 +519,14 @@ public struct OpenAIRealtimeResponseDoneEvent: Decodable, Sendable { let id: String? let conversationID: String? let status: String? + let output: [OpenAIRealtimeResponseOutputItem]? let usage: OpenAIRealtimeResponseUsage? private enum CodingKeys: String, CodingKey { case id case conversationID = "conversation_id" case status + case output case usage } } @@ -503,6 +544,7 @@ public struct OpenAIRealtimeResponseDoneEvent: Decodable, Sendable { self.responseID = response?.id ?? fallbackResponseID self.conversationID = response?.conversationID self.status = response?.status + self.output = response?.output self.usage = response?.usage self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID) } diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningConfiguration.swift new file mode 100644 index 00000000..e049cb10 --- /dev/null +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningConfiguration.swift @@ -0,0 +1,24 @@ +// +// OpenAIRealtimeReasoningConfiguration.swift +// AIProxy +// + +/// Configuration for OpenAI Realtime Reasoning models such as `gpt-realtime-2`. +nonisolated public struct OpenAIRealtimeReasoningConfiguration: Encodable, Sendable { + /// Constrains effort on Realtime Reasoning models. + public let effort: Effort? + + public init(effort: Effort? = nil) { + self.effort = effort + } +} + +extension OpenAIRealtimeReasoningConfiguration { + nonisolated public enum Effort: String, Encodable, Sendable { + case minimal + case low + case medium + case high + case xhigh + } +} diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningResponseCreate.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningResponseCreate.swift new file mode 100644 index 00000000..60086893 --- /dev/null +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningResponseCreate.swift @@ -0,0 +1,78 @@ +// +// OpenAIRealtimeReasoningResponseCreate.swift +// AIProxy +// + +/// `response.create` for Realtime Reasoning models. +nonisolated public struct OpenAIRealtimeReasoningResponseCreate: Encodable { + public let type = "response.create" + public let eventID: String? + public let response: Response? + + private enum CodingKeys: String, CodingKey { + case type + case eventID = "event_id" + case response + } + + public init(eventID: String? = nil, response: Response? = nil) { + self.eventID = eventID + self.response = response + } +} + +extension OpenAIRealtimeReasoningResponseCreate { + nonisolated public struct Response: Encodable { + public let conversation: String? + public let instructions: String? + public let outputModalities: [OpenAIRealtimeSessionConfiguration.Modality]? + public let tools: [OpenAIRealtimeResponseCreate.Response.Tool]? + public let toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? + public let reasoning: OpenAIRealtimeReasoningConfiguration? + public let parallelToolCalls: Bool? + + private enum CodingKeys: String, CodingKey { + case conversation + case instructions + case outputModalities = "output_modalities" + case tools + case toolChoice = "tool_choice" + case reasoning + case parallelToolCalls = "parallel_tool_calls" + } + + public init( + conversation: String? = nil, + instructions: String? = nil, + outputModalities: [OpenAIRealtimeSessionConfiguration.Modality]? = nil, + tools: [OpenAIRealtimeResponseCreate.Response.Tool]? = nil, + toolChoice: OpenAIRealtimeSessionConfiguration.ToolChoice? = nil, + reasoning: OpenAIRealtimeReasoningConfiguration? = nil, + parallelToolCalls: Bool? = nil + ) { + self.conversation = conversation + self.instructions = instructions + self.outputModalities = outputModalities + self.tools = tools + self.toolChoice = toolChoice + self.reasoning = reasoning + self.parallelToolCalls = parallelToolCalls + } + + public init( + base: OpenAIRealtimeResponseCreate.Response, + reasoning: OpenAIRealtimeReasoningConfiguration? = nil, + parallelToolCalls: Bool? = nil + ) { + self.init( + conversation: base.conversation, + instructions: base.instructions, + outputModalities: base.outputModalities, + tools: base.tools, + toolChoice: base.toolChoice, + reasoning: reasoning, + parallelToolCalls: parallelToolCalls + ) + } + } +} diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningSessionConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningSessionConfiguration.swift new file mode 100644 index 00000000..39025277 --- /dev/null +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningSessionConfiguration.swift @@ -0,0 +1,25 @@ +// +// OpenAIRealtimeReasoningSessionConfiguration.swift +// AIProxy +// + +/// Session configuration for Realtime Reasoning models. +/// +/// The Realtime API still expects one `session.update.session` object. This type composes +/// the shared Realtime session configuration with Reasoning-only fields and flattens them +/// into that single wire object when encoded. +nonisolated public struct OpenAIRealtimeReasoningSessionConfiguration: Encodable, Sendable { + public let session: OpenAIRealtimeSessionConfiguration + public let reasoning: OpenAIRealtimeReasoningConfiguration? + public let parallelToolCalls: Bool? + + public init( + session: OpenAIRealtimeSessionConfiguration, + reasoning: OpenAIRealtimeReasoningConfiguration? = nil, + parallelToolCalls: Bool? = nil + ) { + self.session = session + self.reasoning = reasoning + self.parallelToolCalls = parallelToolCalls + } +} diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSession.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSession.swift index 27f33e9c..eb699a14 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeSession.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeSession.swift @@ -17,6 +17,7 @@ nonisolated private let kWebsocketDisconnectedEarlyThreshold: TimeInterval = 3 private var continuation: AsyncStream.Continuation? private let setupTime = Date() let sessionConfiguration: OpenAIRealtimeSessionConfiguration + private let initialSessionUpdate: OpenAIRealtimeSessionUpdate init( webSocketTask: URLSessionWebSocketTask, @@ -24,9 +25,25 @@ nonisolated private let kWebsocketDisconnectedEarlyThreshold: TimeInterval = 3 ) { self.webSocketTask = webSocketTask self.sessionConfiguration = sessionConfiguration + self.initialSessionUpdate = OpenAIRealtimeSessionUpdate(session: sessionConfiguration) Task { - await self.sendMessage(OpenAIRealtimeSessionUpdate(session: self.sessionConfiguration)) + await self.sendMessage(self.initialSessionUpdate) + } + self.webSocketTask.resume() + self.receiveMessage() + } + + init( + webSocketTask: URLSessionWebSocketTask, + sessionConfiguration: OpenAIRealtimeReasoningSessionConfiguration + ) { + self.webSocketTask = webSocketTask + self.sessionConfiguration = sessionConfiguration.session + self.initialSessionUpdate = OpenAIRealtimeSessionUpdate(session: sessionConfiguration) + + Task { + await self.sendMessage(self.initialSessionUpdate) } self.webSocketTask.resume() self.receiveMessage() diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift index 4502773e..e78d7aad 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionConfiguration.swift @@ -45,7 +45,7 @@ nonisolated public struct OpenAIRealtimeSessionConfiguration: Encodable, Sendabl outputModalities: [OpenAIRealtimeSessionConfiguration.Modality]? = nil, outputAudioFormat: OpenAIRealtimeSessionConfiguration.AudioFormat? = nil, speed: Float? = 1.0, - temperature: Double? = nil, // Deprecated in realtime GA + temperature: Double? = nil, // Deprecated in the current Realtime API tools: [Tool]? = nil, toolChoice: ToolChoice? = nil, turnDetection: TurnDetection? = nil, @@ -549,7 +549,7 @@ extension OpenAIRealtimeSessionConfiguration { } -// MARK: - Legacy fixes for pre-GA callsites +// MARK: - Legacy callsite compatibility extension OpenAIRealtimeSessionConfiguration { public typealias MaxResponseOutputTokens = MaxOutputTokens } @@ -561,7 +561,7 @@ extension OpenAIRealtimeSessionConfiguration.Voice: ExpressibleByStringLiteral { } extension OpenAIRealtimeSessionConfiguration.TurnDetection { - /// Pre-GA initializer kept for source compatibility with call sites that + /// Legacy initializer kept for source compatibility with call sites that /// build `TurnDetection(type: .semanticVAD(eagerness: ...))`. public init(type: DetectionType) { switch type { @@ -639,8 +639,14 @@ private struct OpenAIRealtimeSessionConfigurationWire: Encodable, Sendable { let prompt: OpenAIRealtimeSessionConfiguration.Prompt? let tracing: OpenAIRealtimeSessionConfiguration.Tracing? let truncation: OpenAIRealtimeSessionConfiguration.Truncation? + let reasoning: OpenAIRealtimeReasoningConfiguration? + let parallelToolCalls: Bool? - init(_ configuration: OpenAIRealtimeSessionConfiguration) { + init( + _ configuration: OpenAIRealtimeSessionConfiguration, + reasoning: OpenAIRealtimeReasoningConfiguration? = nil, + parallelToolCalls: Bool? = nil + ) { self.include = configuration.include self.type = configuration.type self.inputAudioFormat = configuration.inputAudioFormat @@ -659,6 +665,8 @@ private struct OpenAIRealtimeSessionConfigurationWire: Encodable, Sendable { self.prompt = configuration.prompt self.tracing = configuration.tracing self.truncation = configuration.truncation + self.reasoning = reasoning + self.parallelToolCalls = parallelToolCalls } private enum CodingKeys: String, CodingKey { @@ -670,6 +678,8 @@ private struct OpenAIRealtimeSessionConfigurationWire: Encodable, Sendable { case model case outputModalities = "output_modalities" case prompt + case reasoning + case parallelToolCalls = "parallel_tool_calls" case tracing case truncation case tools @@ -722,6 +732,8 @@ private struct OpenAIRealtimeSessionConfigurationWire: Encodable, Sendable { try container.encodeIfPresent(model, forKey: .model) try container.encodeIfPresent(outputModalities, forKey: .outputModalities) try container.encodeIfPresent(prompt, forKey: .prompt) + try container.encodeIfPresent(reasoning, forKey: .reasoning) + try container.encodeIfPresent(parallelToolCalls, forKey: .parallelToolCalls) try container.encodeIfPresent(tracing, forKey: .tracing) try container.encodeIfPresent(truncation, forKey: .truncation) try container.encodeIfPresent(tools, forKey: .tools) @@ -775,3 +787,13 @@ extension OpenAIRealtimeSessionConfiguration { try OpenAIRealtimeSessionConfigurationWire(self).encode(to: encoder) } } + +extension OpenAIRealtimeReasoningSessionConfiguration { + public func encode(to encoder: Encoder) throws { + try OpenAIRealtimeSessionConfigurationWire( + session, + reasoning: reasoning, + parallelToolCalls: parallelToolCalls + ).encode(to: encoder) + } +} diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionUpdate.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionUpdate.swift index c22121dc..8134bc0f 100644 --- a/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionUpdate.swift +++ b/Sources/AIProxy/OpenAI/OpenAIRealtimeSessionUpdate.swift @@ -8,6 +8,7 @@ nonisolated public struct OpenAIRealtimeSessionUpdate: Encodable { /// Session configuration to update public let session: OpenAIRealtimeSessionConfiguration + private let reasoningSession: OpenAIRealtimeReasoningSessionConfiguration? /// The event type, must be "session.update". public let type = "session.update" @@ -24,5 +25,26 @@ nonisolated public struct OpenAIRealtimeSessionUpdate: Encodable { ) { self.eventId = eventId self.session = session + self.reasoningSession = nil + } + + public init( + eventId: String? = nil, + session: OpenAIRealtimeReasoningSessionConfiguration + ) { + self.eventId = eventId + self.session = session.session + self.reasoningSession = session + } + + public func encode(to encoder: Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + try container.encodeIfPresent(eventId, forKey: .eventId) + if let reasoningSession { + try container.encode(reasoningSession, forKey: .session) + } else { + try container.encode(session, forKey: .session) + } + try container.encode(type, forKey: .type) } } diff --git a/Sources/AIProxy/OpenAI/OpenAIService.swift b/Sources/AIProxy/OpenAI/OpenAIService.swift index e7adfb8e..5184ad91 100644 --- a/Sources/AIProxy/OpenAI/OpenAIService.swift +++ b/Sources/AIProxy/OpenAI/OpenAIService.swift @@ -285,6 +285,35 @@ import Foundation ) } + /// Starts a realtime session for Realtime Reasoning models such as `gpt-realtime-2`. + /// + /// This uses the same Realtime WebSocket transport as performance models, but sends + /// Reasoning-only session fields such as `reasoning` and `parallel_tool_calls` in + /// the initial `session.update`. + /// + /// - Parameters: + /// - model: The Realtime Reasoning model to use, for example `gpt-realtime-2`. + /// - configuration: The Reasoning session configuration object. + /// - logLevel: The threshold level that this library begins emitting log messages. + /// + /// - Returns: A realtime session manager that the caller can send and receive messages with. + public func realtimeSession( + model: String, + configuration: OpenAIRealtimeReasoningSessionConfiguration, + logLevel: AIProxyLogLevel + ) async throws -> OpenAIRealtimeSession { + AIProxyLogLevel.callerDesiredLogLevel = logLevel + let request = try await self.requestBuilder.plainGET( + path: "/v1/realtime?model=\(model)", + secondsToWait: 60, + additionalHeaders: [:] + ) + return OpenAIRealtimeSession( + webSocketTask: self.serviceNetworker.urlSession.webSocketTask(with: request), + sessionConfiguration: configuration + ) + } + /// Uploads a file to OpenAI for use in a future tool call /// https://platform.openai.com/docs/api-reference/files/create /// diff --git a/Tests/AIProxyTests/OpenAIRealtimeMessageTests.swift b/Tests/AIProxyTests/OpenAIRealtimeMessageTests.swift index d5990071..016a653d 100644 --- a/Tests/AIProxyTests/OpenAIRealtimeMessageTests.swift +++ b/Tests/AIProxyTests/OpenAIRealtimeMessageTests.swift @@ -214,6 +214,181 @@ struct OpenAIRealtimeMessageTests { #expect(payload.usage?.totalTokens == 225) } + @Test + func testResponseDoneDecodesPhasedOutput() throws { + let event = try decode( + #""" + { + "type": "response.done", + "event_id": "event_40", + "response": { + "id": "resp_40", + "conversation_id": "conv_40", + "status": "completed", + "output": [ + { + "id": "msg_commentary", + "phase": "commentary", + "content": [ + { + "type": "output_audio", + "transcript": "I'll check that now." + } + ] + }, + { + "id": "msg_final", + "phase": "final_answer", + "content": [ + { + "type": "output_audio", + "transcript": "The appointment is confirmed." + } + ] + } + ] + } + } + """# + ) + + guard case .responseDone(let payload) = event else { + Issue.record("Expected responseDone") + return + } + #expect(payload.output?.count == 2) + #expect(payload.output?.first?.phase == .commentary) + #expect(payload.output?.first?.transcript == "I'll check that now.") + #expect(payload.output?.last?.phase == .finalAnswer) + #expect(payload.output?.last?.transcript == "The appointment is confirmed.") + } + + @Test + func testResponseOutputItemDoneDecodesPhase() throws { + let event = try decode( + #""" + { + "type": "response.output_item.done", + "event_id": "event_41", + "response_id": "resp_41", + "output_index": 0, + "item": { + "id": "msg_41", + "phase": "final_answer", + "content": [ + { + "type": "output_audio", + "transcript": "Done." + } + ] + } + } + """# + ) + + guard case .responseOutputItemDone(let payload) = event else { + Issue.record("Expected responseOutputItemDone") + return + } + #expect(payload.itemID == "msg_41") + #expect(payload.phase == .finalAnswer) + #expect(payload.transcript == "Done.") + } + + @Test + func testResponseOutputItemAddedDecodesPhase() throws { + let event = try decode( + #""" + { + "type": "response.output_item.added", + "event_id": "event_42", + "response_id": "resp_42", + "output_index": 0, + "item": { + "id": "msg_42", + "phase": "commentary" + } + } + """# + ) + + guard case .responseOutputItemAdded(let payload) = event else { + Issue.record("Expected responseOutputItemAdded") + return + } + #expect(payload.itemID == "msg_42") + #expect(payload.phase == .commentary) + } + + @Test + func testConversationItemAddedDecodesPhase() throws { + let event = try decode( + #""" + { + "type": "conversation.item.added", + "event_id": "event_43", + "item": { + "id": "msg_43", + "role": "assistant", + "phase": "final_answer" + } + } + """# + ) + + guard case .conversationItemAdded(let payload) = event else { + Issue.record("Expected conversationItemAdded") + return + } + #expect(payload.itemID == "msg_43") + #expect(payload.role == "assistant") + #expect(payload.phase == .finalAnswer) + } + + @Test + func testConversationItemCreatedAndDoneDecodePhase() throws { + let created = try decode( + #""" + { + "type": "conversation.item.created", + "event_id": "event_44", + "item": { + "id": "msg_44", + "role": "assistant", + "phase": "commentary" + } + } + """# + ) + let done = try decode( + #""" + { + "type": "conversation.item.done", + "event_id": "event_45", + "item": { + "id": "msg_45", + "role": "assistant", + "phase": "final_answer" + } + } + """# + ) + + guard case .conversationItemCreated(let createdPayload) = created else { + Issue.record("Expected conversationItemCreated") + return + } + #expect(createdPayload.itemID == "msg_44") + #expect(createdPayload.phase == .commentary) + + guard case .conversationItemDone(let donePayload) = done else { + Issue.record("Expected conversationItemDone") + return + } + #expect(donePayload.itemID == "msg_45") + #expect(donePayload.phase == .finalAnswer) + } + @Test func testInputAudioTranscriptionDeltaLogprobsAreDecodable() throws { let event = try decode( diff --git a/Tests/AIProxyTests/OpenAIRealtimeSessionEncodingTests.swift b/Tests/AIProxyTests/OpenAIRealtimeSessionEncodingTests.swift index 7d3b8d1e..e4a6eadc 100644 --- a/Tests/AIProxyTests/OpenAIRealtimeSessionEncodingTests.swift +++ b/Tests/AIProxyTests/OpenAIRealtimeSessionEncodingTests.swift @@ -16,6 +16,17 @@ struct OpenAIRealtimeSessionEncodingTests { return e }() + @AIProxyActor + private func compilePerformanceRealtimeSessionCall( + service: OpenAIService + ) async throws { + _ = try await service.realtimeSession( + model: "gpt-realtime-1.5", + configuration: .init(), + logLevel: .debug + ) + } + @Test func sessionUpdateEncodesNestedAudioAndOutputModalities() throws { let update = OpenAIRealtimeSessionUpdate( @@ -38,6 +49,8 @@ struct OpenAIRealtimeSessionEncodingTests { #expect(session["output_modalities"] as? [String] == ["audio"]) #expect(session["modalities"] == nil) #expect(session["max_response_output_tokens"] == nil) + #expect(session["reasoning"] == nil) + #expect(session["parallel_tool_calls"] == nil) let audio = session["audio"] as! [String: Any] let input = audio["input"] as! [String: Any] let inputFormat = input["format"] as! [String: Any] @@ -45,6 +58,48 @@ struct OpenAIRealtimeSessionEncodingTests { #expect(inputFormat["rate"] as? Int == 24000) } + @Test + func sessionUpdateAcceptsInlineDefaultPerformanceConfiguration() throws { + let update = OpenAIRealtimeSessionUpdate(session: .init()) + let encoded = try encoder.encode(update) + let root = try Self.jsonObject(encoded) as! [String: Any] + let session = root["session"] as! [String: Any] + + #expect(session["type"] as? String == "realtime") + #expect(session["reasoning"] == nil) + #expect(session["parallel_tool_calls"] == nil) + } + + @Test + func reasoningSessionUpdateMergesBaseAndReasoningFields() throws { + let update = OpenAIRealtimeSessionUpdate( + session: OpenAIRealtimeReasoningSessionConfiguration( + session: OpenAIRealtimeSessionConfiguration( + inputAudioFormat: .pcm16, + instructions: "Solve carefully.", + outputModalities: [.audio], + voice: .builtin("alloy") + ), + reasoning: .init(effort: .low), + parallelToolCalls: true + ) + ) + let encoded = try encoder.encode(update) + let root = try Self.jsonObject(encoded) as! [String: Any] + #expect(root["type"] as? String == "session.update") + let session = root["session"] as! [String: Any] + #expect(session["instructions"] as? String == "Solve carefully.") + #expect(session["output_modalities"] as? [String] == ["audio"]) + #expect(session["parallel_tool_calls"] as? Bool == true) + let reasoning = session["reasoning"] as! [String: Any] + #expect(reasoning["effort"] as? String == "low") + let audio = session["audio"] as! [String: Any] + let inputFormat = (audio["input"] as! [String: Any])["format"] as! [String: Any] + #expect(inputFormat["type"] as? String == "audio/pcm") + let output = audio["output"] as! [String: Any] + #expect(output["voice"] as? String == "alloy") + } + @Test func sessionUpdateEncodesG711AudioFormatObjects() throws { let update = OpenAIRealtimeSessionUpdate( @@ -153,6 +208,33 @@ struct OpenAIRealtimeSessionEncodingTests { #expect(response["modalities"] == nil) } + @Test + func reasoningResponseCreateEncodesReasoning() throws { + let event = OpenAIRealtimeReasoningResponseCreate( + eventID: "evt_reasoning", + response: .init( + base: .init( + instructions: "Use the lowest sufficient reasoning effort.", + outputModalities: [.audio], + toolChoice: .auto + ), + reasoning: .init(effort: .minimal), + parallelToolCalls: false + ) + ) + let encoded = try encoder.encode(event) + let root = try Self.jsonObject(encoded) as! [String: Any] + #expect(root["type"] as? String == "response.create") + #expect(root["event_id"] as? String == "evt_reasoning") + let response = root["response"] as! [String: Any] + #expect(response["instructions"] as? String == "Use the lowest sufficient reasoning effort.") + #expect(response["output_modalities"] as? [String] == ["audio"]) + #expect(response["tool_choice"] as? String == "auto") + #expect(response["parallel_tool_calls"] as? Bool == false) + let reasoning = response["reasoning"] as! [String: Any] + #expect(reasoning["effort"] as? String == "minimal") + } + @Test func responseCreateToolChoiceMCPEncodesObjectShape() throws { let event = OpenAIRealtimeResponseCreate(