AIProxyTeam · richarddas · Jun 2, 2026 · lzell · Jun 10, 2026 · lzell
diff --git a/Documentation/OpenAI/RealtimeSchemaMatrix.md b/Documentation/OpenAI/RealtimeSchemaMatrix.md
@@ -0,0 +1,92 @@
+# Realtime API Schema Matrix
+
+This matrix maps the current OpenAI Realtime `session.update.session` and `response.create.response`
+fields to AIProxySwift types and wire encoding behavior.
+
+Reference: https://developers.openai.com/api/reference/resources/realtime
+
+## Shared Realtime Session
+
+These fields are used by Performance Realtime models, such as `gpt-realtime-1.5`, and are also the
+base session shape composed by Realtime Reasoning models.
+
+| Wire field | AIProxySwift API | Wire shape emitted |
+| --- | --- | --- |
+| `type` | `OpenAIRealtimeSessionConfiguration.type` | string |
+| `include` | `OpenAIRealtimeSessionConfiguration.include` | string array |
+| `model` | `OpenAIRealtimeSessionConfiguration.model` | string |
+| `instructions` | `OpenAIRealtimeSessionConfiguration.instructions` | string |
+| `max_output_tokens` | `OpenAIRealtimeSessionConfiguration.maxOutputTokens` | int or `"inf"` |
+| `output_modalities` | `OpenAIRealtimeSessionConfiguration.outputModalities` | enum string array |
+| `prompt` | `OpenAIRealtimeSessionConfiguration.prompt` | object (`id`, optional `variables`, optional `version`) |
+| `tracing` | `OpenAIRealtimeSessionConfiguration.tracing` | string `"auto"` or object (`group_id`, `metadata`, `workflow_name`) |
+| `truncation` | `OpenAIRealtimeSessionConfiguration.truncation` | string (`"auto"`/`"disabled"`) or retention-ratio object |
+| `tools` | `OpenAIRealtimeSessionConfiguration.tools` | union array (`function`, `mcp`, `web_search`) |
+| `tool_choice` | `OpenAIRealtimeSessionConfiguration.toolChoice` | string (`auto`/`none`/`required`) or typed selector object |
+| `audio.input.format` | `OpenAIRealtimeSessionConfiguration.inputAudioFormat` | object (`type`, optional `rate`) |
+| `audio.input.noise_reduction` | `OpenAIRealtimeSessionConfiguration.inputAudioNoiseReduction` | object (`type`) |
+| `audio.input.transcription` | `OpenAIRealtimeSessionConfiguration.inputAudioTranscription` | object (`language`, `model`, `prompt`) |
+| `audio.input.turn_detection` | `OpenAIRealtimeSessionConfiguration.turnDetection` | typed object union (`server_vad` / `semantic_vad`) |
+| `audio.output.format` | `OpenAIRealtimeSessionConfiguration.outputAudioFormat` | object (`type`, optional `rate`) |
+| `audio.output.speed` | `OpenAIRealtimeSessionConfiguration.speed` | number (range 0.25...1.5) |
+| `audio.output.voice` | `OpenAIRealtimeSessionConfiguration.voice` | string or object (`id`) |
+
+## Realtime Reasoning Session
+
+Realtime Reasoning models, such as `gpt-realtime-2`, compose the shared session fields above and add
+Reasoning-only fields to the same `session.update.session` object.
+
+| Wire field | AIProxySwift API | Wire shape emitted |
+| --- | --- | --- |
+| `reasoning` | `OpenAIRealtimeReasoningSessionConfiguration.reasoning` | object |
+| `reasoning.effort` | `OpenAIRealtimeReasoningConfiguration.effort` | `minimal`, `low`, `medium`, `high`, or `xhigh` |
+| `parallel_tool_calls` | `OpenAIRealtimeReasoningSessionConfiguration.parallelToolCalls` | boolean |
+
+## Shared `response.create`
+
+| Wire field | AIProxySwift API | Wire shape emitted |
+| --- | --- | --- |
+| `type` | `OpenAIRealtimeResponseCreate.type` | `"response.create"` |
+| `event_id` | `OpenAIRealtimeResponseCreate.eventID` | optional string |
+| `response.instructions` | `OpenAIRealtimeResponseCreate.Response.instructions` | optional string |
+| `response.output_modalities` | `OpenAIRealtimeResponseCreate.Response.outputModalities` | optional enum string array |
+| `response.tools` | `OpenAIRealtimeResponseCreate.Response.tools` | optional tool union array (`function`, `mcp`, `web_search`) |
+| `response.tool_choice` | `OpenAIRealtimeResponseCreate.Response.toolChoice` | optional string/object union |
+
+## Realtime Reasoning `response.create`
+
+| Wire field | AIProxySwift API | Wire shape emitted |
+| --- | --- | --- |
+| `type` | `OpenAIRealtimeReasoningResponseCreate.type` | `"response.create"` |
+| `event_id` | `OpenAIRealtimeReasoningResponseCreate.eventID` | optional string |
+| `response.reasoning` | `OpenAIRealtimeReasoningResponseCreate.Response.reasoning` | object |
+| `response.reasoning.effort` | `OpenAIRealtimeReasoningConfiguration.effort` | `minimal`, `low`, `medium`, `high`, or `xhigh` |
+| `response.parallel_tool_calls` | `OpenAIRealtimeReasoningResponseCreate.Response.parallelToolCalls` | boolean |
+
+## Realtime Reasoning Output Phases
+
+Realtime Reasoning output can be split into commentary and final answer phases.
+
+| Wire field | AIProxySwift API | Wire shape decoded |
+| --- | --- | --- |
+| `response.output[].phase` | `OpenAIRealtimeResponseOutputItem.phase` | `commentary` or `final_answer` |
+| `response.output_item.*.item.phase` | `OpenAIRealtimeResponseOutputItemAddedEvent.phase` / `OpenAIRealtimeResponseOutputItemDoneEvent.phase` | `commentary` or `final_answer` |
+| `conversation.item.*.item.phase` | `OpenAIRealtimeConversationItemCreatedEvent.phase` | `commentary` or `final_answer` |
+
+## `conversation.item.create`
+
+Reference: https://platform.openai.com/docs/api-reference/realtime-client-events/conversation/item/create
+
+| Wire field | AIProxySwift API | Wire shape emitted |
+| --- | --- | --- |
+| `type` | `OpenAIRealtimeConversationItemCreate.type` | `"conversation.item.create"` |
+| `item.type` | `OpenAIRealtimeConversationItemCreate.Item` | `"message"`, `"function_call"`, `"function_call_output"` |
+| `item.role` | `OpenAIRealtimeConversationItemCreate.Item.role` | optional string for message items |
+| `item.content[].type` | `OpenAIRealtimeConversationItemCreate.Item.Content.type` | `input_text`, `output_text`, `input_audio`, `item_reference`, `input_image` |
+| `item.content[].text` | `OpenAIRealtimeConversationItemCreate.Item.Content.text` | optional string |
+| `item.content[].audio` | `OpenAIRealtimeConversationItemCreate.Item.Content.audio` | optional string |
+| `item.content[].item_id` | `OpenAIRealtimeConversationItemCreate.Item.Content.itemID` | optional string |
+| `item.call_id` | `OpenAIRealtimeConversationItemCreate.Item.callID` | optional string |
+| `item.name` | `OpenAIRealtimeConversationItemCreate.Item.name` | optional string |
+| `item.arguments` | `OpenAIRealtimeConversationItemCreate.Item.arguments` | optional string |
+| `item.output` | `OpenAIRealtimeConversationItemCreate.Item.output` | optional string |
diff --git a/README.md b/README.md
@@ -1384,13 +1384,10 @@ final class RealtimeManager {
             inputAudioFormat: .pcm16,
             inputAudioTranscription: .init(model: "whisper-1"),
             instructions: "You are a tour guide of Yosemite national park",
-            maxResponseOutputTokens: .int(4096),
-            modalities: [.audio],
+            maxOutputTokens: .int(4096),
+            outputModalities: [.audio],
             outputAudioFormat: .pcm16,
-            temperature: 0.7,
-            turnDetection: .init(
-                type: .semanticVAD(eagerness: .medium)
-            ),
+            turnDetection: .semanticVAD(.init(eagerness: .medium)),
             voice: "shimmer"
         )
 
@@ -1449,14 +1446,15 @@ final class RealtimeManager {
 }
 ```
 
-#### General Availability (GA) Realtime migration notes
+#### Current Realtime API notes
 
-- OpenAI has announced Realtime beta (`OpenAI-Beta: realtime=v1`) deprecation and shutdown on 2026-05-07.
-- For `response.create`, GA uses `output_modalities` (not `modalities`).
-- The new `output_modalities` for OpenAI realtime GA (general availability) is as follows:
+- For a field-by-field mapping of the Realtime wire shape to AIProxySwift types, see
+  [Realtime schema matrix](Documentation/OpenAI/RealtimeSchemaMatrix.md).
+- For `response.create`, the current Realtime API uses `output_modalities` (not `modalities`).
+- `output_modalities` is as follows:
   - `["audio"]` returns audio with transcript.
   - `["text"]` returns text only.
-- For voice mode with built-in web search, use GA tool (`.webSearch`) and specify `.auto` for toolChoice to let the model decide when to use it.
+- For voice mode with built-in web search, use the `.webSearch` tool and specify `.auto` for `toolChoice` to let the model decide when to use it.
 
 ```swift
 let configuration = OpenAIRealtimeSessionConfiguration(
@@ -1473,6 +1471,60 @@ let session = try await openAIService.realtimeSession(
 )
 ```
 
+#### Realtime Reasoning models
+
+OpenAI's Realtime Reasoning models, such as `gpt-realtime-2`, use the same Realtime WebSocket
+transport and shared session fields as Performance models like `gpt-realtime-1.5`, plus
+Reasoning-only configuration for effort and parallel tool calls.
+
+```swift
+let configuration = OpenAIRealtimeReasoningSessionConfiguration(
+    session: OpenAIRealtimeSessionConfiguration(
+        outputModalities: [.audio],
+        voice: .builtin("alloy"),
+        tools: [.webSearch(.init(searchContextSize: .medium))],
+        toolChoice: .auto
+    ),
+    reasoning: .init(effort: .low),
+    parallelToolCalls: true
+)
+
+let session = try await openAIService.realtimeSession(
+    model: "gpt-realtime-2",
+    configuration: configuration,
+    logLevel: .info
+)
+```
+
+You can also override Reasoning settings for a single response:
+
+```swift
+await session.sendMessage(
+    OpenAIRealtimeReasoningResponseCreate(
+        response: .init(
+            base: .init(
+                instructions: "Use the lowest sufficient reasoning effort.",
+                outputModalities: [.audio]
+            ),
+            reasoning: .init(effort: .minimal),
+            parallelToolCalls: false
+        )
+    )
+)
+```
+
+Realtime Reasoning responses can include phased output. Use `phase` to separate short commentary
+from the final answer when the model emits both in a turn:
+
+```swift
+for await message in session.receiver {
+    if case .responseDone(let event) = message {
+        let commentary = event.output?.filter { $0.phase == .commentary }
+        let finalAnswer = event.output?.filter { $0.phase == .finalAnswer }
+    }
+}
+```
+
 ### How to make a basic request using OpenAI's Responses API
 Note: there is also a streaming version of this snippet below.
 

diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeMessage.swift
@@ -277,15 +277,46 @@ public struct OpenAIRealtimeInputAudioBufferDTMFEventReceivedEvent: Decodable, S
     }
 }
 
+public enum OpenAIRealtimeResponsePhase: String, Decodable, Sendable {
+    case commentary
+    case finalAnswer = "final_answer"
+}
+
+public struct OpenAIRealtimeResponseOutputItem: Decodable, Sendable {
+    public let id: String?
+    public let phase: OpenAIRealtimeResponsePhase?
+    public let content: [Content]?
+
+    public var transcript: String? {
+        content?.first(where: { ($0.transcript?.isEmpty == false) })?.transcript
+    }
+
+    private enum CodingKeys: String, CodingKey {
+        case id
+        case phase
+        case content
+    }
+}
+
+extension OpenAIRealtimeResponseOutputItem {
+    public struct Content: Decodable, Sendable {
+        public let type: String?
+        public let text: String?
+        public let transcript: String?
+    }
+}
+
 public struct OpenAIRealtimeConversationItemCreatedEvent: Decodable, Sendable {
     public let itemID: String?
     public let previousItemID: String?
     public let role: String?
+    public let phase: OpenAIRealtimeResponsePhase?
     public let eventID: String?
 
     private struct ItemBody: Decodable {
         let id: String?
         let role: String?
+        let phase: OpenAIRealtimeResponsePhase?
     }
 
     private enum CodingKeys: String, CodingKey {
@@ -302,6 +333,7 @@ public struct OpenAIRealtimeConversationItemCreatedEvent: Decodable, Sendable {
         self.itemID = item?.id ?? fallbackItemID
         self.previousItemID = try container.decodeIfPresent(String.self, forKey: .previousItemID)
         self.role = item?.role
+        self.phase = item?.phase
         self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID)
     }
 }
@@ -325,10 +357,12 @@ public struct OpenAIRealtimeResponseOutputItemAddedEvent: Decodable, Sendable {
     public let responseID: String?
     public let itemID: String?
     public let outputIndex: Int?
+    public let phase: OpenAIRealtimeResponsePhase?
     public let eventID: String?
 
     private struct ItemBody: Decodable {
         let id: String?
+        let phase: OpenAIRealtimeResponsePhase?
     }
 
     private enum CodingKeys: String, CodingKey {
@@ -346,6 +380,7 @@ public struct OpenAIRealtimeResponseOutputItemAddedEvent: Decodable, Sendable {
         let fallbackItemID = try container.decodeIfPresent(String.self, forKey: .itemID)
         self.itemID = item?.id ?? fallbackItemID
         self.outputIndex = container.decodeFlexibleIntIfPresent(forKey: .outputIndex)
+        self.phase = item?.phase
         self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID)
     }
 }
@@ -354,6 +389,7 @@ public struct OpenAIRealtimeResponseOutputItemDoneEvent: Decodable, Sendable {
     public let responseID: String?
     public let itemID: String?
     public let outputIndex: Int?
+    public let phase: OpenAIRealtimeResponsePhase?
     public let transcript: String?
     public let eventID: String?
 
@@ -362,6 +398,7 @@ public struct OpenAIRealtimeResponseOutputItemDoneEvent: Decodable, Sendable {
             let transcript: String?
         }
         let id: String?
+        let phase: OpenAIRealtimeResponsePhase?
         let content: [ContentBody]?
     }
 
@@ -380,6 +417,7 @@ public struct OpenAIRealtimeResponseOutputItemDoneEvent: Decodable, Sendable {
         let fallbackItemID = try container.decodeIfPresent(String.self, forKey: .itemID)
         self.itemID = item?.id ?? fallbackItemID
         self.outputIndex = container.decodeFlexibleIntIfPresent(forKey: .outputIndex)
+        self.phase = item?.phase
         self.transcript = item?.content?.first(where: { ($0.transcript?.isEmpty == false) })?.transcript
         self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID)
     }
@@ -473,19 +511,22 @@ public struct OpenAIRealtimeResponseDoneEvent: Decodable, Sendable {
     public let responseID: String?
     public let conversationID: String?
     public let status: String?
+    public let output: [OpenAIRealtimeResponseOutputItem]?
     public let usage: OpenAIRealtimeResponseUsage?
     public let eventID: String?
 
     private struct ResponseBody: Decodable {
         let id: String?
         let conversationID: String?
         let status: String?
+        let output: [OpenAIRealtimeResponseOutputItem]?
         let usage: OpenAIRealtimeResponseUsage?
 
         private enum CodingKeys: String, CodingKey {
             case id
             case conversationID = "conversation_id"
             case status
+            case output
             case usage
         }
     }
@@ -503,6 +544,7 @@ public struct OpenAIRealtimeResponseDoneEvent: Decodable, Sendable {
         self.responseID = response?.id ?? fallbackResponseID
         self.conversationID = response?.conversationID
         self.status = response?.status
+        self.output = response?.output
         self.usage = response?.usage
         self.eventID = try container.decodeIfPresent(String.self, forKey: .eventID)
     }

diff --git a/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningConfiguration.swift b/Sources/AIProxy/OpenAI/OpenAIRealtimeReasoningConfiguration.swift
@@ -0,0 +1,24 @@
+//
+//  OpenAIRealtimeReasoningConfiguration.swift
+//  AIProxy
+//
+
+/// Configuration for OpenAI Realtime Reasoning models such as `gpt-realtime-2`.
+nonisolated public struct OpenAIRealtimeReasoningConfiguration: Encodable, Sendable {
+    /// Constrains effort on Realtime Reasoning models.
+    public let effort: Effort?
+
+    public init(effort: Effort? = nil) {
+        self.effort = effort
+    }
+}
+
+extension OpenAIRealtimeReasoningConfiguration {
+    nonisolated public enum Effort: String, Encodable, Sendable {
+        case minimal
+        case low
+        case medium
+        case high
+        case xhigh
+    }
+}