Skip to content

Commit 8a3091c

Browse files
committed
feat(codex): full image-generation lifecycle (started/generating/keepalive/partial/completed)
The diagnostic trace from a real image-gen run revealed five SSE event types we were silently ignoring during the 30-50s render wait: +1.1s response.image_generation_call.in_progress +31s response.image_generation_call.generating +47s keepalive +49s response.image_generation_call.partial_image +49s response.output_item.done (final image) That's why the previous UI sat on an opaque typing indicator the whole time. Plumb every stage through: - New CodexImageEvent enum with .started / .generating / .keepalive / .partial(CodexImage) / .completed(CodexImage). CodexStreamPart now carries .imageEvent(CodexImageEvent) instead of a single .generatedImage. (Pre-1.0 breaking — only consumer is the demo, updated in the same change.) - CodexImage gains an `isPartial` flag. - Parser handles the four new event types and decodes partial_image_b64 (fallback to b64_json / partial_image field names defensively). - CodexTool.imageGeneration now takes partialImages: Int. Default CodexTool.imageGenerationPNG asks for 2 preview frames. Wire JSON serialises as `{"type": "image_generation", "output_format": "png", "partial_images": 2}`. Strip the diagnostic prints — root cause is found, lifecycle is now plumbed cleanly. 46 kit tests still pass.
1 parent a2d965d commit 8a3091c

3 files changed

Lines changed: 108 additions & 36 deletions

File tree

Sources/CodingPlanCodex/CodexStreamPart.swift

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,38 +7,67 @@
77
import Foundation
88

99
/// One slice of a streaming Codex response. Either a text delta to be
10-
/// appended to the running reply, or a fully-generated image artifact.
10+
/// appended to the running reply, or an image-lifecycle event from the
11+
/// `image_generation` tool.
1112
public enum CodexStreamPart: Sendable, Equatable {
1213
/// One streaming text delta. Concatenate every `.textDelta` to
1314
/// reconstruct the full assistant reply.
1415
case textDelta(String)
15-
/// A fully-generated image emitted by the `image_generation` tool.
16-
case generatedImage(CodexImage)
16+
/// An image-generation lifecycle event — start, progress, partial
17+
/// preview, or the final completed image. See ``CodexImageEvent``.
18+
case imageEvent(CodexImageEvent)
1719
}
1820

19-
/// A finished image generated mid-turn by the `image_generation` tool.
21+
/// Stages an image generation passes through during a streaming turn.
2022
///
21-
/// `pngData` carries the decoded PNG bytes ready for
23+
/// On the wire the Codex backend emits `response.image_generation_call.*`
24+
/// events at different points. This enum collapses them into a small
25+
/// surface a chat UI can drive a placeholder + image swap from.
26+
public enum CodexImageEvent: Sendable, Equatable {
27+
/// Backend acknowledged the image-generation call and queued it.
28+
/// Show "Starting image generation…" or similar.
29+
case started(callId: String)
30+
/// Backend is actively rendering. Drives a shimmering placeholder.
31+
case generating(callId: String)
32+
/// A connection keep-alive pulse — no progress, but the server is
33+
/// still working. Useful for resetting client-side stalled timers.
34+
case keepalive
35+
/// A low-fidelity preview of the image arrived. Swap any placeholder
36+
/// for this PNG; expect a final ``completed(_:)`` shortly after.
37+
case partial(CodexImage)
38+
/// The final, fully-rendered image. Replace any partials with this.
39+
case completed(CodexImage)
40+
}
41+
42+
/// A finished (or partial) image emitted by the `image_generation` tool.
43+
///
44+
/// `pngData` carries decoded PNG bytes ready for
2245
/// `UIImage(data:)` / `NSImage(data:)`.
2346
public struct CodexImage: Sendable, Equatable {
2447
/// Backend identifier for this generation call (e.g. `"ig_123"`).
2548
public let id: String
26-
/// Status reported by the backend. Typically `"completed"`.
49+
/// Status reported by the backend. `"completed"` for the final image,
50+
/// or a transient label like `"generating"` for partial previews.
2751
public let status: String
2852
/// The prompt the model used after revision, when the backend reports it.
2953
public let revisedPrompt: String?
3054
/// Decoded PNG bytes.
3155
public let pngData: Data
56+
/// `true` when this is a partial preview; more frames may follow.
57+
/// `false` when this is the final image.
58+
public let isPartial: Bool
3259

3360
public init(
3461
id: String,
3562
status: String,
3663
revisedPrompt: String? = nil,
37-
pngData: Data
64+
pngData: Data,
65+
isPartial: Bool = false
3866
) {
3967
self.id = id
4068
self.status = status
4169
self.revisedPrompt = revisedPrompt
4270
self.pngData = pngData
71+
self.isPartial = isPartial
4372
}
4473
}

Sources/CodingPlanCodex/CodexTool.swift

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,23 +10,33 @@ import Foundation
1010
/// A tool the Codex backend can invoke as part of a response.
1111
public enum CodexTool: Sendable, Equatable, Hashable {
1212
/// Image generation. The model decides when to invoke it; the
13-
/// resulting image is yielded as ``CodexStreamPart/generatedImage(_:)``.
14-
/// `outputFormat` is the file format the backend should return
15-
/// (currently only `"png"` is documented upstream).
16-
case imageGeneration(outputFormat: String)
13+
/// resulting image is yielded as ``CodexImageEvent``-wrapped
14+
/// ``CodexStreamPart/imageEvent(_:)`` events.
15+
///
16+
/// - Parameters:
17+
/// - outputFormat: File format (currently only `"png"` is documented).
18+
/// - partialImages: How many low-fidelity preview frames the backend
19+
/// should stream before the final image. `0` disables previews.
20+
/// The Codex backend supports `0...3`; defaults to `2` for a
21+
/// visible build-up effect.
22+
case imageGeneration(outputFormat: String, partialImages: Int)
1723

18-
/// Convenience: PNG image generation, the upstream default.
19-
public static let imageGenerationPNG: CodexTool = .imageGeneration(outputFormat: "png")
24+
/// Convenience: PNG image generation with two preview frames.
25+
public static let imageGenerationPNG: CodexTool = .imageGeneration(
26+
outputFormat: "png",
27+
partialImages: 2
28+
)
2029
}
2130

2231
extension CodexTool {
2332
/// Wire JSON object suitable for the Codex `/codex/responses` request body.
2433
var jsonObject: [String: Any] {
2534
switch self {
26-
case .imageGeneration(let outputFormat):
35+
case .imageGeneration(let outputFormat, let partialImages):
2736
return [
2837
"type": "image_generation",
2938
"output_format": outputFormat,
39+
"partial_images": partialImages,
3040
]
3141
}
3242
}

Sources/CodingPlanCodex/OpenAICodexClient.swift

Lines changed: 55 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -321,40 +321,43 @@ public struct OpenAICodexClient: Sendable {
321321
var current: [String] = []
322322
var anyDeltaYielded = false
323323
var pendingItemText: String?
324-
let traceImageEvents = !tools.isEmpty
325-
let startedAt = Date()
326-
var eventCount = 0
327324

328325
func flushEvent() throws {
329326
defer { current.removeAll(keepingCapacity: true) }
330327
guard let event = Self.decodeSSEEvent(dataLines: current) else { return }
331-
eventCount += 1
332-
let type = event["type"] as? String ?? "<no-type>"
333-
if traceImageEvents {
334-
let elapsed = String(format: "%.1f", Date().timeIntervalSince(startedAt))
335-
// For image_generation_call sub-events, include the
336-
// status so we can see in_progress / completed.
337-
let status = (event["item"] as? [String: Any])?["status"] as? String
338-
?? event["status"] as? String
339-
?? ""
340-
print("[CodexStream +\(elapsed)s] event #\(eventCount) type=\(type) status=\(status)")
341-
}
342328
switch event["type"] as? String {
343329
case "response.output_text.delta":
344330
if let delta = event["delta"] as? String, !delta.isEmpty {
345331
continuation.yield(.textDelta(delta))
346332
anyDeltaYielded = true
347333
}
334+
case "response.output_item.added":
335+
// The backend adds an image_generation_call item when
336+
// the tool starts. Surface that as `.started`.
337+
if let item = event["item"] as? [String: Any],
338+
(item["type"] as? String) == "image_generation_call",
339+
let id = item["id"] as? String {
340+
continuation.yield(.imageEvent(.started(callId: id)))
341+
}
342+
case "response.image_generation_call.in_progress":
343+
if let id = Self.itemId(in: event) {
344+
continuation.yield(.imageEvent(.started(callId: id)))
345+
}
346+
case "response.image_generation_call.generating":
347+
if let id = Self.itemId(in: event) {
348+
continuation.yield(.imageEvent(.generating(callId: id)))
349+
}
350+
case "keepalive":
351+
continuation.yield(.imageEvent(.keepalive))
352+
case "response.image_generation_call.partial_image":
353+
if let image = Self.parsePartialImage(from: event) {
354+
continuation.yield(.imageEvent(.partial(image)))
355+
}
348356
case "response.output_item.done":
349357
guard let item = event["item"] as? [String: Any] else { return }
350-
// image_generation_call items carry a generated image
351-
// regardless of whether text deltas were also present.
358+
// image_generation_call items carry the final image.
352359
if let image = Self.parseImageGenerationCall(from: item) {
353-
if traceImageEvents {
354-
let elapsed = String(format: "%.1f", Date().timeIntervalSince(startedAt))
355-
print("[CodexStream +\(elapsed)s] yielding image, pngBytes=\(image.pngData.count)")
356-
}
357-
continuation.yield(.generatedImage(image))
360+
continuation.yield(.imageEvent(.completed(image)))
358361
return
359362
}
360363
// Assistant message item: hold its text until response.completed
@@ -431,10 +434,40 @@ public struct OpenAICodexClient: Sendable {
431434
id: id,
432435
status: status,
433436
revisedPrompt: item["revised_prompt"] as? String,
434-
pngData: pngData
437+
pngData: pngData,
438+
isPartial: false
435439
)
436440
}
437441

442+
/// Parse a `response.image_generation_call.partial_image` event payload.
443+
/// Wire shape: `{ "type": "...partial_image", "item_id": "ig_…",
444+
/// "partial_image_b64": "…", "partial_image_index": 0 }`. Falls back
445+
/// to alternative field names defensively in case the upstream renames.
446+
private static func parsePartialImage(from event: [String: Any]) -> CodexImage? {
447+
let id = (event["item_id"] as? String) ?? (event["id"] as? String) ?? ""
448+
let base64 = (event["partial_image_b64"] as? String)
449+
?? (event["b64_json"] as? String)
450+
?? (event["partial_image"] as? String)
451+
guard let base64, let pngData = Data(base64Encoded: base64) else {
452+
return nil
453+
}
454+
return CodexImage(
455+
id: id,
456+
status: "generating",
457+
revisedPrompt: nil,
458+
pngData: pngData,
459+
isPartial: true
460+
)
461+
}
462+
463+
/// Pull the `item_id` from a `response.image_generation_call.*` event.
464+
private static func itemId(in event: [String: Any]) -> String? {
465+
if let id = event["item_id"] as? String { return id }
466+
if let item = event["item"] as? [String: Any],
467+
let id = item["id"] as? String { return id }
468+
return nil
469+
}
470+
438471
private static func decodeSSEEvent(dataLines: [String]) -> [String: Any]? {
439472
guard !dataLines.isEmpty else { return nil }
440473
let payload = dataLines.joined(separator: "\n")

0 commit comments

Comments
 (0)