michaelw9999 · michaelw9999 · Mar 30, 2026 · Mar 30, 2026 · chatgpt-codex-connector · May 3, 2026
diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp
@@ -1255,45 +1255,59 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                 if (item.contains("status")) {
                     item.erase("status");
                 }
+                // Merge system/developer messages into the first system message.
+                // Many model templates (e.g. Qwen) require all system content at
+                // position 0 and reject system messages elsewhere in the conversation.
+                if (item.at("role") == "system" || item.at("role") == "developer") {
+                    if (!chatcmpl_messages.empty() && chatcmpl_messages[0].value("role", "") == "system") {
+                        auto & first_msg = chatcmpl_messages[0];
+                        // Convert string content to array format if needed
+                        if (first_msg["content"].is_string()) {
+                            std::string old_text = first_msg["content"].get<std::string>();
+                            first_msg["content"] = json::array({json{{"text", old_text}, {"type", "text"}}});
+                        }
+                        auto & first_content = first_msg["content"];
+                        for (const auto & part : chatcmpl_content) {
+                            first_content.push_back(part);
+                        }
+                        continue; // merged, don't push a separate message
+                    }
+                    item["role"] = "system";
+                }
                 item["content"] = chatcmpl_content;
 
                 chatcmpl_messages.push_back(item);
             } else if (exists_and_is_array(item, "content") &&
                 exists_and_is_string(item, "role") &&
                 item.at("role") == "assistant" &&
-                // exists_and_is_string(item, "status") &&
-                // (item.at("status") == "in_progress" ||
-                //     item.at("status") == "completed" ||
-                //     item.at("status") == "incomplete") &&
-                // item["status"] not sent by codex-cli
-                exists_and_is_string(item, "type") &&
-                item.at("type") == "message"
+                // status not checked (not always present, e.g. codex-cli omits it)
+                // type == "message" for OutputMessage, absent for EasyInputMessage
+                (!item.contains("type") || item.at("type") == "message")
             ) {
                 // #responses_create-input-input_item_list-item-output_message
-                auto chatcmpl_content = json::array();
+                // Also handles AssistantMessageItemParam / EasyInputMessage with role "assistant"
+                std::vector<json> chatcmpl_content;
 
                 for (const auto & output_text : item.at("content")) {
                     const std::string type = json_value(output_text, "type", std::string());
-                    if (type == "output_text") {
+                    if (type == "output_text" || type == "input_text") {
                         if (!exists_and_is_string(output_text, "text")) {
                             throw std::invalid_argument("'Output text' requires 'text'");
-                            // Ignore annotations and logprobs for now
-                            chatcmpl_content.push_back({
-                                {"text", output_text.at("text")},
-                                {"type", "text"},
-                            });
                         }
+                        chatcmpl_content.push_back({
+                            {"text", output_text.at("text")},
+                            {"type", "text"},
+                        });
                     } else if (type == "refusal") {
                         if (!exists_and_is_string(output_text, "refusal")) {
                             throw std::invalid_argument("'Refusal' requires 'refusal'");
-                            // Ignore annotations and logprobs for now
-                            chatcmpl_content.push_back({
-                                {"refusal", output_text.at("refusal")},
-                                {"type", "refusal"},
-                            });
                         }
+                        chatcmpl_content.push_back({
+                            {"refusal", output_text.at("refusal")},
+                            {"type", "refusal"},
+                        });
                     } else {
-                        throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
+                        throw std::invalid_argument("'type' must be 'output_text', 'input_text', or 'refusal'");
                     }
                 }
 
@@ -1303,7 +1317,9 @@ json convert_responses_to_chatcmpl(const json & response_body) {
                         prev_msg["content"] = json::array();
                     }
                     auto & prev_content = prev_msg["content"];
-                    prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
+                    for (const auto & part : chatcmpl_content) {
+                        prev_content.push_back(part);
+                    }
                 } else {
                     item.erase("status");
                     item.erase("type");
@@ -1407,11 +1423,17 @@ json convert_responses_to_chatcmpl(const json & response_body) {
         }
         std::vector<json> chatcmpl_tools;
         for (json resp_tool : response_body.at("tools")) {
-            json chatcmpl_tool;
+            const std::string tool_type = json_value(resp_tool, "type", std::string());
 
-            if (json_value(resp_tool, "type", std::string()) != "function") {
-                throw std::invalid_argument("'type' of tool must be 'function'");
+            // Skip non-function tools (e.g. web_search, code_interpreter)
+            // sent by clients like Codex CLI — these are provider-specific
+            // and cannot be converted to chat completions function tools
+            if (tool_type != "function") {
+                SRV_WRN("skipping unsupported tool type '%s' in Responses conversion\n", tool_type.c_str());
+                continue;
             }
+
+            json chatcmpl_tool;
             resp_tool.erase("type");
             chatcmpl_tool["type"] = "function";
 
@@ -1422,14 +1444,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
             chatcmpl_tools.push_back(chatcmpl_tool);
         }
         chatcmpl_body.erase("tools");
-        chatcmpl_body["tools"] = chatcmpl_tools;
+        if (!chatcmpl_tools.empty()) {
+            chatcmpl_body["tools"] = chatcmpl_tools;
+        }
     }
 
     if (response_body.contains("max_output_tokens")) {
         chatcmpl_body.erase("max_output_tokens");
         chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
     }
 
+    // Strip Responses-only keys that have no chat completions equivalent
+    // (e.g. Codex CLI sends store, include, prompt_cache_key, web_search)
+    for (const char * key : {
+        "store", "include", "prompt_cache_key", "web_search",
+        "text", "truncation", "metadata",
+    }) {
+        chatcmpl_body.erase(key);
+    }
+
     return chatcmpl_body;
 }
 

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -455,6 +455,127 @@ struct server_slot {
     }
 };
 
+//
+// checkpoint persistence helpers for hybrid/recurrent models
+//
+// Hybrid models (e.g. Qwen3.5, Jamba, Falcon-H1) use recurrent layers whose
+// state cannot be partially restored from the KV cache alone.  The server
+// creates "context checkpoints" during prompt processing that snapshot the
+// full recurrent state at regular intervals.  These checkpoints live in
+// server_prompt::checkpoints and are essential to avoid a full prompt
+// re-processing when the slot is reused.
+//
+// The built-in /slots save/restore API persists the raw KV+recurrent memory
+// via llama_state_seq_{save,load}_file, but does NOT persist the checkpoint
+// metadata.  The two helpers below fill that gap: they write/read a small
+// companion file (<filename>.checkpoints) next to the main slot save file.
+//
+// File format (binary, little-endian):
+//   uint32  magic    = 0x4C4C4350  ("LLCP")
+//   uint32  version  = 1
+//   uint32  n_checkpoints
+//   For each checkpoint:
+//     int32   pos_min
+//     int32   pos_max
+//     int64   n_tokens
+//     uint64  data_size
+//     uint8   data[data_size]
+//
+
+static bool slot_checkpoints_save(const std::string & filepath,
+                                  const std::list<server_prompt_checkpoint> & checkpoints) {
+    if (checkpoints.empty()) {
+        return true;
+    }
+
+    const std::string cp_path = filepath + ".checkpoints";
+    FILE * fp = fopen(cp_path.c_str(), "wb");
+    if (!fp) {
+        SRV_WRN("failed to open checkpoint file for writing: %s\n", cp_path.c_str());
+        return false;
+    }
+
+    const uint32_t magic   = 0x4C4C4350;
+    const uint32_t version = 1;
+    const uint32_t n_cp    = (uint32_t) checkpoints.size();
+
+    bool ok = true;
+    ok = ok && fwrite(&magic,   sizeof(magic),   1, fp) == 1;
+    ok = ok && fwrite(&version, sizeof(version), 1, fp) == 1;
+    ok = ok && fwrite(&n_cp,    sizeof(n_cp),    1, fp) == 1;
+
+    for (const auto & cp : checkpoints) {
+        const uint64_t data_size = cp.data.size();
+        ok = ok && fwrite(&cp.pos_min,  sizeof(cp.pos_min),  1, fp) == 1;
+        ok = ok && fwrite(&cp.pos_max,  sizeof(cp.pos_max),  1, fp) == 1;
+        ok = ok && fwrite(&cp.n_tokens, sizeof(cp.n_tokens), 1, fp) == 1;
+        ok = ok && fwrite(&data_size,   sizeof(data_size),   1, fp) == 1;
+        if (data_size > 0) {
+            ok = ok && fwrite(cp.data.data(), 1, data_size, fp) == data_size;
+        }
+    }
+
+    fclose(fp);
+
+    if (!ok) {
+        SRV_WRN("failed to write checkpoint data to %s\n", cp_path.c_str());
+        std::remove(cp_path.c_str());
+        return false;
+    }
+
+    SRV_INF("saved %u context checkpoints to %s\n", n_cp, cp_path.c_str());
+    return true;
+}
+
+static bool slot_checkpoints_load(const std::string & filepath,
+                                  std::list<server_prompt_checkpoint> & checkpoints) {
+    const std::string cp_path = filepath + ".checkpoints";
+    FILE * fp = fopen(cp_path.c_str(), "rb");
+    if (!fp) {
+        return true;  // no checkpoint file is not an error
+    }
+
+    uint32_t magic = 0, version = 0, n_cp = 0;
+    bool ok = true;
+    ok = ok && fread(&magic,   sizeof(magic),   1, fp) == 1;
+    ok = ok && fread(&version, sizeof(version), 1, fp) == 1;
+    ok = ok && fread(&n_cp,    sizeof(n_cp),    1, fp) == 1;
+
+    if (!ok || magic != 0x4C4C4350 || version != 1) {
+        SRV_WRN("invalid checkpoint file header: %s\n", cp_path.c_str());
+        fclose(fp);
+        return false;
+    }
+
+    checkpoints.clear();
+
+    for (uint32_t i = 0; i < n_cp && ok; i++) {
+        server_prompt_checkpoint cp;
+        uint64_t data_size = 0;
+        ok = ok && fread(&cp.pos_min,  sizeof(cp.pos_min),  1, fp) == 1;
+        ok = ok && fread(&cp.pos_max,  sizeof(cp.pos_max),  1, fp) == 1;
+        ok = ok && fread(&cp.n_tokens, sizeof(cp.n_tokens), 1, fp) == 1;
+        ok = ok && fread(&data_size,   sizeof(data_size),   1, fp) == 1;
+        if (ok && data_size > 0) {
+            cp.data.resize(data_size);
+            ok = ok && fread(cp.data.data(), 1, data_size, fp) == data_size;
+        }
+        if (ok) {
+            checkpoints.push_back(std::move(cp));
+        }
+    }
+
+    fclose(fp);
+
+    if (!ok) {
+        SRV_WRN("failed to read checkpoint data from %s\n", cp_path.c_str());
+        checkpoints.clear();
+        return false;
+    }
+
+    SRV_INF("restored %u context checkpoints from %s\n", n_cp, cp_path.c_str());
+    return true;
+}
 
 
 //
@@ -1822,6 +1943,9 @@ struct server_context_impl {
                     const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
                     const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);
 
+                    // persist context checkpoints alongside the slot state
+                    slot_checkpoints_save(filepath, slot->prompt.checkpoints);
+
                     const int64_t t_end = ggml_time_us();
                     const double t_save_ms = (t_end - t_start) / 1000.0;
 
@@ -1869,6 +1993,9 @@ struct server_context_impl {
                     slot->prompt.tokens.clear();
                     slot->prompt.tokens.insert(tokens);
 
+                    // restore context checkpoints if a companion file exists
+                    slot_checkpoints_load(filepath, slot->prompt.checkpoints);
+
                     const int64_t t_end = ggml_time_us();
                     const double t_restore_ms = (t_end - t_start) / 1000.0;