Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 58 additions & 25 deletions tools/server/server-common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1255,45 +1255,59 @@ json convert_responses_to_chatcmpl(const json & response_body) {
if (item.contains("status")) {
item.erase("status");
}
// Merge system/developer messages into the first system message.
// Many model templates (e.g. Qwen) require all system content at
// position 0 and reject system messages elsewhere in the conversation.
if (item.at("role") == "system" || item.at("role") == "developer") {
if (!chatcmpl_messages.empty() && chatcmpl_messages[0].value("role", "") == "system") {
auto & first_msg = chatcmpl_messages[0];
// Convert string content to array format if needed
if (first_msg["content"].is_string()) {
std::string old_text = first_msg["content"].get<std::string>();
first_msg["content"] = json::array({json{{"text", old_text}, {"type", "text"}}});
}
auto & first_content = first_msg["content"];
for (const auto & part : chatcmpl_content) {
first_content.push_back(part);
}
continue; // merged, don't push a separate message
}
item["role"] = "system";
}
item["content"] = chatcmpl_content;

chatcmpl_messages.push_back(item);
} else if (exists_and_is_array(item, "content") &&
exists_and_is_string(item, "role") &&
item.at("role") == "assistant" &&
// exists_and_is_string(item, "status") &&
// (item.at("status") == "in_progress" ||
// item.at("status") == "completed" ||
// item.at("status") == "incomplete") &&
// item["status"] not sent by codex-cli
exists_and_is_string(item, "type") &&
item.at("type") == "message"
// status not checked (not always present, e.g. codex-cli omits it)
// type == "message" for OutputMessage, absent for EasyInputMessage
(!item.contains("type") || item.at("type") == "message")
) {
// #responses_create-input-input_item_list-item-output_message
auto chatcmpl_content = json::array();
// Also handles AssistantMessageItemParam / EasyInputMessage with role "assistant"
std::vector<json> chatcmpl_content;

for (const auto & output_text : item.at("content")) {
const std::string type = json_value(output_text, "type", std::string());
if (type == "output_text") {
if (type == "output_text" || type == "input_text") {
if (!exists_and_is_string(output_text, "text")) {
throw std::invalid_argument("'Output text' requires 'text'");
// Ignore annotations and logprobs for now
chatcmpl_content.push_back({
{"text", output_text.at("text")},
{"type", "text"},
});
}
chatcmpl_content.push_back({
{"text", output_text.at("text")},
{"type", "text"},
});
} else if (type == "refusal") {
if (!exists_and_is_string(output_text, "refusal")) {
throw std::invalid_argument("'Refusal' requires 'refusal'");
// Ignore annotations and logprobs for now
chatcmpl_content.push_back({
{"refusal", output_text.at("refusal")},
{"type", "refusal"},
});
}
chatcmpl_content.push_back({
{"refusal", output_text.at("refusal")},
{"type", "refusal"},
});
} else {
throw std::invalid_argument("'type' must be one of 'output_text' or 'refusal'");
throw std::invalid_argument("'type' must be 'output_text', 'input_text', or 'refusal'");
}
}

Expand All @@ -1303,7 +1317,9 @@ json convert_responses_to_chatcmpl(const json & response_body) {
prev_msg["content"] = json::array();
}
auto & prev_content = prev_msg["content"];
prev_content.insert(prev_content.end(), chatcmpl_content.begin(), chatcmpl_content.end());
for (const auto & part : chatcmpl_content) {
prev_content.push_back(part);
}
} else {
item.erase("status");
item.erase("type");
Expand Down Expand Up @@ -1407,11 +1423,17 @@ json convert_responses_to_chatcmpl(const json & response_body) {
}
std::vector<json> chatcmpl_tools;
for (json resp_tool : response_body.at("tools")) {
json chatcmpl_tool;
const std::string tool_type = json_value(resp_tool, "type", std::string());

if (json_value(resp_tool, "type", std::string()) != "function") {
throw std::invalid_argument("'type' of tool must be 'function'");
// Skip non-function tools (e.g. web_search, code_interpreter)
// sent by clients like Codex CLI — these are provider-specific
// and cannot be converted to chat completions function tools
if (tool_type != "function") {
SRV_WRN("skipping unsupported tool type '%s' in Responses conversion\n", tool_type.c_str());
continue;
}

json chatcmpl_tool;
resp_tool.erase("type");
chatcmpl_tool["type"] = "function";

Expand All @@ -1422,14 +1444,25 @@ json convert_responses_to_chatcmpl(const json & response_body) {
chatcmpl_tools.push_back(chatcmpl_tool);
}
chatcmpl_body.erase("tools");
chatcmpl_body["tools"] = chatcmpl_tools;
if (!chatcmpl_tools.empty()) {
chatcmpl_body["tools"] = chatcmpl_tools;
}
}

if (response_body.contains("max_output_tokens")) {
chatcmpl_body.erase("max_output_tokens");
chatcmpl_body["max_tokens"] = response_body["max_output_tokens"];
}

// Strip Responses-only keys that have no chat completions equivalent
// (e.g. Codex CLI sends store, include, prompt_cache_key, web_search)
for (const char * key : {
"store", "include", "prompt_cache_key", "web_search",
"text", "truncation", "metadata",
}) {
chatcmpl_body.erase(key);
}

return chatcmpl_body;
}

Expand Down
127 changes: 127 additions & 0 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,127 @@ struct server_slot {
}
};

//
// checkpoint persistence helpers for hybrid/recurrent models
//
// Hybrid models (e.g. Qwen3.5, Jamba, Falcon-H1) use recurrent layers whose
// state cannot be partially restored from the KV cache alone. The server
// creates "context checkpoints" during prompt processing that snapshot the
// full recurrent state at regular intervals. These checkpoints live in
// server_prompt::checkpoints and are essential to avoid a full prompt
// re-processing when the slot is reused.
//
// The built-in /slots save/restore API persists the raw KV+recurrent memory
// via llama_state_seq_{save,load}_file, but does NOT persist the checkpoint
// metadata. The two helpers below fill that gap: they write/read a small
// companion file (<filename>.checkpoints) next to the main slot save file.
//
// File format (binary, little-endian):
// uint32 magic = 0x4C4C4350 ("LLCP")
// uint32 version = 1
// uint32 n_checkpoints
// For each checkpoint:
// int32 pos_min
// int32 pos_max
// int64 n_tokens
// uint64 data_size
// uint8 data[data_size]
//

static bool slot_checkpoints_save(const std::string & filepath,
const std::list<server_prompt_checkpoint> & checkpoints) {
if (checkpoints.empty()) {
return true;
}
Comment on lines +487 to +489
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Remove stale checkpoint sidecar when no checkpoints exist

When checkpoints is empty, slot_checkpoints_save() returns without touching <filepath>.checkpoints, so reusing the same save filename can leave an old sidecar file behind. A later restore will then load stale checkpoint metadata for a different KV snapshot, which can trigger invalid recurrent-state restore attempts or unnecessary full prompt reprocessing.

Useful? React with 👍 / 👎.


const std::string cp_path = filepath + ".checkpoints";
FILE * fp = fopen(cp_path.c_str(), "wb");
if (!fp) {
SRV_WRN("failed to open checkpoint file for writing: %s\n", cp_path.c_str());
return false;
}

const uint32_t magic = 0x4C4C4350;
const uint32_t version = 1;
const uint32_t n_cp = (uint32_t) checkpoints.size();

bool ok = true;
ok = ok && fwrite(&magic, sizeof(magic), 1, fp) == 1;
ok = ok && fwrite(&version, sizeof(version), 1, fp) == 1;
ok = ok && fwrite(&n_cp, sizeof(n_cp), 1, fp) == 1;

for (const auto & cp : checkpoints) {
const uint64_t data_size = cp.data.size();
ok = ok && fwrite(&cp.pos_min, sizeof(cp.pos_min), 1, fp) == 1;
ok = ok && fwrite(&cp.pos_max, sizeof(cp.pos_max), 1, fp) == 1;
ok = ok && fwrite(&cp.n_tokens, sizeof(cp.n_tokens), 1, fp) == 1;
ok = ok && fwrite(&data_size, sizeof(data_size), 1, fp) == 1;
if (data_size > 0) {
ok = ok && fwrite(cp.data.data(), 1, data_size, fp) == data_size;
}
}

fclose(fp);

if (!ok) {
SRV_WRN("failed to write checkpoint data to %s\n", cp_path.c_str());
std::remove(cp_path.c_str());
return false;
}

SRV_INF("saved %u context checkpoints to %s\n", n_cp, cp_path.c_str());
return true;
}

static bool slot_checkpoints_load(const std::string & filepath,
std::list<server_prompt_checkpoint> & checkpoints) {
const std::string cp_path = filepath + ".checkpoints";
FILE * fp = fopen(cp_path.c_str(), "rb");
if (!fp) {
return true; // no checkpoint file is not an error
}

uint32_t magic = 0, version = 0, n_cp = 0;
bool ok = true;
ok = ok && fread(&magic, sizeof(magic), 1, fp) == 1;
ok = ok && fread(&version, sizeof(version), 1, fp) == 1;
ok = ok && fread(&n_cp, sizeof(n_cp), 1, fp) == 1;

if (!ok || magic != 0x4C4C4350 || version != 1) {
SRV_WRN("invalid checkpoint file header: %s\n", cp_path.c_str());
fclose(fp);
return false;
}

checkpoints.clear();

for (uint32_t i = 0; i < n_cp && ok; i++) {
server_prompt_checkpoint cp;
uint64_t data_size = 0;
ok = ok && fread(&cp.pos_min, sizeof(cp.pos_min), 1, fp) == 1;
ok = ok && fread(&cp.pos_max, sizeof(cp.pos_max), 1, fp) == 1;
ok = ok && fread(&cp.n_tokens, sizeof(cp.n_tokens), 1, fp) == 1;
ok = ok && fread(&data_size, sizeof(data_size), 1, fp) == 1;
if (ok && data_size > 0) {
cp.data.resize(data_size);
ok = ok && fread(cp.data.data(), 1, data_size, fp) == data_size;
}
if (ok) {
checkpoints.push_back(std::move(cp));
}
}

fclose(fp);

if (!ok) {
SRV_WRN("failed to read checkpoint data from %s\n", cp_path.c_str());
checkpoints.clear();
return false;
}

SRV_INF("restored %u context checkpoints from %s\n", n_cp, cp_path.c_str());
return true;
}


//
Expand Down Expand Up @@ -1822,6 +1943,9 @@ struct server_context_impl {
const llama_tokens & tokens = slot->prompt.tokens.get_text_tokens();
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id, tokens.data(), token_count);

// persist context checkpoints alongside the slot state
slot_checkpoints_save(filepath, slot->prompt.checkpoints);

const int64_t t_end = ggml_time_us();
const double t_save_ms = (t_end - t_start) / 1000.0;

Expand Down Expand Up @@ -1869,6 +1993,9 @@ struct server_context_impl {
slot->prompt.tokens.clear();
slot->prompt.tokens.insert(tokens);

// restore context checkpoints if a companion file exists
slot_checkpoints_load(filepath, slot->prompt.checkpoints);

const int64_t t_end = ggml_time_us();
const double t_restore_ms = (t_end - t_start) / 1000.0;

Expand Down
Loading