Skip to content
Open
35 changes: 33 additions & 2 deletions server/src/server/chat_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,8 @@ std::string render_chat_template_jinja(
const std::string & eos_token,
bool add_generation_prompt,
bool enable_thinking,
const std::string & tools_json)
const std::string & tools_json,
ChatFormat arch_hint)
{
if (template_src.empty()) {
throw std::runtime_error("render_chat_template_jinja: template_src is empty");
Expand Down Expand Up @@ -411,7 +412,37 @@ std::string render_chat_template_jinja(
jinja::runtime rt(ctx);
jinja::value results = rt.execute(*prog);
auto parts = jinja::runtime::gather_string_parts(results);
return parts->as_string().str();
std::string rendered = parts->as_string().str();

// Qwen3/3.5/3.6 only: the hard-coded renderer appends a closed think
// prefill when thinking is disabled. Some Qwen3.6 Jinja templates omit
// that final assistant suffix, leaving the model in the wrong decoding
// state for tool use. Mirror the hard-coded behavior here when the
// rendered prompt ends with a bare assistant generation prompt.
// Other architectures (Laguna, Gemma4, ...) do not use ChatML tokens
// and must not be touched here.
if (arch_hint == ChatFormat::QWEN3 && !enable_thinking) {
// Tolerate template variants that emit extra trailing whitespace
// after the assistant marker (single \n, double \n\n, trailing
// space). Strategy: trim trailing whitespace, check for the BARE
// assistant marker (no newline), then re-emit marker + prefill.
static constexpr char kAssistantBare[] = "<|im_start|>assistant";
static constexpr char kAssistantPrefill[] = "<|im_start|>assistant\n<think>\n\n</think>\n\n";
size_t trim_end = rendered.size();
while (trim_end > 0) {
char c = rendered[trim_end - 1];
if (c != ' ' && c != '\t' && c != '\n' && c != '\r') break;
--trim_end;
}
const size_t blen = sizeof(kAssistantBare) - 1;
if (trim_end >= blen &&
rendered.compare(trim_end - blen, blen, kAssistantBare) == 0) {
rendered.resize(trim_end - blen);
rendered += kAssistantPrefill;
}
}

return rendered;
} catch (const std::exception & e) {
throw std::runtime_error(std::string("jinja runtime: ") + e.what());
}
Expand Down
5 changes: 4 additions & 1 deletion server/src/server/chat_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ ChatFormat chat_format_for_arch(const std::string & arch);
// {{bos_token}} / {{eos_token}}). Use empty strings if unknown.
// `tools_json` optional JSON array of tool definitions; when non-empty it
// is parsed and injected as `tools` into the template context.
// `arch_hint` model architecture (controls arch-specific post-processing;
// the closed-think prefill injection is Qwen3/3.5/3.6 only).
//
// Internally caches the most recently parsed program per thread (avoids
// re-parsing the template on every request). Throws std::runtime_error on
Expand All @@ -74,6 +76,7 @@ std::string render_chat_template_jinja(
const std::string & eos_token,
bool add_generation_prompt = true,
bool enable_thinking = false,
const std::string & tools_json = "");
const std::string & tools_json = "",
ChatFormat arch_hint = ChatFormat::QWEN3);

} // namespace dflash::common
156 changes: 153 additions & 3 deletions server/src/server/http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,155 @@ std::string render_tool_call_xml(const std::string & name, const json & argument
return out;
}

// Keys that the Unsloth Jinja template's render_extra_keys macro would expand into
// XML tags, polluting the rendered prompt (e.g. <$schema>, <additionalProperties>).
// We strip these at every level of the schema tree before the template sees it.
static const std::vector<std::string> k_schema_metadata_keys = {
"$schema", "additionalProperties", "$defs", "$ref", "definitions"
};

// Strip JSON-Schema metadata keys from a single schema node and recurse into
// nested object property schemas. Only keys in k_schema_metadata_keys are
// removed; all other keys (type, properties, required, enum, items, …) survive.
static json scrub_schema_metadata(json schema) {
if (!schema.is_object()) return schema;
for (const auto & key : k_schema_metadata_keys) {
schema.erase(key);
}
// Recurse into each property's sub-schema.
if (schema.contains("properties") && schema["properties"].is_object()) {
for (auto & [prop_name, prop_schema] : schema["properties"].items()) {
prop_schema = scrub_schema_metadata(prop_schema);
}
}
// Recurse into array item schema.
if (schema.contains("items") && schema["items"].is_object()) {
schema["items"] = scrub_schema_metadata(schema["items"]);
}
// Recurse into JSON-Schema combinators. Claude tool defs frequently use
// these for polymorphic parameter types; without recursion the inner
// sub-schemas keep their $schema/additionalProperties noise.
for (const char * combinator : {"oneOf", "anyOf", "allOf"}) {
if (schema.contains(combinator) && schema[combinator].is_array()) {
for (auto & sub : schema[combinator]) {
sub = scrub_schema_metadata(sub);
}
}
}
if (schema.contains("not") && schema["not"].is_object()) {
schema["not"] = scrub_schema_metadata(schema["not"]);
}
return schema;
}

// Maximum bytes kept from any tool or parameter description before truncation.
static constexpr size_t kMaxToolDescriptionChars = 500;

// Truncate a description string to kMaxToolDescriptionChars bytes.
// Priority: paragraph break (\n\n) before the cap, then last ". " before the
// cap, then hard cut (snapping back to avoid splitting a UTF-8 multibyte sequence).
// Appends U+2026 (…, 3 UTF-8 bytes) at the cut point.
static std::string truncate_description(const std::string & s) {
if (s.size() <= kMaxToolDescriptionChars) return s;

// 1. First \n\n before cap.
size_t nn = s.find("\n\n");
if (nn != std::string::npos && nn < kMaxToolDescriptionChars) {
return s.substr(0, nn) + "\xE2\x80\xA6";
}

// 2. Last ". " at or before cap.
std::string_view sv(s.data(), kMaxToolDescriptionChars);
size_t dot = sv.rfind(". ");
if (dot != std::string_view::npos) {
// Include the period; cut before the trailing space.
return s.substr(0, dot + 1) + "\xE2\x80\xA6";
}

// 3. Hard cut, snap back to UTF-8 boundary.
size_t cut = kMaxToolDescriptionChars;
// While cut > 0 and the byte at `cut` is a UTF-8 continuation byte
// (0x80–0xBF), move back one byte.
while (cut > 0 && (static_cast<unsigned char>(s[cut]) & 0xC0) == 0x80) {
--cut;
}
return s.substr(0, cut) + "\xE2\x80\xA6";
}

// Apply truncate_description to every property's "description" inside a
// parameters/properties object (mutates in place).
static json truncate_parameter_descriptions(json params) {
if (!params.is_object()) return params;
if (!params.contains("properties") || !params["properties"].is_object()) {
return params;
}
for (auto & [prop_name, prop_schema] : params["properties"].items()) {
if (prop_schema.is_object() && prop_schema.contains("description") &&
prop_schema["description"].is_string()) {
prop_schema["description"] =
truncate_description(prop_schema["description"].get<std::string>());
}
}
return params;
}

// Normalize tools array to OpenAI/Qwen3 shape: {"type":"function","function":{...}}.
// Anthropic shape uses "input_schema"; bare Qwen shape has "parameters" at top level.
// Also scrubs JSON-Schema metadata keys that the Unsloth Jinja template would render
// as garbage XML tags (causing the model to hallucinate function names like <function=cls>).
// Truncates function and parameter descriptions to kMaxToolDescriptionChars to prevent
// prescriptive recipes embedded in long descriptions from leaking into the prompt.
json normalize_tools_for_qwen(const json & tools) {
if (!tools.is_array()) return tools;
json out = json::array();
for (const auto & elem : tools) {
if (!elem.is_object()) { out.push_back(elem); continue; }
// Already OpenAI shape: scrub metadata, truncate descriptions, pass through.
if (elem.contains("type") && elem["type"] == "function" && elem.contains("function")) {
json e = elem;
if (e["function"].contains("description") && e["function"]["description"].is_string()) {
e["function"]["description"] =
truncate_description(e["function"]["description"].get<std::string>());
}
if (e["function"].contains("parameters")) {
e["function"]["parameters"] = truncate_parameter_descriptions(
scrub_schema_metadata(e["function"]["parameters"]));
}
out.push_back(std::move(e));
continue;
}
// Anthropic shape: input_schema → parameters (scrubbed + truncated).
if (elem.contains("input_schema")) {
out.push_back({
{"type", "function"},
{"function", {
{"name", elem.value("name", "")},
{"description", truncate_description(elem.value("description", ""))},
{"parameters", truncate_parameter_descriptions(
scrub_schema_metadata(elem["input_schema"]))}
}}
});
continue;
}
// Bare Qwen shape: top-level name + parameters (scrubbed + truncated), no wrapper.
if (elem.contains("name") && elem.contains("parameters")) {
out.push_back({
{"type", "function"},
{"function", {
{"name", elem.value("name", "")},
{"description", truncate_description(elem.value("description", ""))},
{"parameters", truncate_parameter_descriptions(
scrub_schema_metadata(elem["parameters"]))}
}}
});
continue;
}
// Unknown shape: pass through unchanged.
out.push_back(elem);
}
return out;
}

std::vector<ChatMessage> normalize_chat_messages(
const json & messages,
ApiFormat format,
Expand Down Expand Up @@ -777,9 +926,9 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
req.sampler.rep_window = body["rep_window"].get<int>();
}

// Tools.
// Tools — normalize Anthropic/bare-Qwen shape to OpenAI envelope.
if (body.contains("tools")) {
req.tools = body["tools"];
req.tools = normalize_tools_for_qwen(body["tools"]);
}
// Tool choice constraint for hint generation.
if (body.contains("tool_choice")) {
Expand Down Expand Up @@ -1000,7 +1149,8 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
eos_str,
/*add_generation_prompt=*/true,
enable_thinking,
tools_json);
tools_json,
chat_format_);
} catch (const std::exception & e) {
send_error(fd, 500,
std::string("chat template (jinja) render failed: ") + e.what());
Expand Down
18 changes: 17 additions & 1 deletion server/src/server/sse_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ static const char THINK_CLOSE[] = "</think>";
static const char TOOL_OPEN[] = "<tool_call>";
static const char FUNCTION_OPEN[] = "<function=";
static const char TOOL_CODE_OPEN[] = "<tool_code>";
// Native claude-code XML tool tags — the model uses these directly when
// its system prompt teaches the <bash>CMD</bash> format.
static const char BASH_OPEN[] = "<bash>";
static const char READ_OPEN[] = "<read>";
static const char WRITE_OPEN[] = "<write>";
static const char EDIT_OPEN[] = "<edit>";
static const char LS_OPEN[] = "<ls>";
static const char GREP_OPEN[] = "<grep>";
static const char GLOB_OPEN[] = "<glob>";
static constexpr size_t THINK_OPEN_LEN = 7;
static constexpr size_t THINK_CLOSE_LEN = 8;

Expand All @@ -28,7 +37,14 @@ static bool find_tool_start(const std::string & text, size_t & pos) {
while (idx != std::string::npos) {
if (text.compare(idx, sizeof(TOOL_OPEN) - 1, TOOL_OPEN) == 0 ||
text.compare(idx, sizeof(FUNCTION_OPEN) - 1, FUNCTION_OPEN) == 0 ||
text.compare(idx, sizeof(TOOL_CODE_OPEN) - 1, TOOL_CODE_OPEN) == 0) {
text.compare(idx, sizeof(TOOL_CODE_OPEN) - 1, TOOL_CODE_OPEN) == 0 ||
text.compare(idx, sizeof(BASH_OPEN) - 1, BASH_OPEN) == 0 ||
text.compare(idx, sizeof(READ_OPEN) - 1, READ_OPEN) == 0 ||
text.compare(idx, sizeof(WRITE_OPEN) - 1, WRITE_OPEN) == 0 ||
text.compare(idx, sizeof(EDIT_OPEN) - 1, EDIT_OPEN) == 0 ||
text.compare(idx, sizeof(LS_OPEN) - 1, LS_OPEN) == 0 ||
text.compare(idx, sizeof(GREP_OPEN) - 1, GREP_OPEN) == 0 ||
text.compare(idx, sizeof(GLOB_OPEN) - 1, GLOB_OPEN) == 0) {
pos = idx;
return true;
}
Expand Down
Loading
Loading