Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ assets/docker.png -filter -diff -merge -text
*.jpeg filter=lfs diff=lfs merge=lfs -text
*.mp4 filter=lfs diff=lfs merge=lfs -text
*.webm filter=lfs diff=lfs merge=lfs -text
*.gguf filter=lfs diff=lfs merge=lfs -text
24 changes: 24 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ jobs:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
with:
submodules: recursive
lfs: true
token: ${{ secrets.SUBMODULE_PAT || secrets.GITHUB_TOKEN }}

- uses: Jimver/cuda-toolkit@v0.2.35
Expand All @@ -48,6 +49,14 @@ jobs:
# uv reads .python-version (3.12, matching the previous CI) and downloads the matching
# interpreter; no separate setup-python step needed.

- name: Install system build deps
# libcurl4-openssl-dev is required by server/src/server/http_server.cpp
# which #includes <curl/curl.h>; the replay_http_server target links
# against libcurl for upstream proxy support.
run: |
sudo apt-get update
sudo apt-get install -y libcurl4-openssl-dev

- name: Build dflash (smoke + server)
run: |
cd server
Expand All @@ -58,6 +67,7 @@ jobs:
-DCMAKE_BUILD_TYPE=Release
cmake --build build --target \
test_dflash test_generate test_flash_attn_sparse \
replay_http_server \
-j$(nproc)

# Server unit tests require libcurl-dev; skipped when CURL is absent.
Expand All @@ -73,6 +83,20 @@ jobs:
# in the optional `megakernel` extra so its build does NOT run yet.
run: uv sync --frozen

- name: Run CPU integration tests (stub backend, no GPU)
# End-to-end exercise of HttpServer + render_chat_template +
# SseEmitter with a deterministic stub model backend. No GPU
# required: the replay driver runs under CUDA_VISIBLE_DEVICES=""
# and the tokenizer fixture is a stripped Qwen3.6 GGUF (metadata
# only). Covers streaming and non-streaming, OpenAI and
# Anthropic formats — the same regression class previously only
# caught by full-image smoke tests.
env:
CUDA_VISIBLE_DEVICES: ""
run: |
uv run --frozen --with pytest --with requests \
pytest -v server/test/test_stub_integration.py

- name: Build megakernel via uv sync (sm_75)
env:
CUDA_HOME: ${{ env.CUDA_PATH }}
Expand Down
44 changes: 44 additions & 0 deletions server/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -868,6 +868,50 @@ if(DFLASH27B_TESTS)
endif()
endif()

# ─── replay_http_server: CPU-only HttpServer test driver ────────────
# Wires Tokenizer + ScenarioStore + StubModelBackend + HttpServer to
# serve real HTTP requests on the wire, replaying scripted token
# streams from JSON scenario files. Links dflash_common (which
# includes CUDA-compiled TUs) but never instantiates a real
# ModelBackend, so CUDA_VISIBLE_DEVICES="" is supported. Driven by
# test_stub_integration.py.
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/replay_http_server.cpp")
# http_server.cpp #includes <curl/curl.h> for its upstream-proxy
# passthrough; replay_http_server compiles that TU so it must link
# libcurl even though the stub backend itself doesn't use it. Skip
# the target on hosts without libcurl-dev rather than fail configure
# — the rest of the build (server unit tests, dflash_server, etc.)
# has its own CURL gating and shouldn't be blocked by the test rig.
find_package(CURL)
endif()
if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/replay_http_server.cpp" AND CURL_FOUND)
add_executable(replay_http_server
test/replay_http_server.cpp
test/scenario_store.cpp
test/stub_model_backend.cpp
src/server/http_server.cpp
src/server/model_card.cpp
src/server/prompt_normalize.cpp)
target_include_directories(replay_http_server PRIVATE
${DFLASH27B_SRC_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/test
${CURL_INCLUDE_DIRS})
if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
target_compile_definitions(replay_http_server PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
else()
target_compile_definitions(replay_http_server PRIVATE
DFLASH27B_BACKEND_CUDA=1
DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
endif()
target_link_libraries(replay_http_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread CURL::libcurl)
if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
find_package(CUDAToolkit REQUIRED)
target_link_libraries(replay_http_server PRIVATE CUDA::cudart)
else()
target_link_libraries(replay_http_server PRIVATE hip::host)
endif()
endif()

# ─── Unit tests (no GPU, no model files) ────────────────────────────
enable_testing()

Expand Down
76 changes: 72 additions & 4 deletions server/src/server/chat_template.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,18 @@ ChatFormat chat_format_for_arch(const std::string & arch) {
return ChatFormat::QWEN3;
}

std::string render_chat_template(
PromptRenderResult render_chat_template(
const std::vector<ChatMessage> & messages,
ChatFormat format,
bool add_generation_prompt,
bool enable_thinking,
const std::string & tools_json)
{
std::string result;
// `started_in_thinking` is derived deterministically from the template
// branch + render flags below. Set per format inside the switch so a
// future format addition can't silently miss the wiring.
bool started_in_thinking = false;
bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null";

switch (format) {
Expand Down Expand Up @@ -141,6 +145,14 @@ std::string render_chat_template(
// even when the client opts in, defeating the thinking-budget
// mechanism entirely.
result += "<think>\n";
// The prompt suffix pre-opens `<think>` — the model's very
// first generated token is reasoning, never preceded by an
// explicit `<think>` opener in the stream. Callers must
// start the SSE state machine in REASONING mode and pass
// `started_in_thinking=true` to parse_reasoning() so that
// reasoning text routes to reasoning_content instead of
// leaking into content.
started_in_thinking = true;
}
}
break;
Expand Down Expand Up @@ -224,6 +236,11 @@ std::string render_chat_template(
result += "<assistant>\n";
if (enable_thinking) {
result += "<think>";
// Same situation as Qwen3.6: Laguna XS.2's enable_thinking
// generation prompt ends with `<think>` so the model starts
// emitting reasoning tokens with no explicit opener in the
// stream. Route subsequent tokens to the reasoning channel.
started_in_thinking = true;
} else {
// Empty think block — model jumps straight to answer.
result += "</think>";
Expand Down Expand Up @@ -311,11 +328,17 @@ std::string render_chat_template(
result += "<|channel>thought\n<channel|>";
}
}
// Gemma4 does NOT pre-open `<think>` from the prompt; its
// reasoning channel is opened by the model emitting `<|channel>`
// which http_server forwards into the SseEmitter as the text
// `<think>` — so the emitter's existing CONTENT→REASONING
// transition fires on that synthesized opener. started_in_thinking
// stays false (initial CONTENT mode is correct).
break;
}
}

return result;
return PromptRenderResult{std::move(result), started_in_thinking};
}

// ─── Jinja path ─────────────────────────────────────────────────────────
Expand Down Expand Up @@ -353,7 +376,29 @@ static std::shared_ptr<jinja::program> get_or_parse(const std::string & template

} // namespace

std::string render_chat_template_jinja(
// Sniff a rendered prompt for a trailing `<think>` opener so the caller
// can route subsequent stream tokens to the reasoning channel. Accepts
// optional whitespace after the opener (Qwen3.6 emits `<think>\n`).
// True positive ⇒ caller should treat the prompt as having pre-opened
// the reasoning channel (and the renderer warns loudly so a model-card
// mismatch is visible at runtime).
static bool prompt_ends_with_think_open(const std::string & s) {
static const std::string OPEN = "<think>";
// Walk back over trailing ASCII whitespace.
size_t end = s.size();
while (end > 0) {
char c = s[end - 1];
if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
end--;
} else {
break;
}
}
if (end < OPEN.size()) return false;
return s.compare(end - OPEN.size(), OPEN.size(), OPEN) == 0;
}

PromptRenderResult render_chat_template_jinja(
const std::string & template_src,
const std::vector<ChatMessage> & messages,
const std::string & bos_token,
Expand Down Expand Up @@ -407,14 +452,37 @@ std::string render_chat_template_jinja(
throw std::runtime_error(std::string("jinja global_from_json: ") + e.what());
}

std::string rendered;
try {
jinja::runtime rt(ctx);
jinja::value results = rt.execute(*prog);
auto parts = jinja::runtime::gather_string_parts(results);
return parts->as_string().str();
rendered = parts->as_string().str();
} catch (const std::exception & e) {
throw std::runtime_error(std::string("jinja runtime: ") + e.what());
}

// Jinja path: we don't know which template family the caller passed
// in, so derive `started_in_thinking` by sniffing the rendered tail
// for a `<think>` opener. This catches the common Qwen3.6 / Laguna
// chat templates that end with `<think>\n` when enable_thinking is
// honored, plus any custom template that follows the same convention.
//
// Warn loudly when sniffing decides true so a template/model-card
// mismatch (e.g. enable_thinking=false but template hard-codes
// `<think>` anyway) surfaces in server logs.
bool started_in_thinking =
enable_thinking && add_generation_prompt &&
prompt_ends_with_think_open(rendered);
if (started_in_thinking) {
std::fprintf(stderr,
"[WARN] render_chat_template_jinja: rendered prompt ends with "
"`<think>` opener — treating as started_in_thinking=true. If "
"this is unexpected, check the template's enable_thinking "
"branch or the model card's reasoning configuration.\n");
}

return PromptRenderResult{std::move(rendered), started_in_thinking};
}

} // namespace dflash::common
21 changes: 19 additions & 2 deletions server/src/server/chat_template.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,23 @@ enum class ChatFormat {
GEMMA4, // <bos><|turn>role\n...<turn|>\n
};

// Provenance for a rendered prompt. `text` is the byte string that gets
// tokenized; `started_in_thinking` records whether the prompt suffix
// pre-opens a `<think>` block (or equivalent reasoning-channel marker)
// that the model is expected to continue into.
//
// Callers route this into the SseEmitter's initial mode and into
// parse_reasoning()'s `started_in_thinking` argument so reasoning text
// emitted before any explicit `<think>` opener is still attributed to
// the reasoning channel. Without this plumbing, Qwen3.6 / Laguna
// enable_thinking prompts (which pre-open `<think>\n` in the assistant
// turn) cause the model to emit reasoning straight into the content
// channel, leaving `reasoning_content` empty.
struct PromptRenderResult {
std::string text; // rendered prompt text, ready to tokenize
bool started_in_thinking; // prompt suffix opens reasoning channel
};

// Render chat messages into the model-specific prompt string.
// The result is plain text ready to be tokenized.
//
Expand All @@ -40,7 +57,7 @@ enum class ChatFormat {
// `tools_json` is an optional JSON string containing the tool definitions
// array. When non-empty, the Qwen3/3.5 template injects a tool preamble
// into the system message instructing the model how to emit <tool_call> tags.
std::string render_chat_template(
PromptRenderResult render_chat_template(
const std::vector<ChatMessage> & messages,
ChatFormat format,
bool add_generation_prompt = true,
Expand All @@ -67,7 +84,7 @@ ChatFormat chat_format_for_arch(const std::string & arch);
// Internally caches the most recently parsed program per thread (avoids
// re-parsing the template on every request). Throws std::runtime_error on
// lexer/parser/runtime failure (caller should surface a 500 response).
std::string render_chat_template_jinja(
PromptRenderResult render_chat_template_jinja(
const std::string & template_src,
const std::vector<ChatMessage> & messages,
const std::string & bos_token,
Expand Down
34 changes: 25 additions & 9 deletions server/src/server/http_server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1591,7 +1591,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
tools_json = req.tools.dump();
}

std::string rendered;
PromptRenderResult render_result;
if (!config_.chat_template_src.empty()) {
// Jinja path: caller supplied a chat template file via
// --chat-template-file. Override the hardcoded QWEN3/LAGUNA
Expand All @@ -1608,7 +1608,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
? tokenizer_.raw_token(tokenizer_.eos_id())
: std::string();
try {
rendered = render_chat_template_jinja(
render_result = render_chat_template_jinja(
config_.chat_template_src,
chat_msgs,
bos_str,
Expand All @@ -1622,11 +1622,18 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
return true;
}
} else {
rendered = render_chat_template(chat_msgs, chat_format_,
true, enable_thinking,
tools_json);
}
req.prompt_tokens = tokenizer_.encode(rendered);
render_result = render_chat_template(chat_msgs, chat_format_,
true, enable_thinking,
tools_json);
}
// Propagate prompt provenance so the SseEmitter's initial mode
// matches the template's pre-opened reasoning channel (Qwen3.6 /
// Laguna enable_thinking case). Without this, reasoning text
// leaks into the content channel and `reasoning_content` stays
// empty — see fix(server): route Qwen3.6/Laguna think-mode
// reasoning to reasoning_content channel.
req.started_in_thinking = render_result.started_in_thinking;
req.prompt_tokens = tokenizer_.encode(render_result.text);

// count_tokens: short-circuit after tokenization. Skip generation
// entirely — Anthropic's contract is just `{"input_tokens": N}`.
Expand Down Expand Up @@ -1770,11 +1777,20 @@ void HttpServer::worker_loop() {
}
}

// Create SSE emitter for streaming state machine.
// Create SSE emitter for streaming state machine. `initial_mode`
// tracks whether the chat-template prompt pre-opened a `<think>`
// block (Qwen3.6 / Laguna enable_thinking path). When true, the
// emitter starts in REASONING so the model's first generated
// token routes to reasoning_content even though no explicit
// `<think>` opener appears in the token stream.
const StreamMode initial_mode = req.started_in_thinking
? StreamMode::REASONING
: StreamMode::CONTENT;
SseEmitter emitter(req.format, req.response_id, req.model,
(int)req.prompt_tokens.size(), req.tools,
&tool_memory_,
req.stop_sequences);
req.stop_sequences,
initial_mode);

// Emit initial SSE events (skip when proxying).
if (req.stream && config_.pflash_upstream_base.empty()) {
Expand Down
6 changes: 6 additions & 0 deletions server/src/server/http_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,12 @@ struct ParsedRequest {
// Bandit: per-session adaptive keep_ratio opt-in
std::string session_id;
DiskPrefixCachePolicy disk_cache_policy;
// Set by the chat-template renderer when the rendered prompt suffix
// pre-opens a `<think>` block (Qwen3.6 / Laguna enable_thinking path).
// Drives the SseEmitter's initial mode so reasoning tokens emitted
// before any explicit `<think>` opener route to reasoning_content
// instead of leaking into content.
bool started_in_thinking = false;
};

// Build the /props response body. Exposed (non-static) so unit tests
Expand Down
13 changes: 10 additions & 3 deletions server/src/server/sse_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,15 +76,16 @@ SseEmitter::SseEmitter(ApiFormat format,
int prompt_tokens,
const json & tools,
ToolMemory * tool_memory,
const std::vector<std::string> & stop_sequences)
const std::vector<std::string> & stop_sequences,
StreamMode initial_mode)
: format_(format)
, request_id_(request_id)
, model_name_(model_name)
, prompt_tokens_(prompt_tokens)
, tools_(tools)
, tool_memory_(tool_memory)
, mode_(StreamMode::CONTENT)
, active_kind_("text")
, mode_(initial_mode)
, active_kind_(initial_mode == StreamMode::REASONING ? "thinking" : "text")
, stop_sequences_(stop_sequences)
, created_at_(unix_timestamp())
, msg_item_id_(gen_item_id())
Expand All @@ -93,6 +94,12 @@ SseEmitter::SseEmitter(ApiFormat format,
for (const auto & s : stop_sequences_) {
if (s.size() > stop_holdback_) stop_holdback_ = s.size();
}
// NOTE on `checked_think_prefix_`: we deliberately leave the default
// (false) here even when initial_mode == REASONING. The emitter has a
// one-time guard in emit_token() that strips a redundantly-emitted
// leading `<think>` if the model emits one anyway (model-card /
// template-mismatch edge case). Pre-setting the flag to true would
// skip that strip and leak the duplicate opener into reasoning_text.
}

// ─── SSE formatting helpers ─────────────────────────────────────────────
Expand Down
Loading
Loading