Luce-Org · easel · Jun 4, 2026 · May 29, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -6,3 +6,4 @@ assets/docker.png -filter -diff -merge -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.webm filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -29,6 +29,7 @@ jobs:
       - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
         with:
           submodules: recursive
+          lfs: true
           token: ${{ secrets.SUBMODULE_PAT || secrets.GITHUB_TOKEN }}
 
       - uses: Jimver/cuda-toolkit@v0.2.35
@@ -48,6 +49,14 @@ jobs:
           # uv reads .python-version (3.12, matching the previous CI) and downloads the matching
           # interpreter; no separate setup-python step needed.
 
+      - name: Install system build deps
+        # libcurl4-openssl-dev is required by server/src/server/http_server.cpp
+        # which #includes <curl/curl.h>; the replay_http_server target links
+        # against libcurl for upstream proxy support.
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y libcurl4-openssl-dev
+
       - name: Build dflash (smoke + server)
         run: |
           cd server
@@ -58,6 +67,7 @@ jobs:
             -DCMAKE_BUILD_TYPE=Release
           cmake --build build --target \
             test_dflash test_generate test_flash_attn_sparse \
+            replay_http_server \
             -j$(nproc)
 
       # Server unit tests require libcurl-dev; skipped when CURL is absent.
@@ -73,6 +83,20 @@ jobs:
         # in the optional `megakernel` extra so its build does NOT run yet.
         run: uv sync --frozen
 
+      - name: Run CPU integration tests (stub backend, no GPU)
+        # End-to-end exercise of HttpServer + render_chat_template +
+        # SseEmitter with a deterministic stub model backend. No GPU
+        # required: the replay driver runs under CUDA_VISIBLE_DEVICES=""
+        # and the tokenizer fixture is a stripped Qwen3.6 GGUF (metadata
+        # only). Covers streaming and non-streaming, OpenAI and
+        # Anthropic formats — the same regression class previously only
+        # caught by full-image smoke tests.
+        env:
+          CUDA_VISIBLE_DEVICES: ""
+        run: |
+          uv run --frozen --with pytest --with requests \
+            pytest -v server/test/test_stub_integration.py
+
       - name: Build megakernel via uv sync (sm_75)
         env:
           CUDA_HOME: ${{ env.CUDA_PATH }}

diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -868,6 +868,50 @@ if(DFLASH27B_TESTS)
         endif()
     endif()
 
+    # ─── replay_http_server: CPU-only HttpServer test driver ────────────
+    # Wires Tokenizer + ScenarioStore + StubModelBackend + HttpServer to
+    # serve real HTTP requests on the wire, replaying scripted token
+    # streams from JSON scenario files. Links dflash_common (which
+    # includes CUDA-compiled TUs) but never instantiates a real
+    # ModelBackend, so CUDA_VISIBLE_DEVICES="" is supported. Driven by
+    # test_stub_integration.py.
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/replay_http_server.cpp")
+        # http_server.cpp #includes <curl/curl.h> for its upstream-proxy
+        # passthrough; replay_http_server compiles that TU so it must link
+        # libcurl even though the stub backend itself doesn't use it. Skip
+        # the target on hosts without libcurl-dev rather than fail configure
+        # — the rest of the build (server unit tests, dflash_server, etc.)
+        # has its own CURL gating and shouldn't be blocked by the test rig.
+        find_package(CURL)
+    endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/replay_http_server.cpp" AND CURL_FOUND)
+        add_executable(replay_http_server
+            test/replay_http_server.cpp
+            test/scenario_store.cpp
+            test/stub_model_backend.cpp
+            src/server/http_server.cpp
+            src/server/model_card.cpp
+            src/server/prompt_normalize.cpp)
+        target_include_directories(replay_http_server PRIVATE
+            ${DFLASH27B_SRC_INCLUDE_DIRS}
+            ${CMAKE_CURRENT_SOURCE_DIR}/test
+            ${CURL_INCLUDE_DIRS})
+        if(DFLASH27B_GPU_BACKEND STREQUAL "hip")
+            target_compile_definitions(replay_http_server PRIVATE DFLASH27B_BACKEND_HIP=1 GGML_USE_HIP)
+        else()
+            target_compile_definitions(replay_http_server PRIVATE
+                DFLASH27B_BACKEND_CUDA=1
+                DFLASH27B_CUDA_MIN_SM=${_dflash_cuda_min_sm})
+        endif()
+        target_link_libraries(replay_http_server PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET} pthread CURL::libcurl)
+        if(DFLASH27B_GPU_BACKEND STREQUAL "cuda")
+            find_package(CUDAToolkit REQUIRED)
+            target_link_libraries(replay_http_server PRIVATE CUDA::cudart)
+        else()
+            target_link_libraries(replay_http_server PRIVATE hip::host)
+        endif()
+    endif()
+
     # ─── Unit tests (no GPU, no model files) ────────────────────────────
     enable_testing()
 

diff --git a/server/src/server/chat_template.cpp b/server/src/server/chat_template.cpp
@@ -51,14 +51,18 @@ ChatFormat chat_format_for_arch(const std::string & arch) {
     return ChatFormat::QWEN3;
 }
 
-std::string render_chat_template(
+PromptRenderResult render_chat_template(
     const std::vector<ChatMessage> & messages,
     ChatFormat format,
     bool add_generation_prompt,
     bool enable_thinking,
     const std::string & tools_json)
 {
     std::string result;
+    // `started_in_thinking` is derived deterministically from the template
+    // branch + render flags below. Set per format inside the switch so a
+    // future format addition can't silently miss the wiring.
+    bool started_in_thinking = false;
     bool has_tools = !tools_json.empty() && tools_json != "[]" && tools_json != "null";
 
     switch (format) {
@@ -141,6 +145,14 @@ std::string render_chat_template(
                 // even when the client opts in, defeating the thinking-budget
                 // mechanism entirely.
                 result += "<think>\n";
+                // The prompt suffix pre-opens `<think>` — the model's very
+                // first generated token is reasoning, never preceded by an
+                // explicit `<think>` opener in the stream. Callers must
+                // start the SSE state machine in REASONING mode and pass
+                // `started_in_thinking=true` to parse_reasoning() so that
+                // reasoning text routes to reasoning_content instead of
+                // leaking into content.
+                started_in_thinking = true;
             }
         }
         break;
@@ -224,6 +236,11 @@ std::string render_chat_template(
             result += "<assistant>\n";
             if (enable_thinking) {
                 result += "<think>";
+                // Same situation as Qwen3.6: Laguna XS.2's enable_thinking
+                // generation prompt ends with `<think>` so the model starts
+                // emitting reasoning tokens with no explicit opener in the
+                // stream. Route subsequent tokens to the reasoning channel.
+                started_in_thinking = true;
             } else {
                 // Empty think block — model jumps straight to answer.
                 result += "</think>";
@@ -311,11 +328,17 @@ std::string render_chat_template(
                 result += "<|channel>thought\n<channel|>";
             }
         }
+        // Gemma4 does NOT pre-open `<think>` from the prompt; its
+        // reasoning channel is opened by the model emitting `<|channel>`
+        // which http_server forwards into the SseEmitter as the text
+        // `<think>` — so the emitter's existing CONTENT→REASONING
+        // transition fires on that synthesized opener. started_in_thinking
+        // stays false (initial CONTENT mode is correct).
         break;
     }
     }
 
-    return result;
+    return PromptRenderResult{std::move(result), started_in_thinking};
 }
 
 // ─── Jinja path ─────────────────────────────────────────────────────────
@@ -353,7 +376,29 @@ static std::shared_ptr<jinja::program> get_or_parse(const std::string & template
 
 }  // namespace
 
-std::string render_chat_template_jinja(
+// Sniff a rendered prompt for a trailing `<think>` opener so the caller
+// can route subsequent stream tokens to the reasoning channel. Accepts
+// optional whitespace after the opener (Qwen3.6 emits `<think>\n`).
+// True positive ⇒ caller should treat the prompt as having pre-opened
+// the reasoning channel (and the renderer warns loudly so a model-card
+// mismatch is visible at runtime).
+static bool prompt_ends_with_think_open(const std::string & s) {
+    static const std::string OPEN = "<think>";
+    // Walk back over trailing ASCII whitespace.
+    size_t end = s.size();
+    while (end > 0) {
+        char c = s[end - 1];
+        if (c == ' ' || c == '\n' || c == '\r' || c == '\t') {
+            end--;
+        } else {
+            break;
+        }
+    }
+    if (end < OPEN.size()) return false;
+    return s.compare(end - OPEN.size(), OPEN.size(), OPEN) == 0;
+}
+
+PromptRenderResult render_chat_template_jinja(
     const std::string & template_src,
     const std::vector<ChatMessage> & messages,
     const std::string & bos_token,
@@ -407,14 +452,37 @@ std::string render_chat_template_jinja(
         throw std::runtime_error(std::string("jinja global_from_json: ") + e.what());
     }
 
+    std::string rendered;
     try {
         jinja::runtime rt(ctx);
         jinja::value results = rt.execute(*prog);
         auto parts = jinja::runtime::gather_string_parts(results);
-        return parts->as_string().str();
+        rendered = parts->as_string().str();
     } catch (const std::exception & e) {
         throw std::runtime_error(std::string("jinja runtime: ") + e.what());
     }
+
+    // Jinja path: we don't know which template family the caller passed
+    // in, so derive `started_in_thinking` by sniffing the rendered tail
+    // for a `<think>` opener. This catches the common Qwen3.6 / Laguna
+    // chat templates that end with `<think>\n` when enable_thinking is
+    // honored, plus any custom template that follows the same convention.
+    //
+    // Warn loudly when sniffing decides true so a template/model-card
+    // mismatch (e.g. enable_thinking=false but template hard-codes
+    // `<think>` anyway) surfaces in server logs.
+    bool started_in_thinking =
+        enable_thinking && add_generation_prompt &&
+        prompt_ends_with_think_open(rendered);
+    if (started_in_thinking) {
+        std::fprintf(stderr,
+            "[WARN] render_chat_template_jinja: rendered prompt ends with "
+            "`<think>` opener — treating as started_in_thinking=true. If "
+            "this is unexpected, check the template's enable_thinking "
+            "branch or the model card's reasoning configuration.\n");
+    }
+
+    return PromptRenderResult{std::move(rendered), started_in_thinking};
 }
 
 }  // namespace dflash::common
diff --git a/server/src/server/chat_template.h b/server/src/server/chat_template.h
@@ -27,6 +27,23 @@ enum class ChatFormat {
     GEMMA4,    // <bos><|turn>role\n...<turn|>\n
 };
 
+// Provenance for a rendered prompt. `text` is the byte string that gets
+// tokenized; `started_in_thinking` records whether the prompt suffix
+// pre-opens a `<think>` block (or equivalent reasoning-channel marker)
+// that the model is expected to continue into.
+//
+// Callers route this into the SseEmitter's initial mode and into
+// parse_reasoning()'s `started_in_thinking` argument so reasoning text
+// emitted before any explicit `<think>` opener is still attributed to
+// the reasoning channel. Without this plumbing, Qwen3.6 / Laguna
+// enable_thinking prompts (which pre-open `<think>\n` in the assistant
+// turn) cause the model to emit reasoning straight into the content
+// channel, leaving `reasoning_content` empty.
+struct PromptRenderResult {
+    std::string text;            // rendered prompt text, ready to tokenize
+    bool started_in_thinking;    // prompt suffix opens reasoning channel
+};
+
 // Render chat messages into the model-specific prompt string.
 // The result is plain text ready to be tokenized.
 //
@@ -40,7 +57,7 @@ enum class ChatFormat {
 // `tools_json` is an optional JSON string containing the tool definitions
 // array. When non-empty, the Qwen3/3.5 template injects a tool preamble
 // into the system message instructing the model how to emit <tool_call> tags.
-std::string render_chat_template(
+PromptRenderResult render_chat_template(
     const std::vector<ChatMessage> & messages,
     ChatFormat format,
     bool add_generation_prompt = true,
@@ -67,7 +84,7 @@ ChatFormat chat_format_for_arch(const std::string & arch);
 // Internally caches the most recently parsed program per thread (avoids
 // re-parsing the template on every request). Throws std::runtime_error on
 // lexer/parser/runtime failure (caller should surface a 500 response).
-std::string render_chat_template_jinja(
+PromptRenderResult render_chat_template_jinja(
     const std::string & template_src,
     const std::vector<ChatMessage> & messages,
     const std::string & bos_token,

diff --git a/server/src/server/http_server.cpp b/server/src/server/http_server.cpp
@@ -1591,7 +1591,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
             tools_json = req.tools.dump();
         }
 
-        std::string rendered;
+        PromptRenderResult render_result;
         if (!config_.chat_template_src.empty()) {
             // Jinja path: caller supplied a chat template file via
             // --chat-template-file. Override the hardcoded QWEN3/LAGUNA
@@ -1608,7 +1608,7 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
                 ? tokenizer_.raw_token(tokenizer_.eos_id())
                 : std::string();
             try {
-                rendered = render_chat_template_jinja(
+                render_result = render_chat_template_jinja(
                     config_.chat_template_src,
                     chat_msgs,
                     bos_str,
@@ -1622,11 +1622,18 @@ bool HttpServer::route_request(int fd, const HttpRequest & hr) {
                 return true;
             }
         } else {
-            rendered = render_chat_template(chat_msgs, chat_format_,
-                                            true, enable_thinking,
-                                            tools_json);
-        }
-        req.prompt_tokens = tokenizer_.encode(rendered);
+            render_result = render_chat_template(chat_msgs, chat_format_,
+                                                 true, enable_thinking,
+                                                 tools_json);
+        }
+        // Propagate prompt provenance so the SseEmitter's initial mode
+        // matches the template's pre-opened reasoning channel (Qwen3.6 /
+        // Laguna enable_thinking case). Without this, reasoning text
+        // leaks into the content channel and `reasoning_content` stays
+        // empty — see fix(server): route Qwen3.6/Laguna think-mode
+        // reasoning to reasoning_content channel.
+        req.started_in_thinking = render_result.started_in_thinking;
+        req.prompt_tokens = tokenizer_.encode(render_result.text);
 
         // count_tokens: short-circuit after tokenization. Skip generation
         // entirely — Anthropic's contract is just `{"input_tokens": N}`.
@@ -1770,11 +1777,20 @@ void HttpServer::worker_loop() {
             }
         }
 
-        // Create SSE emitter for streaming state machine.
+        // Create SSE emitter for streaming state machine. `initial_mode`
+        // tracks whether the chat-template prompt pre-opened a `<think>`
+        // block (Qwen3.6 / Laguna enable_thinking path). When true, the
+        // emitter starts in REASONING so the model's first generated
+        // token routes to reasoning_content even though no explicit
+        // `<think>` opener appears in the token stream.
+        const StreamMode initial_mode = req.started_in_thinking
+            ? StreamMode::REASONING
+            : StreamMode::CONTENT;
         SseEmitter emitter(req.format, req.response_id, req.model,
                            (int)req.prompt_tokens.size(), req.tools,
                            &tool_memory_,
-                           req.stop_sequences);
+                           req.stop_sequences,
+                           initial_mode);
 
         // Emit initial SSE events (skip when proxying).
         if (req.stream && config_.pflash_upstream_base.empty()) {

diff --git a/server/src/server/http_server.h b/server/src/server/http_server.h
@@ -215,6 +215,12 @@ struct ParsedRequest {
     // Bandit: per-session adaptive keep_ratio opt-in
     std::string               session_id;
     DiskPrefixCachePolicy     disk_cache_policy;
+    // Set by the chat-template renderer when the rendered prompt suffix
+    // pre-opens a `<think>` block (Qwen3.6 / Laguna enable_thinking path).
+    // Drives the SseEmitter's initial mode so reasoning tokens emitted
+    // before any explicit `<think>` opener route to reasoning_content
+    // instead of leaking into content.
+    bool                      started_in_thinking = false;
 };
 
 // Build the /props response body. Exposed (non-static) so unit tests

diff --git a/server/src/server/sse_emitter.cpp b/server/src/server/sse_emitter.cpp
@@ -76,15 +76,16 @@ SseEmitter::SseEmitter(ApiFormat format,
                        int prompt_tokens,
                        const json & tools,
                        ToolMemory * tool_memory,
-                       const std::vector<std::string> & stop_sequences)
+                       const std::vector<std::string> & stop_sequences,
+                       StreamMode initial_mode)
     : format_(format)
     , request_id_(request_id)
     , model_name_(model_name)
     , prompt_tokens_(prompt_tokens)
     , tools_(tools)
     , tool_memory_(tool_memory)
-    , mode_(StreamMode::CONTENT)
-    , active_kind_("text")
+    , mode_(initial_mode)
+    , active_kind_(initial_mode == StreamMode::REASONING ? "thinking" : "text")
     , stop_sequences_(stop_sequences)
     , created_at_(unix_timestamp())
     , msg_item_id_(gen_item_id())
@@ -93,6 +94,12 @@ SseEmitter::SseEmitter(ApiFormat format,
     for (const auto & s : stop_sequences_) {
         if (s.size() > stop_holdback_) stop_holdback_ = s.size();
     }
+    // NOTE on `checked_think_prefix_`: we deliberately leave the default
+    // (false) here even when initial_mode == REASONING. The emitter has a
+    // one-time guard in emit_token() that strips a redundantly-emitted
+    // leading `<think>` if the model emits one anyway (model-card /
+    // template-mismatch edge case). Pre-setting the flag to true would
+    // skip that strip and leak the duplicate opener into reasoning_text.
 }
 
 // ─── SSE formatting helpers ─────────────────────────────────────────────