rolandpg · rolandpg · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
@@ -207,6 +207,12 @@ embedding:
 #   ZETTELFORGE_LLM_MAX_RETRIES=2
 #   ZETTELFORGE_LLM_FALLBACK=ollama
 #   ZETTELFORGE_LLM_LOCAL_BACKEND=onnxruntime-genai   # only when provider=local
+#   ZETTELFORGE_LLM_MAX_TOKENS_CAUSAL=8000
+#   ZETTELFORGE_LLM_MAX_TOKENS_SYNTHESIS=2500
+#   ZETTELFORGE_LLM_MAX_TOKENS_EXTRACTION=2500
+#   ZETTELFORGE_LLM_MAX_TOKENS_NER=2500
+#   ZETTELFORGE_LLM_MAX_TOKENS_EVOLVE=2500
+#   ZETTELFORGE_LLM_REASONING_MODEL=false
 #
 llm:
   provider: ollama
@@ -218,6 +224,12 @@ llm:
   max_retries: 2
   fallback: ""
   local_backend: llama-cpp-python  # used when provider=local (RFC-011)
+  max_tokens_causal: 8000          # note_constructor.extract_causal_triples
+  max_tokens_synthesis: 2500       # synthesis_generator._generate_synthesis
+  max_tokens_extraction: 2500      # fact_extractor.extract
+  max_tokens_ner: 2500             # entity_indexer.extract_llm
+  max_tokens_evolve: 2500          # memory_evolver.evaluate_evolution
+  reasoning_model: false           # auto-scale timeout + budgets for thinking models
   extra: {}
 
 

@@ -167,6 +167,9 @@ ignore = [
 "src/zettelforge/__init__.py" = ["F401", "E402"]  # re-exports and conditional imports
 # S104: web host binding to 0.0.0.0 is intentional (RFC-015 web UI)
 "src/zettelforge/config.py" = ["S104"]
+# S101: assert is standard pytest convention in test files
+# RUF012: mutable class attributes (POSITIVE/NEGATIVE lists) are test fixtures, not production bugs
+"tests/*" = ["S101", "RUF012"]
 # T20 (print) is forbidden in production code (GOV-003), but every CLI
 # entrypoint legitimately uses print() for stdout output. The patterns
 # below scope T20 to library code only.

@@ -107,6 +107,17 @@ class LLMConfig:
     max_retries: int = 2
     fallback: str = ""  # empty preserves implicit local→ollama fallback
     local_backend: str = "llama-cpp-python"  # RFC-011: "llama-cpp-python" or "onnxruntime-genai"
+    # Per-call-site max_tokens budgets (RFC-125). Each maps to a specific
+    # generate() call site so operators can tune without editing source.
+    max_tokens_causal: int = 8000  # note_constructor.extract_causal_triples
+    max_tokens_synthesis: int = 2500  # synthesis_generator._generate_synthesis
+    max_tokens_extraction: int = 2500  # fact_extractor.extract
+    max_tokens_ner: int = 2500  # entity_indexer.extract_llm
+    max_tokens_evolve: int = 2500  # memory_evolver.evaluate_evolution
+    # RFC-125: reasoning-model auto-scaling flag. When True, timeout is
+    # auto-scaled to >= 180s and each per-call-site budget is bumped so
+    #  thinking tokens do not exhaust the limit before JSON output.
+    reasoning_model: bool = False
     extra: dict[str, Any] = field(default_factory=dict)
 
     # Keys under ``extra`` that are commonly used for secrets. Matched
@@ -125,6 +136,33 @@ def _redact_extra(self) -> dict[str, Any]:
                 redacted[k] = v
         return redacted
 
+    def _apply_reasoning_model_scaling(self) -> None:
+        """When ``reasoning_model`` is True, auto-scale timeout and budgets.
+
+        Reasoning models (``deepseek-r1``, ``qwq-32b``, etc.) emit long
+         thinking chains before JSON output, easily exhausting default
+        budgets. This method bumps both ``timeout`` and each per-call-site
+        ``max_tokens_*`` to reasoning-safe tiers so callers don't need
+        to manually configure six knobs.
+        """
+        if not self.reasoning_model:
+            return
+        # Ensure timeout is at least 180s for long  thinking chains
+        if self.timeout < 180.0:
+            self.timeout = 180.0
+        # Bump each per-call-site budget to a reasoning-safe tier
+        # (double the default, minimum 4000)
+        for attr in (
+            "max_tokens_causal",
+            "max_tokens_synthesis",
+            "max_tokens_extraction",
+            "max_tokens_ner",
+            "max_tokens_evolve",
+        ):
+            current = getattr(self, attr, 4000)
+            bumped = max(current * 2, 4000)
+            setattr(self, attr, bumped)
+
     def __repr__(self) -> str:
         # Redact api_key plus any sensitive-looking keys inside ``extra`` so
         # secrets resolved via ``${ENV_VAR}`` refs don't leak into structured
@@ -136,7 +174,13 @@ def __repr__(self) -> str:
             f"temperature={self.temperature}, timeout={self.timeout}, "
             f"max_retries={self.max_retries}, fallback={self.fallback!r}, "
             f"local_backend={self.local_backend!r}, "
-            f"extra={self._redact_extra()!r})"
+            f"extra={self._redact_extra()!r}, "
+            f"max_tokens_causal={self.max_tokens_causal}, "
+            f"max_tokens_synthesis={self.max_tokens_synthesis}, "
+            f"max_tokens_extraction={self.max_tokens_extraction}, "
+            f"max_tokens_ner={self.max_tokens_ner}, "
+            f"max_tokens_evolve={self.max_tokens_evolve}, "
+            f"reasoning_model={self.reasoning_model})"
         )
 
 
@@ -523,6 +567,46 @@ def _apply_env(cfg: ZettelForgeConfig):
     if v := os.environ.get("ZETTELFORGE_LLM_LOCAL_BACKEND"):
         cfg.llm.local_backend = v
 
+    # RFC-125: per-call-site max_tokens budgets
+    if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_CAUSAL"):
+        try:
+            cfg.llm.max_tokens_causal = int(v)
+        except ValueError:
+            get_logger("zettelforge.config").warning(
+                "invalid_max_tokens_causal", value=v, hint="Must be an int"
+            )
+    if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_SYNTHESIS"):
+        try:
+            cfg.llm.max_tokens_synthesis = int(v)
+        except ValueError:
+            get_logger("zettelforge.config").warning(
+                "invalid_max_tokens_synthesis", value=v, hint="Must be an int"
+            )
+    if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_EXTRACTION"):
+        try:
+            cfg.llm.max_tokens_extraction = int(v)
+        except ValueError:
+            get_logger("zettelforge.config").warning(
+                "invalid_max_tokens_extraction", value=v, hint="Must be an int"
+            )
+    if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_NER"):
+        try:
+            cfg.llm.max_tokens_ner = int(v)
+        except ValueError:
+            get_logger("zettelforge.config").warning(
+                "invalid_max_tokens_ner", value=v, hint="Must be an int"
+            )
+    if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_EVOLVE"):
+        try:
+            cfg.llm.max_tokens_evolve = int(v)
+        except ValueError:
+            get_logger("zettelforge.config").warning(
+                "invalid_max_tokens_evolve", value=v, hint="Must be an int"
+            )
+    # RFC-125: reasoning_model flag
+    if v := os.environ.get("ZETTELFORGE_LLM_REASONING_MODEL"):
+        cfg.llm.reasoning_model = v.lower() in ("true", "1", "yes")
+
     # LLM NER
     if v := os.environ.get("ZETTELFORGE_LLM_NER_ENABLED"):
         cfg.llm_ner.enabled = v.lower() in ("true", "1", "yes")
@@ -583,6 +667,9 @@ def get_config() -> ZettelForgeConfig:
         # Layer 2: environment variables (override)
         _apply_env(_config)
 
+        # Layer 3: reasoning-model auto-scaling (RFC-125)
+        _config.llm._apply_reasoning_model_scaling()
+
     return _config
 
 

@@ -268,15 +268,20 @@ def extract_llm(self, text: str) -> dict[str, list[str]]:
             return empty
 
         try:
+            from zettelforge.config import get_config
+
+            cfg = get_config()
+            max_tokens = cfg.llm.max_tokens_ner
+
             from zettelforge.llm_client import generate
 
             prompt = f"Extract named entities from this text:\n\n{text[:2000]}\n\nJSON:"
             # 2500-token budget for reasoning-model headroom (v2.5.2; pre-fix
-            # 300 was exhausted by qwen3.5+ <think> tokens, leaving the NER
+            # 300 was exhausted by qwen3.5+ thinking tokens, leaving the NER
             # JSON empty and entity extraction silently no-opping).
             output = generate(
                 prompt,
-                max_tokens=2500,
+                max_tokens=max_tokens,
                 temperature=0.0,
                 system=self.NER_SYSTEM_PROMPT,
             )
@@ -285,7 +290,9 @@ def extract_llm(self, text: str) -> dict[str, list[str]]:
             if parsed is None and output and output.strip():
                 _logger.info("retry_parse", site="entity_indexer_ner", attempt=2)
                 retry_prompt = prompt + "\n\nRespond with valid JSON only."
-                output = generate(retry_prompt, max_tokens=2500, temperature=0.3, json_mode=True)
+                output = generate(
+                    retry_prompt, max_tokens=max_tokens, temperature=0.3, json_mode=True
+                )
                 parsed = extract_json(output, expect="object")
             return self._parse_ner_output_from_parsed(parsed, output, conversational_types)
 

@@ -40,11 +40,15 @@ def extract(
         prompt = self._build_prompt(content, context)
 
         try:
+            from zettelforge.config import get_config
             from zettelforge.llm_client import generate
 
+            cfg = get_config()
+            max_tokens = cfg.llm.max_tokens_extraction
+
             # 2500-token budget for reasoning-model headroom (see v2.5.2
-            # CHANGELOG; pre-fix 400 was exhausted by qwen3.5+ <think> tokens).
-            raw_output = generate(prompt, max_tokens=2500, temperature=0.1)
+            # CHANGELOG; pre-fix 400 was exhausted by qwen3.5+ thinking tokens).
+            raw_output = generate(prompt, max_tokens=max_tokens, temperature=0.1)
             return self._parse_extraction_response(raw_output)
         except Exception:
             _logger.warning("llm_fact_extraction_failed", exc_info=True)

@@ -14,6 +14,25 @@
 _parse_stats = {"success": 0, "failure": 0}
 
 
+def strip_thinking_tags(text: str) -> str:
+    """Strip  ********  ******** thinking tags from LLM output.
+
+    Reasoning models wrap internal reasoning in  ******** ...  ********
+    tags.  These must be removed before JSON extraction, otherwise the
+    regex searches match the  ******** tag text instead of the JSON
+    payload.
+
+    Args:
+        text: Raw LLM output that may contain  ******** ...  ******** tags.
+
+    Returns:
+        Cleaned text with **thinking**/** and <thinking>/** blocks removed.
+    """
+    return re.sub(
+        r"(?:\*\*thinking\*\*.*?\*\*|<think(?:ing)?>.*?</think(?:ing)?>)", "", text, flags=re.DOTALL
+    )
+
+
 def extract_json(raw: str | None, expect: str = "object") -> dict | list | None:
     """Extract JSON from LLM output, handling code fences and surrounding text.
 
@@ -28,7 +47,10 @@ def extract_json(raw: str | None, expect: str = "object") -> dict | list | None:
         _parse_stats["failure"] += 1
         return None
 
-    text = _strip_code_fences(raw)
+    # Strip  ******** ...  ******** tags before fence-stripping so the regex
+    # searches operate on clean text (RFC-125).
+    text = strip_thinking_tags(raw)
+    text = _strip_code_fences(text)
 
     # Try to find JSON in the text
     if expect == "array":

@@ -87,13 +87,18 @@ def evaluate_evolution(
         when the LLM responds with valid JSON, or ``None`` on parse failure
         (after one retry).
         """
+        from zettelforge.config import get_config
+
+        cfg = get_config()
+        max_tokens = cfg.llm.max_tokens_evolve
+
         prompt = EVOLUTION_PROMPT.format(
             neighbor_content=neighbor.content.raw[:1000],
             new_content=new_note.content.raw[:1000],
         )
 
         # First attempt
-        output = generate(prompt, max_tokens=2500, temperature=0.2, json_mode=True)
+        output = generate(prompt, max_tokens=max_tokens, temperature=0.2, json_mode=True)
         result = extract_json(output, expect="object")
 
         # Single retry on parse failure (AD-2). Capture what the model
@@ -108,7 +113,7 @@ def evaluate_evolution(
                 raw_chars=len(output or ""),
                 prompt_preview=prompt[:240],
             )
-            output = generate(prompt, max_tokens=2500, temperature=0.1, json_mode=True)
+            output = generate(prompt, max_tokens=max_tokens, temperature=0.1, json_mode=True)
             result = extract_json(output, expect="object")
 
         if result is None:

@@ -1180,6 +1180,37 @@ def _drain_enrichment_queue(self) -> None:
             except Exception:
                 self._logger.warning("enrichment_drain_failed", exc_info=True)
 
+    def flush(self) -> None:
+        """Process all pending enrichment jobs synchronously (blocking).
+
+        Intended for bulk-ingest callers that passed ``sync=False`` to
+        :meth:`remember` and want to guarantee all enrichment work is
+        complete before returning (e.g. before exiting a CLI command or
+        integration test).
+
+        This is identical to the atexit drain but without a deadline — it
+        blocks until the queue is empty.
+        """
+        while not self._enrichment_queue.empty():
+            try:
+                job = self._enrichment_queue.get_nowait()
+                if job.defer:
+                    self._enrichment_queue.task_done()
+                    continue
+                if job.job_type == "neighbor_evolution":
+                    self._run_evolution(job)
+                elif job.job_type == "llm_ner":
+                    self._run_llm_ner(job)
+                else:
+                    self._run_enrichment(job)
+                self._enrichment_queue.task_done()
+            except queue.Empty:
+                break
+            except BackendClosedError:
+                return
+            except Exception:
+                self._logger.warning("enrichment_flush_failed", exc_info=True)
+
     def evolve_note(self, note_id: str, sync: bool = False) -> dict | None:
         """Trigger neighbor evolution for an existing note.
 

@@ -120,17 +120,21 @@ def extract_causal_triples(self, text: str, note_id: str = "") -> list[dict[str,
 JSON:"""
 
         try:
+            from zettelforge.config import get_config
             from zettelforge.llm_client import generate
 
+            cfg = get_config()
+            max_tokens = cfg.llm.max_tokens_causal
+
             # 8000 tokens for causal extraction — the highest cap in the
             # codebase. This prompt asks the model to enumerate every causal
             # relation in a passage, which triggers the longest reasoning
             # chains anywhere in the system. Empirical: qwen3.5:9b at
             # num_predict=4000 was *stochastically* sufficient (~70% success
             # rate), eval_count varied between 2.8k (success) and 4k+ (still
-            # in <think> tags when budget exhausted). 8000 keeps the
+            # in thinking tags when budget exhausted). 8000 keeps the
             # success rate >95% on the same model. v2.5.2 CHANGELOG.
-            output = generate(prompt, max_tokens=8000, temperature=0.1)
+            output = generate(prompt, max_tokens=max_tokens, temperature=0.1)
 
             parsed = extract_json(output, expect="array")
             if parsed is None: