Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions config.default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,12 @@ embedding:
# ZETTELFORGE_LLM_MAX_RETRIES=2
# ZETTELFORGE_LLM_FALLBACK=ollama
# ZETTELFORGE_LLM_LOCAL_BACKEND=onnxruntime-genai # only when provider=local
# ZETTELFORGE_LLM_MAX_TOKENS_CAUSAL=8000
# ZETTELFORGE_LLM_MAX_TOKENS_SYNTHESIS=2500
# ZETTELFORGE_LLM_MAX_TOKENS_EXTRACTION=2500
# ZETTELFORGE_LLM_MAX_TOKENS_NER=2500
# ZETTELFORGE_LLM_MAX_TOKENS_EVOLVE=2500
# ZETTELFORGE_LLM_REASONING_MODEL=false
#
llm:
provider: ollama
Expand All @@ -218,6 +224,12 @@ llm:
max_retries: 2
fallback: ""
local_backend: llama-cpp-python # used when provider=local (RFC-011)
max_tokens_causal: 8000 # note_constructor.extract_causal_triples
max_tokens_synthesis: 2500 # synthesis_generator._generate_synthesis
max_tokens_extraction: 2500 # fact_extractor.extract
max_tokens_ner: 2500 # entity_indexer.extract_llm
max_tokens_evolve: 2500 # memory_evolver.evaluate_evolution
reasoning_model: false # auto-scale timeout + budgets for thinking models
extra: {}


Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ ignore = [
"src/zettelforge/__init__.py" = ["F401", "E402"] # re-exports and conditional imports
# S104: web host binding to 0.0.0.0 is intentional (RFC-015 web UI)
"src/zettelforge/config.py" = ["S104"]
# S101: assert is standard pytest convention in test files
# RUF012: mutable class attributes (POSITIVE/NEGATIVE lists) are test fixtures, not production bugs
"tests/*" = ["S101", "RUF012"]
# T20 (print) is forbidden in production code (GOV-003), but every CLI
# entrypoint legitimately uses print() for stdout output. The patterns
# below scope T20 to library code only.
Expand Down
89 changes: 88 additions & 1 deletion src/zettelforge/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,17 @@ class LLMConfig:
max_retries: int = 2
fallback: str = "" # empty preserves implicit local→ollama fallback
local_backend: str = "llama-cpp-python" # RFC-011: "llama-cpp-python" or "onnxruntime-genai"
# Per-call-site max_tokens budgets (RFC-125). Each maps to a specific
# generate() call site so operators can tune without editing source.
max_tokens_causal: int = 8000 # note_constructor.extract_causal_triples
max_tokens_synthesis: int = 2500 # synthesis_generator._generate_synthesis
max_tokens_extraction: int = 2500 # fact_extractor.extract
max_tokens_ner: int = 2500 # entity_indexer.extract_llm
max_tokens_evolve: int = 2500 # memory_evolver.evaluate_evolution
# RFC-125: reasoning-model auto-scaling flag. When True, timeout is
# auto-scaled to >= 180s and each per-call-site budget is bumped so
# thinking tokens do not exhaust the limit before JSON output.
reasoning_model: bool = False
extra: dict[str, Any] = field(default_factory=dict)

# Keys under ``extra`` that are commonly used for secrets. Matched
Expand All @@ -125,6 +136,33 @@ def _redact_extra(self) -> dict[str, Any]:
redacted[k] = v
return redacted

def _apply_reasoning_model_scaling(self) -> None:
"""When ``reasoning_model`` is True, auto-scale timeout and budgets.

Reasoning models (``deepseek-r1``, ``qwq-32b``, etc.) emit long
thinking chains before JSON output, easily exhausting default
budgets. This method bumps both ``timeout`` and each per-call-site
``max_tokens_*`` to reasoning-safe tiers so callers don't need
to manually configure six knobs.
"""
if not self.reasoning_model:
return
# Ensure timeout is at least 180s for long thinking chains
if self.timeout < 180.0:
self.timeout = 180.0
# Bump each per-call-site budget to a reasoning-safe tier
# (double the default, minimum 4000)
for attr in (
"max_tokens_causal",
"max_tokens_synthesis",
"max_tokens_extraction",
"max_tokens_ner",
"max_tokens_evolve",
):
current = getattr(self, attr, 4000)
bumped = max(current * 2, 4000)
setattr(self, attr, bumped)

def __repr__(self) -> str:
# Redact api_key plus any sensitive-looking keys inside ``extra`` so
# secrets resolved via ``${ENV_VAR}`` refs don't leak into structured
Expand All @@ -136,7 +174,13 @@ def __repr__(self) -> str:
f"temperature={self.temperature}, timeout={self.timeout}, "
f"max_retries={self.max_retries}, fallback={self.fallback!r}, "
f"local_backend={self.local_backend!r}, "
f"extra={self._redact_extra()!r})"
f"extra={self._redact_extra()!r}, "
f"max_tokens_causal={self.max_tokens_causal}, "
f"max_tokens_synthesis={self.max_tokens_synthesis}, "
f"max_tokens_extraction={self.max_tokens_extraction}, "
f"max_tokens_ner={self.max_tokens_ner}, "
f"max_tokens_evolve={self.max_tokens_evolve}, "
f"reasoning_model={self.reasoning_model})"
)


Expand Down Expand Up @@ -523,6 +567,46 @@ def _apply_env(cfg: ZettelForgeConfig):
if v := os.environ.get("ZETTELFORGE_LLM_LOCAL_BACKEND"):
cfg.llm.local_backend = v

# RFC-125: per-call-site max_tokens budgets
if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_CAUSAL"):
try:
cfg.llm.max_tokens_causal = int(v)
except ValueError:
get_logger("zettelforge.config").warning(
"invalid_max_tokens_causal", value=v, hint="Must be an int"
)
if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_SYNTHESIS"):
try:
cfg.llm.max_tokens_synthesis = int(v)
except ValueError:
get_logger("zettelforge.config").warning(
"invalid_max_tokens_synthesis", value=v, hint="Must be an int"
)
if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_EXTRACTION"):
try:
cfg.llm.max_tokens_extraction = int(v)
except ValueError:
get_logger("zettelforge.config").warning(
"invalid_max_tokens_extraction", value=v, hint="Must be an int"
)
if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_NER"):
try:
cfg.llm.max_tokens_ner = int(v)
except ValueError:
get_logger("zettelforge.config").warning(
"invalid_max_tokens_ner", value=v, hint="Must be an int"
)
if v := os.environ.get("ZETTELFORGE_LLM_MAX_TOKENS_EVOLVE"):
try:
cfg.llm.max_tokens_evolve = int(v)
except ValueError:
get_logger("zettelforge.config").warning(
"invalid_max_tokens_evolve", value=v, hint="Must be an int"
)
# RFC-125: reasoning_model flag
if v := os.environ.get("ZETTELFORGE_LLM_REASONING_MODEL"):
cfg.llm.reasoning_model = v.lower() in ("true", "1", "yes")

# LLM NER
if v := os.environ.get("ZETTELFORGE_LLM_NER_ENABLED"):
cfg.llm_ner.enabled = v.lower() in ("true", "1", "yes")
Expand Down Expand Up @@ -583,6 +667,9 @@ def get_config() -> ZettelForgeConfig:
# Layer 2: environment variables (override)
_apply_env(_config)

# Layer 3: reasoning-model auto-scaling (RFC-125)
_config.llm._apply_reasoning_model_scaling()

return _config


Expand Down
13 changes: 10 additions & 3 deletions src/zettelforge/entity_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,15 +268,20 @@ def extract_llm(self, text: str) -> dict[str, list[str]]:
return empty

try:
from zettelforge.config import get_config

cfg = get_config()
max_tokens = cfg.llm.max_tokens_ner

from zettelforge.llm_client import generate

prompt = f"Extract named entities from this text:\n\n{text[:2000]}\n\nJSON:"
# 2500-token budget for reasoning-model headroom (v2.5.2; pre-fix
# 300 was exhausted by qwen3.5+ <think> tokens, leaving the NER
# 300 was exhausted by qwen3.5+ thinking tokens, leaving the NER
# JSON empty and entity extraction silently no-opping).
output = generate(
prompt,
max_tokens=2500,
max_tokens=max_tokens,
temperature=0.0,
system=self.NER_SYSTEM_PROMPT,
)
Expand All @@ -285,7 +290,9 @@ def extract_llm(self, text: str) -> dict[str, list[str]]:
if parsed is None and output and output.strip():
_logger.info("retry_parse", site="entity_indexer_ner", attempt=2)
retry_prompt = prompt + "\n\nRespond with valid JSON only."
output = generate(retry_prompt, max_tokens=2500, temperature=0.3, json_mode=True)
output = generate(
retry_prompt, max_tokens=max_tokens, temperature=0.3, json_mode=True
)
parsed = extract_json(output, expect="object")
return self._parse_ner_output_from_parsed(parsed, output, conversational_types)

Expand Down
8 changes: 6 additions & 2 deletions src/zettelforge/fact_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,15 @@ def extract(
prompt = self._build_prompt(content, context)

try:
from zettelforge.config import get_config
from zettelforge.llm_client import generate

cfg = get_config()
max_tokens = cfg.llm.max_tokens_extraction

# 2500-token budget for reasoning-model headroom (see v2.5.2
# CHANGELOG; pre-fix 400 was exhausted by qwen3.5+ <think> tokens).
raw_output = generate(prompt, max_tokens=2500, temperature=0.1)
# CHANGELOG; pre-fix 400 was exhausted by qwen3.5+ thinking tokens).
raw_output = generate(prompt, max_tokens=max_tokens, temperature=0.1)
return self._parse_extraction_response(raw_output)
except Exception:
_logger.warning("llm_fact_extraction_failed", exc_info=True)
Expand Down
24 changes: 23 additions & 1 deletion src/zettelforge/json_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,25 @@
_parse_stats = {"success": 0, "failure": 0}


def strip_thinking_tags(text: str) -> str:
"""Strip ******** ******** thinking tags from LLM output.

Reasoning models wrap internal reasoning in ******** ... ********
tags. These must be removed before JSON extraction, otherwise the
regex searches match the ******** tag text instead of the JSON
payload.

Args:
text: Raw LLM output that may contain ******** ... ******** tags.

Returns:
Cleaned text with **thinking**/** and <thinking>/** blocks removed.
"""
return re.sub(
r"(?:\*\*thinking\*\*.*?\*\*|<think(?:ing)?>.*?</think(?:ing)?>)", "", text, flags=re.DOTALL
)


def extract_json(raw: str | None, expect: str = "object") -> dict | list | None:
"""Extract JSON from LLM output, handling code fences and surrounding text.

Expand All @@ -28,7 +47,10 @@ def extract_json(raw: str | None, expect: str = "object") -> dict | list | None:
_parse_stats["failure"] += 1
return None

text = _strip_code_fences(raw)
# Strip ******** ... ******** tags before fence-stripping so the regex
# searches operate on clean text (RFC-125).
text = strip_thinking_tags(raw)
text = _strip_code_fences(text)

# Try to find JSON in the text
if expect == "array":
Expand Down
9 changes: 7 additions & 2 deletions src/zettelforge/memory_evolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,18 @@ def evaluate_evolution(
when the LLM responds with valid JSON, or ``None`` on parse failure
(after one retry).
"""
from zettelforge.config import get_config

cfg = get_config()
max_tokens = cfg.llm.max_tokens_evolve

prompt = EVOLUTION_PROMPT.format(
neighbor_content=neighbor.content.raw[:1000],
new_content=new_note.content.raw[:1000],
)

# First attempt
output = generate(prompt, max_tokens=2500, temperature=0.2, json_mode=True)
output = generate(prompt, max_tokens=max_tokens, temperature=0.2, json_mode=True)
result = extract_json(output, expect="object")

# Single retry on parse failure (AD-2). Capture what the model
Expand All @@ -108,7 +113,7 @@ def evaluate_evolution(
raw_chars=len(output or ""),
prompt_preview=prompt[:240],
)
output = generate(prompt, max_tokens=2500, temperature=0.1, json_mode=True)
output = generate(prompt, max_tokens=max_tokens, temperature=0.1, json_mode=True)
result = extract_json(output, expect="object")

if result is None:
Expand Down
31 changes: 31 additions & 0 deletions src/zettelforge/memory_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1180,6 +1180,37 @@ def _drain_enrichment_queue(self) -> None:
except Exception:
self._logger.warning("enrichment_drain_failed", exc_info=True)

def flush(self) -> None:
"""Process all pending enrichment jobs synchronously (blocking).

Intended for bulk-ingest callers that passed ``sync=False`` to
:meth:`remember` and want to guarantee all enrichment work is
complete before returning (e.g. before exiting a CLI command or
integration test).

This is identical to the atexit drain but without a deadline — it
blocks until the queue is empty.
"""
while not self._enrichment_queue.empty():
Comment thread
rolandpg marked this conversation as resolved.
try:
job = self._enrichment_queue.get_nowait()
if job.defer:
self._enrichment_queue.task_done()
continue
if job.job_type == "neighbor_evolution":
self._run_evolution(job)
elif job.job_type == "llm_ner":
self._run_llm_ner(job)
else:
self._run_enrichment(job)
self._enrichment_queue.task_done()
except queue.Empty:
Comment on lines +1183 to +1207
Copy link

Copilot AI Apr 28, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

flush() can return while enrichment work is still running: the background worker may have already dequeued jobs (so the queue is empty) but not finished processing them yet. For bulk-ingest callers that need a “drain until done” guarantee, prefer Queue.join() (and ensure every worker path calls task_done()), and consider coordinating with the worker so no new jobs are enqueued during flush.

Copilot uses AI. Check for mistakes.
break
except BackendClosedError:
return
except Exception:
self._logger.warning("enrichment_flush_failed", exc_info=True)

def evolve_note(self, note_id: str, sync: bool = False) -> dict | None:
"""Trigger neighbor evolution for an existing note.

Expand Down
8 changes: 6 additions & 2 deletions src/zettelforge/note_constructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,17 +120,21 @@ def extract_causal_triples(self, text: str, note_id: str = "") -> list[dict[str,
JSON:"""

try:
from zettelforge.config import get_config
from zettelforge.llm_client import generate

cfg = get_config()
max_tokens = cfg.llm.max_tokens_causal

# 8000 tokens for causal extraction — the highest cap in the
# codebase. This prompt asks the model to enumerate every causal
# relation in a passage, which triggers the longest reasoning
# chains anywhere in the system. Empirical: qwen3.5:9b at
# num_predict=4000 was *stochastically* sufficient (~70% success
# rate), eval_count varied between 2.8k (success) and 4k+ (still
# in <think> tags when budget exhausted). 8000 keeps the
# in thinking tags when budget exhausted). 8000 keeps the
# success rate >95% on the same model. v2.5.2 CHANGELOG.
output = generate(prompt, max_tokens=8000, temperature=0.1)
output = generate(prompt, max_tokens=max_tokens, temperature=0.1)

parsed = extract_json(output, expect="array")
if parsed is None:
Expand Down
Empty file.
Loading
Loading