VectorInstitute · saidul-islam98 · Mar 19, 2026 · Feb 27, 2026 · Feb 28, 2026 · Feb 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -140,8 +140,11 @@ dmypy.json
 .idea/
 
 logs/
+src/task_generation/init_book_chapter_text_files
+src/task_generation/other_scripts
 src/outputs/
 outputs/
+base_output/
 
 # inspect result logs
 seed_datasets_inspect_logs/

diff --git a/src/agentic_task_generator.py b/src/agentic_task_generator.py
diff --git a/src/area_generation/generator.py b/src/area_generation/generator.py
@@ -188,6 +188,6 @@ async def generate_areas(cfg: DictConfig, langfuse_client: Langfuse) -> None:
                 },
             )
 
-            if langfuse_client is None:
+            if langfuse_client is not None:
                 langfuse_client.flush()
             raise
diff --git a/src/base_stages/stage3_tasks.py b/src/base_stages/stage3_tasks.py
@@ -15,6 +15,7 @@
 )
 from src.schemas.io_utils import load_capabilities, save_tasks
 from src.schemas.metadata_schemas import PipelineMetadata
+from src.task_generation.runner import run_from_stage3
 from src.utils import constants
 from src.utils.model_client_utils import get_standard_model_client
 from src.utils.timestamp_utils import iso_timestamp, timestamp_tag
@@ -49,6 +50,7 @@ def run_stage3(
     """
     experiment_id = cfg.exp_cfg.exp_id
     output_base_dir = Path(cfg.global_cfg.output_dir)
+    task_gen_mode = str(cfg.task_generation_cfg.get("mode", "base")).strip().lower()
 
     # Determine tasks tag (resume or new)
     is_resume = tasks_tag is not None
@@ -57,6 +59,22 @@ def run_stage3(
     else:
         tasks_tag = timestamp_tag()
         logger.info(f"Starting new Stage 3 with tasks_tag: {tasks_tag}")
+    # If agentic mode, delegate to runner module which will call back into this module.
+    if task_gen_mode == "agentic":
+        logger.info("Stage 3 mode: agentic")
+        return run_from_stage3(
+            experiment_id=experiment_id,
+            output_base_dir=output_base_dir,
+            capabilities_tag=capabilities_tag,
+            tasks_tag=tasks_tag,
+            is_resume=is_resume,
+        )
+
+    if task_gen_mode != "base":
+        raise ValueError(
+            f"Invalid task_generation_cfg.mode={task_gen_mode}. "
+            "Expected one of: base, agentic"
+        )
 
     # Initialize scientist LLM client using task_generation config
     scientist_llm_gen_cfg = dict(cfg.scientist_llm.generation_cfg.task_generation)

diff --git a/src/cfg/run_cfg.yaml b/src/cfg/run_cfg.yaml
@@ -65,8 +65,9 @@ embedding_cfg:
   embedding_size: 1536
   filtering_similarity_threshold: 0.85
 
-# Stage 3: Task generation
+# Stage 3: Task generation (base mode included)
 task_generation_cfg:
+  mode: base
   tasks_per_blueprint: 1
   min_subtopics: 1
   max_subtopics: 1

diff --git a/src/cfg/task_generation/agent_config.yaml b/src/cfg/task_generation/agent_config.yaml
@@ -0,0 +1,40 @@
+# cfg/agentic_config.yaml
+
+defaults:
+  temperature: 0
+  top_p: 0.95
+  timeout: 120
+  cache_seed: 42
+
+agents:
+  designer:
+    model: "gemini_pro"
+    model_config:
+      config_list:
+        - api_type: "google"
+          model: "gemini-3.1-pro-preview"
+          api_key: "${GOOGLE_API_KEY}"
+      temperature: 0
+      top_p: 0.95
+      timeout: 1200
+      cache_seed: 42
+
+  verifier:
+    model: "claude"
+    model_config:
+      config_list:
+        - api_type: "claude"
+          model: "claude-opus-4-6"
+          api_key: "${ANTHROPIC_API_KEY}"
+      temperature: 0
+      top_p: 0.95
+      timeout: 600
+      cache_seed: 42
+
+dedup:
+  enabled: false
+  embedding_model: "text-embedding-3-small"
+  threshold: 0.90
+  keep_policy: "first"
+  cache_embeddings: true
+  save_discarded: true
diff --git a/src/cfg/task_generation/pipeline_config.yaml b/src/cfg/task_generation/pipeline_config.yaml
@@ -0,0 +1,43 @@
+pipeline:
+  # Experiment identifier - used to name output folders - required by runner
+  # Keep these aligned with src/cfg/run_cfg.yaml defaults for compatibility
+  # (exp_cfg.exp_id and global_cfg.output_dir) when switching between
+  # `python -m src.task_generation.runner` and `python -m src.run_base_pipeline`.
+  experiment_id: "test_exp"
+  output_base_dir: "base_output"         # folder where outputs are written
+
+  # Chapter corpus root (relative to repo root)
+  # runner expects: <book_chapter_dir>/**.txt
+  book_chapter_dir: "init_book_chapter_text_files"
+
+  # Blueprints file (relative to cfg/)
+  blueprints_file: "blueprints.json"
+
+  # Capability source mode:
+  # - placeholder: synthesize capability metadata from chapter files (current mode)
+  # - from_stage2: load capabilities from Stage 2 outputs and persist their lineage
+  capability_source_mode: "placeholder"
+  # capabilities_tag: null  # required when capability_source_mode=from_stage2
+  # capability_chapter_mapping_file: "src/task_generation/capability_to_chapters.json"
+  # Optional JSON object mapping capability_id or area_id/capability_id to one or more
+  # chapter relpaths under book_chapter_dir. If omitted:
+  # - single discovered chapter => that chapter is used for every capability
+  # - multiple discovered chapters => all chapters are provided as context
+
+  # Loop constraints
+  max_retries: 3
+  num_tasks: 50
+  hardening_rounds: 5
+
+  # Default tasks per combo (can be overridden by each combo's "num_tasks" in blueprints.json)
+  num_tasks_per_combo: 15
+
+  # Checkpoint configuration for incremental save/resume within a chapter run
+  checkpoint:
+    enabled: true
+    resume_from_checkpoint: true
+    every: 5
+    dir_name: "checkpoints"
+    file_name: "passed_tasks_checkpoint.json"
+
+  # Resumability is controlled by the Stage-3 tasks_tag passed into the runner.
diff --git a/src/schemas/GENERATION_PIPELINE_SCHEMAS.md b/src/schemas/GENERATION_PIPELINE_SCHEMAS.md
@@ -562,6 +562,16 @@ This stage creates two files:
 }
 ```
 
+### Current Agentic Status
+
+The current Stage 3 agentic task generation pipeline is experimental.
+
+- It is currently focused on generating problems from chapter text, not on preserving exact Stage 1 -> Stage 2 -> Stage 3 area/capability matching.
+- In this mode, area and capability fields may be placeholder lineage values used to keep outputs compatible with the Stage 3 task schema and directory structure.
+- Because of that, running the full base pipeline from Stage 1 is not the supported path for this experimental agentic task generation flow.
+- Downstream Stage 4 (Solution Generation) and Stage 5 (Validation) should also be treated as incompatible with this experimental chapter-based task-generation flow unless they are explicitly adapted for it.
+- To generate problems with the current chapter-based agentic pipeline, start from Stage 3 and follow the instructions in [`src/task_generation/INSTRUCTIONS.md`](https://github.com/VectorInstitute/automated_capability_evaluation/blob/main/src/task_generation/INSTRUCTIONS.md).
+
 ---
 
 ## Stage 4: Solution Generation

diff --git a/src/schemas/task_gen_io_utils.py b/src/schemas/task_gen_io_utils.py
@@ -0,0 +1,99 @@
+"""Shared I/O-oriented utilities for task-generation agents."""
+
+from __future__ import annotations
+
+import re
+from typing import Any, Sequence
+
+from autogen_agentchat.messages import TextMessage
+
+
+def strip_agent_terminator(text: str) -> str:
+    """Remove trailing agent terminator tokens from model output."""
+    if not text:
+        return ""
+    return re.sub(r"\s*TERMINATE\s*$", "", text, flags=re.IGNORECASE).strip()
+
+
+def escape_invalid_json_backslashes(s: str) -> str:
+    """
+    Escape backslashes that are invalid in JSON escape sequences.
+
+    Example repaired patterns:
+    - "\\dot" -> "\\\\dot"
+    - "\\frac" -> "\\\\frac"
+
+    Valid JSON escapes (\\", \\\\, \\/, \\b, \\f, \\n, \\r, \\t, \\uXXXX) are preserved.
+    For \\u, preservation only applies when followed by exactly 4 hex digits.
+
+    Args:
+        s: The input string to process.
+
+    Returns
+    -------
+        A new string with invalid JSON backslashes escaped.
+    """  # noqa: D301
+    out: list[str] = []
+    i = 0
+    n = len(s)
+    while i < n:
+        ch = s[i]
+        if ch != "\\":
+            out.append(ch)
+            i += 1
+            continue
+
+        if i + 1 >= n:
+            out.append("\\\\")
+            i += 1
+            continue
+
+        nxt = s[i + 1]
+        if nxt in ['"', "\\", "/", "b", "f", "n", "r", "t"]:
+            out.append("\\")
+            out.append(nxt)
+            i += 2
+            continue
+
+        if nxt == "u":
+            if i + 5 < n and re.fullmatch(r"[0-9a-fA-F]{4}", s[i + 2 : i + 6]):
+                out.append("\\u")
+                out.append(s[i + 2 : i + 6])
+                i += 6
+                continue
+            out.append("\\\\u")
+            i += 2
+            continue
+
+        out.append("\\\\")
+        out.append(nxt)
+        i += 2
+
+    return "".join(out)
+
+
+def extract_last_message_text(
+    messages: Sequence[Any],
+    *,
+    include_text_attr: bool,
+    strip_content: bool,
+) -> str:
+    """Extract the last message text from AgentChat result messages."""
+    if not messages:
+        return ""
+
+    for message in reversed(messages):
+        if isinstance(message, TextMessage):
+            text_content = message.content or ""
+            return text_content.strip() if strip_content else text_content
+
+        raw_content = getattr(message, "content", None)
+        if isinstance(raw_content, str):
+            return raw_content.strip() if strip_content else raw_content
+
+        if include_text_attr:
+            raw_text = getattr(message, "text", None)
+            if isinstance(raw_text, str):
+                return raw_text.strip() if strip_content else raw_text
+
+    return ""