Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,11 @@ dmypy.json
.idea/

logs/
src/task_generation/init_book_chapter_text_files
src/task_generation/other_scripts
src/outputs/
outputs/
base_output/

# inspect result logs
seed_datasets_inspect_logs/
Expand Down
120 changes: 0 additions & 120 deletions src/agentic_task_generator.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/area_generation/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,6 @@ async def generate_areas(cfg: DictConfig, langfuse_client: Langfuse) -> None:
},
)

if langfuse_client is None:
if langfuse_client is not None:
langfuse_client.flush()
raise
18 changes: 18 additions & 0 deletions src/base_stages/stage3_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
)
from src.schemas.io_utils import load_capabilities, save_tasks
from src.schemas.metadata_schemas import PipelineMetadata
from src.task_generation.runner import run_from_stage3
from src.utils import constants
from src.utils.model_client_utils import get_standard_model_client
from src.utils.timestamp_utils import iso_timestamp, timestamp_tag
Expand Down Expand Up @@ -49,6 +50,7 @@ def run_stage3(
"""
experiment_id = cfg.exp_cfg.exp_id
output_base_dir = Path(cfg.global_cfg.output_dir)
task_gen_mode = str(cfg.task_generation_cfg.get("mode", "base")).strip().lower()

# Determine tasks tag (resume or new)
is_resume = tasks_tag is not None
Expand All @@ -57,6 +59,22 @@ def run_stage3(
else:
tasks_tag = timestamp_tag()
logger.info(f"Starting new Stage 3 with tasks_tag: {tasks_tag}")
# If agentic mode, delegate to runner module which will call back into this module.
if task_gen_mode == "agentic":
logger.info("Stage 3 mode: agentic")
return run_from_stage3(
experiment_id=experiment_id,
output_base_dir=output_base_dir,
capabilities_tag=capabilities_tag,
tasks_tag=tasks_tag,
is_resume=is_resume,
)

if task_gen_mode != "base":
raise ValueError(
f"Invalid task_generation_cfg.mode={task_gen_mode}. "
"Expected one of: base, agentic"
)

# Initialize scientist LLM client using task_generation config
scientist_llm_gen_cfg = dict(cfg.scientist_llm.generation_cfg.task_generation)
Expand Down
3 changes: 2 additions & 1 deletion src/cfg/run_cfg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,9 @@ embedding_cfg:
embedding_size: 1536
filtering_similarity_threshold: 0.85

# Stage 3: Task generation
# Stage 3: Task generation (base mode included)
task_generation_cfg:
mode: base
tasks_per_blueprint: 1
min_subtopics: 1
max_subtopics: 1
Expand Down
40 changes: 40 additions & 0 deletions src/cfg/task_generation/agent_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# cfg/agentic_config.yaml

defaults:
temperature: 0
top_p: 0.95
timeout: 120
cache_seed: 42

agents:
designer:
model: "gemini_pro"
model_config:
config_list:
- api_type: "google"
model: "gemini-3.1-pro-preview"
api_key: "${GOOGLE_API_KEY}"
temperature: 0
top_p: 0.95
timeout: 1200
cache_seed: 42

verifier:
model: "claude"
model_config:
config_list:
- api_type: "claude"
model: "claude-opus-4-6"
api_key: "${ANTHROPIC_API_KEY}"
temperature: 0
top_p: 0.95
timeout: 600
cache_seed: 42

dedup:
enabled: false
embedding_model: "text-embedding-3-small"
threshold: 0.90
keep_policy: "first"
cache_embeddings: true
save_discarded: true
43 changes: 43 additions & 0 deletions src/cfg/task_generation/pipeline_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
pipeline:
# Experiment identifier - used to name output folders - required by runner
# Keep these aligned with src/cfg/run_cfg.yaml defaults for compatibility
# (exp_cfg.exp_id and global_cfg.output_dir) when switching between
# `python -m src.task_generation.runner` and `python -m src.run_base_pipeline`.
experiment_id: "test_exp"
output_base_dir: "base_output" # folder where outputs are written

# Chapter corpus root (relative to repo root)
# runner expects: <book_chapter_dir>/**.txt
book_chapter_dir: "init_book_chapter_text_files"

# Blueprints file (relative to cfg/)
blueprints_file: "blueprints.json"

# Capability source mode:
# - placeholder: synthesize capability metadata from chapter files (current mode)
# - from_stage2: load capabilities from Stage 2 outputs and persist their lineage
capability_source_mode: "placeholder"
# capabilities_tag: null # required when capability_source_mode=from_stage2
# capability_chapter_mapping_file: "src/task_generation/capability_to_chapters.json"
# Optional JSON object mapping capability_id or area_id/capability_id to one or more
# chapter relpaths under book_chapter_dir. If omitted:
# - single discovered chapter => that chapter is used for every capability
# - multiple discovered chapters => all chapters are provided as context

# Loop constraints
max_retries: 3
num_tasks: 50
hardening_rounds: 5

# Default tasks per combo (can be overridden by each combo's "num_tasks" in blueprints.json)
num_tasks_per_combo: 15

# Checkpoint configuration for incremental save/resume within a chapter run
checkpoint:
enabled: true
resume_from_checkpoint: true
every: 5
dir_name: "checkpoints"
file_name: "passed_tasks_checkpoint.json"

# Resumability is controlled by the Stage-3 tasks_tag passed into the runner.
10 changes: 10 additions & 0 deletions src/schemas/GENERATION_PIPELINE_SCHEMAS.md
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,16 @@ This stage creates two files:
}
```

### Current Agentic Status

The current Stage 3 agentic task generation pipeline is experimental.

- It is currently focused on generating problems from chapter text, not on preserving exact Stage 1 -> Stage 2 -> Stage 3 area/capability matching.
- In this mode, area and capability fields may be placeholder lineage values used to keep outputs compatible with the Stage 3 task schema and directory structure.
- Because of that, running the full base pipeline from Stage 1 is not the supported path for this experimental agentic task generation flow.
- Downstream Stage 4 (Solution Generation) and Stage 5 (Validation) should also be treated as incompatible with this experimental chapter-based task-generation flow unless they are explicitly adapted for it.
- To generate problems with the current chapter-based agentic pipeline, start from Stage 3 and follow the instructions in [`src/task_generation/INSTRUCTIONS.md`](https://github.com/VectorInstitute/automated_capability_evaluation/blob/main/src/task_generation/INSTRUCTIONS.md).

---

## Stage 4: Solution Generation
Expand Down
99 changes: 99 additions & 0 deletions src/schemas/task_gen_io_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Shared I/O-oriented utilities for task-generation agents."""

from __future__ import annotations

import re
from typing import Any, Sequence

from autogen_agentchat.messages import TextMessage


def strip_agent_terminator(text: str) -> str:
"""Remove trailing agent terminator tokens from model output."""
if not text:
return ""
return re.sub(r"\s*TERMINATE\s*$", "", text, flags=re.IGNORECASE).strip()


def escape_invalid_json_backslashes(s: str) -> str:
"""
Escape backslashes that are invalid in JSON escape sequences.

Example repaired patterns:
- "\\dot" -> "\\\\dot"
- "\\frac" -> "\\\\frac"

Valid JSON escapes (\\", \\\\, \\/, \\b, \\f, \\n, \\r, \\t, \\uXXXX) are preserved.
For \\u, preservation only applies when followed by exactly 4 hex digits.

Args:
s: The input string to process.

Returns
-------
A new string with invalid JSON backslashes escaped.
""" # noqa: D301
out: list[str] = []
i = 0
n = len(s)
while i < n:
ch = s[i]
if ch != "\\":
out.append(ch)
i += 1
continue

if i + 1 >= n:
out.append("\\\\")
i += 1
continue

nxt = s[i + 1]
if nxt in ['"', "\\", "/", "b", "f", "n", "r", "t"]:
out.append("\\")
out.append(nxt)
i += 2
continue

if nxt == "u":
if i + 5 < n and re.fullmatch(r"[0-9a-fA-F]{4}", s[i + 2 : i + 6]):
out.append("\\u")
out.append(s[i + 2 : i + 6])
i += 6
continue
out.append("\\\\u")
i += 2
continue

out.append("\\\\")
out.append(nxt)
i += 2

return "".join(out)


def extract_last_message_text(
messages: Sequence[Any],
*,
include_text_attr: bool,
strip_content: bool,
) -> str:
"""Extract the last message text from AgentChat result messages."""
if not messages:
return ""

for message in reversed(messages):
if isinstance(message, TextMessage):
text_content = message.content or ""
return text_content.strip() if strip_content else text_content

raw_content = getattr(message, "content", None)
if isinstance(raw_content, str):
return raw_content.strip() if strip_content else raw_content

if include_text_attr:
raw_text = getattr(message, "text", None)
if isinstance(raw_text, str):
return raw_text.strip() if strip_content else raw_text

return ""
Loading
Loading