Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ options = metrics_logger.MetricsLoggerOptions(
logger = metrics_logger.MetricsLogger(metrics_logger_options=options)
```

With the above, agentic_grpo_learner will by default start an async trajectory
With the above, agentic_learner will by default start an async trajectory
logger which logs the trajectories including prompts, responses, etc. to the
specified `log_dir`.

Expand Down
2 changes: 1 addition & 1 deletion docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ out_data = sampler(
During reinforcement learning (RL) training, it is often useful to analyze the
generated trajectories (prompts, responses, rewards, etc.). Tunix provides an
`AsyncTrajectoryLogger` to log this data asynchronously to CSV files without
blocking the training loop. It's enabled in agentic_grpo_learner by default, if
blocking the training loop. It's enabled in agentic_learner by default, if
you provide a log directory in your cluster configuration training config.

```python
Expand Down
2 changes: 1 addition & 1 deletion examples/agentic/gemma_grpo_demo_nb.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
from tunix.models.gemma import model as gemma_lib
from tunix.sft import utils
from tunix.utils import script_utils
from tunix.rl.agentic.agentic_grpo_learner import GRPOConfig, GRPOLearner
from tunix.rl.agentic.agentic_learner import GRPOConfig, GRPOLearner
from flax import nnx
from tunix.cli.utils import model as model_utils

Expand Down
26 changes: 13 additions & 13 deletions examples/deepscaler/run_deepscaler_disagg_v5p16.sh
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,19 @@ python -m tunix.cli.grpo_main \
tokenizer_config.add_eos=false \
\
`# ── GRPO algorithm ───────────────────────────────────────────────────` \
agentic_grpo_config.num_generations=8 \
agentic_grpo_config.num_iterations=1 \
agentic_grpo_config.beta=0.0 \
agentic_grpo_config.epsilon=0.2 \
agentic_grpo_config.epsilon_high=0.28 \
agentic_grpo_config.system_prompt="" \
agentic_grpo_config.max_concurrency=1024 \
agentic_grpo_config.max_response_length="$max_response_length" \
agentic_grpo_config.off_policy_steps=0 \
agentic_grpo_config.loss_agg_mode="token-mean" \
agentic_grpo_config.kl_loss_mode="low_var_kl" \
agentic_grpo_config.max_turns=1 \
agentic_grpo_config.context_ratio=1 \
agentic_config.num_generations=8 \
agentic_config.num_iterations=1 \
agentic_config.beta=0.0 \
agentic_config.epsilon=0.2 \
agentic_config.epsilon_high=0.28 \
agentic_config.system_prompt="" \
agentic_config.max_concurrency=1024 \
agentic_config.max_response_length="$max_response_length" \
agentic_config.off_policy_steps=0 \
agentic_config.loss_agg_mode="token-mean" \
agentic_config.kl_loss_mode="low_var_kl" \
agentic_config.max_turns=1 \
agentic_config.context_ratio=1 \
\
`# ── Optimizer ────────────────────────────────────────────────────────` \
rl_training_config.actor_optimizer_config.opt_type="adamw" \
Expand Down
2 changes: 1 addition & 1 deletion examples/deepscaler/train_deepscaler_nb.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
from tunix.models.qwen2 import params as params_lib
from tunix.models.qwen2 import model as model_lib
from tunix.sft import metrics_logger
from tunix.rl.agentic.agentic_grpo_learner import GRPOConfig, GRPOLearner
from tunix.rl.agentic.agentic_learner import GRPOConfig, GRPOLearner
from tunix.rl.agentic.agents import model_agent
from tunix.rl.agentic.environments import task_environment
from tunix.rl.agentic.trajectory import trajectory_collect_engine
Expand Down
26 changes: 13 additions & 13 deletions examples/deepswe/run_deepswe_disagg_v5p_32.sh
Original file line number Diff line number Diff line change
Expand Up @@ -149,21 +149,21 @@ python -m tunix.cli.grpo_main \
kubernetes_config.node_selector_val="deepswe-cpu-pool" \
\
`# ── Agentic / multi-turn ─────────────────────────────────────────────` \
agentic_grpo_config.max_turns=20 \
agentic_grpo_config.per_turn_timeout_secs=300 \
agentic_grpo_config.context_ratio=2 \
agentic_grpo_config.max_concurrency=100 \
agentic_config.max_turns=20 \
agentic_config.per_turn_timeout_secs=300 \
agentic_config.context_ratio=2 \
agentic_config.max_concurrency=100 \
\
`# ── GRPO algorithm ───────────────────────────────────────────────────` \
agentic_grpo_config.num_generations="$num_generations" \
agentic_grpo_config.max_response_length="$max_response_length" \
agentic_grpo_config.num_iterations=1 \
agentic_grpo_config.beta=0.001 \
agentic_grpo_config.epsilon=0.2 \
agentic_grpo_config.epsilon_high=0.28 \
agentic_grpo_config.off_policy_steps=0 \
agentic_grpo_config.loss_agg_mode="seq-mean-token-mean" \
agentic_grpo_config.kl_loss_mode="low_var_kl" \
agentic_config.num_generations="$num_generations" \
agentic_config.max_response_length="$max_response_length" \
agentic_config.num_iterations=1 \
agentic_config.beta=0.001 \
agentic_config.epsilon=0.2 \
agentic_config.epsilon_high=0.28 \
agentic_config.off_policy_steps=0 \
agentic_config.loss_agg_mode="seq-mean-token-mean" \
agentic_config.kl_loss_mode="low_var_kl" \
\
`# ── Optimizer ────────────────────────────────────────────────────────` \
rl_training_config.actor_optimizer_config.opt_type="adamw" \
Expand Down
8 changes: 4 additions & 4 deletions examples/deepswe/train_deepswe_nb.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@
from tunix.sft import metrics_logger
from tunix.rl import rl_cluster as rl_cluster_lib
from tunix.rl.rollout import base_rollout
from tunix.rl.agentic import agentic_grpo_learner
from tunix.rl.agentic import agentic_learner
from tunix.rl.agentic.parser.chat_template_parser import parser as template_parser
from tunix.rl.agentic.rewards.reward_types import RewardOutput
from examples.deepswe.swe_agent import (
Expand Down Expand Up @@ -564,7 +564,7 @@ def transform(entry):
# ==========================================
# 11. Learner & Agent Setup
# ==========================================
grpo_config = agentic_grpo_learner.GRPOConfig(
grpo_config = agentic_learner.GRPOConfig(
num_generations=NUM_GENERATIONS,
num_iterations=NUM_ITERATIONS,
max_response_length=MAX_RESPONSE_LENGTH,
Expand All @@ -579,7 +579,7 @@ def transform(entry):
)


agentic_grpo_learner = agentic_grpo_learner.GRPOLearner(
agentic_learner = agentic_learner.GRPOLearner(
rl_cluster=rl_cluster,
reward_fns=None,
agent_class=SWEAgent,
Expand Down Expand Up @@ -652,7 +652,7 @@ def mixed_type_batch_fn(elements):


print("Starting training...")
agentic_grpo_learner.train(train_dataset=train_dataset)
agentic_learner.train(train_dataset=train_dataset)


# %%
18 changes: 9 additions & 9 deletions examples/rl/grpo/gsm8k/run_qwen3_8b.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,15 +130,15 @@ python -m tunix.cli.grpo_main \
tokenizer_config.add_eos=false \
\
`# -- GRPO algorithm ---------------------------------------------------` \
agentic_grpo_config.num_generations="$num_generations" \
agentic_grpo_config.num_iterations=1 \
agentic_grpo_config.beta=0.08 \
agentic_grpo_config.epsilon=0.2 \
agentic_grpo_config.system_prompt="You are given a grade school math problem. Think step by step and respond using <reasoning>...</reasoning> followed by <answer>...</answer> with only the final numeric answer inside <answer>." \
agentic_grpo_config.max_concurrency=128 \
agentic_grpo_config.max_response_length=768 \
agentic_grpo_config.max_turns=1 \
agentic_grpo_config.context_ratio=1 \
agentic_config.num_generations="$num_generations" \
agentic_config.num_iterations=1 \
agentic_config.beta=0.08 \
agentic_config.epsilon=0.2 \
agentic_config.system_prompt="You are given a grade school math problem. Think step by step and respond using <reasoning>...</reasoning> followed by <answer>...</answer> with only the final numeric answer inside <answer>." \
agentic_config.max_concurrency=128 \
agentic_config.max_response_length=768 \
agentic_config.max_turns=1 \
agentic_config.context_ratio=1 \
\
`# -- Optimizer --------------------------------------------------------` \
rl_training_config.actor_optimizer_config.opt_type="adamw" \
Expand Down
28 changes: 14 additions & 14 deletions tests/cli/grpo_main_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def test_agentic_data_module_receives_data_config_for_raw_dataset(self):
env_class_path: null
env_kwargs: {}
kubernetes_config: null
agentic_grpo_config:
agentic_config:
num_generations: 2
num_iterations: 1
beta: 0.0
Expand Down Expand Up @@ -277,7 +277,7 @@ def test_agentic_nullable_string_can_be_overridden_from_cli(self):
env_class_path: null
env_kwargs: {}
kubernetes_config: null
agentic_grpo_config:
agentic_config:
num_generations: 2
num_iterations: 1
beta: 0.0
Expand Down Expand Up @@ -321,7 +321,7 @@ def test_agentic_nullable_dict_can_be_overridden_from_cli(self):
env_class_path: null
env_kwargs: {}
kubernetes_config: null
agentic_grpo_config:
agentic_config:
num_generations: 2
num_iterations: 1
beta: 0.0
Expand Down Expand Up @@ -371,7 +371,7 @@ def test_agentic_nullable_string_can_be_overridden_from_env(self):
env_class_path: null
env_kwargs: {}
kubernetes_config: null
agentic_grpo_config:
agentic_config:
num_generations: 2
num_iterations: 1
beta: 0.0
Expand Down Expand Up @@ -417,7 +417,7 @@ def test_standard_grpo_dispatches_to_standard(self):
pipeline.run_grpo_trainer()
mock_run.assert_called_once_with(mode="grpo")

def test_agentic_grpo_dispatches_to_agentic(self):
def test_agentic_dispatches_to_agentic(self):
extra = """
training_mode: "agentic_grpo"
data_module: "tunix.cli.recipes.deepscaler_data"
Expand All @@ -436,7 +436,7 @@ def test_agentic_grpo_dispatches_to_agentic(self):
env_class_path: null
env_kwargs: {}
kubernetes_config: null
agentic_grpo_config:
agentic_config:
num_generations: 2
num_iterations: 1
beta: 0.0
Expand Down Expand Up @@ -530,7 +530,7 @@ def _make_agentic_pipeline(self, max_turns, context_ratio):
env_class_path: null
env_kwargs: {{}}
kubernetes_config: null
agentic_grpo_config:
agentic_config:
num_generations: 2
num_iterations: 1
beta: 0.0
Expand Down Expand Up @@ -603,7 +603,7 @@ def _base_extra(self, agentic_overrides="", system_prompt='""'):
env_class_path: null
env_kwargs: {{}}
kubernetes_config: null
agentic_grpo_config:
agentic_config:
num_generations: 2
num_iterations: 1
beta: 0.001
Expand All @@ -623,26 +623,26 @@ def test_episode_timeout_computed(self):
p = _make_pipeline(
self._base_extra("max_turns: 20\n per_turn_timeout_secs: 300")
)
algo = p._create_agentic_grpo_config()
algo = p._create_agentic_config()
self.assertEqual(algo.episode_timeout, 300 * 20)

def test_max_response_length_from_rollout(self):
p = _make_pipeline(self._base_extra("max_turns: 1"))
algo = p._create_agentic_grpo_config()
algo = p._create_agentic_config()
# rollout_config.total_generation_steps = 512
self.assertEqual(algo.max_response_length, 512)

def test_num_generations_passed_through(self):
p = _make_pipeline(self._base_extra("max_turns: 1"))
algo = p._create_agentic_grpo_config()
algo = p._create_agentic_config()
self.assertEqual(algo.num_generations, 2)

def test_cli_empty_system_prompt_stays_empty_string(self):
p = _make_pipeline_with_cli_args(
self._base_extra("max_turns: 1", system_prompt='"base"'),
['agentic_grpo_config.system_prompt=""'],
['agentic_config.system_prompt=""'],
)
self.assertEqual(p.config["agentic_grpo_config"]["system_prompt"], "")
self.assertEqual(p.config["agentic_config"]["system_prompt"], "")


class SplitMeshConfigTest(absltest.TestCase):
Expand All @@ -665,7 +665,7 @@ def test_split_mesh_uses_explicit_role_meshes(self):
env_class_path: null
env_kwargs: {}
kubernetes_config: null
agentic_grpo_config:
agentic_config:
num_generations: 2
num_iterations: 1
beta: 0.0
Expand Down
Loading
Loading