-
Notifications
You must be signed in to change notification settings - Fork 264
Torch 2.8.0 + vLLM 0.10.2 开启 use_kl_loss 后训练qwen2.5-vl-3B-agentic崩溃,Torch 2.6.0 + vLLM 0.8.4 正常 #398
Copy link
Copy link
Open
Description
相同配置,在设置use_kl_loss为true后,使用torch2.8.0 + vLLM0.10.2镜像训练到一定步数后会崩溃,但是使用torch2.6.0 + vLLM0.8.4镜像则可以正常训练,训练曲线对比如下:
yaml配置如下:
defaults:
- ../config/vl_traj_envs@_here_
- ../config/deepspeed_zero@_here_
- ../config/deepspeed_zero2@_here_
- ../config/deepspeed_zero3@_here_
- ../config/deepspeed_zero3_cpuoffload@_here_
hydra:
run:
dir: .
output_subdir: null
exp_name: "agentic_pipeline_3b"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
render_save_dir: ./output/render
system_envs:
USE_MODELSCOPE: '1'
track_with: wandb
tracker_kwargs:
api_key: ""
project: ""
name: ${exp_name}_sokoban
notes: "agentic_pipeline"
tags:
- agentic
- roll
- baseline
# track_with: tensorboard
# tracker_kwargs:
# log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_sokoban
checkpoint_config:
type: file_system
output_dir: /data/cpfs_0/rl_examples/models/${exp_name}
num_gpus_per_node: 8
max_steps: 1024
save_steps: 10000
logging_steps: 1
eval_steps: 10
resume_from_checkpoint: false
rollout_batch_size: 1024
val_batch_size: 1024
sequence_length: 8192
advantage_clip: 0.2
ppo_epochs: 1
adv_estimator: "grpo"
#pg_clip: 0.1
#dual_clip_loss: True
init_kl_coef: 0.0
whiten_advantages: true
entropy_loss_coef: 0
max_grad_norm: 1.0
enable_old_logprobs_recompute: true
use_kl_loss: true
kl_loss_coef: 0.01
pretrain: qwen/Qwen2.5-VL-3B-Instruct
reward_pretrain: qwen/Qwen2.5-VL-3B-Instruct
actor_train:
model_args:
attn_implementation: fa2
disable_gradient_checkpointing: false
dtype: bf16
model_type: ~
training_args:
learning_rate: 1.0e-6
weight_decay: 0
per_device_train_batch_size: 2
gradient_accumulation_steps: 8
warmup_steps: 10
lr_scheduler_type: cosine
data_args:
template: qwen2_5
strategy_args:
strategy_name: deepspeed_train
strategy_config: ${deepspeed_zero3}
# strategy_name: megatron_train
# strategy_config:
# tensor_model_parallel_size: 1
# pipeline_model_parallel_size: 1
# expert_model_parallel_size: 1
# use_distributed_optimizer: true
# recompute_granularity: full
device_mapping: list(range(0,64))
infer_batch_size: 2
actor_infer:
model_args:
disable_gradient_checkpointing: true
dtype: bf16
generating_args:
max_new_tokens: 128 # single-turn response length
top_p: 0.99
top_k: 100
num_beams: 1
temperature: 0.99
num_return_sequences: 1
data_args:
template: qwen2_5
strategy_args:
strategy_name: vllm
strategy_config:
gpu_memory_utilization: 0.8
block_size: 16
load_format: auto
enable_prefix_caching: false
limit_mm_per_prompt:
image: ${max_actions_per_traj}
device_mapping: list(range(0,64))
reference:
model_args:
attn_implementation: fa2
disable_gradient_checkpointing: true
dtype: bf16
model_type: ~
data_args:
template: qwen2_5
strategy_args:
strategy_name: deepspeed_infer
strategy_config: ${deepspeed_zero3}
device_mapping: list(range(0,64))
infer_batch_size: 2
reward_normalization:
grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv
method: mean_std # asym_clip / identity / mean_std
train_env_manager:
format_penalty: -0.15 # sokoban env penalty_for_step=-0.1
max_env_num_per_worker: 16
num_env_groups: 128
# under the same group, the env config and env seed are ensured to be equal
group_size: 8
tags: [SimpleSokoban]
num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
val_env_manager:
max_env_num_per_worker: 32
num_env_groups: 1024
group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output
tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake]
num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation
# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64
max_tokens_per_step: 64
custom_envs:
SimpleSokoban:
${custom_env.SimpleSokoban}
LargerSokoban:
${custom_env.LargerSokoban}
SokobanDifferentGridVocab:
${custom_env.SokobanDifferentGridVocab}
FrozenLake:
${custom_env.FrozenLake}
FrozenLakeThink:
${custom_env.FrozenLakeThink}
dockerfile如下:
FROM roll-registry-vpc.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-25.06-py3-torch280-vllm0102
RUN apt-get update && apt-get install --fix-missing -y -o Acquire::Retries=3 iproute2 tmux zip
COPY requirements_torch280_vllm.txt ./requirements_torch280_vllm.txt
COPY requirements_common.txt ./requirements_common.txt
COPY requirements_vision.txt ./requirements_vision.txt
COPY mcore_adapter ./mcore_adapter
RUN pip install -r requirements_torch280_vllm.txt
辛苦解答,感谢🙏
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels