Torch 2.8.0 + vLLM 0.10.2 开启 use_kl_loss 后训练qwen2.5-vl-3B-agentic崩溃，Torch 2.6.0 + vLLM 0.8.4 正常

相同配置，在设置use_kl_loss为true后，使用torch2.8.0 + vLLM0.10.2镜像训练到一定步数后会崩溃，但是使用torch2.6.0 + vLLM0.8.4镜像则可以正常训练，训练曲线对比如下：

<img width="973" height="323" alt="Image" src="https://github.com/user-attachments/assets/60dd3c60-f508-4c83-89ae-1565ba2d9fd7" />

yaml配置如下：
```
defaults:
  - ../config/vl_traj_envs@_here_
  - ../config/deepspeed_zero@_here_
  - ../config/deepspeed_zero2@_here_
  - ../config/deepspeed_zero3@_here_
  - ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
  run:
    dir: .
  output_subdir: null

exp_name: "agentic_pipeline_3b"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
render_save_dir: ./output/render
system_envs:
  USE_MODELSCOPE: '1'
  

track_with: wandb
tracker_kwargs:
 api_key: ""
 project: ""
 name: ${exp_name}_sokoban
 notes: "agentic_pipeline"
 tags:
   - agentic
   - roll
   - baseline

# track_with: tensorboard
# tracker_kwargs:
#   log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_sokoban

checkpoint_config:
  type: file_system
  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}

num_gpus_per_node: 8

max_steps: 1024
save_steps: 10000
logging_steps: 1
eval_steps: 10
resume_from_checkpoint: false

rollout_batch_size: 1024
val_batch_size: 1024
sequence_length: 8192

advantage_clip: 0.2
ppo_epochs: 1
adv_estimator: "grpo"
#pg_clip: 0.1
#dual_clip_loss: True
init_kl_coef: 0.0
whiten_advantages: true
entropy_loss_coef: 0
max_grad_norm: 1.0
enable_old_logprobs_recompute: true
use_kl_loss: true
kl_loss_coef: 0.01

pretrain: qwen/Qwen2.5-VL-3B-Instruct
reward_pretrain: qwen/Qwen2.5-VL-3B-Instruct
actor_train:
  model_args:
    attn_implementation: fa2
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 1.0e-6
    weight_decay: 0
    per_device_train_batch_size: 2
    gradient_accumulation_steps: 8
    warmup_steps: 10
    lr_scheduler_type: cosine
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: deepspeed_train
    strategy_config: ${deepspeed_zero3}
    # strategy_name: megatron_train
    # strategy_config:
    #   tensor_model_parallel_size: 1
    #   pipeline_model_parallel_size: 1
    #   expert_model_parallel_size: 1
    #   use_distributed_optimizer: true
    #   recompute_granularity: full
  device_mapping: list(range(0,64))
  infer_batch_size: 2

actor_infer:
  model_args:
    disable_gradient_checkpointing: true
    dtype: bf16
  generating_args:
    max_new_tokens: 128 # single-turn response length
    top_p: 0.99
    top_k: 100
    num_beams: 1
    temperature: 0.99
    num_return_sequences: 1
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: vllm
    strategy_config:
      gpu_memory_utilization: 0.8
      block_size: 16
      load_format: auto
      enable_prefix_caching: false
      limit_mm_per_prompt:
        image: ${max_actions_per_traj}
  device_mapping: list(range(0,64))

reference:
  model_args:
    attn_implementation: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
    model_type: ~
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: deepspeed_infer
    strategy_config: ${deepspeed_zero3}
  device_mapping: list(range(0,64))
  infer_batch_size: 2

reward_normalization:
  grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv
  method: mean_std # asym_clip / identity / mean_std

train_env_manager:
  format_penalty: -0.15 # sokoban env penalty_for_step=-0.1
  max_env_num_per_worker: 16
  num_env_groups: 128
  # under the same group, the env config and env seed are ensured to be equal
  group_size: 8
  tags: [SimpleSokoban]
  num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation

val_env_manager:
  max_env_num_per_worker: 32
  num_env_groups: 1024
  group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output
  tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake]
  num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation

# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64
max_tokens_per_step: 64

custom_envs:
  SimpleSokoban:
    ${custom_env.SimpleSokoban}
  LargerSokoban:
    ${custom_env.LargerSokoban}
  SokobanDifferentGridVocab:
    ${custom_env.SokobanDifferentGridVocab}
  FrozenLake:
    ${custom_env.FrozenLake}
  FrozenLakeThink:
    ${custom_env.FrozenLakeThink}
```

dockerfile如下：
```
FROM roll-registry-vpc.cn-hangzhou.cr.aliyuncs.com/roll/pytorch:nvcr-25.06-py3-torch280-vllm0102

RUN apt-get update && apt-get install --fix-missing -y -o Acquire::Retries=3 iproute2 tmux zip
COPY requirements_torch280_vllm.txt ./requirements_torch280_vllm.txt 
COPY requirements_common.txt ./requirements_common.txt 
COPY requirements_vision.txt ./requirements_vision.txt 
COPY mcore_adapter ./mcore_adapter
RUN pip install -r requirements_torch280_vllm.txt
```

辛苦解答，感谢🙏

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Torch 2.8.0 + vLLM 0.10.2 开启 use_kl_loss 后训练qwen2.5-vl-3B-agentic崩溃，Torch 2.6.0 + vLLM 0.8.4 正常 #398

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Torch 2.8.0 + vLLM 0.10.2 开启 use_kl_loss 后训练qwen2.5-vl-3B-agentic崩溃，Torch 2.6.0 + vLLM 0.8.4 正常 #398

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions