diff --git a/bitbrain/finetune/run.sh b/bitbrain/finetune/run.sh index bae7524..eff94ad 100644 --- a/bitbrain/finetune/run.sh +++ b/bitbrain/finetune/run.sh @@ -1,53 +1,50 @@ -########################## # llama factory start script ########################## -#CUDA_VISIBLE_DEVICES=5,6 python ../LLaMA-Factory/src/train.py \ -export LLaMA_PATH=/home/chenyuhang/LLaMA-Factory +export LLaMA_PATH="llama_factory本地路径" OUTPUT_DIR="输出路径" -#export CUDA_DEVICE_ORDER=PCI_BUS_ID -export CUDA_VISIBLE_DEVICES=4,5,6,7 +export CUDA_VISIBLE_DEVICES=0,1,2,3 export NCCL_P2P_LEVEL=NVL export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True -# 可设置混合策略: --mix_strategy interleave_over\ -# --interleave_probs 0.1,0.35,0.2,0.2,0.1,0.05 \ - -# 可设置自定义的评估数据集 --eval_dataset ceval,cmmlu \ - -FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=4,5,6,7 torchrun --nproc_per_node 4 $LLaMA_PATH/src/train.py \ +FORCE_TORCHRUN=1 CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun --nproc_per_node 4 $LLaMA_PATH/src/train.py \ --stage sft \ --do_train \ - --model_name_or_path "预训练模型路径" \ + --model_name_or_path "模型路径" \ --cutoff_len 2048 \ --dataset_dir "数据集路径" \ - --dataset shared_gpt_format\ + --dataset chinese_instruct,deepctrl_200k,ultrachat_200k,code_feedback_custom \ --overwrite_cache \ - --max_samples 5000000 \ - --packing True \ + --enable_liger_kernel True\ + --packing False \ --use_swanlab true \ --report_to swanlab \ - --run_name sft_bit-brain \ + --swanlab_project "swanlab项目名称" \ + --run_name "运行名称" \ --preprocessing_num_workers 30 \ --template qwen \ --finetuning_type full \ - --output_dir ${OUTPUT_DIR}/sft \ + --output_dir ${OUTPUT_DIR}/"本次运行保存子路径" \ --overwrite_output_dir \ - --per_device_train_batch_size 16 \ + --per_device_train_batch_size 4 \ --per_device_eval_batch_size 4 \ --do_eval \ --val_size 100 \ --eval_strategy steps \ --eval_steps 1000 \ - --flash_attn sdpa\ - --gradient_accumulation_steps 4 \ + --flash_attn fa2\ + --gradient_accumulation_steps 16 \ --lr_scheduler_type cosine \ + --warmup_ratio 0.0125 \ + --max_grad_norm 1.0 \ --logging_steps 10 \ - --save_steps 500 \ - --learning_rate 3e-4 \ + --save_steps 5000 \ + --learning_rate 2e-5 \ --weight_decay 0.01 \ - --num_train_epochs 4.0 \ + --num_train_epochs 3.0 \ --plot_loss \ - --bf16 \ - --resume_from_checkpoint "恢复训练的checkpoint路径" + --bf16 + + #--resume_from_checkpoint ${OUTPUT_DIR}/"本次运行保存子路径/checkpoint-xxxx" +