Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker/axolotl/axoltl-rocm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \

# Install minio
RUN curl https://dl.min.io/client/mc/release/linux-amd64/mc \
--location \
--create-dirs \
-o /minio-binaries/mc && \
chown -hR ${USER_NAME} /minio-binaries/ && \
Expand Down
1 change: 1 addition & 0 deletions docker/llama-factory/llama-factory-rocm.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ RUN git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git && \
# Install minio
RUN curl https://dl.min.io/client/mc/release/linux-amd64/mc \
--create-dirs \
--location \
-o /minio-binaries/mc && \
chmod +x /minio-binaries/mc
ENV PATH="${PATH}:/minio-binaries/:/root/scripts/"
1 change: 1 addition & 0 deletions docker/logistics/logistics.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ RUN pip install --no-cache-dir -r requirements.txt
# Install minio
RUN curl https://dl.min.io/client/mc/release/linux-amd64/mc \
--create-dirs \
--location \
-o /minio-binaries/mc && \
chmod +x /minio-binaries/mc

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,7 @@ data:
{{- end }}
echo "Installing MinIO:"
curl https://dl.min.io/client/mc/release/linux-amd64/mc \
--location \
--create-dirs \
-o /minio-binaries/mc
chmod +x /minio-binaries/mc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ echo '--------------------------------------------'
echo 'Installing minio client'
echo '--------------------------------------------'
curl https://dl.min.io/client/mc/release/linux-amd64/mc \
--location \
--create-dirs \
-o /minio-binaries/mc
chmod +x /minio-binaries/mc
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

# Data download and preprocess script:
dataScript: |
#!/usr/bin/env python3
"""Custom preprocessing script for generating random long sequences for speed tests."""
import random
import string
import os
import datasets

def generate_random_string(length):
return ''.join(random.choices(string.ascii_letters + string.digits + " ", k=length))

rows = [
{
"messages": [
{"role": "system", "content": "This is random content for speed testing and max sequence length (with truncation) validation."},
{"role": "user", "content": generate_random_string(8192)},
{"role": "assistant", "content": generate_random_string(8192)},
],
"data_source": "random_generated",
"extra_info": {"split": "test", "index": i, "skip_reason": ""},
} for i in range(2048)
]
dataset = datasets.Dataset.from_list(rows)
dataset.to_json("/downloads/datasets/long-random-data.jsonl") # Need to save any data files in this specific directory to be uploaded.

# Where the resources should be stored:
bucketDataDir: default-bucket/datasets/
bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80

# Bucket credentials from a secret:
bucketCredentialsSecret:
name: minio-credentials
accessKeyKey: minio-access-key
secretKeyKey: minio-secret-key

# Storage configuration:
storageClass: mlstorage
storageQuantity: "128Mi"
1 change: 1 addition & 0 deletions workloads/llm-evaluation-judge/helm/templates/_helpers.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ echo '--------------------------------------------'
echo 'Installing and setting up minio client'
echo '--------------------------------------------'
curl https://dl.min.io/client/mc/release/linux-amd64/mc \
--location \
--create-dirs \
-o /minio-binaries/mc
chmod +x /minio-binaries/mc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ echo '--------------------------------------------'
echo 'Installing and setting up minio client'
echo '--------------------------------------------'
curl https://dl.min.io/client/mc/release/linux-amd64/mc \
--location \
--create-dirs \
-o /minio-binaries/mc
chmod +x /minio-binaries/mc
Expand Down
290 changes: 144 additions & 146 deletions workloads/llm-finetune-silogen-engine/helm/config_doc_dpo.md

Large diffs are not rendered by default.

293 changes: 145 additions & 148 deletions workloads/llm-finetune-silogen-engine/helm/config_doc_sft.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# This works for default path in the corresponding data download override
finetuning_config:
data_conf:
training_data:
datasets:
- path: default-bucket/datasets/long-random-data.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
metadata:
compatibleAccelerators:
# MI300X:
- 74a1
- 74a9
- 74b5
- 74bd # MI325X:
- 74a5
- 74b9 # MI350X:
- 75a0
- 75b0 # MI355X:
- 75a3
- 75b3

# Canonical model name:
model: "google/gemma-3-27b-it"

# Resources:
downloadsReservedSize: 80Gi
checkpointsReservedSize: 160Gi
finetuningGpus: 1
memoryPerGpu: 192
cpuPerGpu: 8

# Runtime configuration:
distributedType: "auto-deepspeed-stage1"

### Finetuning config section ###
finetuning_config:
method: sft
data_conf:
training_data:
type: CONCATENATION
validation_data:
type: AUTO_SPLIT
ratio: 0.1
chat_template_name: "keep-original"
missing_pad_token_strategy: "bos-repurpose"
training_args:
learning_rate: 0.000005
max_grad_norm: 1.0
weight_decay: 0.000001
optim: "adamw_torch"
num_train_epochs: 1
lr_scheduler_type: cosine
warmup_ratio: 0.01
logging_strategy: steps
logging_steps: 0.01
save_strategy: "no"
seed: 42
bf16: true
report_to:
- none
push_to_hub: false
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
eval_steps: 0.2
eval_strategy: "steps"
metric_for_best_model: "loss"
greater_is_better: false
load_best_model_at_end: false
batchsize_conf:
max_per_device_train_batch_size: 1
peft_conf:
peft_type: "LORA"
task_type: "CAUSAL_LM"
peft_kwargs:
r: 64
lora_alpha: 16.0
lora_dropout: 0.05
target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
- up_proj
- down_proj
- gate_proj
run_conf:
model_args:
torch_dtype: bfloat16
attn_implementation: "flash_attention_2"
resume_from_checkpoint: auto
sft_args:
max_seq_length: 8192

basemodel: hf://google/gemma-3-27b-it
aimManifest:
modelId: "google/gemma-3-27b-it"
aimId: "google/gemma-3-27b-it"
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
metadata:
compatibleAccelerators:
# AMD Radeon Pro series
- 7551 # AI PRO R9700 / R9700S / R9600D

finetuningImage: ghcr.io/silogen/rocm-silogen-finetuning-worker:v0.7.2-rocm7.2
runAs: 1001

# Canonical model name:
model: "meta-llama/Llama-3.1-8B-Instruct"
hfDownloadExcludeGlob: "original/*"

# Resources:
downloadsReservedSize: 64Gi
checkpointsReservedSize: 128Gi
memoryPerGpu: 24
cpusPerGpu: 2
finetuningGpus: 1

# Runtime configuration:
distributedType: "auto-single-process"
mergeAdapter: true


### Finetuning config section ###
finetuning_config:
method: sft
data_conf:
training_data:
type: CONCATENATION
validation_data:
type: AUTO_SPLIT
ratio: 0.1
chat_template_name: "keep-original"
missing_pad_token_strategy: "bos-repurpose"
training_args:
learning_rate: 0.00005
max_grad_norm: 7.0
weight_decay: 0.000001
optim: "adamw_torch"
num_train_epochs: 1
lr_scheduler_type: cosine
warmup_ratio: 0.01
logging_strategy: steps
logging_steps: 0.01
save_strategy: steps
save_steps: 0.2
seed: 42
bf16: true
report_to:
- none
push_to_hub: false
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: true
eval_steps: 0.1
eval_strategy: "steps"
metric_for_best_model: "loss"
greater_is_better: false
load_best_model_at_end: true
batchsize_conf:
max_per_device_train_batch_size: 1
peft_conf:
peft_type: "LORA"
task_type: "CAUSAL_LM"
peft_kwargs:
r: 64
lora_alpha: 16.0
lora_dropout: 0.05
target_modules:
- q_proj
- k_proj
- v_proj
- o_proj
- up_proj
- down_proj
- gate_proj
- lm_head
- embed_tokens
run_conf:
model_args:
dtype: bfloat16
attn_implementation: "sdpa"
resume_from_checkpoint: auto
sft_args:
max_seq_length: 8192

basemodel: hf://meta-llama/Llama-3.1-8B-Instruct
aimManifest:
modelId: "meta-llama/Llama-3.1-8B-Instruct"
aimId: "meta-llama/Llama-3.1-8B-Instruct"
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ model: "mistralai/Mixtral-8x7B-Instruct-v0.1"
# Resources:
downloadsReservedSize: 256Gi
checkpointsReservedSize: 512Gi
finetuningGpus: 8
finetuningGpus: 1
memoryPerGpu: 192
cpuPerGpu: 8

Expand Down Expand Up @@ -55,9 +55,9 @@ finetuning_config:
report_to:
- none
push_to_hub: false
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: true
gradient_checkpointing: false
#gradient_checkpointing_kwargs:
# use_reentrant: true
eval_steps: 0.2
eval_strategy: "steps"
metric_for_best_model: "loss"
Expand All @@ -66,15 +66,31 @@ finetuning_config:
batchsize_conf:
max_per_device_train_batch_size: 1
peft_conf:
peft_type: "NO_PEFT"
peft_type: "LORA"
task_type: "CAUSAL_LM"
peft_kwargs:
bias: "none"
fan_in_fan_out: false
lora_alpha: 16
lora_dropout: 0.05
r: 16
target_modules:
- "o_proj"
- "v_proj"
- "k_proj"
- "q_proj"
- "w1"
- "gate"
- "w3"
- "w2"
run_conf:
model_args:
torch_dtype: bfloat16
use_cache: false
attn_implementation: "flash_attention_2"
resume_from_checkpoint: auto
sft_args:
max_seq_length: 8192
max_seq_length: 4096

basemodel: hf://mistralai/Mixtral-8x7B-Instruct-v0.1
aimManifest:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
finetuning_config:
training_args:
include_num_input_tokens_seen: 'non_padding'
include_tokens_per_second: true
sft_args:
length_handling: "truncate"
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,15 @@ merge_adapter $merge_base ./checkpoints/checkpoint-final-adapter ./checkpoints/c
echo 'Copying AIMModel manifest to checkpoint directory...'
cp /configs/aim-model-manifest.yaml /workdir/checkpoints/aim-model-manifest.yaml
{{- end }}
# Copy preprocessor config into final checkpoint if present in downloaded base model.
# This keeps deployment artifacts self-contained without a separate processor job.
if [ -f /local_resources/basemodel/preprocessor_config.json ]; then
mkdir -p /workdir/checkpoints/checkpoint-final
cp /local_resources/basemodel/preprocessor_config.json /workdir/checkpoints/checkpoint-final/preprocessor_config.json
echo 'Copied preprocessor_config.json to checkpoint-final'
else
echo 'No preprocessor_config.json found in basemodel, skipping copy'
fi
{{- if not .Values.debug.skip_checkpoint_upload }}
# Once more to ensure everything gets uploaded
echo 'Training done, syncing once more...'
Expand Down
Loading
Loading