silogen · Gastron · Apr 29, 2026 · Apr 27, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/docker/axolotl/axoltl-rocm.Dockerfile b/docker/axolotl/axoltl-rocm.Dockerfile
@@ -58,6 +58,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 
 # Install minio
 RUN curl https://dl.min.io/client/mc/release/linux-amd64/mc \
+    --location \
     --create-dirs \
     -o /minio-binaries/mc && \
     chown -hR ${USER_NAME} /minio-binaries/ && \

diff --git a/docker/llama-factory/llama-factory-rocm.Dockerfile b/docker/llama-factory/llama-factory-rocm.Dockerfile
@@ -7,6 +7,7 @@ RUN git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git && \
 # Install minio
 RUN curl https://dl.min.io/client/mc/release/linux-amd64/mc \
     --create-dirs \
+    --location \
     -o /minio-binaries/mc && \
     chmod +x /minio-binaries/mc
 ENV PATH="${PATH}:/minio-binaries/:/root/scripts/"
diff --git a/docker/logistics/logistics.Dockerfile b/docker/logistics/logistics.Dockerfile
@@ -21,6 +21,7 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Install minio
 RUN curl https://dl.min.io/client/mc/release/linux-amd64/mc \
     --create-dirs \
+    --location \
     -o /minio-binaries/mc && \
     chmod +x /minio-binaries/mc
 

diff --git a/workloads/aim-fine-tuning/aimtrain-fine-tune-verl/helm/templates/configmap.yaml b/workloads/aim-fine-tuning/aimtrain-fine-tune-verl/helm/templates/configmap.yaml
@@ -425,6 +425,7 @@ data:
     {{- end }}
     echo "Installing MinIO:"
     curl https://dl.min.io/client/mc/release/linux-amd64/mc \
+          --location \
           --create-dirs \
           -o /minio-binaries/mc
     chmod +x /minio-binaries/mc

diff --git a/workloads/dev-text2image-comfyui/helm/templates/_entrypoint.tpl b/workloads/dev-text2image-comfyui/helm/templates/_entrypoint.tpl
@@ -6,6 +6,7 @@ echo '--------------------------------------------'
 echo 'Installing minio client'
 echo '--------------------------------------------'
 curl https://dl.min.io/client/mc/release/linux-amd64/mc \
+      --location \
       --create-dirs \
       -o /minio-binaries/mc
 chmod +x /minio-binaries/mc

diff --git a/workloads/download-data-to-bucket/helm/overrides/create-long-random-data.yaml b/workloads/download-data-to-bucket/helm/overrides/create-long-random-data.yaml
@@ -0,0 +1,40 @@
+
+# Data download and preprocess script:
+dataScript: |
+  #!/usr/bin/env python3
+  """Custom preprocessing script for generating random long sequences for speed tests."""
+  import random
+  import string
+  import os
+  import datasets
+
+  def generate_random_string(length):
+      return ''.join(random.choices(string.ascii_letters + string.digits + " ", k=length))
+
+  rows = [
+    {
+      "messages": [
+        {"role": "system", "content": "This is random content for speed testing and max sequence length (with truncation) validation."},
+        {"role": "user", "content": generate_random_string(8192)},
+        {"role": "assistant", "content": generate_random_string(8192)},
+      ],
+      "data_source": "random_generated",
+      "extra_info": {"split": "test", "index": i, "skip_reason": ""},
+    } for i in range(2048)
+  ]
+  dataset = datasets.Dataset.from_list(rows)
+  dataset.to_json("/downloads/datasets/long-random-data.jsonl") # Need to save any data files in this specific directory to be uploaded.
+
+# Where the resources should be stored:
+bucketDataDir: default-bucket/datasets/
+bucketStorageHost: http://minio.minio-tenant-default.svc.cluster.local:80
+
+# Bucket credentials from a secret:
+bucketCredentialsSecret:
+  name: minio-credentials
+  accessKeyKey: minio-access-key
+  secretKeyKey: minio-secret-key
+
+# Storage configuration:
+storageClass: mlstorage
+storageQuantity: "128Mi"
diff --git a/workloads/llm-evaluation-judge/helm/templates/_helpers.tpl b/workloads/llm-evaluation-judge/helm/templates/_helpers.tpl
@@ -10,6 +10,7 @@ echo '--------------------------------------------'
 echo 'Installing and setting up minio client'
 echo '--------------------------------------------'
 curl https://dl.min.io/client/mc/release/linux-amd64/mc \
+      --location \
       --create-dirs \
       -o /minio-binaries/mc
 chmod +x /minio-binaries/mc

diff --git a/workloads/llm-evaluation-metrics/helm/templates/_helpers.tpl b/workloads/llm-evaluation-metrics/helm/templates/_helpers.tpl
@@ -10,6 +10,7 @@ echo '--------------------------------------------'
 echo 'Installing and setting up minio client'
 echo '--------------------------------------------'
 curl https://dl.min.io/client/mc/release/linux-amd64/mc \
+      --location \
       --create-dirs \
       -o /minio-binaries/mc
 chmod +x /minio-binaries/mc

diff --git a/workloads/llm-finetune-silogen-engine/helm/config_doc_dpo.md b/workloads/llm-finetune-silogen-engine/helm/config_doc_dpo.md
diff --git a/workloads/llm-finetune-silogen-engine/helm/config_doc_sft.md b/workloads/llm-finetune-silogen-engine/helm/config_doc_sft.md
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/data/long-random-data.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/data/long-random-data.yaml
@@ -0,0 +1,6 @@
+# This works for default path in the corresponding data download override
+finetuning_config:
+  data_conf:
+    training_data:
+      datasets:
+        - path: default-bucket/datasets/long-random-data.jsonl
diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/models/google-gemma-3-27b-it.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/models/google-gemma-3-27b-it.yaml
@@ -0,0 +1,91 @@
+metadata:
+  compatibleAccelerators:
+  # MI300X:
+  - 74a1
+  - 74a9
+  - 74b5
+  - 74bd # MI325X:
+  - 74a5
+  - 74b9 # MI350X:
+  - 75a0
+  - 75b0 # MI355X:
+  - 75a3
+  - 75b3
+
+# Canonical model name:
+model: "google/gemma-3-27b-it"
+
+# Resources:
+downloadsReservedSize: 80Gi
+checkpointsReservedSize: 160Gi
+finetuningGpus: 1
+memoryPerGpu: 192
+cpuPerGpu: 8
+
+# Runtime configuration:
+distributedType: "auto-deepspeed-stage1"
+
+### Finetuning config section ###
+finetuning_config:
+  method: sft
+  data_conf:
+    training_data:
+      type: CONCATENATION
+    validation_data:
+      type: AUTO_SPLIT
+      ratio: 0.1
+    chat_template_name: "keep-original"
+    missing_pad_token_strategy: "bos-repurpose"
+  training_args:
+    learning_rate: 0.000005
+    max_grad_norm: 1.0
+    weight_decay: 0.000001
+    optim: "adamw_torch"
+    num_train_epochs: 1
+    lr_scheduler_type: cosine
+    warmup_ratio: 0.01
+    logging_strategy: steps
+    logging_steps: 0.01
+    save_strategy: "no"
+    seed: 42
+    bf16: true
+    report_to:
+    - none
+    push_to_hub: false
+    gradient_checkpointing: true
+    gradient_checkpointing_kwargs:
+      use_reentrant: false
+    eval_steps: 0.2
+    eval_strategy: "steps"
+    metric_for_best_model: "loss"
+    greater_is_better: false
+    load_best_model_at_end: false
+  batchsize_conf:
+    max_per_device_train_batch_size: 1
+  peft_conf:
+    peft_type: "LORA"
+    task_type: "CAUSAL_LM"
+    peft_kwargs:
+      r: 64
+      lora_alpha: 16.0
+      lora_dropout: 0.05
+      target_modules:
+      - q_proj
+      - k_proj
+      - v_proj
+      - o_proj
+      - up_proj
+      - down_proj
+      - gate_proj
+  run_conf:
+    model_args:
+      torch_dtype: bfloat16
+      attn_implementation: "flash_attention_2"
+    resume_from_checkpoint: auto
+  sft_args:
+    max_seq_length: 8192
+
+basemodel: hf://google/gemma-3-27b-it
+aimManifest:
+  modelId: "google/gemma-3-27b-it"
+  aimId: "google/gemma-3-27b-it"
diff --git a/...inetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b-instruct-radeon.yaml b/...inetune-silogen-engine/helm/overrides/models/meta-llama_llama-3.1-8b-instruct-radeon.yaml
@@ -0,0 +1,91 @@
+metadata:
+  compatibleAccelerators:
+    # AMD Radeon Pro series
+    - 7551  # AI PRO R9700 / R9700S / R9600D
+
+finetuningImage: ghcr.io/silogen/rocm-silogen-finetuning-worker:v0.7.2-rocm7.2
+runAs: 1001
+
+# Canonical model name:
+model: "meta-llama/Llama-3.1-8B-Instruct"
+hfDownloadExcludeGlob: "original/*"
+
+# Resources:
+downloadsReservedSize: 64Gi
+checkpointsReservedSize: 128Gi
+memoryPerGpu: 24
+cpusPerGpu: 2
+finetuningGpus: 1
+
+# Runtime configuration:
+distributedType: "auto-single-process"
+mergeAdapter: true
+
+
+### Finetuning config section ###
+finetuning_config:
+  method: sft
+  data_conf:
+    training_data:
+      type: CONCATENATION
+    validation_data:
+      type: AUTO_SPLIT
+      ratio: 0.1
+    chat_template_name: "keep-original"
+    missing_pad_token_strategy: "bos-repurpose"
+  training_args:
+    learning_rate: 0.00005
+    max_grad_norm: 7.0
+    weight_decay: 0.000001
+    optim: "adamw_torch"
+    num_train_epochs: 1
+    lr_scheduler_type: cosine
+    warmup_ratio: 0.01
+    logging_strategy: steps
+    logging_steps: 0.01
+    save_strategy: steps
+    save_steps: 0.2
+    seed: 42
+    bf16: true
+    report_to:
+      - none
+    push_to_hub: false
+    gradient_checkpointing: true
+    gradient_checkpointing_kwargs:
+      use_reentrant: true
+    eval_steps: 0.1
+    eval_strategy: "steps"
+    metric_for_best_model: "loss"
+    greater_is_better: false
+    load_best_model_at_end: true
+  batchsize_conf:
+    max_per_device_train_batch_size: 1
+  peft_conf:
+    peft_type: "LORA"
+    task_type: "CAUSAL_LM"
+    peft_kwargs:
+      r: 64
+      lora_alpha: 16.0
+      lora_dropout: 0.05
+      target_modules:
+        - q_proj
+        - k_proj
+        - v_proj
+        - o_proj
+        - up_proj
+        - down_proj
+        - gate_proj
+        - lm_head
+        - embed_tokens
+  run_conf:
+    model_args:
+      dtype: bfloat16
+      attn_implementation: "sdpa"
+    resume_from_checkpoint: auto
+  sft_args:
+    max_seq_length: 8192
+
+basemodel: hf://meta-llama/Llama-3.1-8B-Instruct
+aimManifest:
+  modelId: "meta-llama/Llama-3.1-8B-Instruct"
+  aimId: "meta-llama/Llama-3.1-8B-Instruct"
diff --git a/...m-finetune-silogen-engine/helm/overrides/models/mistralai_mixtral-8x7b-instruct-v0.1.yaml b/...m-finetune-silogen-engine/helm/overrides/models/mistralai_mixtral-8x7b-instruct-v0.1.yaml
@@ -21,7 +21,7 @@ model: "mistralai/Mixtral-8x7B-Instruct-v0.1"
 # Resources:
 downloadsReservedSize: 256Gi
 checkpointsReservedSize: 512Gi
-finetuningGpus: 8
+finetuningGpus: 1
 memoryPerGpu: 192
 cpuPerGpu: 8
 
@@ -55,9 +55,9 @@ finetuning_config:
     report_to:
     - none
     push_to_hub: false
-    gradient_checkpointing: true
-    gradient_checkpointing_kwargs:
-      use_reentrant: true
+    gradient_checkpointing: false
+    #gradient_checkpointing_kwargs:
+    #  use_reentrant: true
     eval_steps: 0.2
     eval_strategy: "steps"
     metric_for_best_model: "loss"
@@ -66,15 +66,31 @@ finetuning_config:
   batchsize_conf:
     max_per_device_train_batch_size: 1
   peft_conf:
-    peft_type: "NO_PEFT"
+    peft_type: "LORA"
+    task_type: "CAUSAL_LM"
+    peft_kwargs:
+      bias: "none"
+      fan_in_fan_out: false
+      lora_alpha: 16
+      lora_dropout: 0.05
+      r: 16
+      target_modules:
+        - "o_proj"
+        - "v_proj"
+        - "k_proj"
+        - "q_proj"
+        - "w1"
+        - "gate"
+        - "w3"
+        - "w2"
   run_conf:
     model_args:
       torch_dtype: bfloat16
       use_cache: false
       attn_implementation: "flash_attention_2"
     resume_from_checkpoint: auto
   sft_args:
-    max_seq_length: 8192
+    max_seq_length: 4096
 
 basemodel: hf://mistralai/Mixtral-8x7B-Instruct-v0.1
 aimManifest:

diff --git a/workloads/llm-finetune-silogen-engine/helm/overrides/utilities/speed-and-max-batch-test.yaml b/workloads/llm-finetune-silogen-engine/helm/overrides/utilities/speed-and-max-batch-test.yaml
@@ -0,0 +1,6 @@
+finetuning_config:
+  training_args:
+    include_num_input_tokens_seen: 'non_padding'
+    include_tokens_per_second: true
+  sft_args:
+    length_handling: "truncate"
diff --git a/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl b/workloads/llm-finetune-silogen-engine/helm/templates/_helpers.tpl
@@ -119,6 +119,15 @@ merge_adapter $merge_base ./checkpoints/checkpoint-final-adapter ./checkpoints/c
 echo 'Copying AIMModel manifest to checkpoint directory...'
 cp /configs/aim-model-manifest.yaml /workdir/checkpoints/aim-model-manifest.yaml
 {{- end }}
+# Copy preprocessor config into final checkpoint if present in downloaded base model.
+# This keeps deployment artifacts self-contained without a separate processor job.
+if [ -f /local_resources/basemodel/preprocessor_config.json ]; then
+  mkdir -p /workdir/checkpoints/checkpoint-final
+  cp /local_resources/basemodel/preprocessor_config.json /workdir/checkpoints/checkpoint-final/preprocessor_config.json
+  echo 'Copied preprocessor_config.json to checkpoint-final'
+else
+  echo 'No preprocessor_config.json found in basemodel, skipping copy'
+fi
 {{- if not .Values.debug.skip_checkpoint_upload }}
 # Once more to ensure everything gets uploaded
 echo 'Training done, syncing once more...'