Skip to content
19 changes: 17 additions & 2 deletions osdc/clusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,19 @@ clusters:
vpc_cidr: "10.4.0.0/16"
node_compactor:
min_node_age_seconds: 900
buildkit:
amd64_instance_type: m6id.24xlarge
amd64_pods_per_node: 2
arm64_instance_type: m7gd.16xlarge
arm64_pods_per_node: 4
autoscaling:
enabled: true
amd64_min: 2 # 1x m6id.24xlarge (2 pods/node)
amd64_max: 360 # ~90d peak ≈180, x2 for headroom
arm64_min: 4 # 1x m7gd.16xlarge (4 pods/node)
arm64_max: 30 # ~90d peak ≈15, x2 for headroom
amd64_fallback: 32 # if KEDA can't read metrics, hold the proven fixed pool
arm64_fallback: 8
arc-runners:
github_config_url: "https://github.com/pytorch"
github_secret_name: meta-prod-aws-ue1
Expand All @@ -320,6 +333,8 @@ clusters:
- arc
- nodepools
- arc-runners
- keda
- buildkit
- pypi-cache
- cache-enforcer
- zombie-cleanup
Expand All @@ -346,9 +361,9 @@ clusters:
arm64_pods_per_node: 4
autoscaling:
enabled: true
amd64_min: 32 # warm baseline = proven fixed pool (16x m6id.24xlarge)
amd64_min: 2 # 1x m6id.24xlarge (2 pods/node)
amd64_max: 360 # ~90d peak ≈180, x2 for headroom
arm64_min: 8 # warm baseline = proven fixed pool (2x m7gd.16xlarge)
arm64_min: 4 # 1x m7gd.16xlarge (4 pods/node)
arm64_max: 30 # ~90d peak ≈15, x2 for headroom
amd64_fallback: 32 # if KEDA can't read metrics, hold the proven fixed pool
arm64_fallback: 8
Expand Down
Loading