diff --git a/osdc/clusters.yaml b/osdc/clusters.yaml index 70e6bd4d..b36121e3 100644 --- a/osdc/clusters.yaml +++ b/osdc/clusters.yaml @@ -308,6 +308,19 @@ clusters: vpc_cidr: "10.4.0.0/16" node_compactor: min_node_age_seconds: 900 + buildkit: + amd64_instance_type: m6id.24xlarge + amd64_pods_per_node: 2 + arm64_instance_type: m7gd.16xlarge + arm64_pods_per_node: 4 + autoscaling: + enabled: true + amd64_min: 2 # 1x m6id.24xlarge (2 pods/node) + amd64_max: 360 # ~90d peak ≈180, x2 for headroom + arm64_min: 4 # 1x m7gd.16xlarge (4 pods/node) + arm64_max: 30 # ~90d peak ≈15, x2 for headroom + amd64_fallback: 32 # if KEDA can't read metrics, hold the proven fixed pool + arm64_fallback: 8 arc-runners: github_config_url: "https://github.com/pytorch" github_secret_name: meta-prod-aws-ue1 @@ -320,6 +333,8 @@ clusters: - arc - nodepools - arc-runners + - keda + - buildkit - pypi-cache - cache-enforcer - zombie-cleanup @@ -346,9 +361,9 @@ clusters: arm64_pods_per_node: 4 autoscaling: enabled: true - amd64_min: 32 # warm baseline = proven fixed pool (16x m6id.24xlarge) + amd64_min: 2 # 1x m6id.24xlarge (2 pods/node) amd64_max: 360 # ~90d peak ≈180, x2 for headroom - arm64_min: 8 # warm baseline = proven fixed pool (2x m7gd.16xlarge) + arm64_min: 4 # 1x m7gd.16xlarge (4 pods/node) arm64_max: 30 # ~90d peak ≈15, x2 for headroom amd64_fallback: 32 # if KEDA can't read metrics, hold the proven fixed pool arm64_fallback: 8