diff --git a/osdc/modules/bin-pack-scheduler/kubernetes/config.yaml b/osdc/modules/bin-pack-scheduler/kubernetes/config.yaml new file mode 100644 index 00000000..dc5e2b8a --- /dev/null +++ b/osdc/modules/bin-pack-scheduler/kubernetes/config.yaml @@ -0,0 +1,24 @@ +# KubeSchedulerConfiguration for the bin-pack-scheduler secondary scheduler. +# Stock upstream kube-scheduler; only NodeResourcesFit scoring is overridden to +# MostAllocated (pack onto the fullest node). Every other plugin/default is +# inherited. EKS's managed default scheduler can't be reconfigured, hence a +# second scheduler that pods opt into via spec.schedulerName. +apiVersion: kubescheduler.config.k8s.io/v1 +kind: KubeSchedulerConfiguration +leaderElection: + leaderElect: true + # MUST differ from the managed scheduler's lease ("kube-scheduler"). + resourceName: bin-pack-scheduler + resourceNamespace: kube-system +profiles: + - schedulerName: bin-pack-scheduler + pluginConfig: + - name: NodeResourcesFit + args: + scoringStrategy: + # cpu/memory weights match the upstream default, so `type` is the + # only delta from the managed default-scheduler. + type: MostAllocated + resources: + - {name: cpu, weight: 1} + - {name: memory, weight: 1} diff --git a/osdc/modules/bin-pack-scheduler/kubernetes/deployment.yaml b/osdc/modules/bin-pack-scheduler/kubernetes/deployment.yaml new file mode 100644 index 00000000..5509572a --- /dev/null +++ b/osdc/modules/bin-pack-scheduler/kubernetes/deployment.yaml @@ -0,0 +1,73 @@ +# bin-pack-scheduler: a stock kube-scheduler run as a secondary scheduler. +# Image tracks eks_version (clusters.yaml); a guard test enforces the minor +# matches. ±1 minor skew is tolerated, so bump on EKS upgrade. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: bin-pack-scheduler + namespace: kube-system + labels: + osdc.io/module: bin-pack-scheduler + app: bin-pack-scheduler +spec: + replicas: 2 # HA via leader election; only one schedules at a time + selector: + matchLabels: + app: bin-pack-scheduler + template: + metadata: + labels: + app: bin-pack-scheduler + osdc.io/module: bin-pack-scheduler + spec: + serviceAccountName: bin-pack-scheduler + priorityClassName: system-cluster-critical + # Run on stable infra nodes, not ephemeral runner/workflow nodes. + nodeSelector: + role: base-infrastructure + tolerations: + - key: CriticalAddonsOnly + operator: Equal + value: "true" + effect: NoSchedule + # Spread the 2 replicas across nodes so a single node loss keeps a leader. + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + podAffinityTerm: + topologyKey: kubernetes.io/hostname + labelSelector: + matchLabels: + app: bin-pack-scheduler + containers: + - name: kube-scheduler + image: registry.k8s.io/kube-scheduler:v1.35.0 + command: + - kube-scheduler + - --config=/etc/kubernetes/bin-pack-scheduler/config.yaml + - --v=2 + ports: + - name: https + containerPort: 10259 + protocol: TCP + resources: + requests: {cpu: 200m, memory: 256Mi} + limits: {cpu: 500m, memory: 512Mi} + livenessProbe: + httpGet: {path: /healthz, port: 10259, scheme: HTTPS} + initialDelaySeconds: 15 + readinessProbe: + httpGet: {path: /readyz, port: 10259, scheme: HTTPS} + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + runAsNonRoot: true + volumeMounts: + - name: config + mountPath: /etc/kubernetes/bin-pack-scheduler + readOnly: true + volumes: + - name: config + configMap: + name: bin-pack-scheduler-config # kustomize rewrites to the hashed name diff --git a/osdc/modules/bin-pack-scheduler/kubernetes/kustomization.yaml b/osdc/modules/bin-pack-scheduler/kubernetes/kustomization.yaml new file mode 100644 index 00000000..769bb25f --- /dev/null +++ b/osdc/modules/bin-pack-scheduler/kubernetes/kustomization.yaml @@ -0,0 +1,18 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +resources: + - rbac.yaml + - deployment.yaml + +# Generate the config ConfigMap with a content-hash suffix so the Deployment +# auto-rolls whenever config.yaml changes (no manual rollout restart needed). +configMapGenerator: + - name: bin-pack-scheduler-config + namespace: kube-system + files: + - config.yaml + +generatorOptions: + labels: + osdc.io/module: bin-pack-scheduler diff --git a/osdc/modules/bin-pack-scheduler/kubernetes/rbac.yaml b/osdc/modules/bin-pack-scheduler/kubernetes/rbac.yaml new file mode 100644 index 00000000..11c87acb --- /dev/null +++ b/osdc/modules/bin-pack-scheduler/kubernetes/rbac.yaml @@ -0,0 +1,91 @@ +# RBAC for the bin-pack-scheduler. Reuses the built-in system:kube-scheduler +# and system:volume-scheduler ClusterRoles (track upstream across k8s versions) +# and adds get/update on our own leader-election Lease, which system:kube-scheduler +# scopes to resourceName "kube-scheduler" only. +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: bin-pack-scheduler + namespace: kube-system + labels: + osdc.io/module: bin-pack-scheduler +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: bin-pack-scheduler + labels: + osdc.io/module: bin-pack-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:kube-scheduler +subjects: + - kind: ServiceAccount + name: bin-pack-scheduler + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: bin-pack-scheduler-volume + labels: + osdc.io/module: bin-pack-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:volume-scheduler +subjects: + - kind: ServiceAccount + name: bin-pack-scheduler + namespace: kube-system +--- +# Lets the scheduler read the extension-apiserver-authentication ConfigMap so +# its HTTPS serving port (10259) can set up delegated auth cleanly on startup. +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: bin-pack-scheduler-auth-reader + namespace: kube-system + labels: + osdc.io/module: bin-pack-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: + - kind: ServiceAccount + name: bin-pack-scheduler + namespace: kube-system +--- +# Lease create is already granted cluster-wide by system:kube-scheduler; +# get/update on our custom lease name is not, so grant it here. +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: bin-pack-scheduler-lease + namespace: kube-system + labels: + osdc.io/module: bin-pack-scheduler +rules: + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + resourceNames: ["bin-pack-scheduler"] + verbs: ["get", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: bin-pack-scheduler-lease + namespace: kube-system + labels: + osdc.io/module: bin-pack-scheduler +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: bin-pack-scheduler-lease +subjects: + - kind: ServiceAccount + name: bin-pack-scheduler + namespace: kube-system diff --git a/osdc/modules/bin-pack-scheduler/scripts/python/test_image_version.py b/osdc/modules/bin-pack-scheduler/scripts/python/test_image_version.py new file mode 100644 index 00000000..6f4eb247 --- /dev/null +++ b/osdc/modules/bin-pack-scheduler/scripts/python/test_image_version.py @@ -0,0 +1,34 @@ +"""Guard: the bin-pack-scheduler image minor must track clusters.yaml eks_version. + +kube-scheduler tolerates only +/-1 minor skew from the API server (the EKS +control-plane version). This pure-logic test fails loudly if someone bumps +eks_version without bumping the scheduler image (or vice-versa), so the two +can't silently diverge across an EKS upgrade. Runs in `just test` — no cluster. +""" + +from pathlib import Path + +import yaml + +_OSDC_ROOT = Path(__file__).resolve().parents[4] +_DEPLOYMENT = _OSDC_ROOT / "modules" / "bin-pack-scheduler" / "kubernetes" / "deployment.yaml" +_CLUSTERS = _OSDC_ROOT / "clusters.yaml" + + +def _scheduler_image_minor(): + """Return the major.minor of the kube-scheduler container image (e.g. '1.35').""" + doc = yaml.safe_load(_DEPLOYMENT.read_text()) + containers = doc["spec"]["template"]["spec"]["containers"] + image = next(c["image"] for c in containers if c["name"] == "kube-scheduler") + tag = image.rsplit(":", 1)[1].lstrip("v") # "v1.35.0" -> "1.35.0" + return ".".join(tag.split(".")[:2]) # -> "1.35" + + +def test_scheduler_image_minor_matches_eks_version(): + eks_version = yaml.safe_load(_CLUSTERS.read_text())["defaults"]["eks_version"] + image_minor = _scheduler_image_minor() + msg = ( + f"bin-pack-scheduler image minor is {image_minor!r} but clusters.yaml " + f"eks_version is {eks_version!r}; bump them together on EKS upgrade." + ) + assert image_minor == eks_version, msg diff --git a/osdc/modules/bin-pack-scheduler/tests/smoke/conftest.py b/osdc/modules/bin-pack-scheduler/tests/smoke/conftest.py new file mode 100644 index 00000000..8dfd070c --- /dev/null +++ b/osdc/modules/bin-pack-scheduler/tests/smoke/conftest.py @@ -0,0 +1 @@ +from smoke_conftest import * # noqa: F403 diff --git a/osdc/modules/bin-pack-scheduler/tests/smoke/test_bin_pack_scheduler.py b/osdc/modules/bin-pack-scheduler/tests/smoke/test_bin_pack_scheduler.py new file mode 100644 index 00000000..34145109 --- /dev/null +++ b/osdc/modules/bin-pack-scheduler/tests/smoke/test_bin_pack_scheduler.py @@ -0,0 +1,26 @@ +"""Smoke tests for the bin-pack-scheduler secondary scheduler. + +Validates that the scheduler Deployment is ready and has acquired its +leader-election Lease (so it is actually scheduling, not just running). +""" + +from __future__ import annotations + +import pytest +from helpers import assert_deployment_ready, run_kubectl + +pytestmark = [pytest.mark.live] + +NAMESPACE = "kube-system" +NAME = "bin-pack-scheduler" + + +class TestBinPackScheduler: + def test_deployment_ready(self, all_deployments: dict) -> None: + assert_deployment_ready(all_deployments, NAMESPACE, NAME) + + def test_leader_lease_held(self) -> None: + """Leader election Lease exists and has a current holder.""" + lease = run_kubectl(["get", "lease", NAME], namespace=NAMESPACE) + holder = lease.get("spec", {}).get("holderIdentity") + assert holder, f"Lease {NAME} has no holderIdentity — no scheduler is leading"