Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions osdc/modules/bin-pack-scheduler/kubernetes/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# KubeSchedulerConfiguration for the bin-pack-scheduler secondary scheduler.
# Stock upstream kube-scheduler; only NodeResourcesFit scoring is overridden to
# MostAllocated (pack onto the fullest node). Every other plugin/default is
# inherited. EKS's managed default scheduler can't be reconfigured, hence a
# second scheduler that pods opt into via spec.schedulerName.
apiVersion: kubescheduler.config.k8s.io/v1
kind: KubeSchedulerConfiguration
leaderElection:
leaderElect: true
# MUST differ from the managed scheduler's lease ("kube-scheduler").
resourceName: bin-pack-scheduler
resourceNamespace: kube-system
profiles:
- schedulerName: bin-pack-scheduler
pluginConfig:
- name: NodeResourcesFit
args:
scoringStrategy:
# cpu/memory weights match the upstream default, so `type` is the
# only delta from the managed default-scheduler.
type: MostAllocated
resources:
- {name: cpu, weight: 1}
- {name: memory, weight: 1}
73 changes: 73 additions & 0 deletions osdc/modules/bin-pack-scheduler/kubernetes/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# bin-pack-scheduler: a stock kube-scheduler run as a secondary scheduler.
# Image tracks eks_version (clusters.yaml); a guard test enforces the minor
# matches. ±1 minor skew is tolerated, so bump on EKS upgrade.
apiVersion: apps/v1
kind: Deployment
metadata:
name: bin-pack-scheduler
namespace: kube-system
labels:
osdc.io/module: bin-pack-scheduler
app: bin-pack-scheduler
spec:
replicas: 2 # HA via leader election; only one schedules at a time
selector:
matchLabels:
app: bin-pack-scheduler
template:
metadata:
labels:
app: bin-pack-scheduler
osdc.io/module: bin-pack-scheduler
spec:
serviceAccountName: bin-pack-scheduler
priorityClassName: system-cluster-critical
# Run on stable infra nodes, not ephemeral runner/workflow nodes.
nodeSelector:
role: base-infrastructure
tolerations:
- key: CriticalAddonsOnly
operator: Equal
value: "true"
effect: NoSchedule
# Spread the 2 replicas across nodes so a single node loss keeps a leader.
affinity:
podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution:
- weight: 100
podAffinityTerm:
topologyKey: kubernetes.io/hostname
labelSelector:
matchLabels:
app: bin-pack-scheduler
containers:
- name: kube-scheduler
image: registry.k8s.io/kube-scheduler:v1.35.0
command:
- kube-scheduler
- --config=/etc/kubernetes/bin-pack-scheduler/config.yaml
- --v=2
ports:
- name: https
containerPort: 10259
protocol: TCP
resources:
requests: {cpu: 200m, memory: 256Mi}
limits: {cpu: 500m, memory: 512Mi}
livenessProbe:
httpGet: {path: /healthz, port: 10259, scheme: HTTPS}
initialDelaySeconds: 15
readinessProbe:
httpGet: {path: /readyz, port: 10259, scheme: HTTPS}
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
runAsNonRoot: true
volumeMounts:
- name: config
mountPath: /etc/kubernetes/bin-pack-scheduler
readOnly: true
volumes:
- name: config
configMap:
name: bin-pack-scheduler-config # kustomize rewrites to the hashed name
18 changes: 18 additions & 0 deletions osdc/modules/bin-pack-scheduler/kubernetes/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- rbac.yaml
- deployment.yaml

# Generate the config ConfigMap with a content-hash suffix so the Deployment
# auto-rolls whenever config.yaml changes (no manual rollout restart needed).
configMapGenerator:
- name: bin-pack-scheduler-config
namespace: kube-system
files:
- config.yaml

generatorOptions:
labels:
osdc.io/module: bin-pack-scheduler
91 changes: 91 additions & 0 deletions osdc/modules/bin-pack-scheduler/kubernetes/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# RBAC for the bin-pack-scheduler. Reuses the built-in system:kube-scheduler
# and system:volume-scheduler ClusterRoles (track upstream across k8s versions)
# and adds get/update on our own leader-election Lease, which system:kube-scheduler
# scopes to resourceName "kube-scheduler" only.
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: bin-pack-scheduler
namespace: kube-system
labels:
osdc.io/module: bin-pack-scheduler
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: bin-pack-scheduler
labels:
osdc.io/module: bin-pack-scheduler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:kube-scheduler
subjects:
- kind: ServiceAccount
name: bin-pack-scheduler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: bin-pack-scheduler-volume
labels:
osdc.io/module: bin-pack-scheduler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:volume-scheduler
subjects:
- kind: ServiceAccount
name: bin-pack-scheduler
namespace: kube-system
---
# Lets the scheduler read the extension-apiserver-authentication ConfigMap so
# its HTTPS serving port (10259) can set up delegated auth cleanly on startup.
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: bin-pack-scheduler-auth-reader
namespace: kube-system
labels:
osdc.io/module: bin-pack-scheduler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: extension-apiserver-authentication-reader
subjects:
- kind: ServiceAccount
name: bin-pack-scheduler
namespace: kube-system
---
# Lease create is already granted cluster-wide by system:kube-scheduler;
# get/update on our custom lease name is not, so grant it here.
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: bin-pack-scheduler-lease
namespace: kube-system
labels:
osdc.io/module: bin-pack-scheduler
rules:
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
resourceNames: ["bin-pack-scheduler"]
verbs: ["get", "update"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: bin-pack-scheduler-lease
namespace: kube-system
labels:
osdc.io/module: bin-pack-scheduler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: bin-pack-scheduler-lease
subjects:
- kind: ServiceAccount
name: bin-pack-scheduler
namespace: kube-system
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""Guard: the bin-pack-scheduler image minor must track clusters.yaml eks_version.

kube-scheduler tolerates only +/-1 minor skew from the API server (the EKS
control-plane version). This pure-logic test fails loudly if someone bumps
eks_version without bumping the scheduler image (or vice-versa), so the two
can't silently diverge across an EKS upgrade. Runs in `just test` — no cluster.
"""

from pathlib import Path

import yaml

_OSDC_ROOT = Path(__file__).resolve().parents[4]
_DEPLOYMENT = _OSDC_ROOT / "modules" / "bin-pack-scheduler" / "kubernetes" / "deployment.yaml"
_CLUSTERS = _OSDC_ROOT / "clusters.yaml"


def _scheduler_image_minor():
"""Return the major.minor of the kube-scheduler container image (e.g. '1.35')."""
doc = yaml.safe_load(_DEPLOYMENT.read_text())
containers = doc["spec"]["template"]["spec"]["containers"]
image = next(c["image"] for c in containers if c["name"] == "kube-scheduler")
tag = image.rsplit(":", 1)[1].lstrip("v") # "v1.35.0" -> "1.35.0"
return ".".join(tag.split(".")[:2]) # -> "1.35"


def test_scheduler_image_minor_matches_eks_version():
eks_version = yaml.safe_load(_CLUSTERS.read_text())["defaults"]["eks_version"]
image_minor = _scheduler_image_minor()
msg = (
f"bin-pack-scheduler image minor is {image_minor!r} but clusters.yaml "
f"eks_version is {eks_version!r}; bump them together on EKS upgrade."
)
assert image_minor == eks_version, msg
1 change: 1 addition & 0 deletions osdc/modules/bin-pack-scheduler/tests/smoke/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from smoke_conftest import * # noqa: F403
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Smoke tests for the bin-pack-scheduler secondary scheduler.

Validates that the scheduler Deployment is ready and has acquired its
leader-election Lease (so it is actually scheduling, not just running).
"""

from __future__ import annotations

import pytest
from helpers import assert_deployment_ready, run_kubectl

pytestmark = [pytest.mark.live]

NAMESPACE = "kube-system"
NAME = "bin-pack-scheduler"


class TestBinPackScheduler:
def test_deployment_ready(self, all_deployments: dict) -> None:
assert_deployment_ready(all_deployments, NAMESPACE, NAME)

def test_leader_lease_held(self) -> None:
"""Leader election Lease exists and has a current holder."""
lease = run_kubectl(["get", "lease", NAME], namespace=NAMESPACE)
holder = lease.get("spec", {}).get("holderIdentity")
assert holder, f"Lease {NAME} has no holderIdentity — no scheduler is leading"
Loading