diff --git a/cmd/plugins/balloons/Dockerfile b/cmd/plugins/balloons/Dockerfile index 05681ef88..89c861fc1 100644 --- a/cmd/plugins/balloons/Dockerfile +++ b/cmd/plugins/balloons/Dockerfile @@ -1,6 +1,6 @@ ARG GO_VERSION=1.25 -FROM golang:${GO_VERSION}-bookworm AS builder +FROM docker.io/library/golang:${GO_VERSION}-bookworm AS builder ARG IMAGE_VERSION ARG BUILD_VERSION diff --git a/cmd/plugins/balloons/policy/balloons-policy.go b/cmd/plugins/balloons/policy/balloons-policy.go index 53f59fe6a..3211893f5 100644 --- a/cmd/plugins/balloons/policy/balloons-policy.go +++ b/cmd/plugins/balloons/policy/balloons-policy.go @@ -76,9 +76,10 @@ type balloons struct { options *policy.BackendOptions // configuration common to all policies bpoptions *BalloonsOptions // balloons-specific configuration cch cache.Cache // nri-resource-policy cache - allowed cpuset.CPUSet // bounding set of CPUs we're allowed to use - reserved cpuset.CPUSet // system-/kube-reserved CPUs - freeCpus cpuset.CPUSet // CPUs to be included in growing or new ballons + allowed cpuset.CPUSet // bounding set of CPUs we're allowed to use + reserved cpuset.CPUSet // system-/kube-reserved CPUs + reservedExact bool // keep built-in reserved balloon on the exact reserved cpuset + freeCpus cpuset.CPUSet // CPUs to be included in growing or new ballons ifreeCpus cpuset.CPUSet // initially free CPUs before assigning any containers cpuTree *cpuTreeNode // system CPU topology @@ -1102,7 +1103,28 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro } } } - if err := p.resizeBalloon(bln, blnDef.MinCpus*1000); err != nil { + if p.reservedExact && blnDef.Name == reservedBalloonDefName { + requested := p.reserved.Clone() + if requested.Size() == 0 { + return nil, balloonsError("reservedCPUMode=hard-exact requires non-empty reserved cpuset") + } + if requested.Difference(p.freeCpus).Size() > 0 { + return nil, balloonsError("reserved exact cpuset %q is not fully free, missing %q", requested, requested.Difference(p.freeCpus)) + } + // AllocateCpus() may mutate the preferred/requested cpuset argument. + // Keep an immutable copy for post-allocation validation and logging. + exact := requested.Clone() + newCpus, err := p.cpuAllocator.AllocateCpus(&exact, requested.Size(), blnDef.AllocatorPriority.Value().Option()) + if err != nil { + return nil, balloonsError("failed to allocate exact reserved cpuset %q: %w", requested, err) + } + if !newCpus.Equals(requested) { + return nil, balloonsError("failed to allocate exact reserved cpuset: requested %q, got %q", requested, newCpus) + + } + p.freeCpus = p.freeCpus.Difference(newCpus) + bln.Cpus = newCpus + } else if err := p.resizeBalloon(bln, blnDef.MinCpus*1000); err != nil { return nil, err } bln.Mems = p.closestMems(bln.Cpus) @@ -1809,7 +1831,19 @@ func (p *balloons) fillBuiltinBalloonDefs(bpoptions *BalloonsOptions) (*BalloonD cset, p.allowed, cset.Difference(p.allowed)) } p.reserved = p.allowed.Intersection(cset) - if reservedBalloonDef.MinCpus == 0 { + if bpoptions.ReservedCPUMode == cfgapi.ReservedCPUModeHardExact { + p.reservedExact = true + if reservedBalloonDef.MinCpus != 0 && reservedBalloonDef.MinCpus != p.reserved.Size() { + return nil, nil, balloonsError("mismatching reserved balloon minCpus: %d and ReservedResources cpuset size: %d", + reservedBalloonDef.MinCpus, p.reserved.Size()) + } + if reservedBalloonDef.MaxCpus != 0 && reservedBalloonDef.MaxCpus != p.reserved.Size() { + return nil, nil, balloonsError("mismatching reserved balloon maxCpus: %d and ReservedResources cpuset size: %d", + reservedBalloonDef.MaxCpus, p.reserved.Size()) + } + reservedBalloonDef.MinCpus = p.reserved.Size() + reservedBalloonDef.MaxCpus = p.reserved.Size() + } else if reservedBalloonDef.MinCpus == 0 { if p.reserved.Size() < reservedBalloonDef.MaxCpus { reservedBalloonDef.MinCpus = p.reserved.Size() } else { @@ -1828,6 +1862,9 @@ func (p *balloons) fillBuiltinBalloonDefs(bpoptions *BalloonsOptions) (*BalloonD // balloon type. reservedBalloonDef.PreferCloseToDevices = append([]string{virtDevReservedCpus}, reservedBalloonDef.PreferCloseToDevices...) case cfgapi.AmountQuantity: + if bpoptions.ReservedCPUMode == cfgapi.ReservedCPUModeHardExact { + return nil, nil, balloonsError("reservedCPUMode=hard-exact requires ReservedResources cpu in cpuset form") + } // ReservedResources.cpus defines number of // CPUs. Treat the value as a minimum size for the // reserved balloon, but the balloon is allowed to @@ -1996,6 +2033,13 @@ func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error { if bln.Def.MinCpus > 0 && newCpuCount < bln.Def.MinCpus { newCpuCount = bln.Def.MinCpus } + if p.reservedExact && bln.Def.Name == reservedBalloonDefName { + exactCount := p.reserved.Size() + if oldCpuCount == exactCount && bln.Cpus.Equals(p.reserved) { + return nil + } + newCpuCount = exactCount + } log.Debugf("resize %s to fit %d mCPU", bln, newMilliCpus) log.Debugf("- change size from %d to %d full cpus", oldCpuCount, newCpuCount) log.Debugf("- free cpus: %q", p.freeCpus) @@ -2003,6 +2047,15 @@ func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error { return nil } cpuCountDelta := newCpuCount - oldCpuCount + if p.reservedExact && bln.Def.Name == reservedBalloonDefName { + if cpuCountDelta == 0 { + if !bln.Cpus.Equals(p.reserved) { + return balloonsError("reserved balloon exact cpuset drifted: expected %q, got %q", p.reserved, bln.Cpus) + } + return nil + } + return balloonsError("reserved balloon exact cpuset cannot be resized: expected %q, current %q", p.reserved, bln.Cpus) + } p.forgetCpuClass(bln) defer func() { if err := p.useCpuClass(bln); err != nil { diff --git a/cmd/plugins/balloons/policy/flags.go b/cmd/plugins/balloons/policy/flags.go index 19889ea15..77a3d0168 100644 --- a/cmd/plugins/balloons/policy/flags.go +++ b/cmd/plugins/balloons/policy/flags.go @@ -25,6 +25,7 @@ type ( LoadClass = cfgapi.LoadClass SchedulingClass = cfgapi.SchedulingClass CPUTopologyLevel = cfgapi.CPUTopologyLevel + ReservedCPUMode = cfgapi.ReservedCPUMode ) var ( @@ -35,6 +36,9 @@ var ( ) const ( + ReservedCPUModePreferred = cfgapi.ReservedCPUModePreferred + ReservedCPUModeHardExact = cfgapi.ReservedCPUModeHardExact + CPUTopologyLevelUndefined = cfgapi.CPUTopologyLevelUndefined CPUTopologyLevelSystem = cfgapi.CPUTopologyLevelSystem CPUTopologyLevelPackage = cfgapi.CPUTopologyLevelPackage diff --git a/config/crd/bases/config.nri_balloonspolicies.yaml b/config/crd/bases/config.nri_balloonspolicies.yaml index 321f3f417..d43efae82 100644 --- a/config/crd/bases/config.nri_balloonspolicies.yaml +++ b/config/crd/bases/config.nri_balloonspolicies.yaml @@ -964,6 +964,15 @@ spec: type: object type: array type: object + reservedCPUMode: + description: |- + ReservedCPUMode controls how strictly ReservedResources.cpu is interpreted. + Value "preferred" keeps the current behavior where a reserved cpuset is only a preference. + Value "hard-exact" makes the reserved balloon stay within the exact configured cpuset. + enum: + - preferred + - hard-exact + type: string reservedPoolNamespaces: description: |- ReservedPoolNamespaces is a list of namespace globs that diff --git a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml index 321f3f417..d43efae82 100644 --- a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml +++ b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml @@ -964,6 +964,15 @@ spec: type: object type: array type: object + reservedCPUMode: + description: |- + ReservedCPUMode controls how strictly ReservedResources.cpu is interpreted. + Value "preferred" keeps the current behavior where a reserved cpuset is only a preference. + Value "hard-exact" makes the reserved balloon stay within the exact configured cpuset. + enum: + - preferred + - hard-exact + type: string reservedPoolNamespaces: description: |- ReservedPoolNamespaces is a list of namespace globs that diff --git a/deployment/helm/balloons/templates/daemonset.yaml b/deployment/helm/balloons/templates/daemonset.yaml index 2d190fb94..5e744cff7 100644 --- a/deployment/helm/balloons/templates/daemonset.yaml +++ b/deployment/helm/balloons/templates/daemonset.yaml @@ -22,6 +22,7 @@ spec: {{ $name }}: "{{ $value }}" {{- end }} spec: + hostPID: true {{- with .Values.tolerations }} tolerations: {{- toYaml . | nindent 8 }} @@ -99,7 +100,9 @@ spec: image: {{ .Values.image.name }}:{{ .Values.image.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.image.pullPolicy }} securityContext: - allowPrivilegeEscalation: false + privileged: true + allowPrivilegeEscalation: true + runAsUser: 0 capabilities: drop: - ALL diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go index 496f851a6..bcf1743ad 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go @@ -32,6 +32,7 @@ type ( CPUTopologyLevel = policy.CPUTopologyLevel ComponentCreationStrategy = policy.ComponentCreationStrategy SchedulingClass = policy.SchedulingClass + ReservedCPUMode string ) const ( @@ -42,6 +43,9 @@ const ( AmountCPUSet = policy.AmountCPUSet AmountExcludeCPUSet = policy.AmountExcludeCPUSet + ReservedCPUModePreferred ReservedCPUMode = "preferred" + ReservedCPUModeHardExact ReservedCPUMode = "hard-exact" + CPUTopologyLevelUndefined = policy.CPUTopologyLevelUndefined CPUTopologyLevelSystem = policy.CPUTopologyLevelSystem CPUTopologyLevelPackage = policy.CPUTopologyLevelPackage @@ -115,6 +119,11 @@ type Config struct { // Reserved (CPU) resources for kube-system namespace. // +kubebuilder:validation:Required ReservedResources Constraints `json:"reservedResources"` + // ReservedCPUMode controls how strictly ReservedResources.cpu is interpreted. + // Value "preferred" keeps the current behavior where a reserved cpuset is only a preference. + // Value "hard-exact" makes the reserved balloon stay within the exact configured cpuset. + // +kubebuilder:validation:Enum=preferred;hard-exact + ReservedCPUMode ReservedCPUMode `json:"reservedCPUMode,omitempty"` // Preserve specifies containers whose resource pinning must not be // modified by the policy. Preserve *ContainerMatchConfig `json:"preserve,omitempty"`