diff --git a/cmd/plugins/balloons/policy/balloons-policy.go b/cmd/plugins/balloons/policy/balloons-policy.go index a2f31323f..af9dfbd11 100644 --- a/cmd/plugins/balloons/policy/balloons-policy.go +++ b/cmd/plugins/balloons/policy/balloons-policy.go @@ -89,9 +89,10 @@ type balloons struct { meters *Meters // balloon metrics meterLock sync.RWMutex // protects metrics collection against allocation - cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy - memAllocator *libmem.Allocator // memory allocator used by the policy - loadVirtDev map[string]*loadClassVirtDev // map LoadClasses to virtual devices + cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy + memAllocator *libmem.Allocator // memory allocator used by the policy + turboAllocator *CPUClassTurboAllocator // turbo budget allocator based on CPUClasses + loadVirtDev map[string]*loadClassVirtDev // map LoadClasses to virtual devices } // Balloon contains attributes of a balloon instance @@ -791,80 +792,62 @@ func largest(sliceLen int, valueOf func(i int) int) ([]int, int) { // resetCpuClass resets CPU configurations globally. All balloons can // be ignored, their CPU configurations will be applied later. func (p *balloons) resetCpuClass() error { - // Usual inputs: - // - p.allowed (cpuset.CPUset): all CPUs available for this - // policy. - // - p.IdleCpuClass (string): CPU class for allowed CPUs. - // - // Other inputs, if needed: - // - p.reserved (cpuset.CPUset): CPUs of ReservedResources - // (typically for kube-system containers). - // - // Note: p.useCpuClass(balloon) will be called before assigning - // containers on the balloon, including the reserved balloon. - // - // TODO: don't depend on cpu controller directly - if err := cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, p.allowed.UnsortedList()...); err != nil { + // p.useCpuClass(balloon) will be called later for every balloon, + // including the reserved balloon, to set the per-balloon CPU + // class. Here we only assign the idle class to all allowed CPUs. + if p.turboAllocator == nil { + return nil + } + idle := p.turboAllocator.ResolveClassName(p.bpoptions.IdleCpuClass) + if err := p.turboAllocator.ResetIdle(p.allowed); err != nil { log.Warnf("failed to reset class of available cpus: %v", err) } else { - log.Debugf("reset class of available cpus: %q (reserved: %q)", p.allowed, p.reserved) + log.Debugf("reset class of available cpus: %q to idle class %q (reserved: %q)", + p.allowed, idle, p.reserved) } return nil } -// useCpuClass configures CPUs of a balloon. +// useCpuClass configures CPUs of a balloon by delegating to the +// turbo-aware CPU class allocator. func (p *balloons) useCpuClass(bln *Balloon) error { - // Usual inputs: - // - CPUs that cpuallocator has reserved for this balloon: - // bln.Cpus (cpuset.CPUSet). - // - User-defined CPU configuration for CPUs of balloon of this type: - // bln.Def.CpuClass (string). - // - Current configuration(?): feel free to add data - // structure for this. For instance policy-global p.cpuConfs, - // or balloon-local bln.cpuConfs. - // - // Other input examples, if needed: - // - Requested CPU resources by all containers in the balloon: - // p.requestedMilliCpus(bln). - // - Free CPU resources in the balloon: p.freeMilliCpus(bln). - // - Number of assigned containers: bln.ContainerCount(). - // - Container details: access p.cch with bln.ContainerIDs(). - // - User-defined CPU AllocatorPriority: bln.Def.AllocatorPriority. - // - All existing balloon instances: p.balloons. - // - CPU configurations by user: bln.Def.CpuClass (for bln in p.balloons) if len(bln.components) > 0 { - // If this is a composite balloon, CPU class is - // defined in the component balloons. - log.Debugf("apply CPU class %q on CPUs %s of composite balloon %q", - bln.Def.CpuClass, bln.Cpus, bln.PrettyName()) + // Composite balloon: each component carries its own CpuClass. + log.Debugf("apply CPU classes of components of composite balloon %q on CPUs %s", + bln.PrettyName(), bln.Cpus) for _, compBln := range bln.components { if err := p.useCpuClass(compBln); err != nil { log.Warnf("failed to apply CPU class %q on CPUs %s of %q in composite balloon %q: %v", compBln.Def.CpuClass, compBln.Cpus, compBln.PrettyName(), bln.PrettyName(), err) } - } return nil } - if err := cpucontrol.Assign(p.cch, bln.Def.CpuClass, bln.Cpus.UnsortedList()...); err != nil { - log.Warnf("failed to apply class %q on CPUs %q: %v", bln.Def.CpuClass, bln.Cpus, err) - } else { - log.Debugf("apply CPU class %q on CPUs %q of %q", bln.Def.CpuClass, bln.Cpus, bln.PrettyName()) + if p.turboAllocator == nil { + return nil + } + className := p.turboAllocator.ResolveClassName(bln.Def.CpuClass) + log.Debugf("apply CPU class %q on CPUs %q of %q", className, bln.Cpus, bln.PrettyName()) + if err := p.turboAllocator.UseClass(bln.Def.CpuClass, bln.Cpus); err != nil { + log.Warnf("failed to apply class %q on CPUs %q: %v", className, bln.Cpus, err) } return nil } // forgetCpuClass is called when CPUs of a balloon are released from duty. func (p *balloons) forgetCpuClass(bln *Balloon) { - // Use p.IdleCpuClass for bln.Cpus. - // Usual inputs: see useCpuClass - if err := cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, bln.Cpus.UnsortedList()...); err != nil { - log.Warnf("failed to forget class %q of cpus %q: %v", bln.Def.CpuClass, bln.Cpus, err) + if p.turboAllocator == nil { + return + } + idle := p.turboAllocator.ResolveClassName(p.bpoptions.IdleCpuClass) + if err := p.turboAllocator.ForgetClass(bln.Cpus); err != nil { + log.Warnf("failed to forget class of cpus %q (idle class %q): %v", bln.Cpus, idle, err) } else { if len(bln.components) > 0 { - log.Debugf("forget classes of composite balloon %q cpus %q", bln.Def.Name, bln.Cpus) + log.Debugf("forget classes of composite balloon %q cpus %q (idle class %q)", + bln.Def.Name, bln.Cpus, idle) } else { - log.Debugf("forget class %q of cpus %q", bln.Def.CpuClass, bln.Cpus) + log.Debugf("forget class of cpus %q (idle class %q)", bln.Cpus, idle) } } } @@ -1432,6 +1415,13 @@ func changesCpuClasses(opts0, opts1 *BalloonsOptions) bool { return true } } + // Detect changes in CPUClasses definitions (turbo attributes, frequencies, etc.) + if len(opts0.CPUClasses) != len(opts1.CPUClasses) { + return true + } + if utils.DumpJSON(opts0.CPUClasses) != utils.DumpJSON(opts1.CPUClasses) { + return true + } return false } @@ -1454,6 +1444,14 @@ func (p *balloons) Reconfigure(newCfg interface{}) error { log.Infof("no configuration changes") } else { log.Infof("configuration changes only on CPU classes") + // Update CPUClasses definitions. + p.bpoptions.CPUClasses = newBalloonsOptions.CPUClasses + p.bpoptions.IdleCpuClass = newBalloonsOptions.IdleCpuClass + if p.turboAllocator != nil { + if err := p.turboAllocator.Reconfigure(p.bpoptions.CPUClasses, p.bpoptions.IdleCpuClass); err != nil { + log.Warnf("failed to reconfigure CPU class allocator: %v", err) + } + } // Update new CPU classes to existing balloon // definitions. The same BalloonDef instances // must be kept in use, because each Balloon @@ -1600,6 +1598,31 @@ func (p *balloons) validateConfig(bpoptions *BalloonsOptions) error { if len(undefinedSchedulingClasses) > 0 { return balloonsError("schedulingClass(es) defined in balloonTypes but missing from schedulingClasses: %v", undefinedSchedulingClasses) } + // Validate CPUClasses. + cpuClassNames := map[string]struct{}{} + for _, cc := range bpoptions.CPUClasses { + if cc.Name == "" { + return balloonsError("missing or empty name in a cpuClasses entry") + } + if _, dup := cpuClassNames[cc.Name]; dup { + return balloonsError("duplicate cpuClasses name: %q", cc.Name) + } + cpuClassNames[cc.Name] = struct{}{} + } + // Verify that cpuClass references in balloon types are defined + // in either cpuClasses or existing control.cpu.classes. + existingControlClasses := cpucontrol.GetClasses() + for _, blnDef := range bpoptions.BalloonDefs { + if blnDef.CpuClass == "" { + continue + } + _, inCPUClasses := cpuClassNames[blnDef.CpuClass] + _, inControlClasses := existingControlClasses[blnDef.CpuClass] + if !inCPUClasses && !inControlClasses { + log.Warnf("cpuClass %q referenced by balloon type %q is not defined in cpuClasses or control.cpu.classes", + blnDef.CpuClass, blnDef.Name) + } + } var circularCheck func(name string, seen map[string]int) error circularCheck = func(name string, seen map[string]int) error { if seen[name] > 0 { @@ -1671,6 +1694,30 @@ func (p *balloons) setConfig(bpoptions *BalloonsOptions) error { setOmittedDefaults(bpoptions) + // Set bpoptions early so the turbo allocator construction below + // has access to CPUClasses. + p.bpoptions = bpoptions + + // Construct or reconfigure the turbo-aware CPU class allocator. + // All cpucontrol.SetClass / cpucontrol.Assign calls flow through + // it. + if p.turboAllocator == nil { + ta, err := NewCPUClassTurboAllocator( + WithSystem(p.options.System), + WithCache(p.cch), + WithCPUClasses(bpoptions.CPUClasses), + WithIdleClass(bpoptions.IdleCpuClass), + ) + if err != nil { + return balloonsError("failed to create CPU class turbo allocator: %w", err) + } + p.turboAllocator = ta + } else { + if err := p.turboAllocator.Reconfigure(bpoptions.CPUClasses, bpoptions.IdleCpuClass); err != nil { + return balloonsError("failed to reconfigure CPU class turbo allocator: %w", err) + } + } + reservedBalloonDef, defaultBalloonDef, err := p.fillBuiltinBalloonDefs(bpoptions) if err != nil { return err diff --git a/cmd/plugins/balloons/policy/cpuclass.go b/cmd/plugins/balloons/policy/cpuclass.go new file mode 100644 index 000000000..60cd3cc04 --- /dev/null +++ b/cmd/plugins/balloons/policy/cpuclass.go @@ -0,0 +1,448 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package balloons + +import ( + "fmt" + + cpucfg "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/control/cpu" + "github.com/containers/nri-plugins/pkg/resmgr/cache" + cpucontrol "github.com/containers/nri-plugins/pkg/resmgr/control/cpu" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// CPUClassTurboAllocator owns all CPU-class lifecycle concerns for the +// balloons policy: resolution of symbolic frequencies (min/base/turbo), +// turbo-priority winner selection, and the actual cpucontrol.SetClass / +// cpucontrol.Assign calls that follow from those decisions. +// +// The allocator keeps the policy code free of any direct CPU controller +// access. Balloons code only needs to call UseClass/ForgetClass for the +// CPU sets it manages; the allocator takes care of pushing class +// definitions to the CPU controller and re-assigning CPUs of classes +// whose effective turbo frequency changes when the active winner +// changes. +type CPUClassTurboAllocator struct { + sys sysfs.System + cch cache.Cache + classes []*CPUClass + classByName map[string]*CPUClass + idleClassName string + turboInfo *platformTurboInfo + + // activeCpus tracks the set of CPUs currently assigned to each + // CPU class (by the latest UseClass/ForgetClass calls). It is the + // allocator's local model of "which classes are active". The + // recalculation of the turbo winner consults this map without + // reaching back into balloons or the CPU controller. + activeCpus map[string]cpuset.CPUSet + + // winnerPrio is the highest TurboPriority among CPU classes that + // had any active CPUs the last time recalculateTurbo() ran. + // Initialized to -1 to force the first recalculation. + winnerPrio int +} + +// TurboOption is a functional option for NewCPUClassTurboAllocator. +type TurboOption func(*CPUClassTurboAllocator) error + +// WithSystem provides the sysfs system topology for symbolic frequency +// resolution. +func WithSystem(sys sysfs.System) TurboOption { + return func(a *CPUClassTurboAllocator) error { + a.sys = sys + return nil + } +} + +// WithCache provides the resource manager cache for cpucontrol.Assign. +func WithCache(cch cache.Cache) TurboOption { + return func(a *CPUClassTurboAllocator) error { + a.cch = cch + return nil + } +} + +// WithCPUClasses provides the user-facing CPUClass definitions. +func WithCPUClasses(classes []*CPUClass) TurboOption { + return func(a *CPUClassTurboAllocator) error { + a.classes = classes + a.classByName = make(map[string]*CPUClass, len(classes)) + for _, cc := range classes { + a.classByName[cc.Name] = cc + } + return nil + } +} + +// WithIdleClass provides the name of the idle CPU class used by +// ForgetClass and ResetIdle. +func WithIdleClass(name string) TurboOption { + return func(a *CPUClassTurboAllocator) error { + a.idleClassName = name + return nil + } +} + +// NewCPUClassTurboAllocator creates a turbo allocator and applies the +// given options. The constructor pushes initial CPU class definitions +// (with symbolic frequencies resolved against sysfs, when possible) +// into the CPU controller via cpucontrol.SetClass, so subsequent +// cpucontrol.Assign calls see the correct effective frequencies. +func NewCPUClassTurboAllocator(opts ...TurboOption) (*CPUClassTurboAllocator, error) { + a := &CPUClassTurboAllocator{ + activeCpus: map[string]cpuset.CPUSet{}, + winnerPrio: -1, + } + for _, opt := range opts { + if err := opt(a); err != nil { + return nil, err + } + } + if a.sys == nil { + return nil, fmt.Errorf("CPUClassTurboAllocator: missing required option WithSystem") + } + if a.cch == nil { + return nil, fmt.Errorf("CPUClassTurboAllocator: missing required option WithCache") + } + a.discoverPlatformInfo() + a.pushInitialClassDefinitions() + return a, nil +} + +// Reconfigure replaces the CPU class set and idle class name. Resets +// the turbo winner so the next UseClass/ForgetClass call recomputes +// the effective frequencies, and re-pushes class definitions to the +// CPU controller. +func (a *CPUClassTurboAllocator) Reconfigure(classes []*CPUClass, idleClass string) error { + a.classes = classes + a.classByName = make(map[string]*CPUClass, len(classes)) + for _, cc := range classes { + a.classByName[cc.Name] = cc + } + a.idleClassName = idleClass + a.winnerPrio = -1 + a.pushInitialClassDefinitions() + return nil +} + +// Classes returns the current user-facing CPUClass set. +func (a *CPUClassTurboAllocator) Classes() []*CPUClass { + return a.classes +} + +// ClassByName looks up a CPUClass by name. +func (a *CPUClassTurboAllocator) ClassByName(name string) *CPUClass { + return a.classByName[name] +} + +// defaultClassName is the name of the CPU class used as a fallback +// when a balloon type does not specify cpuClass or when idleCpuClass +// is left empty. +const defaultClassName = "default" + +// isKnownClass reports whether the given class name is known to either +// the user-facing CPUClasses configuration of this allocator or to the +// CPU controller (via cpucontrol.GetClasses, which contains classes +// defined in the legacy control.cpu.classes section as well as every +// class pushed via cpucontrol.SetClass). The two sources can differ: +// classByName carries the turbo metadata needed by recalculateTurbo, +// while cpucontrol's class map is what actually drives sysfs writes, +// so a class defined only via control.cpu.classes is unknown to +// classByName but known to cpucontrol. +func (a *CPUClassTurboAllocator) isKnownClass(name string) bool { + if _, ok := a.classByName[name]; ok { + return true + } + if _, ok := cpucontrol.GetClasses()[name]; ok { + return true + } + return false +} + +// ResolveClassName resolves a (possibly empty or unknown) configured +// CPU class name to the class that should actually be applied. If the +// configured name matches a class known to either cpuClasses or +// control.cpu.classes it is returned unchanged. Otherwise, if a class +// named "default" is known to either source, "default" is returned. +// As a last resort the original name is returned, so the caller's +// existing log/warning paths still see what was requested. +func (a *CPUClassTurboAllocator) ResolveClassName(name string) string { + if a.isKnownClass(name) { + return name + } + if a.isKnownClass(defaultClassName) { + if name != "" { + log.Errorf("unknown CPU class %q: falling back to using %q", name, defaultClassName) + } + return defaultClassName + } + log.Errorf("unknown CPU class %q and fallback class %q missing from cpuClasses", name, defaultClassName) + return name +} + +// UseClass marks the given CPUs as active under className, recalculates +// the turbo winner, then assigns the CPUs to className via the CPU +// controller. The recalculation runs first so that the controller's +// in-memory class definition reflects the correct effective turbo +// frequency at the time of Assign. An empty or unknown className +// resolves to the "default" CPU class when one is configured. +func (a *CPUClassTurboAllocator) UseClass(className string, cpus cpuset.CPUSet) error { + if cpus.IsEmpty() { + return nil + } + className = a.ResolveClassName(className) + a.removeCpusFromAllClasses(cpus) + if className != "" { + a.activeCpus[className] = a.activeCpus[className].Union(cpus) + } + a.recalculateTurbo() + if err := cpucontrol.Assign(a.cch, className, cpus.UnsortedList()...); err != nil { + return fmt.Errorf("failed to assign CPUs %s to class %q: %w", cpus, className, err) + } + return nil +} + +// ForgetClass removes the given CPUs from any active class set, +// assigns them to the idle class via the CPU controller, then +// recalculates the turbo winner (the previously dominant class may +// have lost its last active balloon). An empty or unknown idle class +// name resolves to the "default" CPU class when one is configured. +func (a *CPUClassTurboAllocator) ForgetClass(cpus cpuset.CPUSet) error { + if cpus.IsEmpty() { + return nil + } + idle := a.ResolveClassName(a.idleClassName) + a.removeCpusFromAllClasses(cpus) + if err := cpucontrol.Assign(a.cch, idle, cpus.UnsortedList()...); err != nil { + return fmt.Errorf("failed to assign CPUs %s to idle class %q: %w", cpus, idle, err) + } + a.recalculateTurbo() + return nil +} + +// ResetIdle assigns the given CPU set to the idle class via the CPU +// controller. Used at policy startup to bring all allowed CPUs to a +// known baseline before any container-driven UseClass call. Does not +// affect the active-class tracking. An empty or unknown idle class +// name resolves to the "default" CPU class when one is configured. +func (a *CPUClassTurboAllocator) ResetIdle(cpus cpuset.CPUSet) error { + if cpus.IsEmpty() { + return nil + } + idle := a.ResolveClassName(a.idleClassName) + if err := cpucontrol.Assign(a.cch, idle, cpus.UnsortedList()...); err != nil { + return fmt.Errorf("failed to assign CPUs %s to idle class %q: %w", cpus, idle, err) + } + return nil +} + +// removeCpusFromAllClasses removes the given CPUs from every active +// class set. Empty class sets are deleted. +func (a *CPUClassTurboAllocator) removeCpusFromAllClasses(cpus cpuset.CPUSet) { + for name, set := range a.activeCpus { + newSet := set.Difference(cpus) + if newSet.IsEmpty() { + delete(a.activeCpus, name) + } else { + a.activeCpus[name] = newSet + } + } +} + +// discoverPlatformInfo reads platform turbo capabilities from sysfs. +// Failure is non-fatal; symbolic frequencies will resolve to 0 in +// that case (matching the behavior of the pre-allocator code path). +func (a *CPUClassTurboAllocator) discoverPlatformInfo() { + info, err := discoverTurboInfo(a.sys) + if err != nil { + log.Warnf("CPUClassTurboAllocator: cannot discover platform turbo info: %v", err) + return + } + a.turboInfo = info +} + +// pushInitialClassDefinitions resolves symbolic frequencies in every +// CPUClass and pushes the resulting cpucfg.Class to the CPU controller +// via cpucontrol.SetClass. At this point no class has been declared a +// turbo winner yet, so symbolic "turbo" resolves to the platform max +// turbo frequency for every class. The first UseClass call will +// trigger recalculateTurbo() to enforce the priority-based effective +// turbo. +func (a *CPUClassTurboAllocator) pushInitialClassDefinitions() { + for _, cc := range a.classes { + var controlClass cpucfg.Class + if a.turboInfo != nil { + controlClass = resolvedCpuClassToControlClass(cc, a.turboInfo, 0) + } else { + controlClass = cpuClassToControlClass(cc) + } + cpucontrol.SetClass(cc.Name, controlClass) + log.Infof("cpuClass %q configured: minFreq=%s(%d) maxFreq=%s(%d) disabledCstates=%v", + cc.Name, cc.MinFreq, controlClass.MinFreq, cc.MaxFreq, controlClass.MaxFreq, cc.DisabledCstates) + } +} + +// recalculateTurbo resolves exclusive turbo frequency access based on +// turboPriority across all CPU classes that currently have active CPUs. +// +// Algorithm (steady-state no-op): +// 1. Find the highest turboPriority among classes with non-empty +// active CPU sets. +// 2. If the new highest priority equals the previously computed one, +// return immediately. Effective frequencies cannot have changed. +// 3. Otherwise: update CPU controller class definitions for ALL +// CPUClasses via cpucontrol.SetClass. SetClass records the new +// definition in memory and marks every CPU currently assigned to +// the affected class as dirty. The CPU controller's Commit() +// (called once per NRI request after all per-container hooks) +// then issues the minimal set of sysfs writes needed to reach +// the new desired state, deduplicated against the per-CPU +// lastFreq cache. +func (a *CPUClassTurboAllocator) recalculateTurbo() { + if len(a.classes) == 0 { + return + } + + // Find the highest turboPriority among classes with active CPUs. + newPrio := 0 + for _, cc := range a.classes { + if cc.TurboPriority <= newPrio { + continue + } + if set, ok := a.activeCpus[cc.Name]; ok && !set.IsEmpty() { + newPrio = cc.TurboPriority + } + } + + // Steady-state fast path. + if newPrio == a.winnerPrio { + return + } + + a.winnerPrio = newPrio + + if a.turboInfo == nil { + // No platform info -> we cannot compute effective turbo. + // Still update winnerPrio to avoid repeated warnings. + log.Warnf("turbo recalculation skipped: no platform turbo info available") + return + } + + // Update CPU controller class definitions for every CPUClass with + // the new effective turbo. The actual sysfs writes are deferred + // until the CPU controller's next Commit() call. + for _, cc := range a.classes { + effectiveTurboKHz := a.turboInfo.baseFreqKHz + if newPrio == 0 || cc.TurboPriority >= newPrio { + effectiveTurboKHz = a.turboInfo.maxTurboFreqKHz + } + controlClass := resolvedCpuClassToControlClass(cc, a.turboInfo, effectiveTurboKHz) + cpucontrol.SetClass(cc.Name, controlClass) + log.Infof("turbo: class %q (prio=%d, winner=%v): minFreq=%d maxFreq=%d", + cc.Name, cc.TurboPriority, + newPrio == 0 || cc.TurboPriority >= newPrio, + controlClass.MinFreq, controlClass.MaxFreq) + } +} + +// cpuClassToControlClass converts a user-friendly CPUClass definition +// into the internal cpu.Class representation used by the CPU controller. +// Symbolic frequencies (min, base, turbo) are left as 0; use +// resolvedCpuClassToControlClass() when platform info is available. +func cpuClassToControlClass(cc *CPUClass) cpucfg.Class { + resolveFreq := func(f Frequency) uint { + if f.IsSymbolic() { + return 0 + } + return f.KHz() + } + return cpucfg.Class{ + MinFreq: resolveFreq(cc.MinFreq), + MaxFreq: resolveFreq(cc.MaxFreq), + EnergyPerformancePreference: cc.EnergyPerformancePreference, + UncoreMinFreq: resolveFreq(cc.UncoreMinFreq), + UncoreMaxFreq: resolveFreq(cc.UncoreMaxFreq), + FreqGovernor: cc.FreqGovernor, + DisabledCstates: cc.DisabledCstates, + } +} + +// resolvedCpuClassToControlClass converts a CPUClass to a control +// class with symbolic frequencies resolved using platform info. +// effectiveTurboKHz overrides the turbo frequency used when resolving +// the "turbo" symbolic name (0 means use the platform turbo frequency). +func resolvedCpuClassToControlClass(cc *CPUClass, info *platformTurboInfo, effectiveTurboKHz uint) cpucfg.Class { + turboKHz := info.maxTurboFreqKHz + if effectiveTurboKHz > 0 { + turboKHz = effectiveTurboKHz + } + resolve := func(f Frequency) uint { + if info != nil { + return f.Resolve(info.minFreqKHz, info.baseFreqKHz, turboKHz) + } + if f.IsSymbolic() { + return 0 + } + return f.KHz() + } + return cpucfg.Class{ + MinFreq: resolve(cc.MinFreq), + MaxFreq: resolve(cc.MaxFreq), + EnergyPerformancePreference: cc.EnergyPerformancePreference, + UncoreMinFreq: resolve(cc.UncoreMinFreq), + UncoreMaxFreq: resolve(cc.UncoreMaxFreq), + FreqGovernor: cc.FreqGovernor, + DisabledCstates: cc.DisabledCstates, + } +} + +// platformTurboInfo holds platform-level turbo frequency capabilities +// discovered from sysfs. +type platformTurboInfo struct { + // baseFreqKHz is the base frequency shared by all CPUs (kHz). + baseFreqKHz uint + // maxTurboFreqKHz is the maximum single-core turbo frequency (kHz). + maxTurboFreqKHz uint + // minFreqKHz is the platform minimum frequency (kHz). + minFreqKHz uint +} + +// discoverTurboInfo reads platform turbo capabilities from sysfs. +// It uses the first online CPU's frequency range as representative. +func discoverTurboInfo(sys sysfs.System) (*platformTurboInfo, error) { + cpuIDs := sys.CPUIDs() + if len(cpuIDs) == 0 { + return nil, fmt.Errorf("no CPUs found in system topology") + } + for _, id := range cpuIDs { + cpu := sys.CPU(id) + if cpu == nil || !cpu.Online() { + continue + } + freq := cpu.FrequencyRange() + baseFreq := cpu.BaseFrequency() + if baseFreq == 0 || freq.Max == 0 { + continue + } + return &platformTurboInfo{ + baseFreqKHz: uint(baseFreq), + maxTurboFreqKHz: uint(freq.Max), + minFreqKHz: uint(freq.Min), + }, nil + } + return nil, fmt.Errorf("no online CPU with valid frequency information found") +} diff --git a/cmd/plugins/balloons/policy/flags.go b/cmd/plugins/balloons/policy/flags.go index 19889ea15..e39809ca3 100644 --- a/cmd/plugins/balloons/policy/flags.go +++ b/cmd/plugins/balloons/policy/flags.go @@ -24,6 +24,8 @@ type ( BalloonDef = cfgapi.BalloonDef LoadClass = cfgapi.LoadClass SchedulingClass = cfgapi.SchedulingClass + CPUClass = cfgapi.CPUClass + Frequency = cfgapi.Frequency CPUTopologyLevel = cfgapi.CPUTopologyLevel ) diff --git a/config/crd/bases/config.nri_balloonspolicies.yaml b/config/crd/bases/config.nri_balloonspolicies.yaml index 321f3f417..a93bb7d0d 100644 --- a/config/crd/bases/config.nri_balloonspolicies.yaml +++ b/config/crd/bases/config.nri_balloonspolicies.yaml @@ -733,6 +733,76 @@ spec: type: boolean type: object type: object + cpuClasses: + description: |- + CPUClasses define CPU frequency, C-state, and turbo + attributes for CPU classes referenced by balloon types. + Exclusive turbo frequency access is controlled via + turboPriority. + items: + description: |- + CPUClass specifies CPU frequency, C-state, and turbo attributes + for a CPU class. + properties: + disabledCstates: + description: |- + DisabledCstates lists C-states disabled for CPUs in this class. + Example: ["C4", "C6", "C8", "C10"] + items: + type: string + type: array + energyPerformancePreference: + description: EnergyPerformancePreference for CPUs in this class. + minimum: 0 + type: integer + freqGovernor: + description: |- + FreqGovernor is the CPUFreq governor for this class + (e.g., "performance", "powersave", "schedutil"). + type: string + maxFreq: + description: |- + MaxFreq is the maximum CPU frequency for this class. + Same format and symbolic names as MinFreq. + type: string + minFreq: + description: |- + MinFreq is the minimum CPU frequency for this class. + Accepts values with units: "3.2GHz", "2900MHz", "2900000kHz", + or a plain number in kHz. Also accepts symbolic names: "min" + (platform minimum), "base" (CPU base frequency), "turbo" + (maximum turbo frequency), resolved at runtime from sysfs. + When turboPriority is set, "turbo" resolves to actual turbo + only for the highest-priority active class; others get base. + type: string + name: + description: Name of the CPU class. + type: string + turboPriority: + description: |- + TurboPriority controls exclusive turbo frequency access. + Among CPU classes with active balloons, only the class with + the highest turboPriority gets the symbolic frequency "turbo" + resolved to the actual turbo frequency. All other classes get + "turbo" resolved to the base frequency instead. + If all classes have turboPriority 0 (default), every class + gets actual turbo frequencies -- no competition occurs. + minimum: 0 + type: integer + uncoreMaxFreq: + description: |- + UncoreMaxFreq is the maximum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + uncoreMinFreq: + description: |- + UncoreMinFreq is the minimum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + required: + - name + type: object + type: array idleCPUClass: description: |- IdleCpuClass controls how unusded CPUs outside any a diff --git a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml index 321f3f417..a93bb7d0d 100644 --- a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml +++ b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml @@ -733,6 +733,76 @@ spec: type: boolean type: object type: object + cpuClasses: + description: |- + CPUClasses define CPU frequency, C-state, and turbo + attributes for CPU classes referenced by balloon types. + Exclusive turbo frequency access is controlled via + turboPriority. + items: + description: |- + CPUClass specifies CPU frequency, C-state, and turbo attributes + for a CPU class. + properties: + disabledCstates: + description: |- + DisabledCstates lists C-states disabled for CPUs in this class. + Example: ["C4", "C6", "C8", "C10"] + items: + type: string + type: array + energyPerformancePreference: + description: EnergyPerformancePreference for CPUs in this class. + minimum: 0 + type: integer + freqGovernor: + description: |- + FreqGovernor is the CPUFreq governor for this class + (e.g., "performance", "powersave", "schedutil"). + type: string + maxFreq: + description: |- + MaxFreq is the maximum CPU frequency for this class. + Same format and symbolic names as MinFreq. + type: string + minFreq: + description: |- + MinFreq is the minimum CPU frequency for this class. + Accepts values with units: "3.2GHz", "2900MHz", "2900000kHz", + or a plain number in kHz. Also accepts symbolic names: "min" + (platform minimum), "base" (CPU base frequency), "turbo" + (maximum turbo frequency), resolved at runtime from sysfs. + When turboPriority is set, "turbo" resolves to actual turbo + only for the highest-priority active class; others get base. + type: string + name: + description: Name of the CPU class. + type: string + turboPriority: + description: |- + TurboPriority controls exclusive turbo frequency access. + Among CPU classes with active balloons, only the class with + the highest turboPriority gets the symbolic frequency "turbo" + resolved to the actual turbo frequency. All other classes get + "turbo" resolved to the base frequency instead. + If all classes have turboPriority 0 (default), every class + gets actual turbo frequencies -- no competition occurs. + minimum: 0 + type: integer + uncoreMaxFreq: + description: |- + UncoreMaxFreq is the maximum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + uncoreMinFreq: + description: |- + UncoreMinFreq is the minimum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + required: + - name + type: object + type: array idleCPUClass: description: |- IdleCpuClass controls how unusded CPUs outside any a diff --git a/docs/resource-policy/policy/balloons.md b/docs/resource-policy/policy/balloons.md index 741fad8de..35d3546bf 100644 --- a/docs/resource-policy/policy/balloons.md +++ b/docs/resource-policy/policy/balloons.md @@ -851,23 +851,32 @@ memory-type.resource-policy.nri.io/container.CONTAINER_NAME: HBM,DRAM These options configure CPU behavior and power management. **`cpuClass`** (string) -- References a CPU class defined in `control.cpu.classes` - (policy-level configuration). +- References a CPU class defined in `cpuClasses` (preferred) or + `control.cpu.classes` (legacy, policy-level configuration). - Applied when balloon is created, inflated, or deflated. - Configures frequency scaling and C-states for CPUs in the balloon. +- If left unset and a `cpuClasses` entry named `default` exists, that + `default` class is applied instead. **`idleCPUClass`** (string, policy-level configuration) - CPU class for idle CPUs (not in any balloon). - Applied when CPUs are removed from balloons. - -**`control.cpu.classes`** (object, policy-level configuration): - -Each CPU class (keyed by name) can define: - -- `minFreq` (integer): Minimum CPU frequency in kHz. -- `maxFreq` (integer): Maximum CPU frequency in kHz. -- `uncoreMinFreq` (integer): Minimum uncore frequency in kHz. -- `uncoreMaxFreq` (integer): Maximum uncore frequency in kHz. +- If left unset and a `cpuClasses` entry named `default` exists, that + `default` class is applied to idle CPUs instead. + +**`cpuClasses`** (list, policy-level configuration): + +User-friendly CPU class definitions. Each class is an object with: + +- `name` (string): Class name referenced by `cpuClass` in balloon types. +- `minFreq` (string or number): Minimum CPU frequency. Accepts values + with units: `"3.2GHz"`, `"2900MHz"`, `"2900000kHz"`, or a plain + number in kHz. Also accepts symbolic names: `"min"` (platform + minimum), `"base"` (CPU base frequency), `"turbo"` (maximum turbo + frequency), which are resolved at runtime from sysfs. +- `maxFreq` (string or number): Maximum CPU frequency (same format). +- `uncoreMinFreq` / `uncoreMaxFreq` (string or number): Uncore + frequency limits (same format). - `disabledCstates` (list): C-state names to disable (e.g., `["C6", "C8"]`). - Disabling deep C-states reduces latency by preventing deep sleep. - Disabling intermediate C-states keeps CPU more responsive longer @@ -875,6 +884,17 @@ Each CPU class (keyed by name) can define: not needed. - List available C-states: `grep . /sys/devices/system/cpu/cpu0/cpuidle/state*/name`. +- `energyPerformancePreference` (integer): EPP value for CPUs. +- `freqGovernor` (string): CPUFreq governor (e.g., `"performance"`). +- `turboPriority` (integer): Controls exclusive turbo frequency + access. Among CPU classes with active balloons, only the class + with the highest `turboPriority` gets the symbolic frequency + `"turbo"` resolved to the actual turbo frequency. All other + classes get `"turbo"` resolved to the base frequency. When the + highest-priority class no longer has active balloons, the next + highest-priority class regains turbo. If all classes have + `turboPriority` 0 (default), every class gets real turbo -- no + competition occurs. ```yaml balloonTypes: @@ -884,6 +904,37 @@ balloonTypes: cpuClass: normal idleCPUClass: powersave +cpuClasses: +- name: turbo + minFreq: "turbo" + maxFreq: "turbo" + disabledCstates: [C6, C8, C10] + turboPriority: 10 +- name: normal + minFreq: "min" + maxFreq: "turbo" + turboPriority: 1 +- name: powersave + minFreq: "min" + maxFreq: "1.2GHz" +``` + +**`control.cpu.classes`** (object, legacy policy-level configuration): + +This is the original low-level CPU class configuration. It continues +to work for backwards compatibility. If a class name is defined in +both `cpuClasses` and `control.cpu.classes`, the `cpuClasses` +definition takes precedence. + +Each CPU class (keyed by name) can define: + +- `minFreq` (integer): Minimum CPU frequency in kHz. +- `maxFreq` (integer): Maximum CPU frequency in kHz. +- `uncoreMinFreq` (integer): Minimum uncore frequency in kHz. +- `uncoreMaxFreq` (integer): Maximum uncore frequency in kHz. +- `disabledCstates` (list): C-state names to disable (e.g., `["C6", "C8"]`). + +```yaml control: cpu: classes: @@ -1352,21 +1403,19 @@ spec: overloadsLevelInBalloon: false # Share L2 between CPUs within balloon # CPU classes for frequency and C-state control - control: - cpu: - classes: - ultra-low-latency: - minFreq: 3500000 - maxFreq: 3900000 - uncoreMinFreq: 2400000 - uncoreMaxFreq: 2400000 - disabledCstates: [C6, C7, C8, C10] - normal: - minFreq: 800000 - maxFreq: 2500000 - powersave: - minFreq: 800000 - maxFreq: 800000 + cpuClasses: + - name: ultra-low-latency + minFreq: "base" + maxFreq: "turbo" + uncoreMinFreq: "2.4GHz" + uncoreMaxFreq: "2.4GHz" + disabledCstates: [C6, C7, C8, C10] + - name: normal + minFreq: "min" + maxFreq: "base" + - name: powersave + minFreq: "min" + maxFreq: "min" # Scheduling for high priority schedulingClasses: diff --git a/pkg/apis/config/v1alpha1/balloons-policy.go b/pkg/apis/config/v1alpha1/balloons-policy.go index 259e1afaa..20eea30d5 100644 --- a/pkg/apis/config/v1alpha1/balloons-policy.go +++ b/pkg/apis/config/v1alpha1/balloons-policy.go @@ -14,6 +14,11 @@ package v1alpha1 +import ( + cpucfg "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/control/cpu" + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" +) + var ( _ ResmgrConfig = &BalloonsPolicy{} ) @@ -32,13 +37,45 @@ func (c *BalloonsPolicy) CommonConfig() *CommonConfig { if c == nil { return nil } + ctrl := c.Spec.Control + // Inject user-friendly cpuClasses into control.cpu.classes so + // the CPU controller sees them at startup. CPUClasses entries + // take precedence over identically-named control.cpu.classes. + // Symbolic frequencies (min, base, turbo) are passed as 0 here; + // the balloons policy resolves them at runtime using sysfs data. + if len(c.Spec.CPUClasses) > 0 { + if ctrl.CPU.Classes == nil { + ctrl.CPU.Classes = make(map[string]cpucfg.Class) + } + for _, cc := range c.Spec.CPUClasses { + ctrl.CPU.Classes[cc.Name] = cpucfg.Class{ + MinFreq: freqKHzOrZero(cc.MinFreq), + MaxFreq: freqKHzOrZero(cc.MaxFreq), + EnergyPerformancePreference: cc.EnergyPerformancePreference, + UncoreMinFreq: freqKHzOrZero(cc.UncoreMinFreq), + UncoreMaxFreq: freqKHzOrZero(cc.UncoreMaxFreq), + FreqGovernor: cc.FreqGovernor, + DisabledCstates: cc.DisabledCstates, + } + } + } return &CommonConfig{ - Control: c.Spec.Control, + Control: ctrl, Log: c.Spec.Log, Instrumentation: c.Spec.Instrumentation, } } +// freqKHzOrZero returns the kHz value of a frequency, or 0 if it is +// symbolic (min/base/turbo). Symbolic frequencies are resolved later +// by the policy using actual platform sysfs data. +func freqKHzOrZero(f policyapi.Frequency) uint { + if f.IsSymbolic() { + return 0 + } + return f.KHz() +} + func (c *BalloonsPolicy) PolicyConfig() interface{} { if c == nil { return nil diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go index 496f851a6..5105b8792 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go @@ -32,6 +32,8 @@ type ( CPUTopologyLevel = policy.CPUTopologyLevel ComponentCreationStrategy = policy.ComponentCreationStrategy SchedulingClass = policy.SchedulingClass + CPUClass = policy.CPUClass + Frequency = policy.Frequency ) const ( @@ -135,6 +137,11 @@ type Config struct { // SchedulingClasses specify scheduling classes available in // balloon types. SchedulingClasses []*SchedulingClass `json:"schedulingClasses,omitempty"` + // CPUClasses define CPU frequency, C-state, and turbo + // attributes for CPU classes referenced by balloon types. + // Exclusive turbo frequency access is controlled via + // turboPriority. + CPUClasses []*CPUClass `json:"cpuClasses,omitempty"` } // BalloonDef contains a balloon definition. diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go index 74276ce1d..e4b9ceae1 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go @@ -185,6 +185,17 @@ func (in *Config) DeepCopyInto(out *Config) { } } } + if in.CPUClasses != nil { + in, out := &in.CPUClasses, &out.CPUClasses + *out = make([]*CPUClass, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(CPUClass) + (*in).DeepCopyInto(*out) + } + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Config. diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go b/pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go new file mode 100644 index 000000000..0dd6f00e5 --- /dev/null +++ b/pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go @@ -0,0 +1,59 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package policy + +// CPUClass specifies CPU frequency, C-state, and turbo attributes +// for a CPU class. +// +k8s:deepcopy-gen=true +type CPUClass struct { + // Name of the CPU class. + // +kubebuilder:validation:Required + Name string `json:"name"` + // MinFreq is the minimum CPU frequency for this class. + // Accepts values with units: "3.2GHz", "2900MHz", "2900000kHz", + // or a plain number in kHz. Also accepts symbolic names: "min" + // (platform minimum), "base" (CPU base frequency), "turbo" + // (maximum turbo frequency), resolved at runtime from sysfs. + // When turboPriority is set, "turbo" resolves to actual turbo + // only for the highest-priority active class; others get base. + MinFreq Frequency `json:"minFreq,omitempty"` + // MaxFreq is the maximum CPU frequency for this class. + // Same format and symbolic names as MinFreq. + MaxFreq Frequency `json:"maxFreq,omitempty"` + // EnergyPerformancePreference for CPUs in this class. + // +kubebuilder:validation:Minimum=0 + EnergyPerformancePreference uint `json:"energyPerformancePreference,omitempty"` + // UncoreMinFreq is the minimum uncore frequency for this class. + // Accepts values with units like MinFreq. + UncoreMinFreq Frequency `json:"uncoreMinFreq,omitempty"` + // UncoreMaxFreq is the maximum uncore frequency for this class. + // Accepts values with units like MinFreq. + UncoreMaxFreq Frequency `json:"uncoreMaxFreq,omitempty"` + // FreqGovernor is the CPUFreq governor for this class + // (e.g., "performance", "powersave", "schedutil"). + FreqGovernor string `json:"freqGovernor,omitempty"` + // DisabledCstates lists C-states disabled for CPUs in this class. + // Example: ["C4", "C6", "C8", "C10"] + DisabledCstates []string `json:"disabledCstates,omitempty"` + // TurboPriority controls exclusive turbo frequency access. + // Among CPU classes with active balloons, only the class with + // the highest turboPriority gets the symbolic frequency "turbo" + // resolved to the actual turbo frequency. All other classes get + // "turbo" resolved to the base frequency instead. + // If all classes have turboPriority 0 (default), every class + // gets actual turbo frequencies -- no competition occurs. + // +kubebuilder:validation:Minimum=0 + TurboPriority int `json:"turboPriority,omitempty"` +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/frequency.go b/pkg/apis/config/v1alpha1/resmgr/policy/frequency.go new file mode 100644 index 000000000..0095117ec --- /dev/null +++ b/pkg/apis/config/v1alpha1/resmgr/policy/frequency.go @@ -0,0 +1,205 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package policy + +import ( + "encoding/json" + "fmt" + "math" + "regexp" + "strconv" + "strings" +) + +// Frequency represents a CPU frequency value that can be specified +// with human-readable units in YAML/JSON configuration. Supported +// formats: +// - "3.2G" or "3.2GHz" = 3200000 (kHz) +// - "2900M" or "2900MHz" = 2900000 (kHz) +// - "2900000k" or "2900000kHz" = 2900000 (kHz) +// - "2900000" (bare number) = 2900000 (kHz, backwards compatible) +// - 2900000 (JSON number) = 2900000 (kHz, backwards compatible) +// - "min" = platform minimum frequency (resolved at runtime) +// - "base" = CPU base frequency (resolved at runtime) +// - "turbo" = maximum turbo frequency (resolved at runtime) +// +// The internal representation is always in kHz (the unit used by Linux +// kernel sysfs cpufreq interface). Symbolic values ("min", "base", +// "turbo") are stored as sentinel constants and must be resolved with +// Resolve() before being passed to the CPU controller. +// +kubebuilder:validation:Type=string +type Frequency uint + +const ( + // FrequencyMin is a sentinel indicating the platform minimum frequency. + FrequencyMin Frequency = math.MaxUint - 2 + // FrequencyBase is a sentinel indicating the CPU base frequency. + FrequencyBase Frequency = math.MaxUint - 1 + // FrequencyTurbo is a sentinel indicating the maximum turbo frequency. + FrequencyTurbo Frequency = math.MaxUint +) + +var frequencyRegexp = regexp.MustCompile(`(?i)^\s*([0-9]*\.?[0-9]+)\s*(GHz|G|MHz|M|kHz|k)?\s*$`) + +// parseFrequency parses a frequency string into kHz. +func parseFrequency(s string) (Frequency, error) { + s = strings.TrimSpace(s) + if s == "" { + return 0, nil + } + + // Check for symbolic frequency names. + switch strings.ToLower(s) { + case "min": + return FrequencyMin, nil + case "base": + return FrequencyBase, nil + case "turbo": + return FrequencyTurbo, nil + } + + matches := frequencyRegexp.FindStringSubmatch(s) + if matches == nil { + return 0, fmt.Errorf("invalid frequency %q: expected number with optional unit (GHz, MHz, kHz) or symbolic name (min, base, turbo)", s) + } + + numStr := matches[1] + unit := strings.ToLower(matches[2]) + + val, err := strconv.ParseFloat(numStr, 64) + if err != nil { + return 0, fmt.Errorf("invalid frequency %q: %w", s, err) + } + if val < 0 { + return 0, fmt.Errorf("invalid frequency %q: negative value", s) + } + + var kHz float64 + switch unit { + case "ghz", "g": + kHz = val * 1_000_000 + case "mhz", "m": + kHz = val * 1_000 + case "khz", "k": + kHz = val + case "": + // Bare number: interpret as kHz for backwards compatibility + // with the existing uint config fields. + kHz = val + } + + result := uint(math.Round(kHz)) + if result == 0 && val > 0 { + return 0, fmt.Errorf("invalid frequency %q: value too small to represent in kHz", s) + } + + return Frequency(result), nil +} + +// UnmarshalJSON implements json.Unmarshaler. Accepts both JSON strings +// with units (e.g., "3.2GHz") and plain JSON numbers (interpreted as kHz). +func (f *Frequency) UnmarshalJSON(data []byte) error { + // Try string first (quoted value with optional unit). + var s string + if err := json.Unmarshal(data, &s); err == nil { + parsed, err := parseFrequency(s) + if err != nil { + return err + } + *f = parsed + return nil + } + + // Try plain number (backwards compatible with uint kHz). + var n float64 + if err := json.Unmarshal(data, &n); err == nil { + if n < 0 { + return fmt.Errorf("invalid frequency: negative value %v", n) + } + *f = Frequency(uint(math.Round(n))) + return nil + } + + return fmt.Errorf("invalid frequency: expected string or number, got %s", string(data)) +} + +// MarshalJSON implements json.Marshaler. Symbolic frequencies are +// marshaled as their string name; numeric values as plain numbers (kHz) +// for backwards compatibility. +func (f Frequency) MarshalJSON() ([]byte, error) { + switch f { + case FrequencyMin: + return json.Marshal("min") + case FrequencyBase: + return json.Marshal("base") + case FrequencyTurbo: + return json.Marshal("turbo") + } + return json.Marshal(uint(f)) +} + +// KHz returns the frequency value in kHz. For symbolic frequencies +// (min, base, turbo) this returns the sentinel value; use Resolve() +// first to obtain the actual platform frequency. +func (f Frequency) KHz() uint { + return uint(f) +} + +// IsSymbolic returns true if this frequency is a symbolic name +// (min, base, or turbo) that requires runtime resolution. +func (f Frequency) IsSymbolic() bool { + return f == FrequencyMin || f == FrequencyBase || f == FrequencyTurbo +} + +// Resolve converts a symbolic frequency to its concrete kHz value +// using platform frequency information. For non-symbolic frequencies, +// the value is returned unchanged. The parameters are: +// - minKHz: platform minimum frequency (cpufreq/cpuinfo_min_freq) +// - baseKHz: CPU base frequency (cpufreq/base_frequency) +// - turboKHz: maximum turbo frequency (cpufreq/cpuinfo_max_freq) +func (f Frequency) Resolve(minKHz, baseKHz, turboKHz uint) uint { + switch f { + case FrequencyMin: + return minKHz + case FrequencyBase: + return baseKHz + case FrequencyTurbo: + return turboKHz + } + return uint(f) +} + +// String returns a human-readable representation. +func (f Frequency) String() string { + switch f { + case FrequencyMin: + return "min" + case FrequencyBase: + return "base" + case FrequencyTurbo: + return "turbo" + } + kHz := uint(f) + if kHz == 0 { + return "0" + } + if kHz >= 1_000_000 && kHz%1_000_000 == 0 { + return fmt.Sprintf("%dGHz", kHz/1_000_000) + } + if kHz >= 1_000 && kHz%1_000 == 0 { + return fmt.Sprintf("%dMHz", kHz/1_000) + } + return fmt.Sprintf("%dkHz", kHz) +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go b/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go index 3bef85a34..9720ac1a7 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go @@ -20,6 +20,26 @@ package policy import () +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CPUClass) DeepCopyInto(out *CPUClass) { + *out = *in + if in.DisabledCstates != nil { + in, out := &in.DisabledCstates, &out.DisabledCstates + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUClass. +func (in *CPUClass) DeepCopy() *CPUClass { + if in == nil { + return nil + } + out := new(CPUClass) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingClass) DeepCopyInto(out *SchedulingClass) { *out = *in diff --git a/pkg/resmgr/control/control.go b/pkg/resmgr/control/control.go index 96d958c4c..be66edc1c 100644 --- a/pkg/resmgr/control/control.go +++ b/pkg/resmgr/control/control.go @@ -40,6 +40,11 @@ type Control interface { RunPostUpdateHooks(cache.Container) error // RunPostStopHooks runs the post-stop hooks of all registered controllers. RunPostStopHooks(cache.Container) error + // RunCommit invokes Commit on every running controller. It is meant + // to be called once per NRI request, after all per-container hooks + // have run, so that controllers can flush any deferred state + // changes (e.g. sysfs writes) in a single batch. + RunCommit() error } // Controller is the interface all resource controllers must implement. @@ -58,6 +63,11 @@ type Controller interface { PostUpdateHook(cache.Container) error // PostStopHook is the controller's post-stop hook. PostStopHook(cache.Container) error + // Commit applies any deferred state changes the controller has + // accumulated since the previous Commit. It is called once per + // NRI request, after all per-container hooks have run. Controllers + // with no deferred state should return nil. + Commit() error } // control encapsulates our controller-agnostic runtime state. @@ -191,6 +201,23 @@ func (c *control) RunPostStopHooks(container cache.Container) error { return nil } +// RunCommit invokes Commit() on every running controller, collecting +// all errors. Controllers are expected to defer sysfs (or other +// expensive) state changes accumulated during a single NRI request +// and apply them here in one batch. +func (c *control) RunCommit() error { + var errs []error + for _, controller := range c.controllers { + if !controller.running { + continue + } + if err := controller.c.Commit(); err != nil { + errs = append(errs, controlError("%s commit failed: %v", controller.name, err)) + } + } + return errors.Join(errs...) +} + // runhook executes the given container hook according to the controller settings func (c *control) runhook(controller *controller, hook string, container cache.Container) error { if !controller.running { diff --git a/pkg/resmgr/control/cpu/api.go b/pkg/resmgr/control/cpu/api.go index 33bd02025..fb8be183b 100644 --- a/pkg/resmgr/control/cpu/api.go +++ b/pkg/resmgr/control/cpu/api.go @@ -24,10 +24,31 @@ func GetClasses() map[string]Class { return getCPUController().getClasses() } +// SetClass adds or updates a CPU class definition. This allows +// policies to dynamically define CPU classes (e.g., from +// user-friendly CPUClasses configuration) without requiring them +// to be present in the static control.cpu.classes config. +// +// The change is purely in-memory: any CPUs currently assigned to the +// updated class are marked dirty so the next Commit() will re-enforce +// the new definition on them in a single batch. +func SetClass(name string, class Class) { + ctl := getCPUController() + if ctl.classes == nil { + ctl.classes = make(map[string]Class) + } + ctl.classes[name] = class + ctl.markClassDirty(name) +} + // Assign assigns a set of cpus to a class. // -// TODO: Drop this function. Don't store cpu class in policy data but implement -// controller-specific data store in cache. +// The assignment is recorded in the cache (so it survives across +// restarts) and the affected CPUs are marked dirty. No sysfs writes +// happen here; the CPU controller's Commit() (invoked once per NRI +// request after all per-container hooks have run) coalesces all +// pending changes into the minimal set of writes needed to reach the +// final desired state. func Assign(c cache.Cache, class string, cpus ...int) error { // NOTE: no locking implemented anywhere around -> we don't expect multiple parallel callers @@ -54,20 +75,7 @@ func Assign(c cache.Cache, class string, cpus ...int) error { setClassAssignments(c, &assignments) - if getCPUController().started { - // We don't want to try to enforce until the controller has been fully - // started. Enforcement of all assignments happens on StarT(), anyway. - ctl := getCPUController() - if err := ctl.enforceCpufreq(class, cpus...); err != nil { - log.Errorf("cpufreq enforcement failed: %v", err) - } - if err := ctl.enforceCstates(class, cpus...); err != nil { - log.Errorf("cstate enforcement failed: %v", err) - } - if err := ctl.enforceUncore(assignments, cpus...); err != nil { - log.Errorf("uncore frequency enforcement failed: %v", err) - } - } + getCPUController().markCPUsDirty(cpus...) return nil } diff --git a/pkg/resmgr/control/cpu/cache.go b/pkg/resmgr/control/cpu/cache.go index 0279dbd2d..3affed367 100644 --- a/pkg/resmgr/control/cpu/cache.go +++ b/pkg/resmgr/control/cpu/cache.go @@ -32,7 +32,10 @@ func getClassAssignments(c cache.Cache) *cpuClassAssignments { a := &cpuClassAssignments{} if !c.GetPolicyEntry(cacheKeyCPUAssignments, a) { - log.Errorf("no cached state of CPU class assignments found") + // Expected on a fresh policy startup: the cache has no + // prior CPU class assignments. Callers create entries + // via Assign() and persist them via setClassAssignments. + log.Debugf("no cached state of CPU class assignments found") } return a diff --git a/pkg/resmgr/control/cpu/cpu.go b/pkg/resmgr/control/cpu/cpu.go index 90159a5fa..05ab0e678 100644 --- a/pkg/resmgr/control/cpu/cpu.go +++ b/pkg/resmgr/control/cpu/cpu.go @@ -45,6 +45,25 @@ type cpuctl struct { cstates *cstates.Cstates // C-states handler uncoreEnabled bool // whether we need to care about uncore started bool + lastFreq map[int]cpufreqState // cpu id -> last successfully written cpufreq values + // dirtyCPUs accumulates CPUs whose desired class definition or + // class assignment has changed since the last Commit(). Writes to + // sysfs are deferred until Commit() so that intermediate bursts + // of Assign()/SetClass() calls within a single policy decision + // do not produce sequences of redundant or temporarily-wrong + // sysfs writes. + dirtyCPUs map[int]bool +} + +// cpufreqState tracks the last successfully written cpufreq values +// for a single CPU. Used to skip redundant sysfs writes. +type cpufreqState struct { + min uint + max uint + governor string + hasMin bool + hasMax bool + hasGov bool } type Class = cfgcpu.Class @@ -90,8 +109,6 @@ func (ctl *cpuctl) Start(cache cache.Cache, cfg *cfgapi.Config) (bool, error) { log.Errorf("failed apply /cpuinitial configuration: %v", err) } - ctl.started = true - return true, nil } @@ -124,35 +141,176 @@ func (ctl *cpuctl) PostStopHook(c cache.Container) error { return nil } -// enforceCpufreq enforces a class-specific cpufreq configuration to a cpuset +// markCPUsDirty records the given CPUs as needing a sysfs re-check at +// the next Commit(). +func (ctl *cpuctl) markCPUsDirty(cpus ...int) { + if ctl.dirtyCPUs == nil { + ctl.dirtyCPUs = make(map[int]bool) + } + for _, c := range cpus { + ctl.dirtyCPUs[c] = true + } +} + +// markClassDirty records every CPU currently assigned to the given +// class as dirty. Used when SetClass changes a class definition that +// already has CPUs assigned to it. +func (ctl *cpuctl) markClassDirty(class string) { + if ctl.cache == nil { + return + } + assignments := *getClassAssignments(ctl.cache) + cpus, ok := assignments[class] + if !ok { + return + } + if ctl.dirtyCPUs == nil { + ctl.dirtyCPUs = make(map[int]bool, len(cpus)) + } + for id := range cpus { + ctl.dirtyCPUs[int(id)] = true + } +} + +// Commit flushes deferred per-CPU sysfs updates accumulated since the +// previous Commit. It is the choke point that converts the desired +// state (class definitions in ctl.classes + cached class assignments) +// into the minimal set of sysfs writes needed to reach it. Per-CPU +// writes are still deduplicated against ctl.lastFreq, so even if a +// CPU is marked dirty by multiple intermediate Assign/SetClass calls +// the final value is written at most once. +func (ctl *cpuctl) Commit() error { + if !ctl.started || len(ctl.dirtyCPUs) == 0 { + return nil + } + + assignments := *getClassAssignments(ctl.cache) + + // Group dirty CPUs by their currently assigned class. CPUs that + // no longer appear in any class assignment are skipped: there is + // no class definition to enforce on them. + cpuClass := make(map[int]string, len(ctl.dirtyCPUs)) + for class, cpus := range assignments { + for id := range cpus { + if ctl.dirtyCPUs[int(id)] { + cpuClass[int(id)] = class + } + } + } + + byClass := make(map[string][]int, len(ctl.classes)) + for cpu, class := range cpuClass { + byClass[class] = append(byClass[class], cpu) + } + + var firstErr error + for class, cpus := range byClass { + if _, ok := ctl.classes[class]; !ok { + log.Warnf("commit: class %q (cpus %v) missing from configuration", class, cpus) + continue + } + if err := ctl.enforceCpufreq(class, cpus...); err != nil { + log.Errorf("commit: cpufreq enforcement failed for class %q: %v", class, err) + if firstErr == nil { + firstErr = err + } + } + if err := ctl.enforceCstates(class, cpus...); err != nil { + log.Errorf("commit: cstate enforcement failed for class %q: %v", class, err) + if firstErr == nil { + firstErr = err + } + } + } + + // Uncore is per-die; recompute over all dirty CPUs in one pass. + affectedCPUs := make([]int, 0, len(ctl.dirtyCPUs)) + for cpu := range ctl.dirtyCPUs { + affectedCPUs = append(affectedCPUs, cpu) + } + if err := ctl.enforceUncore(assignments, affectedCPUs...); err != nil { + log.Errorf("commit: uncore enforcement failed: %v", err) + if firstErr == nil { + firstErr = err + } + } + + // Clear the dirty set unconditionally. enforceCpufreq has its own + // per-property lastFreq update logic that avoids re-trying writes + // that keep failing for unchanged desired values. + ctl.dirtyCPUs = nil + + return firstErr +} + +// enforceCpufreq enforces a class-specific cpufreq configuration to a cpuset. +// Per-CPU sysfs writes are skipped when the desired value matches the +// last successfully written value (tracked in ctl.lastFreq). A write +// failure on one CPU/property is logged but does not stop processing +// of remaining CPUs/properties. The first error encountered is +// returned to the caller. func (ctl *cpuctl) enforceCpufreq(class string, cpus ...int) error { c, ok := ctl.classes[class] if !ok { return fmt.Errorf("non-existent cpu class %q", class) } + if ctl.lastFreq == nil { + ctl.lastFreq = make(map[int]cpufreqState) + } - if min := int(c.MinFreq); min > 0 { - log.Debugf("enforcing cpu frequency min %d from class %q on %v", min, class, cpus) - if err := utils.SetCPUsScalingMinFreq(cpus, min); err != nil { - return fmt.Errorf("cannot set min freq %d: %w", min, err) + min := uint(c.MinFreq) + max := uint(c.MaxFreq) + governor := c.FreqGovernor + + var firstErr error + for _, cpu := range cpus { + state := ctl.lastFreq[cpu] + + if min > 0 && (!state.hasMin || state.min != min) { + log.Debugf("enforcing cpu frequency min %d from class %q on cpu %d", min, class, cpu) + if err := utils.SetCPUScalingMinFreq(utils.ID(cpu), int(min)); err != nil { + log.Errorf("cannot set min freq %d on cpu %d: %v", min, cpu, err) + if firstErr == nil { + firstErr = err + } + } + // Update the cache even on failure: the desired value + // is unchanged so retrying on every Assign would just + // spam logs without ever succeeding. A subsequent + // configure() resets lastFreq so a real configuration + // change still triggers a fresh attempt. + state.min = min + state.hasMin = true } - } - if max := int(c.MaxFreq); max > 0 { - log.Debugf("enforcing cpu frequency max %d from class %q on %v", max, class, cpus) - if err := utils.SetCPUsScalingMaxFreq(cpus, max); err != nil { - return fmt.Errorf("cannot set max freq %d: %w", max, err) + if max > 0 && (!state.hasMax || state.max != max) { + log.Debugf("enforcing cpu frequency max %d from class %q on cpu %d", max, class, cpu) + if err := utils.SetCPUScalingMaxFreq(utils.ID(cpu), int(max)); err != nil { + log.Errorf("cannot set max freq %d on cpu %d: %v", max, cpu, err) + if firstErr == nil { + firstErr = err + } + } + state.max = max + state.hasMax = true } - } - if governor := c.FreqGovernor; governor != "" { - log.Debugf("enforcing cpu frequency governor %q from class %q on %v", governor, class, cpus) - if err := utils.SetScalingGovernorForCPUs(cpus, governor); err != nil { - return fmt.Errorf("cannot set cpufreq governor %q: %w", governor, err) + if governor != "" && (!state.hasGov || state.governor != governor) { + log.Debugf("enforcing cpu frequency governor %q from class %q on cpu %d", governor, class, cpu) + if err := utils.SetCPUScalingGovernor(utils.ID(cpu), governor); err != nil { + log.Errorf("cannot set cpufreq governor %q on cpu %d: %v", governor, cpu, err) + if firstErr == nil { + firstErr = err + } + } + state.governor = governor + state.hasGov = true } + + ctl.lastFreq[cpu] = state } - return nil + return firstErr } // enforceCstates enforces a class-specific C-state configuration to a cpuset @@ -276,11 +434,37 @@ func idSetIntersects(a, b utils.IDSet) bool { } func (ctl *cpuctl) configure(cfg *cfgapi.Config) error { + // Preserve any class definitions that were pushed via SetClass + // before the controller started. The balloons policy uses + // SetClass to publish CPU class definitions with proper kHz + // values resolved from symbolic frequencies (min/base/turbo). + // CommonConfig() also injects placeholder entries (kHz=0) into + // cfg.CPU.Classes so that controller startup sanity checks see + // the class names. Merge them: cfg-provided classes seed the + // map, then any SetClass-pushed values take precedence. + preserved := ctl.classes ctl.classes = nil ctl.uncoreEnabled = false + // Reset per-CPU last-written cache: a config change may + // alter min/max for the same class, so the next enforce + // pass must actually write to sysfs. + ctl.lastFreq = nil + // Reset the dirty set; we'll re-populate it below with every + // CPU currently assigned to a known class so that the Commit() + // at the end of configure() re-enforces the full desired state. + ctl.dirtyCPUs = nil if cfg != nil && len(cfg.CPU.Classes) != 0 { - ctl.classes = cfg.CPU.Classes + ctl.classes = make(map[string]Class, len(cfg.CPU.Classes)) + for name, c := range cfg.CPU.Classes { + ctl.classes[name] = c + } + } + for name, c := range preserved { + if ctl.classes == nil { + ctl.classes = make(map[string]Class) + } + ctl.classes[name] = c } // Re-configure CPUs that are assigned to some known class @@ -321,15 +505,15 @@ func (ctl *cpuctl) configure(cfg *cfgapi.Config) error { } } - // Configure the system + // Mark every CPU assigned to a known class as dirty so the + // Commit() below re-enforces all per-CPU values in one batch. + // Classes that have disappeared from the configuration are + // preserved in the cache, but their CPUs are not re-enforced + // (see the warning below). for class, cpus := range assignments { if _, ok := ctl.classes[class]; ok { - // Re-configure cpus (sysfs) according to new class parameters - if err := ctl.enforceCpufreq(class, cpus.SortedMembers()...); err != nil { - log.Errorf("cpufreq enforcement on re-configure failed: %v", err) - } - if err := ctl.enforceCstates(class, cpus.SortedMembers()...); err != nil { - log.Errorf("cpufreq enforcement on re-configure failed: %v", err) + for id := range cpus { + ctl.markCPUsDirty(int(id)) } } else { // TODO: what should we really do with classes that do not exist in @@ -339,8 +523,13 @@ func (ctl *cpuctl) configure(cfg *cfgapi.Config) error { log.Warnf("class %q with cpus %v missing from the configuration", class, cpus) } } - if err := ctl.enforceUncore(assignments); err != nil { - log.Errorf("uncore frequency enforcement on re-configure failed: %v", err) + + // Set started=true before the in-line Commit() call below: Commit + // gates on this flag (so a stray pre-Start call is a no-op), and + // configure() is invoked from Start() once classes are ready. + ctl.started = true + if err := ctl.Commit(); err != nil { + log.Errorf("initial commit failed: %v", err) } log.Debugf("cpu controller configured") diff --git a/pkg/resmgr/control/e2e-test/e2e-test.go b/pkg/resmgr/control/e2e-test/e2e-test.go index de9076c14..efd209c04 100644 --- a/pkg/resmgr/control/e2e-test/e2e-test.go +++ b/pkg/resmgr/control/e2e-test/e2e-test.go @@ -122,6 +122,11 @@ func (ctl *testctl) PostStopHook(c cache.Container) error { return nil } +// Commit is a no-op for the e2e test controller. +func (ctl *testctl) Commit() error { + return nil +} + // dumpE2ETestControllerState prints internal info used by e2e testing script. func (ctl *testctl) dumpE2ETestControllerState(w http.ResponseWriter, req *http.Request) { log.Debugf("output E2E test controller state...") diff --git a/pkg/resmgr/nri.go b/pkg/resmgr/nri.go index 0678fe0bf..3d93cb2bb 100644 --- a/pkg/resmgr/nri.go +++ b/pkg/resmgr/nri.go @@ -1074,6 +1074,9 @@ func (p *nriPlugin) runPostAllocateHooks(method string, created cache.Container) c.PrettyName(), c.GetState()) } } + if err := m.control.RunCommit(); err != nil { + nri.Warnf("%s: controller commit failed: %v", method, err) + } return nil } @@ -1083,6 +1086,9 @@ func (p *nriPlugin) runPostStartHooks(method string, c cache.Container) error { if err := m.control.RunPostStartHooks(c); err != nil { nri.Errorf("%s: post-start hook failed for %s: %v", method, c.PrettyName(), err) } + if err := m.control.RunCommit(); err != nil { + nri.Warnf("%s: controller commit failed: %v", method, err) + } return nil } @@ -1109,5 +1115,8 @@ func (p *nriPlugin) runPostReleaseHooks(method string, released ...cache.Contain method, c.PrettyName(), c.GetState()) } } + if err := m.control.RunCommit(); err != nil { + nri.Warnf("%s: controller commit failed: %v", method, err) + } return nil } diff --git a/pkg/resmgr/resource-manager.go b/pkg/resmgr/resource-manager.go index 28e5b9434..6537e9849 100644 --- a/pkg/resmgr/resource-manager.go +++ b/pkg/resmgr/resource-manager.go @@ -331,6 +331,10 @@ func (m *resmgr) reconfigure(cfg cfgapi.ResmgrConfig) error { log.Warnf("failed to apply configuration to containers: %v", err) } + if err := m.control.RunCommit(); err != nil { + log.Warnf("failed to commit controller state after reconfigure: %v", err) + } + return nil } diff --git a/pkg/sysfs/system.go b/pkg/sysfs/system.go index 394b39d55..16db6ea29 100644 --- a/pkg/sysfs/system.go +++ b/pkg/sysfs/system.go @@ -292,8 +292,10 @@ var ( PerformanceCore: "OVERRIDE_SYS_CORE_CPUS", EfficientCore: "OVERRIDE_SYS_ATOM_CPUS", } - cacheEnvOverridesVar = "OVERRIDE_SYS_CACHES" - cacheEnvOverridesJson = os.Getenv(cacheEnvOverridesVar) + cacheEnvOverridesVar = "OVERRIDE_SYS_CACHES" + cacheEnvOverridesJson = os.Getenv(cacheEnvOverridesVar) + cpufreqEnvOverridesVar = "OVERRIDE_SYS_CPUFREQ" + cpufreqEnvOverridesJson = os.Getenv(cpufreqEnvOverridesVar) ) // MemInfo contains data read from a NUMA node meminfo file. @@ -338,6 +340,16 @@ type cacheOverride struct { var cacheEnvOverrides map[int][]*Cache +// cpufreqOverride specifies frequency values to use instead of reading sysfs. +type cpufreqOverride struct { + Cpus string `json:"cpus"` // CPU ids in list format, e.g. "0-15" + Base uint64 `json:"base"` // base frequency (kHz) + Min uint64 `json:"min"` // minimum frequency (kHz) + Max uint64 `json:"max"` // maximum/turbo frequency (kHz) +} + +var cpufreqEnvOverrides map[int]CPUFreq + // SetSysRoot sets the sys root directory. func SetSysRoot(root string) { if root != "" { @@ -1063,6 +1075,10 @@ func (sys *system) discoverCPU(path string) error { if _, err := readSysfsEntry(path, "cpufreq/cpuinfo_max_freq", &cpu.freq.Max); err != nil { cpu.freq.Max = 0 } + // Apply cpufreq overrides from OVERRIDE_SYS_CPUFREQ if set. + if err := sys.applyCpufreqOverrides(cpu); err != nil { + log.Warnf("failed to apply cpufreq overrides for cpu%d: %v", cpu.id, err) + } if _, err := readSysfsEntry(path, "cpufreq/energy_performance_preference", &cpu.epp); err != nil { cpu.epp = EPPUnknown } @@ -2082,7 +2098,45 @@ func (sys *system) discoverCacheFromOverrides(cpu *cpu) (bool, error) { return false, nil } -// Discover cache associated with the given CPU. +// applyCpufreqOverrides overrides CPU frequency values from OVERRIDE_SYS_CPUFREQ. +func (sys *system) applyCpufreqOverrides(cpu *cpu) error { + if cpufreqEnvOverridesJson == "" { + return nil + } + if cpufreqEnvOverrides == nil { + sys.Debugf("parsing cpufreq overrides from %s=%q", cpufreqEnvOverridesVar, cpufreqEnvOverridesJson) + overrides, err := parseCpufreqOverrides(cpufreqEnvOverridesJson) + if err != nil { + return fmt.Errorf("failed to parse %s: %v", cpufreqEnvOverridesVar, err) + } + cpufreqEnvOverrides = overrides + } + if freq, ok := cpufreqEnvOverrides[cpu.id]; ok { + sys.Debugf("cpufreq override for cpu%d: base=%d min=%d max=%d", cpu.id, freq.Base, freq.Min, freq.Max) + cpu.freq = freq + } + return nil +} + +// parseCpufreqOverrides parses JSON cpufreq overrides into a per-CPU map. +func parseCpufreqOverrides(jsonData string) (map[int]CPUFreq, error) { + var overrides []cpufreqOverride + if err := json.Unmarshal([]byte(jsonData), &overrides); err != nil { + return nil, err + } + result := make(map[int]CPUFreq) + for _, o := range overrides { + cpus, err := idset.NewIDSetFromString(o.Cpus) + if err != nil { + return nil, fmt.Errorf("invalid CPU list %q: %v", o.Cpus, err) + } + freq := CPUFreq{Base: o.Base, Min: o.Min, Max: o.Max} + for cpu := range cpus { + result[cpu] = freq + } + } + return result, nil +} func (sys *system) discoverCache(cpu *cpu, path string) error { var id idset.ID diff --git a/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in b/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in index 8ebcfaacb..aa11dcf33 100644 --- a/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in +++ b/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in @@ -61,19 +61,29 @@ spec: debug: - policy - control: - cpu: - classes: - default: - minFreq: ${CPU_DEFAULT_MIN:-800000} - maxFreq: ${CPU_DEFAULT_MAX:-2800000} - classA: - minFreq: ${CPU_CLASSA_MIN:-900000} - maxFreq: ${CPU_CLASSA_MAX:-2900000} - classB: - minFreq: ${CPU_CLASSB_MIN:-1000000} - maxFreq: ${CPU_CLASSB_MAX:-3000000} - classC: - minFreq: ${CPU_CLASSC_MIN:-1100000} - maxFreq: ${CPU_CLASSC_MAX:-3100000} - energyPerformancePreference: ${CPU_CLASSC_EPP:-1} + cpuClasses: + + $([ -n "$CPUCLASS_DEFAULT_SKIP" ] || echo " + - name: default + minFreq: ${CPU_DEFAULT_MIN:-800MHz} + maxFreq: ${CPU_DEFAULT_MAX:-2.8GHz} + ") + + $([ -n "$CPUCLASS_A_SKIP" ] || echo " + - name: classA + minFreq: ${CPU_CLASSA_MIN:-900MHz} + maxFreq: ${CPU_CLASSA_MAX:-2.9GHz} + ") + + $([ -n "$CPUCLASS_B_SKIP" ] || echo " + - name: classB + minFreq: ${CPU_CLASSB_MIN:-1GHz} + maxFreq: ${CPU_CLASSB_MAX:-3GHz} + ") + + $([ -n "$CPUCLASS_C_SKIP" ] || echo " + - name: classC + minFreq: ${CPU_CLASSC_MIN:-1.1GHz} + maxFreq: ${CPU_CLASSC_MAX:-3.1GHz} + energyPerformancePreference: ${CPU_CLASSC_EPP:-1} + ") diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg index 215432bf1..a89a699d2 100644 --- a/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg +++ b/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg @@ -16,13 +16,11 @@ config: cpuClass: lowlatency-class schedulingClass: realtime - control: - cpu: - classes: - lowlatency-class: - disabledCstates: [C4, C6, C8, C10] - default-class: - disabledCstates: [] + cpuClasses: + - name: lowlatency-class + disabledCstates: [C4, C6, C8, C10] + - name: default-class + disabledCstates: [] schedulingClasses: - name: realtime diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg new file mode 100644 index 000000000..eb2a2b9cc --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg @@ -0,0 +1,35 @@ +config: + agent: + nodeResourceTopology: true + allocatorTopologyBalancing: false + availableResources: + cpu: cpuset:2-7,10-13 + reservedResources: + cpu: 750m + + pinCPU: true + + # Intentionally no idleCPUClass and no cpuClass on the reserved + # balloon type: both must fall back to the cpuClass named "default". + balloonTypes: + - name: reserved + - name: fast-bln + cpuClass: fast + minCPUs: 1 + maxCPUs: 1 + + cpuClasses: + - name: default + minFreq: "min" + maxFreq: "base" + - name: fast + minFreq: "turbo" + maxFreq: "turbo" + + log: + debug: + - policy + - nri-plugin + - cpu +extraEnv: + OVERRIDE_SYS_CPUFREQ: '''[{"cpus": "0-15", "base": 2900000, "min": 800000, "max": 3800000}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg new file mode 100644 index 000000000..61f2888fe --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg @@ -0,0 +1,36 @@ +config: + agent: + nodeResourceTopology: true + allocatorTopologyBalancing: false + availableResources: + cpu: cpuset:2-7,10-13 + reservedResources: + cpu: 750m + + pinCPU: true + + idleCPUClass: legacy-idle + + balloonTypes: + - name: legacy-bln + cpuClass: legacy-fast + minCPUs: 1 + maxCPUs: 1 + + control: + cpu: + classes: + legacy-idle: + minFreq: 800000 + maxFreq: 2900000 + legacy-fast: + minFreq: 3800000 + maxFreq: 3800000 + + log: + debug: + - policy + - nri-plugin + - cpu +extraEnv: + OVERRIDE_SYS_CPUFREQ: '''[{"cpus": "0-15", "base": 2900000, "min": 800000, "max": 3800000}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg new file mode 100644 index 000000000..e73f41a37 --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg @@ -0,0 +1,48 @@ +config: + agent: + nodeResourceTopology: true + allocatorTopologyBalancing: false + availableResources: + cpu: cpuset:2-7,10-13 + reservedResources: + cpu: 750m + + pinCPU: true + + idleCPUClass: default-noturbo + + balloonTypes: + - name: reserved + cpuClass: default-turbo + - name: turbo-high-bln + cpuClass: turbo-high + minCPUs: 1 + maxCPUs: 2 + - name: turbo-low-bln + cpuClass: turbo-low + minCPUs: 1 + maxCPUs: 2 + + cpuClasses: + - name: turbo-high + minFreq: "turbo" + maxFreq: "turbo" + turboPriority: 10 + - name: turbo-low + minFreq: "turbo" + maxFreq: "turbo" + turboPriority: 1 + - name: default-turbo + minFreq: "min" + maxFreq: "turbo" + - name: default-noturbo + minFreq: "min" + maxFreq: "base" + + log: + debug: + - policy + - nri-plugin + - cpu +extraEnv: + OVERRIDE_SYS_CPUFREQ: '''[{"cpus": "0-15", "base": 2900000, "min": 800000, "max": 3800000}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh new file mode 100644 index 000000000..c840cfba5 --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh @@ -0,0 +1,495 @@ +# Test turbo priority: highest-priority active CPU class gets turbo, +# others get base. When the highest-priority balloon is removed, +# the next highest-priority class regains turbo. +# +# Also verifies CPU frequency write minimality: +# - no duplicate sysfs writes (each (cpu, prop, freq) tuple is logged +# at most once per recorded snapshot window, thanks to the per-CPU +# last-written cache in pkg/resmgr/control/cpu), +# - writes do happen on class transitions (turbo<->base) and when +# idle CPUs need their initial class settings, +# - a no-op event (creating a 2nd container that lands in the +# *same* turbo-low balloon as pod0) does not produce any new +# enforce writes. + +helm-terminate +helm_config=$TEST_DIR/balloons-turbo.cfg helm-launch balloons + +# turbo-log fetches the latest turbo recalculation log lines +turbo-log() { + local last_n=${1:-20} + vm-command "kubectl -n kube-system logs ds/nri-resource-policy-balloons | grep -E 'turbo:|cpuClass' | tail -n $last_n" +} + +# verify-turbo-winner checks that the given class is logged as a turbo winner +# with the expected maxFreq, within the last N turbo log lines. +verify-turbo-winner() { + local class=$1 + local expected_max_freq=$2 + local last_n=${3:-20} + echo "verify turbo winner: class=$class maxFreq=$expected_max_freq" + turbo-log $last_n + grep "class \"$class\"" <<< "$COMMAND_OUTPUT" | grep "winner=true" | tail -n 1 | grep -q "maxFreq=$expected_max_freq" || { + command-error "expected class $class as turbo winner with maxFreq=$expected_max_freq" + } +} + +# verify-turbo-loser checks that the given class is logged as NOT a turbo winner +# (winner=false) with the expected maxFreq (base), within the last N turbo log lines. +verify-turbo-loser() { + local class=$1 + local expected_max_freq=$2 + local last_n=${3:-20} + echo "verify turbo loser: class=$class maxFreq=$expected_max_freq" + turbo-log $last_n + grep "class \"$class\"" <<< "$COMMAND_OUTPUT" | grep "winner=false" | tail -n 1 | grep -q "maxFreq=$expected_max_freq" || { + command-error "expected class $class as turbo loser with maxFreq=$expected_max_freq" + } +} + +ENFORCE_PATTERN='enforcing cpu frequency' + +# enforce-count returns the total number of "enforcing cpu frequency" log lines so far. +enforce-count() { + vm-command "kubectl -n kube-system logs ds/nri-resource-policy-balloons | grep -c '$ENFORCE_PATTERN' || true" >/dev/null + echo "$COMMAND_OUTPUT" | tr -d '[:space:]' +} + +# wait-enforce-grows [timeout=15] +# Polls until the cumulative number of enforce writes is greater than . +wait-enforce-grows() { + local baseline=$1 + local timeout=${2:-15} + vm-run-until --timeout "$timeout" \ + "[ \$(kubectl -n kube-system logs ds/nri-resource-policy-balloons 2>/dev/null | grep -c '$ENFORCE_PATTERN') -gt $baseline ]" || { + command-error "expected enforce-count to grow above $baseline within ${timeout}s" + } +} + +# wait-pod-gone [timeout=30] +# Polls until the named pod no longer exists. +wait-pod-gone() { + local pod=$1 + local timeout=${2:-30} + vm-run-until --timeout "$timeout" "! kubectl get pod $pod -o name 2>/dev/null | grep -q ." || { + command-error "pod $pod did not disappear within ${timeout}s" + } +} + +# enforce-lines-since prints the enforce log lines added since the given absolute count. +enforce-lines-since() { + local from=$1 + vm-command "kubectl -n kube-system logs ds/nri-resource-policy-balloons | grep '$ENFORCE_PATTERN' | tail -n +$((from+1))" >/dev/null +} + +# assert-step-writes