containers · askervin · Nov 10, 2025 · Nov 5, 2025 · Sep 2, 2025 · Sep 2, 2025
diff --git a/cmd/plugins/balloons/policy/cputree_test.go b/cmd/plugins/balloons/policy/cputree_test.go
@@ -651,7 +651,7 @@ func TestWalk(t *testing.T) {
 		foundLevel := CPUTopologyLevelUndefined
 		rv := tree.DepthFirstWalk(func(tn *cpuTreeNode) error {
 			foundName = tn.name
-			foundLevel = string(tn.level)
+			foundLevel = tn.level
 			return nil
 		})
 		if rv != nil {

diff --git a/cmd/plugins/topology-aware/policy/node.go b/cmd/plugins/topology-aware/policy/node.go
@@ -17,6 +17,7 @@ package topologyaware
 import (
 	"fmt"
 
+	cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware"
 	system "github.com/containers/nri-plugins/pkg/sysfs"
 	"github.com/containers/nri-plugins/pkg/topology"
 	idset "github.com/intel/goresctrl/pkg/utils"
@@ -50,6 +51,35 @@ const (
 	VirtualNode NodeKind = "virtual node"
 )
 
+// TopologyLevel returns the topology level for this node.
+func (k NodeKind) TopologyLevel() cfgapi.CPUTopologyLevel {
+	switch k {
+	case VirtualNode:
+		return cfgapi.CPUTopologyLevelSystem
+	case SocketNode:
+		return cfgapi.CPUTopologyLevelPackage
+	case DieNode:
+		return cfgapi.CPUTopologyLevelDie
+	case NumaNode:
+		return cfgapi.CPUTopologyLevelNuma
+	}
+	return cfgapi.CPUTopologyLevelUndefined
+}
+
+func NodeKindForTopologyLevel(level cfgapi.CPUTopologyLevel) NodeKind {
+	switch level {
+	case cfgapi.CPUTopologyLevelSystem:
+		return VirtualNode
+	case cfgapi.CPUTopologyLevelPackage:
+		return SocketNode
+	case cfgapi.CPUTopologyLevelDie:
+		return DieNode
+	case cfgapi.CPUTopologyLevelNuma:
+		return NumaNode
+	}
+	return UnknownNode
+}
+
 const (
 	// OverfitPenalty is the per layer penalty for overfitting in the node tree.
 	OverfitPenalty = 0.9

diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go
@@ -17,6 +17,7 @@ package topologyaware
 import (
 	"encoding/json"
 	"fmt"
+	"math"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -27,6 +28,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 
+	cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware"
 	"github.com/containers/nri-plugins/pkg/kubernetes"
 	"github.com/containers/nri-plugins/pkg/resmgr/cache"
 	libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
@@ -58,6 +60,8 @@ const (
 	hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace
 	// effective annotation key for picking resources by topology hints
 	pickResourcesByHints = keyPickResourcesByHints + "." + kubernetes.ResmgrKeyNamespace
+
+	unlimitedCPU = math.MaxInt // 'unlimited' burstable CPU limit
 )
 
 type prefKind int
@@ -308,10 +312,11 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) {
 // Returned values:
 // 1. full: number of full CPUs
 // 2. fraction: amount of fractional CPU in milli-CPU
-// 3. isolate: (bool) whether to prefer isolated full CPUs
-// 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
-// 5. cpuPrio: preferred CPU allocator priority for CPU allocation.
-func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) {
+// 3. limit: CPU limit for this container
+// 4. isolate: (bool) whether to prefer isolated full CPUs
+// 5. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
+// 6. cpuPrio: preferred CPU allocator priority for CPU allocation.
+func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, int, bool, cpuClass, cpuPrio) {
 	//
 	// CPU allocation preferences for a container consist of
 	//
@@ -381,52 +386,68 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in
 	qosClass := pod.GetQOSClass()
 	fraction := int(request.MilliValue())
 	prio := defaultPrio // ignored for fractional allocations
+	limit := 0
+
+	switch qosClass {
+	case corev1.PodQOSBestEffort:
+	case corev1.PodQOSBurstable:
+		if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
+			limit = int(lim.MilliValue())
+		} else {
+			limit = unlimitedCPU
+		}
+	case corev1.PodQOSGuaranteed:
+		if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
+			limit = int(lim.MilliValue())
+		}
+	}
 
 	// easy cases: kube-system namespace, Burstable or BestEffort QoS class containers
 	preferReserved, explicitReservation := checkReservedCPUsAnnotations(container)
 	switch {
 	case container.PreserveCpuResources():
-		return 0, fraction, false, cpuPreserve, prio
+		return 0, fraction, limit, false, cpuPreserve, prio
 	case preferReserved:
-		return 0, fraction, false, cpuReserved, prio
+		return 0, fraction, limit, false, cpuReserved, prio
 	case checkReservedPoolNamespaces(namespace) && !explicitReservation:
-		return 0, fraction, false, cpuReserved, prio
+		return 0, fraction, limit, false, cpuReserved, prio
 	case qosClass == corev1.PodQOSBurstable:
-		return 0, fraction, false, cpuNormal, prio
+		return 0, fraction, limit, false, cpuNormal, prio
 	case qosClass == corev1.PodQOSBestEffort:
-		return 0, 0, false, cpuNormal, prio
+		return 0, 0, 0, false, cpuNormal, prio
 	}
 
 	// complex case: Guaranteed QoS class containers
 	cores := fraction / 1000
 	fraction = fraction % 1000
+	limit = 1000*cores + fraction
 	preferIsolated, isolPrefKind := isolatedCPUsPreference(pod, container)
 	preferShared, sharedPrefKind := sharedCPUsPreference(pod, container)
 	prio = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations
 
 	switch {
 	case cores == 0: // sub-core CPU request
-		return 0, fraction, false, cpuNormal, prio
+		return 0, fraction, limit, false, cpuNormal, prio
 	case cores < 2: // 1 <= CPU request < 2
 		if preferShared {
-			return 0, 1000*cores + fraction, false, cpuNormal, prio
+			return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
 		}
 		// potentially mixed allocation (1 core + some fraction)
-		return cores, fraction, preferIsolated, cpuNormal, prio
+		return cores, fraction, limit, preferIsolated, cpuNormal, prio
 	default: // CPU request >= 2
 		// fractional allocation, only mixed if explicitly annotated as unshared
 		if fraction > 0 {
 			if !preferShared && sharedPrefKind == prefAnnotated {
-				return cores, fraction, preferIsolated, cpuNormal, prio
+				return cores, fraction, limit, preferIsolated, cpuNormal, prio
 			}
-			return 0, 1000*cores + fraction, false, cpuNormal, prio
+			return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
 		}
 		// non-fractional allocation
 		if preferShared {
-			return 0, 1000 * cores, false, cpuNormal, prio
+			return 0, 1000 * cores, limit, false, cpuNormal, prio
 		}
 		// for multiple cores, isolated preference must be explicitly annotated
-		return cores, 0, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
+		return cores, 0, limit, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
 	}
 }
 
@@ -472,6 +493,24 @@ func pickByHintsPreference(pod cache.Pod, container cache.Container) bool {
 	return pick
 }
 
+// unlimitedBurstablePreference returns the preferred unlimited burstable topology level.
+func (p *policy) unlimitedBurstablePreference(container cache.Container) cfgapi.CPUTopologyLevel {
+	prefer, ok := container.GetEffectiveAnnotation(cache.UnlimitedBurstableKey)
+	if !ok {
+		return opt.UnlimitedBurstable
+	}
+
+	level := cfgapi.CPUTopologyLevel(prefer)
+	if level.Value() == cfgapi.CPUTopologyLevelUndefined.Value() {
+		log.Errorf("ignoring invalid annotated burstable preference %q", prefer)
+		level = opt.UnlimitedBurstable
+	} else {
+		level = p.findExistingTopologyLevel(level)
+	}
+
+	return level
+}
+
 // String stringifies a cpuClass.
 func (t cpuClass) String() string {
 	if cpuClassName, ok := cpuClassNames[t]; ok {

diff --git a/cmd/plugins/topology-aware/policy/pod-preferences_test.go b/cmd/plugins/topology-aware/policy/pod-preferences_test.go
@@ -1052,7 +1052,7 @@ func TestCpuAllocationPreferences(t *testing.T) {
 			}
 			opt.PreferIsolated, opt.PreferShared = &tc.preferIsolated, &tc.preferShared
 			opt.ReservedPoolNamespaces = tc.reservedPoolNamespaces
-			full, fraction, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
+			full, fraction, _, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
 			require.Equal(t, tc.expectedFull, full, "full CPU cores")
 			require.Equal(t, tc.expectedFraction, fraction, "CPU core fraction")
 			require.Equal(t, tc.expectedIsolate, isolate, "isolation preference")

diff --git a/cmd/plugins/topology-aware/policy/pools.go b/cmd/plugins/topology-aware/policy/pools.go
@@ -20,6 +20,7 @@ import (
 	"sort"
 
 	"github.com/containers/nri-plugins/pkg/utils/cpuset"
+	corev1 "k8s.io/api/core/v1"
 
 	"github.com/containers/nri-plugins/pkg/resmgr/cache"
 	libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
@@ -673,7 +674,9 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 	//   - if we have topology hints
 	//       * better hint score wins
 	//       * for a tie, prefer the lower node then the smaller id
-	//   - if we have a better matching or tighter fitting memory offer, it wins
+	//   - if we have a better matching memory offer, it wins
+	//   - if we have a burstable container, sufficient capacity for the limit wins
+	//   - if we have or tighter fitting memory offer, it wins
 	//   - if only one node matches the memory type request, it wins
 	//   - for low-prio and high-prio CPU preference, if only one node has such CPUs, it wins
 	//   - if a node is lower in the tree it wins
@@ -772,7 +775,7 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 		}
 	}
 
-	// better matching or tighter memory offer wins
+	// better matching offer wins
 	switch {
 	case o1 != nil && o2 == nil:
 		log.Debug("  => %s loses on memory offer (failed offer)", node2.Name())
@@ -809,22 +812,99 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
 			}
 			log.Debug("  - memory offers burstability are a TIE")
 		}
+	}
 
-		if m1.Size() < m2.Size() {
-			log.Debug("   - %s loses on memory offer (%s less tight than %s)",
-				node2.Name(), m2, m1)
-			return true
-		}
-		if m2.Size() < m1.Size() {
-			log.Debug("   - %s loses on memory offer (%s less tight than %s)",
-				node1.Name(), m1, m2)
-			return false
-		}
-		if m2.Size() == m1.Size() {
-			log.Debug("  - memory offers are a TIE (%s vs. %s)", m1, m2)
+	if ctr := request.GetContainer(); ctr.GetQOSClass() == corev1.PodQOSBurstable {
+		var (
+			limit = request.CPULimit()
+			b1    = score1.Supply().AllocatableSharedCPU()
+			b2    = score2.Supply().AllocatableSharedCPU()
+			r1    = b1 - limit
+			r2    = b2 - limit
+		)
+
+		log.Debug("  - CPU burstability %s=%d, %s=%d, limit=%d",
+			node1.Name(), b1, node2.Name(), b2, limit)
+
+		if limit != unlimitedCPU {
+			// prefer pool with enough burstable capacity
+			switch {
+			case r1 >= 0 && r2 < 0:
+				log.Debug("  - %s loses on insufficient CPU burstability (%d vs. %d for limit %d)",
+					node2.Name(), b1, b2, limit)
+				return true
+			case r2 >= 0 && r1 < 0:
+				log.Debug("  - %s loses on insufficient CPU burstability", node1.Name())
+				return false
+			default:
+				log.Debug("  - CPU burstability is a TIE")
+			}
+		} else {
+			// prefer a pool at configured topology level with more burstable capacity
+			var (
+				level1 = node1.Kind().TopologyLevel()
+				level2 = node2.Kind().TopologyLevel()
+				target = node1.Policy().unlimitedBurstablePreference(ctr)
+			)
+
+			log.Debug("  - unlimited CPU burstable topology level: %s", target)
+			log.Debug("  - %s topology level: %s", node1.Name(), level1)
+			log.Debug("  - %s topology level: %s", node2.Name(), level2)
+
+			switch {
+			case level1 == target && level2 != target:
+				log.Debug("  - %s WINS on burstability topology level (%s)",
+					node1.Name(), target)
+				return true
+			case level2 == target && level1 != target:
+				log.Debug("  - %s WINS on burstability topology level (%s)",
+					node2.Name(), target)
+				return false
+			case level1 == target && level2 == target:
+				log.Debug("  - burstability topology level (%s) is a TIE", target)
+				if b1 > b2 {
+					log.Debug("  - %s WINS on more CPU burstability", node1.Name())
+					return true
+				}
+				if b2 > b1 {
+					log.Debug("  - %s WINS on more CPU burstability", node2.Name())
+					return false
+				}
+				log.Debug("  - CPU burstability is a TIE")
+				return id1 < id2
+			default:
+				if level1.Value() > target.Value() && level2.Value() < target.Value() {
+					log.Debug("  - %s WINS on CPU burstability limit (%s, limit %s)",
+						node1.Name(), level1, target)
+					return true
+				}
+				if level2.Value() > target.Value() && level1.Value() < target.Value() {
+					log.Debug("  - %s WINS on CPU burstability limit (%s, limit %s)",
+						node2.Name(), level2, target)
+					return true
+				}
+				log.Debug("  - CPU burstability limit is a TIE")
+				return id1 < id2
+			}
 		}
 	}
 
+	// tighter memory offer wins
+	m1, m2 := o1.NodeMask(), o2.NodeMask()
+	if m1.Size() < m2.Size() {
+		log.Debug("   - %s loses on memory offer (%s less tight than %s)",
+			node2.Name(), m2, m1)
+		return true
+	}
+	if m2.Size() < m1.Size() {
+		log.Debug("   - %s loses on memory offer (%s less tight than %s)",
+			node1.Name(), m1, m2)
+		return false
+	}
+	if m2.Size() == m1.Size() {
+		log.Debug("  - memory offers are a TIE (%s vs. %s)", m1, m2)
+	}
+
 	// matching memory type wins
 	if reqType := request.MemoryType(); reqType != memoryUnspec && reqType != memoryPreserve {
 		if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) {

diff --git a/cmd/plugins/topology-aware/policy/resources.go b/cmd/plugins/topology-aware/policy/resources.go
@@ -107,6 +107,8 @@ type Request interface {
 	FullCPUs() int
 	// CPUFraction returns the amount of fractional milli-CPU requested.
 	CPUFraction() int
+	// CPULimit returns the amount of fractional CPU limit.
+	CPULimit() int
 	// Isolate returns whether isolated CPUs are preferred for this request.
 	Isolate() bool
 	// MemoryType returns the type(s) of requested memory.
@@ -223,6 +225,7 @@ type request struct {
 	container   cache.Container // container for this request
 	full        int             // number of full CPUs requested
 	fraction    int             // amount of fractional CPU requested
+	limit       int             // CPU limit, MaxInt for no limit
 	isolate     bool            // prefer isolated exclusive CPUs
 	cpuType     cpuClass        // preferred CPU type (normal, reserved)
 	prio        cpuPrio         // CPU priority preference, ignored for fraction requests
@@ -715,7 +718,7 @@ func prettyMem(value int64) string {
 // newRequest creates a new request for the given container.
 func newRequest(container cache.Container, types libmem.TypeMask) Request {
 	pod, _ := container.GetPod()
-	full, fraction, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
+	full, fraction, cpuLimit, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
 	req, lim, mtype := memoryAllocationPreference(pod, container)
 	coldStart := time.Duration(0)
 
@@ -752,6 +755,7 @@ func newRequest(container cache.Container, types libmem.TypeMask) Request {
 		container:   container,
 		full:        full,
 		fraction:    fraction,
+		limit:       cpuLimit,
 		isolate:     isolate,
 		cpuType:     cpuType,
 		memReq:      req,
@@ -815,6 +819,11 @@ func (cr *request) CPUFraction() int {
 	return cr.fraction
 }
 
+// CPULimit returns the amount of fractional milli-CPU limit.
+func (cr *request) CPULimit() int {
+	return cr.limit
+}
+
 // Isolate returns whether isolated CPUs are preferred for this request.
 func (cr *request) Isolate() bool {
 	return cr.isolate