Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/plugins/balloons/policy/cputree_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ func TestWalk(t *testing.T) {
foundLevel := CPUTopologyLevelUndefined
rv := tree.DepthFirstWalk(func(tn *cpuTreeNode) error {
foundName = tn.name
foundLevel = string(tn.level)
foundLevel = tn.level
return nil
})
if rv != nil {
Expand Down
30 changes: 30 additions & 0 deletions cmd/plugins/topology-aware/policy/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package topologyaware
import (
"fmt"

cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware"
system "github.com/containers/nri-plugins/pkg/sysfs"
"github.com/containers/nri-plugins/pkg/topology"
idset "github.com/intel/goresctrl/pkg/utils"
Expand Down Expand Up @@ -50,6 +51,35 @@ const (
VirtualNode NodeKind = "virtual node"
)

// TopologyLevel returns the topology level for this node.
func (k NodeKind) TopologyLevel() cfgapi.CPUTopologyLevel {
switch k {
case VirtualNode:
return cfgapi.CPUTopologyLevelSystem
case SocketNode:
return cfgapi.CPUTopologyLevelPackage
case DieNode:
return cfgapi.CPUTopologyLevelDie
case NumaNode:
return cfgapi.CPUTopologyLevelNuma
}
return cfgapi.CPUTopologyLevelUndefined
}

func NodeKindForTopologyLevel(level cfgapi.CPUTopologyLevel) NodeKind {
switch level {
case cfgapi.CPUTopologyLevelSystem:
return VirtualNode
case cfgapi.CPUTopologyLevelPackage:
return SocketNode
case cfgapi.CPUTopologyLevelDie:
return DieNode
case cfgapi.CPUTopologyLevelNuma:
return NumaNode
}
return UnknownNode
}

const (
// OverfitPenalty is the per layer penalty for overfitting in the node tree.
OverfitPenalty = 0.9
Expand Down
71 changes: 55 additions & 16 deletions cmd/plugins/topology-aware/policy/pod-preferences.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package topologyaware
import (
"encoding/json"
"fmt"
"math"
"path/filepath"
"strconv"
"strings"
Expand All @@ -27,6 +28,7 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware"
"github.com/containers/nri-plugins/pkg/kubernetes"
"github.com/containers/nri-plugins/pkg/resmgr/cache"
libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
Expand Down Expand Up @@ -58,6 +60,8 @@ const (
hideHyperthreadsKey = keyHideHyperthreads + "." + kubernetes.ResmgrKeyNamespace
// effective annotation key for picking resources by topology hints
pickResourcesByHints = keyPickResourcesByHints + "." + kubernetes.ResmgrKeyNamespace

unlimitedCPU = math.MaxInt // 'unlimited' burstable CPU limit
)

type prefKind int
Expand Down Expand Up @@ -308,10 +312,11 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) {
// Returned values:
// 1. full: number of full CPUs
// 2. fraction: amount of fractional CPU in milli-CPU
// 3. isolate: (bool) whether to prefer isolated full CPUs
// 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
// 5. cpuPrio: preferred CPU allocator priority for CPU allocation.
func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) {
// 3. limit: CPU limit for this container
// 4. isolate: (bool) whether to prefer isolated full CPUs
// 5. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
// 6. cpuPrio: preferred CPU allocator priority for CPU allocation.
func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, int, bool, cpuClass, cpuPrio) {
//
// CPU allocation preferences for a container consist of
//
Expand Down Expand Up @@ -381,52 +386,68 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in
qosClass := pod.GetQOSClass()
fraction := int(request.MilliValue())
prio := defaultPrio // ignored for fractional allocations
limit := 0

switch qosClass {
case corev1.PodQOSBestEffort:
case corev1.PodQOSBurstable:
if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
limit = int(lim.MilliValue())
} else {
limit = unlimitedCPU
}
case corev1.PodQOSGuaranteed:
if lim, ok := reqs.Limits[corev1.ResourceCPU]; ok {
limit = int(lim.MilliValue())
}
}

// easy cases: kube-system namespace, Burstable or BestEffort QoS class containers
preferReserved, explicitReservation := checkReservedCPUsAnnotations(container)
switch {
case container.PreserveCpuResources():
return 0, fraction, false, cpuPreserve, prio
return 0, fraction, limit, false, cpuPreserve, prio
case preferReserved:
return 0, fraction, false, cpuReserved, prio
return 0, fraction, limit, false, cpuReserved, prio
case checkReservedPoolNamespaces(namespace) && !explicitReservation:
return 0, fraction, false, cpuReserved, prio
return 0, fraction, limit, false, cpuReserved, prio
case qosClass == corev1.PodQOSBurstable:
return 0, fraction, false, cpuNormal, prio
return 0, fraction, limit, false, cpuNormal, prio
case qosClass == corev1.PodQOSBestEffort:
return 0, 0, false, cpuNormal, prio
return 0, 0, 0, false, cpuNormal, prio
}

// complex case: Guaranteed QoS class containers
cores := fraction / 1000
fraction = fraction % 1000
limit = 1000*cores + fraction
preferIsolated, isolPrefKind := isolatedCPUsPreference(pod, container)
preferShared, sharedPrefKind := sharedCPUsPreference(pod, container)
prio = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations

switch {
case cores == 0: // sub-core CPU request
return 0, fraction, false, cpuNormal, prio
return 0, fraction, limit, false, cpuNormal, prio
case cores < 2: // 1 <= CPU request < 2
if preferShared {
return 0, 1000*cores + fraction, false, cpuNormal, prio
return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
}
// potentially mixed allocation (1 core + some fraction)
return cores, fraction, preferIsolated, cpuNormal, prio
return cores, fraction, limit, preferIsolated, cpuNormal, prio
default: // CPU request >= 2
// fractional allocation, only mixed if explicitly annotated as unshared
if fraction > 0 {
if !preferShared && sharedPrefKind == prefAnnotated {
return cores, fraction, preferIsolated, cpuNormal, prio
return cores, fraction, limit, preferIsolated, cpuNormal, prio
}
return 0, 1000*cores + fraction, false, cpuNormal, prio
return 0, 1000*cores + fraction, limit, false, cpuNormal, prio
}
// non-fractional allocation
if preferShared {
return 0, 1000 * cores, false, cpuNormal, prio
return 0, 1000 * cores, limit, false, cpuNormal, prio
}
// for multiple cores, isolated preference must be explicitly annotated
return cores, 0, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
return cores, 0, limit, preferIsolated && isolPrefKind == prefAnnotated, cpuNormal, prio
}
}

Expand Down Expand Up @@ -472,6 +493,24 @@ func pickByHintsPreference(pod cache.Pod, container cache.Container) bool {
return pick
}

// unlimitedBurstablePreference returns the preferred unlimited burstable topology level.
func (p *policy) unlimitedBurstablePreference(container cache.Container) cfgapi.CPUTopologyLevel {
prefer, ok := container.GetEffectiveAnnotation(cache.UnlimitedBurstableKey)
if !ok {
return opt.UnlimitedBurstable
}

level := cfgapi.CPUTopologyLevel(prefer)
if level.Value() == cfgapi.CPUTopologyLevelUndefined.Value() {
log.Errorf("ignoring invalid annotated burstable preference %q", prefer)
level = opt.UnlimitedBurstable
} else {
level = p.findExistingTopologyLevel(level)
}

return level
}

// String stringifies a cpuClass.
func (t cpuClass) String() string {
if cpuClassName, ok := cpuClassNames[t]; ok {
Expand Down
2 changes: 1 addition & 1 deletion cmd/plugins/topology-aware/policy/pod-preferences_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1052,7 +1052,7 @@ func TestCpuAllocationPreferences(t *testing.T) {
}
opt.PreferIsolated, opt.PreferShared = &tc.preferIsolated, &tc.preferShared
opt.ReservedPoolNamespaces = tc.reservedPoolNamespaces
full, fraction, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
full, fraction, _, isolate, cpuType, _ := cpuAllocationPreferences(tc.pod, tc.container)
require.Equal(t, tc.expectedFull, full, "full CPU cores")
require.Equal(t, tc.expectedFraction, fraction, "CPU core fraction")
require.Equal(t, tc.expectedIsolate, isolate, "isolation preference")
Expand Down
108 changes: 94 additions & 14 deletions cmd/plugins/topology-aware/policy/pools.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"sort"

"github.com/containers/nri-plugins/pkg/utils/cpuset"
corev1 "k8s.io/api/core/v1"

"github.com/containers/nri-plugins/pkg/resmgr/cache"
libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory"
Expand Down Expand Up @@ -673,7 +674,9 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
// - if we have topology hints
// * better hint score wins
// * for a tie, prefer the lower node then the smaller id
// - if we have a better matching or tighter fitting memory offer, it wins
// - if we have a better matching memory offer, it wins
// - if we have a burstable container, sufficient capacity for the limit wins
// - if we have or tighter fitting memory offer, it wins
// - if only one node matches the memory type request, it wins
// - for low-prio and high-prio CPU preference, if only one node has such CPUs, it wins
// - if a node is lower in the tree it wins
Expand Down Expand Up @@ -772,7 +775,7 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
}
}

// better matching or tighter memory offer wins
// better matching offer wins
switch {
case o1 != nil && o2 == nil:
log.Debug(" => %s loses on memory offer (failed offer)", node2.Name())
Expand Down Expand Up @@ -809,22 +812,99 @@ func (p *policy) compareScores(request Request, pools []Node, scores map[int]Sco
}
log.Debug(" - memory offers burstability are a TIE")
}
}

if m1.Size() < m2.Size() {
log.Debug(" - %s loses on memory offer (%s less tight than %s)",
node2.Name(), m2, m1)
return true
}
if m2.Size() < m1.Size() {
log.Debug(" - %s loses on memory offer (%s less tight than %s)",
node1.Name(), m1, m2)
return false
}
if m2.Size() == m1.Size() {
log.Debug(" - memory offers are a TIE (%s vs. %s)", m1, m2)
if ctr := request.GetContainer(); ctr.GetQOSClass() == corev1.PodQOSBurstable {
var (
limit = request.CPULimit()
b1 = score1.Supply().AllocatableSharedCPU()
b2 = score2.Supply().AllocatableSharedCPU()
r1 = b1 - limit
r2 = b2 - limit
)

log.Debug(" - CPU burstability %s=%d, %s=%d, limit=%d",
node1.Name(), b1, node2.Name(), b2, limit)

if limit != unlimitedCPU {
// prefer pool with enough burstable capacity
switch {
case r1 >= 0 && r2 < 0:
log.Debug(" - %s loses on insufficient CPU burstability (%d vs. %d for limit %d)",
node2.Name(), b1, b2, limit)
return true
case r2 >= 0 && r1 < 0:
log.Debug(" - %s loses on insufficient CPU burstability", node1.Name())
return false
default:
log.Debug(" - CPU burstability is a TIE")
}
} else {
// prefer a pool at configured topology level with more burstable capacity
var (
level1 = node1.Kind().TopologyLevel()
level2 = node2.Kind().TopologyLevel()
target = node1.Policy().unlimitedBurstablePreference(ctr)
)

log.Debug(" - unlimited CPU burstable topology level: %s", target)
log.Debug(" - %s topology level: %s", node1.Name(), level1)
log.Debug(" - %s topology level: %s", node2.Name(), level2)

switch {
case level1 == target && level2 != target:
log.Debug(" - %s WINS on burstability topology level (%s)",
node1.Name(), target)
return true
case level2 == target && level1 != target:
log.Debug(" - %s WINS on burstability topology level (%s)",
node2.Name(), target)
return false
case level1 == target && level2 == target:
log.Debug(" - burstability topology level (%s) is a TIE", target)
if b1 > b2 {
log.Debug(" - %s WINS on more CPU burstability", node1.Name())
return true
}
if b2 > b1 {
log.Debug(" - %s WINS on more CPU burstability", node2.Name())
return false
}
log.Debug(" - CPU burstability is a TIE")
return id1 < id2
default:
if level1.Value() > target.Value() && level2.Value() < target.Value() {
log.Debug(" - %s WINS on CPU burstability limit (%s, limit %s)",
node1.Name(), level1, target)
return true
}
if level2.Value() > target.Value() && level1.Value() < target.Value() {
log.Debug(" - %s WINS on CPU burstability limit (%s, limit %s)",
node2.Name(), level2, target)
return true
}
log.Debug(" - CPU burstability limit is a TIE")
return id1 < id2
}
}
}

// tighter memory offer wins
m1, m2 := o1.NodeMask(), o2.NodeMask()
if m1.Size() < m2.Size() {
log.Debug(" - %s loses on memory offer (%s less tight than %s)",
node2.Name(), m2, m1)
return true
}
if m2.Size() < m1.Size() {
log.Debug(" - %s loses on memory offer (%s less tight than %s)",
node1.Name(), m1, m2)
return false
}
if m2.Size() == m1.Size() {
log.Debug(" - memory offers are a TIE (%s vs. %s)", m1, m2)
}

// matching memory type wins
if reqType := request.MemoryType(); reqType != memoryUnspec && reqType != memoryPreserve {
if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) {
Expand Down
11 changes: 10 additions & 1 deletion cmd/plugins/topology-aware/policy/resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,8 @@ type Request interface {
FullCPUs() int
// CPUFraction returns the amount of fractional milli-CPU requested.
CPUFraction() int
// CPULimit returns the amount of fractional CPU limit.
CPULimit() int
// Isolate returns whether isolated CPUs are preferred for this request.
Isolate() bool
// MemoryType returns the type(s) of requested memory.
Expand Down Expand Up @@ -223,6 +225,7 @@ type request struct {
container cache.Container // container for this request
full int // number of full CPUs requested
fraction int // amount of fractional CPU requested
limit int // CPU limit, MaxInt for no limit
isolate bool // prefer isolated exclusive CPUs
cpuType cpuClass // preferred CPU type (normal, reserved)
prio cpuPrio // CPU priority preference, ignored for fraction requests
Expand Down Expand Up @@ -715,7 +718,7 @@ func prettyMem(value int64) string {
// newRequest creates a new request for the given container.
func newRequest(container cache.Container, types libmem.TypeMask) Request {
pod, _ := container.GetPod()
full, fraction, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
full, fraction, cpuLimit, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
req, lim, mtype := memoryAllocationPreference(pod, container)
coldStart := time.Duration(0)

Expand Down Expand Up @@ -752,6 +755,7 @@ func newRequest(container cache.Container, types libmem.TypeMask) Request {
container: container,
full: full,
fraction: fraction,
limit: cpuLimit,
isolate: isolate,
cpuType: cpuType,
memReq: req,
Expand Down Expand Up @@ -815,6 +819,11 @@ func (cr *request) CPUFraction() int {
return cr.fraction
}

// CPULimit returns the amount of fractional milli-CPU limit.
func (cr *request) CPULimit() int {
return cr.limit
}

// Isolate returns whether isolated CPUs are preferred for this request.
func (cr *request) Isolate() bool {
return cr.isolate
Expand Down
Loading