From c98ff9713af8ef55d1ce60f85f195fa7154e8f85 Mon Sep 17 00:00:00 2001
From: Krisztian Litkey <krisztian.litkey@intel.com>
Date: Wed, 13 Mar 2024 00:03:53 +0200
Subject: [PATCH] WiP: topology-aware: support for CPU allocator priorities.

Add support for configurable default and annotated per-container
CPU priority preferences. These determine the preferred priority
for CPUs when doing fully or partially exclusive CPU allocation.
Priorities are calculated for such allocations and passed on to
the CPU allocator which then tries to fulfill these preferences.

It should now be possible to configure the policy to allocate
(exclusive) E-cores by default and P-cores to containers which
are annotated so, or to do it the other way around.

Signed-off-by: Krisztian Litkey <krisztian.litkey@intel.com>
---
 .../topology-aware/policy/pod-preferences.go  | 66 ++++++++++++++-----
 .../topology-aware/policy/resources.go        | 40 +++++++++--
 .../policy/topology-aware-policy.go           |  4 +-
 .../config.nri_topologyawarepolicies.yaml     | 13 ++++
 .../config.nri_topologyawarepolicies.yaml     | 13 ++++
 .../resmgr/policy/topologyaware/config.go     | 32 +++++++++
 6 files changed, 147 insertions(+), 21 deletions(-)

diff --git a/cmd/plugins/topology-aware/policy/pod-preferences.go b/cmd/plugins/topology-aware/policy/pod-preferences.go
index af1bb1b90..ce35215c9 100644
--- a/cmd/plugins/topology-aware/policy/pod-preferences.go
+++ b/cmd/plugins/topology-aware/policy/pod-preferences.go
@@ -42,6 +42,8 @@ const (
 	keyColdStartPreference = "cold-start"
 	// annotation key for reserved pools
 	keyReservedCPUsPreference = "prefer-reserved-cpus"
+	// annotation key for CPU Priority preference
+	keyCpuPriorityPreference = "prefer-cpu-priority"
 
 	// effective annotation key for isolated CPU preference
 	preferIsolatedCPUsKey = keyIsolationPreference + "." + kubernetes.ResmgrKeyNamespace
@@ -53,6 +55,8 @@ const (
 	preferColdStartKey = keyColdStartPreference + "." + kubernetes.ResmgrKeyNamespace
 	// annotation key for reserved pools
 	preferReservedCPUsKey = keyReservedCPUsPreference + "." + kubernetes.ResmgrKeyNamespace
+	// effective annotation key for CPU priority preference
+	preferCpuPriorityKey = keyCpuPriorityPreference + "." + kubernetes.ResmgrKeyNamespace
 )
 
 // cpuClass is a type of CPU to allocate
@@ -153,6 +157,36 @@ func sharedCPUsPreference(pod cache.Pod, container cache.Container) (bool, bool)
 	return preference, true
 }
 
+// cpuPrioPreference returns the CPU priority preference for the given container
+// and whether the container was explicitly annotated with this setting.
+func cpuPrioPreference(pod cache.Pod, container cache.Container, fallback cpuPrio) (cpuPrio, bool) {
+	key := preferCpuPriorityKey
+	value, ok := pod.GetEffectiveAnnotation(key, container.GetName())
+
+	if !ok {
+		prio := fallback
+		log.Debug("%s: implicit CPU priority preference %q", container.PrettyName(), prio)
+		return prio, false
+	}
+
+	if value == "default" {
+		prio := defaultPrio
+		log.Debug("%s: explicit CPU priority preference %q", container.PrettyName(), prio)
+		return prio, true
+	}
+
+	prio, ok := cpuPrioByName[value]
+	if !ok {
+		log.Error("%s: invalid CPU priority preference %q", container.PrettyName(), value)
+		prio := fallback
+		log.Debug("%s: implicit CPU priority preference %q", container.PrettyName(), prio)
+		return prio, false
+	}
+
+	log.Debug("%s: explicit CPU priority preference %q", container.PrettyName(), prio)
+	return prio, true
+}
+
 // memoryTypePreference returns what type of memory should be allocated for the container.
 //
 // If the effective annotations are not found, this function falls back to
@@ -370,7 +404,7 @@ func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) {
 // 2. fraction: amount of fractional CPU in milli-CPU
 // 3. isolate: (bool) whether to prefer isolated full CPUs
 // 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal)
-func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass) {
+func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass, cpuPrio) {
 	//
 	// CPU allocation preferences for a container consist of
 	//
@@ -439,20 +473,21 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in
 	request := reqs.Requests[corev1.ResourceCPU]
 	qosClass := pod.GetQOSClass()
 	fraction := int(request.MilliValue())
+	prio := defaultPrio // ignored for fractional allocations
 
 	// easy cases: kube-system namespace, Burstable or BestEffort QoS class containers
 	preferReserved, explicitReservation := checkReservedCPUsAnnotations(container)
 	switch {
 	case container.PreserveCpuResources():
-		return 0, fraction, false, cpuPreserve
+		return 0, fraction, false, cpuPreserve, prio
 	case preferReserved == true:
-		return 0, fraction, false, cpuReserved
+		return 0, fraction, false, cpuReserved, prio
 	case checkReservedPoolNamespaces(namespace) && !explicitReservation:
-		return 0, fraction, false, cpuReserved
+		return 0, fraction, false, cpuReserved, prio
 	case qosClass == corev1.PodQOSBurstable:
-		return 0, fraction, false, cpuNormal
+		return 0, fraction, false, cpuNormal, prio
 	case qosClass == corev1.PodQOSBestEffort:
-		return 0, 0, false, cpuNormal
+		return 0, 0, false, cpuNormal, prio
 	}
 
 	// complex case: Guaranteed QoS class containers
@@ -460,39 +495,40 @@ func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, in
 	fraction = fraction % 1000
 	preferIsolated, explicitIsolated := isolatedCPUsPreference(pod, container)
 	preferShared, explicitShared := sharedCPUsPreference(pod, container)
+	prio, _ = cpuPrioPreference(pod, container, defaultPrio) // ignored for fractional allocations
 
 	switch {
 	// sub-core CPU request
 	case cores == 0:
-		return 0, fraction, false, cpuNormal
+		return 0, fraction, false, cpuNormal, prio
 		// 1 <= CPU request < 2
 	case cores < 2:
 		// fractional allocation, potentially mixed
 		if fraction > 0 {
 			if preferShared {
-				return 0, 1000*cores + fraction, false, cpuNormal
+				return 0, 1000*cores + fraction, false, cpuNormal, prio
 			}
-			return cores, fraction, preferIsolated, cpuNormal
+			return cores, fraction, preferIsolated, cpuNormal, prio
 		}
 		// non-fractional allocation
 		if preferShared && explicitShared {
-			return 0, 1000*cores + fraction, false, cpuNormal
+			return 0, 1000*cores + fraction, false, cpuNormal, prio
 		}
-		return cores, fraction, preferIsolated, cpuNormal
+		return cores, fraction, preferIsolated, cpuNormal, prio
 		// CPU request >= 2
 	default:
 		// fractional allocation, only mixed if explicitly annotated as unshared
 		if fraction > 0 {
 			if !preferShared && explicitShared {
-				return cores, fraction, preferIsolated && explicitIsolated, cpuNormal
+				return cores, fraction, preferIsolated && explicitIsolated, cpuNormal, prio
 			}
-			return 0, 1000*cores + fraction, false, cpuNormal
+			return 0, 1000*cores + fraction, false, cpuNormal, prio
 		}
 		// non-fractional allocation
 		if preferShared && explicitShared {
-			return 0, 1000 * cores, false, cpuNormal
+			return 0, 1000 * cores, false, cpuNormal, prio
 		}
-		return cores, fraction, preferIsolated && explicitIsolated, cpuNormal
+		return cores, fraction, preferIsolated && explicitIsolated, cpuNormal, prio
 	}
 }
 
diff --git a/cmd/plugins/topology-aware/policy/resources.go b/cmd/plugins/topology-aware/policy/resources.go
index b2e4ef210..3c5121cc5 100644
--- a/cmd/plugins/topology-aware/policy/resources.go
+++ b/cmd/plugins/topology-aware/policy/resources.go
@@ -28,6 +28,28 @@ import (
 	idset "github.com/intel/goresctrl/pkg/utils"
 )
 
+type (
+	cpuPrio = cpuallocator.CPUPriority
+)
+
+const (
+	highPrio   = cpuallocator.PriorityHigh
+	normalPrio = cpuallocator.PriorityNormal
+	lowPrio    = cpuallocator.PriorityLow
+	nonePrio   = cpuallocator.PriorityNone
+)
+
+var (
+	defaultPrio = nonePrio
+
+	cpuPrioByName = map[string]cpuPrio{
+		"high":   highPrio,
+		"normal": normalPrio,
+		"low":    lowPrio,
+		"none":   nonePrio,
+	}
+)
+
 // Supply represents avaialbe CPU and memory capacity of a node.
 type Supply interface {
 	// GetNode returns the node supplying this capacity.
@@ -95,6 +117,8 @@ type Request interface {
 	String() string
 	// CPUType returns the type of requested CPU.
 	CPUType() cpuClass
+	// CPUPrio returns the preferred priority of requested CPU.
+	CPUPrio() cpuPrio
 	// SetCPUType sets the type of requested CPU.
 	SetCPUType(cpuType cpuClass)
 	// FullCPUs return the number of full CPUs requested.
@@ -223,6 +247,7 @@ type request struct {
 	fraction  int             // amount of fractional CPU requested
 	isolate   bool            // prefer isolated exclusive CPUs
 	cpuType   cpuClass        // preferred CPU type (normal, reserved)
+	prio      cpuPrio         // CPU priority preference, ignored for fraction requests
 
 	memReq  uint64     // memory request
 	memLim  uint64     // memory limit
@@ -575,7 +600,7 @@ func (cs *supply) AllocateCPU(r Request) (Grant, error) {
 	// allocate isolated exclusive CPUs or slice them off the sharable set
 	switch {
 	case full > 0 && cs.isolated.Size() >= full && cr.isolate:
-		exclusive, err = cs.takeCPUs(&cs.isolated, nil, full)
+		exclusive, err = cs.takeCPUs(&cs.isolated, nil, full, cr.CPUPrio())
 		if err != nil {
 			return nil, policyError("internal error: "+
 				"%s: can't take %d exclusive isolated CPUs from %s: %v",
@@ -583,7 +608,7 @@ func (cs *supply) AllocateCPU(r Request) (Grant, error) {
 		}
 
 	case full > 0 && cs.AllocatableSharedCPU() > 1000*full:
-		exclusive, err = cs.takeCPUs(&cs.sharable, nil, full)
+		exclusive, err = cs.takeCPUs(&cs.sharable, nil, full, cr.CPUPrio())
 		if err != nil {
 			return nil, policyError("internal error: "+
 				"%s: can't take %d exclusive CPUs from %s: %v",
@@ -764,8 +789,8 @@ func (cs *supply) ReserveMemory(g Grant) error {
 }
 
 // takeCPUs takes up to cnt CPUs from a given CPU set to another.
-func (cs *supply) takeCPUs(from, to *cpuset.CPUSet, cnt int) (cpuset.CPUSet, error) {
-	cset, err := cs.node.Policy().cpuAllocator.AllocateCpus(from, cnt, cpuallocator.PriorityHigh)
+func (cs *supply) takeCPUs(from, to *cpuset.CPUSet, cnt int, prio cpuPrio) (cpuset.CPUSet, error) {
+	cset, err := cs.node.Policy().cpuAllocator.AllocateCpus(from, cnt, prio)
 	if err != nil {
 		return cset, err
 	}
@@ -942,7 +967,7 @@ func (cs *supply) DumpMemoryState(prefix string) {
 // newRequest creates a new request for the given container.
 func newRequest(container cache.Container) Request {
 	pod, _ := container.GetPod()
-	full, fraction, isolate, cpuType := cpuAllocationPreferences(pod, container)
+	full, fraction, isolate, cpuType, prio := cpuAllocationPreferences(pod, container)
 	req, lim, mtype := memoryAllocationPreference(pod, container)
 	coldStart := time.Duration(0)
 
@@ -984,6 +1009,7 @@ func newRequest(container cache.Container) Request {
 		memLim:    lim,
 		memType:   mtype,
 		coldStart: coldStart,
+		prio:      prio,
 	}
 }
 
@@ -1019,6 +1045,10 @@ func (cr *request) CPUType() cpuClass {
 	return cr.cpuType
 }
 
+func (cr *request) CPUPrio() cpuPrio {
+	return cr.prio
+}
+
 // SetCPUType sets the requested type of CPU for the grant.
 func (cr *request) SetCPUType(cpuType cpuClass) {
 	cr.cpuType = cpuType
diff --git a/cmd/plugins/topology-aware/policy/topology-aware-policy.go b/cmd/plugins/topology-aware/policy/topology-aware-policy.go
index e25984c2f..c3a5d2181 100644
--- a/cmd/plugins/topology-aware/policy/topology-aware-policy.go
+++ b/cmd/plugins/topology-aware/policy/topology-aware-policy.go
@@ -425,6 +425,7 @@ func (p *policy) Reconfigure(newCfg interface{}) error {
 
 	opt = cfg
 	p.cfg = cfg
+	defaultPrio = cfg.DefaultCPUPriority.Value()
 
 	if err := p.initialize(); err != nil {
 		*p = savedPolicy
@@ -435,6 +436,7 @@ func (p *policy) Reconfigure(newCfg interface{}) error {
 		if err := grant.RefetchNodes(); err != nil {
 			*p = savedPolicy
 			opt = p.cfg
+			defaultPrio = p.cfg.DefaultCPUPriority.Value()
 			return policyError("failed to reconfigure: %v", err)
 		}
 	}
@@ -523,7 +525,7 @@ func (p *policy) checkConstraints() error {
 		// Use CpuAllocator to pick reserved CPUs among
 		// allowed ones. Because using those CPUs is allowed,
 		// they remain (they are put back) in the allowed set.
-		cset, err := p.cpuAllocator.AllocateCpus(&p.allowed, p.reserveCnt, cpuallocator.PriorityNormal)
+		cset, err := p.cpuAllocator.AllocateCpus(&p.allowed, p.reserveCnt, normalPrio)
 		p.allowed = p.allowed.Union(cset)
 		if err != nil {
 			log.Fatal("cannot reserve %dm CPUs for ReservedResources from AvailableResources: %s", qty.MilliValue(), err)
diff --git a/config/crd/bases/config.nri_topologyawarepolicies.yaml b/config/crd/bases/config.nri_topologyawarepolicies.yaml
index af7faacee..caffebaeb 100644
--- a/config/crd/bases/config.nri_topologyawarepolicies.yaml
+++ b/config/crd/bases/config.nri_topologyawarepolicies.yaml
@@ -94,6 +94,19 @@ spec:
                     - classes
                     type: object
                 type: object
+              defaultCPUPriority:
+                default: none
+                description: |-
+                  DefaultCPUPriority (high, normal, low, none)
+                  This parameter is passed to CPU allocator when allocating CPUs exclusively.
+                  If a container is not annotated otherwise, this is the requested priority
+                  from the allocator.
+                enum:
+                - high
+                - normal
+                - low
+                - none
+                type: string
               instrumentation:
                 description: Config provides runtime configuration for instrumentation.
                 properties:
diff --git a/deployment/helm/topology-aware/crds/config.nri_topologyawarepolicies.yaml b/deployment/helm/topology-aware/crds/config.nri_topologyawarepolicies.yaml
index af7faacee..caffebaeb 100644
--- a/deployment/helm/topology-aware/crds/config.nri_topologyawarepolicies.yaml
+++ b/deployment/helm/topology-aware/crds/config.nri_topologyawarepolicies.yaml
@@ -94,6 +94,19 @@ spec:
                     - classes
                     type: object
                 type: object
+              defaultCPUPriority:
+                default: none
+                description: |-
+                  DefaultCPUPriority (high, normal, low, none)
+                  This parameter is passed to CPU allocator when allocating CPUs exclusively.
+                  If a container is not annotated otherwise, this is the requested priority
+                  from the allocator.
+                enum:
+                - high
+                - normal
+                - low
+                - none
+                type: string
               instrumentation:
                 description: Config provides runtime configuration for instrumentation.
                 properties:
diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware/config.go b/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware/config.go
index 5adfca087..feafe17dc 100644
--- a/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware/config.go
+++ b/pkg/apis/config/v1alpha1/resmgr/policy/topologyaware/config.go
@@ -15,7 +15,10 @@
 package topologyaware
 
 import (
+	"strings"
+
 	policy "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy"
+	"github.com/containers/nri-plugins/pkg/cpuallocator"
 )
 
 type (
@@ -33,6 +36,27 @@ const (
 	AmountCPUSet   = policy.AmountCPUSet
 )
 
+type CPUPriority string
+
+const (
+	PriorityHigh   CPUPriority = "high"
+	PriorityNormal CPUPriority = "normal"
+	PriorityLow    CPUPriority = "low"
+	PriorityNone   CPUPriority = "none"
+)
+
+func (p CPUPriority) Value() cpuallocator.CPUPriority {
+	switch strings.ToLower(string(p)) {
+	case string(PriorityHigh):
+		return cpuallocator.PriorityHigh
+	case string(PriorityNormal):
+		return cpuallocator.PriorityNormal
+	case string(PriorityLow):
+		return cpuallocator.PriorityLow
+	}
+	return cpuallocator.PriorityNone
+}
+
 // +k8s:deepcopy-gen=true
 // +optional
 type Config struct {
@@ -77,4 +101,12 @@ type Config struct {
 	// of it.
 	// +kubebuilder:validation:Required
 	ReservedResources Constraints `json:"reservedResources"`
+	// DefaultCPUPriority (high, normal, low, none)
+	// This parameter is passed to CPU allocator when allocating CPUs exclusively.
+	// If a container is not annotated otherwise, this is the requested priority
+	// from the allocator.
+	// +kubebuilder:validation:Enum=high;normal;low;none
+	// +kubebuilder:default=none
+	// +kubebuilder:validation:Format:string
+	DefaultCPUPriority CPUPriority `json:"defaultCPUPriority,omitempty"`
 }