diff --git a/cmd/plugins/balloons/policy/balloons-policy.go b/cmd/plugins/balloons/policy/balloons-policy.go index 44f4321a0..8a8068f62 100644 --- a/cmd/plugins/balloons/policy/balloons-policy.go +++ b/cmd/plugins/balloons/policy/balloons-policy.go @@ -17,6 +17,7 @@ package balloons import ( "fmt" "path/filepath" + "strconv" cfgapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/balloons" "github.com/containers/nri-plugins/pkg/cpuallocator" @@ -41,6 +42,8 @@ const ( // balloonKey is a pod annotation key, the value is a pod balloon name. balloonKey = "balloon." + PolicyName + "." + kubernetes.ResmgrKeyNamespace + // hideHyperthreadsKey is a pod annotation key for pod/container-specific hyperthread allowance. + hideHyperthreadsKey = "hide-hyperthreads." + kubernetes.ResmgrKeyNamespace // reservedBalloonDefName is the name in the reserved balloon definition. reservedBalloonDefName = "reserved" // defaultBalloonDefName is the name in the default balloon definition. @@ -1313,16 +1316,38 @@ func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error { func (p *balloons) updatePinning(blns ...*Balloon) { for _, bln := range blns { - cpus := bln.Cpus.Union(bln.SharedIdleCpus) - bln.Mems = p.closestMems(cpus) + var cpusNoHt cpuset.CPUSet + var allowedCpus cpuset.CPUSet + pinnableCpus := bln.Cpus.Union(bln.SharedIdleCpus) + bln.Mems = p.closestMems(pinnableCpus) for _, cID := range bln.ContainerIDs() { if c, ok := p.cch.LookupContainer(cID); ok { - p.pinCpuMem(c, cpus, bln.Mems) + if runWithoutHyperthreads(c, bln) { + if cpusNoHt.Size() == 0 { + cpusNoHt = p.cpuTree.system().SingleThreadForCPUs(pinnableCpus) + } + allowedCpus = cpusNoHt + } else { + allowedCpus = pinnableCpus + } + p.pinCpuMem(c, allowedCpus, bln.Mems) } } } } +// runWithoutHyperthreads returns true if a container should run using +// only single hyperthread from each physical core. +func runWithoutHyperthreads(c cache.Container, bln *Balloon) bool { + // Is balloon type configuration overridden by annotation? + if value, ok := c.GetEffectiveAnnotation(hideHyperthreadsKey); ok { + if hide, err := strconv.ParseBool(value); err == nil { + return hide + } + } + return bln.Def.HideHyperthreads != nil && *bln.Def.HideHyperthreads +} + // shareIdleCpus adds addCpus and removes removeCpus to those balloons // that whose containers are allowed to use shared idle CPUs. Returns // balloons that will need re-pinning. diff --git a/cmd/plugins/balloons/policy/cputree.go b/cmd/plugins/balloons/policy/cputree.go index ce4e8f0db..634291fb7 100644 --- a/cmd/plugins/balloons/policy/cputree.go +++ b/cmd/plugins/balloons/policy/cputree.go @@ -32,7 +32,7 @@ type cpuTreeNode struct { parent *cpuTreeNode children []*cpuTreeNode cpus cpuset.CPUSet // union of CPUs of child nodes - + sys system.System } // cpuTreeNodeAttributes contains various attributes of a CPU tree @@ -97,6 +97,13 @@ func (t *cpuTreeNode) PrettyPrint() string { return strings.Join(lines, "\n") } +func (t *cpuTreeNode) system() system.System { + if t.sys != nil || t.parent == nil { + return t.sys + } + return t.parent.system() +} + // String returns cpuTreeNodeAttributes as a string. func (tna cpuTreeNodeAttributes) String() string { return fmt.Sprintf("%s{%d,%v,%d,%d}", tna.t.name, tna.depth, @@ -247,6 +254,7 @@ func NewCpuTreeFromSystem() (*cpuTreeNode, error) { } // TODO: split deep nested loops into functions sysTree := NewCpuTree("system") + sysTree.sys = sys sysTree.level = CPUTopologyLevelSystem for _, packageID := range sys.PackageIDs() { packageTree := NewCpuTree(fmt.Sprintf("p%d", packageID)) diff --git a/config/crd/bases/config.nri_balloonspolicies.yaml b/config/crd/bases/config.nri_balloonspolicies.yaml index 80ad36af3..f2a1d3d44 100644 --- a/config/crd/bases/config.nri_balloonspolicies.yaml +++ b/config/crd/bases/config.nri_balloonspolicies.yaml @@ -91,6 +91,21 @@ spec: ${pod/labels/mylabel} will be substituted with corresponding values. type: string + hideHyperthreads: + description: |- + HideHyperthreads allows containers in a balloon use only + one hyperthread from each physical CPU core in the + balloon. For instance, if a balloon contains 16 logical + CPUs from 8 physical cores and this option is true, then + containers in the balloon will be allowed to use 8 logical + CPUs, one from each physical core. This option is best used + with PreferSpreadOnPhysicalCores=false in order to allocate + all hyperthreads of each physical core into the same + balloon, but allow containers to use only one hyperthread + from each core. This will ensure that hidden hyperthreads + will remain completely idle as they cannot be allocated to + other balloons. + type: boolean matchExpressions: description: |- MatchExpressions specifies one or more expressions which are evaluated diff --git a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml index 80ad36af3..f2a1d3d44 100644 --- a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml +++ b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml @@ -91,6 +91,21 @@ spec: ${pod/labels/mylabel} will be substituted with corresponding values. type: string + hideHyperthreads: + description: |- + HideHyperthreads allows containers in a balloon use only + one hyperthread from each physical CPU core in the + balloon. For instance, if a balloon contains 16 logical + CPUs from 8 physical cores and this option is true, then + containers in the balloon will be allowed to use 8 logical + CPUs, one from each physical core. This option is best used + with PreferSpreadOnPhysicalCores=false in order to allocate + all hyperthreads of each physical core into the same + balloon, but allow containers to use only one hyperthread + from each core. This will ensure that hidden hyperthreads + will remain completely idle as they cannot be allocated to + other balloons. + type: boolean matchExpressions: description: |- MatchExpressions specifies one or more expressions which are evaluated diff --git a/docs/resource-policy/policy/balloons.md b/docs/resource-policy/policy/balloons.md index 2091e1b1e..d9f641936 100644 --- a/docs/resource-policy/policy/balloons.md +++ b/docs/resource-policy/policy/balloons.md @@ -181,6 +181,17 @@ Balloons policy parameters: - `numa`: ...in the same numa node(s) as the balloon. - `core`: ...allowed to use idle CPU threads in the same cores with the balloon. + - `hideHyperthreads`: "soft" disable hyperthreads. If `true`, only + one hyperthread from every physical CPU core in the balloon is + allowed to be used by containers in the balloon. Hidden + hyperthreads are not available to any container in the system + either. If containers in the balloon are allowed to share idle + CPUs (see `shareIdleCPUsInSame`), hyperthreads of idle CPUs, too, + are hidden from the containers. If containers in another balloon + share the same idle CPUs, those containers are allowed to use both + hyperthreads of the idle CPUs if `hideHyperthreads` is `false` for + the other balloon. The default is `false`: containers are allowed + to use all hyperthreads of balloon's CPUs and shared idle CPUs. - `preferSpreadOnPhysicalCores` overrides the policy level option with the same name in the scope of this balloon type. - `preferCloseToDevices` prefers creating new balloons close to @@ -301,6 +312,28 @@ memory.preserve.resource-policy.nri.io/pod: "true" memory.preserve.resource-policy.nri.io: "true" ``` +### Selectively Disabling Hyperthreading + +If a container opts to hide hyperthreads, it is allowed to use only +one hyperthread from every physical CPU core allocated to it. Note +that as a result the container may be allowed to run on only half of +the CPUs it has requested. In case of workloads that do not benefit +from hyperthreading this nevertheless results in better performance +compared to running on all hyperthreads of the same CPU cores. If +container's CPU allocation is exclusive, no other container can run on +hidden hyperthreads either. + +```yaml +metadata: + annotations: + # allow the "LLM" container to use only single thread per physical CPU core + hide-hyperthreads.resource-policy.nri.io/container.LLM: "true" +``` + +The `hide-hyperthreads` pod annotation overrides the +`hideHyperthreads` balloon type parameter value for selected +containers in the pod. + ## Metrics and Debugging In order to enable more verbose logging and metrics exporting from the diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go index 889b0164c..ed44769cc 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go @@ -156,6 +156,19 @@ type BalloonDef struct { // PreferSpreadOnPhysicalCores is the balloon type specific // parameter of the policy level parameter with the same name. PreferSpreadOnPhysicalCores *bool `json:"preferSpreadOnPhysicalCores,omitempty"` + // HideHyperthreads allows containers in a balloon use only + // one hyperthread from each physical CPU core in the + // balloon. For instance, if a balloon contains 16 logical + // CPUs from 8 physical cores and this option is true, then + // containers in the balloon will be allowed to use 8 logical + // CPUs, one from each physical core. This option is best used + // with PreferSpreadOnPhysicalCores=false in order to allocate + // all hyperthreads of each physical core into the same + // balloon, but allow containers to use only one hyperthread + // from each core. This will ensure that hidden hyperthreads + // will remain completely idle as they cannot be allocated to + // other balloons. + HideHyperthreads *bool `json:"hideHyperthreads,omitempty"` // AllocatorTopologyBalancing is the balloon type specific // parameter of the policy level parameter with the same name. AllocatorTopologyBalancing *bool `json:"allocatorTopologyBalancing,omitempty"` diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go index 437c4df27..ed78429e7 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go @@ -43,6 +43,11 @@ func (in *BalloonDef) DeepCopyInto(out *BalloonDef) { *out = new(bool) **out = **in } + if in.HideHyperthreads != nil { + in, out := &in.HideHyperthreads, &out.HideHyperthreads + *out = new(bool) + **out = **in + } if in.AllocatorTopologyBalancing != nil { in, out := &in.AllocatorTopologyBalancing, &out.AllocatorTopologyBalancing *out = new(bool) diff --git a/test/e2e/policies.test-suite/balloons/balloons-busybox.yaml.in b/test/e2e/policies.test-suite/balloons/balloons-busybox.yaml.in index 2c7b2c7a5..359f4b6eb 100644 --- a/test/e2e/policies.test-suite/balloons/balloons-busybox.yaml.in +++ b/test/e2e/policies.test-suite/balloons/balloons-busybox.yaml.in @@ -4,7 +4,9 @@ metadata: name: ${NAME} $(if [ -n "$POD_ANNOTATION" ]; then echo " annotations: - $POD_ANNOTATION + $(for ann in "${POD_ANNOTATION[@]}"; do echo " + $ann + "; done) "; fi) labels: app: ${NAME} diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test01-basic-placement/code.var.sh b/test/e2e/policies.test-suite/balloons/n4c16/test01-basic-placement/code.var.sh index f3aef5f1f..d8b2b7129 100644 --- a/test/e2e/policies.test-suite/balloons/n4c16/test01-basic-placement/code.var.sh +++ b/test/e2e/policies.test-suite/balloons/n4c16/test01-basic-placement/code.var.sh @@ -46,11 +46,17 @@ cleanup # pod4: first two containers to the first instance, 3rd to new four-cpu instance CPUREQ="3" MEMREQ="" CPULIM="3" MEMLIM="" -POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: four-cpu" CONTCOUNT=3 create balloons-busybox +POD_ANNOTATION=( + "balloon.balloons.resource-policy.nri.io: four-cpu" + "hide-hyperthreads.resource-policy.nri.io/container.pod4c1: \"true\"" +) +CONTCOUNT=3 create balloons-busybox +unset POD_ANNOTATION report allowed -verify 'cpus["pod4c0"] == cpus["pod4c1"]' \ +verify 'cpus["pod4c1"].issubset(cpus["pod4c0"])' \ 'disjoint_sets(cpus["pod4c2"], cpus["pod4c0"])' \ 'len(cpus["pod4c0"]) == 6' \ + 'len(cpus["pod4c1"]) == 3' \ 'len(cpus["pod4c2"]) == 4' cleanup diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/balloons-metrics.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/balloons-metrics.cfg index 333b963c8..30769a444 100644 --- a/test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/balloons-metrics.cfg +++ b/test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/balloons-metrics.cfg @@ -5,6 +5,7 @@ config: - name: full-core minCPUs: 2 maxCPUs: 2 + hideHyperthreads: true cpuClass: normal - name: fast-dualcore diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/code.var.sh b/test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/code.var.sh index 66cc8dfd2..f8797fa85 100644 --- a/test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/code.var.sh +++ b/test/e2e/policies.test-suite/balloons/n4c16/test02-prometheus-metrics/code.var.sh @@ -21,11 +21,19 @@ verify-metrics-has-no-line 'balloon_type="flex"' # pod0 in full-core[0] CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" -POD_ANNOTATION="balloon.balloons.resource-policy.nri.io: full-core" CONTCOUNT=2 create balloons-busybox +POD_ANNOTATION=( + "balloon.balloons.resource-policy.nri.io: full-core" + "hide-hyperthreads.resource-policy.nri.io/container.pod0c1: \"false\"" +) +CONTCOUNT=2 create balloons-busybox +unset POD_ANNOTATION report allowed verify-metrics-has-line 'balloon="default\[0\]"' verify-metrics-has-line 'balloon="reserved\[0\]"' verify-metrics-has-line 'balloons{balloon="full-core\[0\]",balloon_type="full-core",containers="pod0/pod0c0,pod0/pod0c1",cpu_class="normal",cpus=".*",cpus_allowed=".*",cpus_allowed_count="2",cpus_count="2",cpus_max="2",cpus_min="2",dies="p[01]d0",dies_count="1",groups="",mems="[0-3]",numas="p[01]d0n[0-3]",numas_count="1",packages="p[01]",packages_count="1",sharedidlecpus="",sharedidlecpus_count="0",tot_req_millicpu="(199|200)"} 2' +verify 'len(cpus["pod0c0"]) == 1' \ + 'len(cpus["pod0c1"]) == 2' \ + 'cpus["pod0c0"].issubset(cpus["pod0c1"])' # pod1 in fast-dualcore[0] CPUREQ="200m" MEMREQ="" CPULIM="200m" MEMLIM=""