Skip to content

Commit

Permalink
Delete By Deletion Cost (#622)
Browse files Browse the repository at this point in the history
* CloneSet support pod-deletion-cost

Signed-off-by: PokerBySwq <[email protected]>
  • Loading branch information
PokerBySwq authored May 18, 2021
1 parent 66d696d commit c5c37ae
Show file tree
Hide file tree
Showing 3 changed files with 209 additions and 5 deletions.
9 changes: 4 additions & 5 deletions pkg/controller/cloneset/sync/cloneset_scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ import (
"k8s.io/apimachinery/pkg/util/rand"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/klog"
kubecontroller "k8s.io/kubernetes/pkg/controller"
)

const (
Expand Down Expand Up @@ -369,10 +368,10 @@ func choosePodsToDelete(totalDiff int, currentRevDiff int, notUpdatedPods, updat
choose := func(pods []*v1.Pod, diff int) []*v1.Pod {
// No need to sort pods if we are about to delete all of them.
if diff < len(pods) {
// Sort the pods in the order such that not-ready < ready, unscheduled
// < scheduled, and pending < running. This ensures that we delete pods
// in the earlier stages whenever possible.
sort.Sort(kubecontroller.ActivePods(pods))
// Pods are classified according to whether they receive traffic.
// If pods are in the same category, they are sorted in the order of pod-deletion-cost.
// Otherwise, the pods are sorted according to controller.ActivePods
sort.Sort(ActivePodsWithDeletionCost(pods))
} else if diff > len(pods) {
klog.Warningf("Diff > len(pods) in choosePodsToDelete func which is not expected.")
return pods
Expand Down
121 changes: 121 additions & 0 deletions pkg/controller/cloneset/sync/cloneset_sync_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package sync

import (
"fmt"

appspub "github.com/openkruise/kruise/apis/apps/pub"
appsv1alpha1 "github.com/openkruise/kruise/apis/apps/v1alpha1"
clonesetcore "github.com/openkruise/kruise/pkg/controller/cloneset/core"
Expand All @@ -26,9 +28,14 @@ import (
utilfeature "github.com/openkruise/kruise/pkg/util/feature"
"github.com/openkruise/kruise/pkg/util/lifecycle"
"github.com/openkruise/kruise/pkg/util/specifieddelete"

"strconv"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
intstrutil "k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/klog"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
"k8s.io/utils/integer"
)

Expand Down Expand Up @@ -216,3 +223,117 @@ func isPodReady(coreControl clonesetcore.Control, pod *v1.Pod, minReadySeconds i
}
return coreControl.IsPodUpdateReady(pod, minReadySeconds)
}

// ActivePodsWithDeletionCost type allows custom sorting of pods so a controller can pick the best ones to delete.
type ActivePodsWithDeletionCost []*v1.Pod

const (
// PodDeletionCost can be used to set to an int32 that represent the cost of deleting
// a pod compared to other pods belonging to the same ReplicaSet. Pods with lower
// deletion cost are preferred to be deleted before pods with higher deletion cost.
PodDeletionCost = "controller.kubernetes.io/pod-deletion-cost"
)

func (s ActivePodsWithDeletionCost) Len() int { return len(s) }
func (s ActivePodsWithDeletionCost) Swap(i, j int) { s[i], s[j] = s[j], s[i] }

func (s ActivePodsWithDeletionCost) Less(i, j int) bool {
// 1. Unassigned < assigned
// If only one of the pods is unassigned, the unassigned one is smaller
if s[i].Spec.NodeName != s[j].Spec.NodeName && (len(s[i].Spec.NodeName) == 0 || len(s[j].Spec.NodeName) == 0) {
return len(s[i].Spec.NodeName) == 0
}
// 2. PodPending < PodUnknown < PodRunning
podPhaseToOrdinal := map[v1.PodPhase]int{v1.PodPending: 0, v1.PodUnknown: 1, v1.PodRunning: 2}
if podPhaseToOrdinal[s[i].Status.Phase] != podPhaseToOrdinal[s[j].Status.Phase] {
return podPhaseToOrdinal[s[i].Status.Phase] < podPhaseToOrdinal[s[j].Status.Phase]
}
// 3. Not ready < ready
// If only one of the pods is not ready, the not ready one is smaller
if podutil.IsPodReady(s[i]) != podutil.IsPodReady(s[j]) {
return !podutil.IsPodReady(s[i])
}

// 4. higher pod-deletion-cost < lower pod-deletion cost
pi, _ := getDeletionCostFromPodAnnotations(s[i].Annotations)
pj, _ := getDeletionCostFromPodAnnotations(s[j].Annotations)
if pi != pj {
return pi < pj
}

// TODO: take availability into account when we push minReadySeconds information from deployment into pods,
// see https://github.com/kubernetes/kubernetes/issues/22065
// 5. Been ready for empty time < less time < more time
// If both pods are ready, the latest ready one is smaller
if podutil.IsPodReady(s[i]) && podutil.IsPodReady(s[j]) {
readyTime1 := podReadyTime(s[i])
readyTime2 := podReadyTime(s[j])
if !readyTime1.Equal(readyTime2) {
return afterOrZero(readyTime1, readyTime2)
}
}
// 6. Pods with containers with higher restart counts < lower restart counts
if maxContainerRestarts(s[i]) != maxContainerRestarts(s[j]) {
return maxContainerRestarts(s[i]) > maxContainerRestarts(s[j])
}
// 7. Empty creation time pods < newer pods < older pods
if !s[i].CreationTimestamp.Equal(&s[j].CreationTimestamp) {
return afterOrZero(&s[i].CreationTimestamp, &s[j].CreationTimestamp)
}
return false
}

// afterOrZero checks if time t1 is after time t2; if one of them
// is zero, the zero time is seen as after non-zero time.
func afterOrZero(t1, t2 *metav1.Time) bool {
if t1.Time.IsZero() || t2.Time.IsZero() {
return t1.Time.IsZero()
}
return t1.After(t2.Time)
}

func podReadyTime(pod *v1.Pod) *metav1.Time {
if podutil.IsPodReady(pod) {
for _, c := range pod.Status.Conditions {
// we only care about pod ready conditions
if c.Type == v1.PodReady && c.Status == v1.ConditionTrue {
return &c.LastTransitionTime
}
}
}
return &metav1.Time{}
}

func maxContainerRestarts(pod *v1.Pod) int {
maxRestarts := 0
for _, c := range pod.Status.ContainerStatuses {
maxRestarts = integer.IntMax(maxRestarts, int(c.RestartCount))
}
return maxRestarts
}

// getDeletionCostFromPodAnnotations returns the integer value of pod-deletion-cost. Returns 0
// if not set or the value is invalid.
func getDeletionCostFromPodAnnotations(annotations map[string]string) (int32, error) {
if value, exist := annotations[PodDeletionCost]; exist {
// values that start with plus sign (e.g, "+10") or leading zeros (e.g., "008") are not valid.
if !validFirstDigit(value) {
return 0, fmt.Errorf("invalid value %q", value)
}

i, err := strconv.ParseInt(value, 10, 32)
if err != nil {
// make sure we default to 0 on error.
return 0, err
}
return int32(i), nil
}
return 0, nil
}

func validFirstDigit(str string) bool {
if len(str) == 0 {
return false
}
return str[0] == '-' || (str[0] == '0' && str == "0") || (str[0] >= '1' && str[0] <= '9')
}
84 changes: 84 additions & 0 deletions pkg/controller/cloneset/sync/cloneset_sync_utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -604,3 +604,87 @@ func createTestPod(revisionHash string, lifecycleState appspub.LifecycleStateTyp
}
return pod
}

func TestSortingActivePodsWithDeletionCost(t *testing.T) {
now := metav1.Now()
then := metav1.Time{Time: now.AddDate(0, -1, 0)}
zeroTime := metav1.Time{}
pod := func(podName, nodeName string, phase v1.PodPhase, ready bool, restarts int32, readySince metav1.Time, created metav1.Time, annotations map[string]string) *v1.Pod {
var conditions []v1.PodCondition
var containerStatuses []v1.ContainerStatus
if ready {
conditions = []v1.PodCondition{{Type: v1.PodReady, Status: v1.ConditionTrue, LastTransitionTime: readySince}}
containerStatuses = []v1.ContainerStatus{{RestartCount: restarts}}
}
return &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
CreationTimestamp: created,
Name: podName,
Annotations: annotations,
},
Spec: v1.PodSpec{NodeName: nodeName},
Status: v1.PodStatus{
Conditions: conditions,
ContainerStatuses: containerStatuses,
Phase: phase,
},
}
}
var (
unscheduledPod = pod("unscheduled", "", v1.PodPending, false, 0, zeroTime, zeroTime, nil)
scheduledPendingPod = pod("pending", "node", v1.PodPending, false, 0, zeroTime, zeroTime, nil)
unknownPhasePod = pod("unknown-phase", "node", v1.PodUnknown, false, 0, zeroTime, zeroTime, nil)
runningNotReadyPod = pod("not-ready", "node", v1.PodRunning, false, 0, zeroTime, zeroTime, nil)
runningReadyNoLastTransitionTimePod = pod("ready-no-last-transition-time", "node", v1.PodRunning, true, 0, zeroTime, zeroTime, nil)
runningReadyNow = pod("ready-now", "node", v1.PodRunning, true, 0, now, now, nil)
runningReadyThen = pod("ready-then", "node", v1.PodRunning, true, 0, then, then, nil)
runningReadyNowHighRestarts = pod("ready-high-restarts", "node", v1.PodRunning, true, 9001, now, now, nil)
runningReadyNowCreatedThen = pod("ready-now-created-then", "node", v1.PodRunning, true, 0, now, then, nil)
lowPodDeletionCost = pod("low-deletion-cost", "node", v1.PodRunning, true, 0, now, then, map[string]string{PodDeletionCost: "10"})
highPodDeletionCost = pod("high-deletion-cost", "node", v1.PodRunning, true, 0, now, then, map[string]string{PodDeletionCost: "100"})
)
equalityTests := []*v1.Pod{
unscheduledPod,
scheduledPendingPod,
unknownPhasePod,
runningNotReadyPod,
runningReadyNowCreatedThen,
runningReadyNow,
runningReadyThen,
runningReadyNowHighRestarts,
runningReadyNowCreatedThen,
}
for _, pod := range equalityTests {
podsWithDeletionCost := ActivePodsWithDeletionCost([]*v1.Pod{pod, pod})
if podsWithDeletionCost.Less(0, 1) || podsWithDeletionCost.Less(1, 0) {
t.Errorf("expected pod %q not to be less than than itself", pod.Name)
}
}
type podWithDeletionCost *v1.Pod
inequalityTests := []struct {
lesser, greater podWithDeletionCost
}{
{lesser: podWithDeletionCost(unscheduledPod), greater: podWithDeletionCost(scheduledPendingPod)},
{lesser: podWithDeletionCost(unscheduledPod), greater: podWithDeletionCost(scheduledPendingPod)},
{lesser: podWithDeletionCost(scheduledPendingPod), greater: podWithDeletionCost(unknownPhasePod)},
{lesser: podWithDeletionCost(unknownPhasePod), greater: podWithDeletionCost(runningNotReadyPod)},
{lesser: podWithDeletionCost(runningNotReadyPod), greater: podWithDeletionCost(runningReadyNoLastTransitionTimePod)},
{lesser: podWithDeletionCost(runningReadyNoLastTransitionTimePod), greater: podWithDeletionCost(runningReadyNow)},
{lesser: podWithDeletionCost(runningReadyNow), greater: podWithDeletionCost(runningReadyThen)},
{lesser: podWithDeletionCost(runningReadyNowHighRestarts), greater: podWithDeletionCost(runningReadyNow)},
{lesser: podWithDeletionCost(runningReadyNow), greater: podWithDeletionCost(runningReadyNowCreatedThen)},
{lesser: podWithDeletionCost(lowPodDeletionCost), greater: podWithDeletionCost(highPodDeletionCost)},
}
for i, test := range inequalityTests {
t.Run(fmt.Sprintf("test%d", i), func(t *testing.T) {

podsWithDeletionCost := ActivePodsWithDeletionCost([]*v1.Pod{test.lesser, test.greater})
if !podsWithDeletionCost.Less(0, 1) {
t.Errorf("expected pod %q to be less than %q", podsWithDeletionCost[0].Name, podsWithDeletionCost[1].Name)
}
if podsWithDeletionCost.Less(1, 0) {
t.Errorf("expected pod %q not to be less than %v", podsWithDeletionCost[1].Name, podsWithDeletionCost[0].Name)
}
})
}
}

0 comments on commit c5c37ae

Please sign in to comment.