/* Copyright 2016 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package eviction import ( "fmt" "sort" "strconv" "strings" "time" "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/klog" "k8s.io/kubernetes/pkg/features" statsapi "k8s.io/kubernetes/pkg/kubelet/apis/stats/v1alpha1" evictionapi "k8s.io/kubernetes/pkg/kubelet/eviction/api" kubetypes "k8s.io/kubernetes/pkg/kubelet/types" schedulerutils "k8s.io/kubernetes/pkg/scheduler/util" ) const ( unsupportedEvictionSignal = "unsupported eviction signal %v" // Reason is the reason reported back in status. Reason = "Evicted" // nodeLowMessageFmt is the message for evictions due to resource pressure. nodeLowMessageFmt = "The node was low on resource: %v. " // nodeConditionMessageFmt is the message for evictions due to resource pressure. nodeConditionMessageFmt = "The node had condition: %v. " // containerMessageFmt provides additional information for containers exceeding requests containerMessageFmt = "Container %s was using %s, which exceeds its request of %s. " // containerEphemeralStorageMessageFmt provides additional information for containers which have exceeded their ES limit containerEphemeralStorageMessageFmt = "Container %s exceeded its local ephemeral storage limit %q. " // podEphemeralStorageMessageFmt provides additional information for pods which have exceeded their ES limit podEphemeralStorageMessageFmt = "Pod ephemeral local storage usage exceeds the total limit of containers %s. " // emptyDirMessageFmt provides additional information for empty-dir volumes which have exceeded their size limit emptyDirMessageFmt = "Usage of EmptyDir volume %q exceeds the limit %q. " // inodes, number. internal to this module, used to account for local disk inode consumption. resourceInodes v1.ResourceName = "inodes" // resourcePids, number. internal to this module, used to account for local pid consumption. resourcePids v1.ResourceName = "pids" // OffendingContainersKey is the key in eviction event annotations for the list of container names which exceeded their requests OffendingContainersKey = "offending_containers" // OffendingContainersUsageKey is the key in eviction event annotations for the list of usage of containers which exceeded their requests OffendingContainersUsageKey = "offending_containers_usage" // StarvedResourceKey is the key for the starved resource in eviction event annotations StarvedResourceKey = "starved_resource" ) var ( // signalToNodeCondition maps a signal to the node condition to report if threshold is met. signalToNodeCondition map[evictionapi.Signal]v1.NodeConditionType // signalToResource maps a Signal to its associated Resource. signalToResource map[evictionapi.Signal]v1.ResourceName ) func init() { // map eviction signals to node conditions signalToNodeCondition = map[evictionapi.Signal]v1.NodeConditionType{} signalToNodeCondition[evictionapi.SignalMemoryAvailable] = v1.NodeMemoryPressure signalToNodeCondition[evictionapi.SignalAllocatableMemoryAvailable] = v1.NodeMemoryPressure signalToNodeCondition[evictionapi.SignalImageFsAvailable] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalNodeFsAvailable] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalImageFsInodesFree] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalNodeFsInodesFree] = v1.NodeDiskPressure signalToNodeCondition[evictionapi.SignalPIDAvailable] = v1.NodePIDPressure // map signals to resources (and vice-versa) signalToResource = map[evictionapi.Signal]v1.ResourceName{} signalToResource[evictionapi.SignalMemoryAvailable] = v1.ResourceMemory signalToResource[evictionapi.SignalAllocatableMemoryAvailable] = v1.ResourceMemory signalToResource[evictionapi.SignalImageFsAvailable] = v1.ResourceEphemeralStorage signalToResource[evictionapi.SignalImageFsInodesFree] = resourceInodes signalToResource[evictionapi.SignalNodeFsAvailable] = v1.ResourceEphemeralStorage signalToResource[evictionapi.SignalNodeFsInodesFree] = resourceInodes signalToResource[evictionapi.SignalPIDAvailable] = resourcePids } // validSignal returns true if the signal is supported. func validSignal(signal evictionapi.Signal) bool { _, found := signalToResource[signal] return found } // ParseThresholdConfig parses the flags for thresholds. func ParseThresholdConfig(allocatableConfig []string, evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim map[string]string) ([]evictionapi.Threshold, error) { results := []evictionapi.Threshold{} hardThresholds, err := parseThresholdStatements(evictionHard) if err != nil { return nil, err } results = append(results, hardThresholds...) softThresholds, err := parseThresholdStatements(evictionSoft) if err != nil { return nil, err } gracePeriods, err := parseGracePeriods(evictionSoftGracePeriod) if err != nil { return nil, err } minReclaims, err := parseMinimumReclaims(evictionMinimumReclaim) if err != nil { return nil, err } for i := range softThresholds { signal := softThresholds[i].Signal period, found := gracePeriods[signal] if !found { return nil, fmt.Errorf("grace period must be specified for the soft eviction threshold %v", signal) } softThresholds[i].GracePeriod = period } results = append(results, softThresholds...) for i := range results { for signal, minReclaim := range minReclaims { if results[i].Signal == signal { results[i].MinReclaim = &minReclaim break } } } for _, key := range allocatableConfig { if key == kubetypes.NodeAllocatableEnforcementKey { results = addAllocatableThresholds(results) break } } return results, nil } func addAllocatableThresholds(thresholds []evictionapi.Threshold) []evictionapi.Threshold { additionalThresholds := []evictionapi.Threshold{} for _, threshold := range thresholds { if threshold.Signal == evictionapi.SignalMemoryAvailable && isHardEvictionThreshold(threshold) { // Copy the SignalMemoryAvailable to SignalAllocatableMemoryAvailable additionalThresholds = append(additionalThresholds, evictionapi.Threshold{ Signal: evictionapi.SignalAllocatableMemoryAvailable, Operator: threshold.Operator, Value: threshold.Value, MinReclaim: threshold.MinReclaim, }) } } return append(thresholds, additionalThresholds...) } // parseThresholdStatements parses the input statements into a list of Threshold objects. func parseThresholdStatements(statements map[string]string) ([]evictionapi.Threshold, error) { if len(statements) == 0 { return nil, nil } results := []evictionapi.Threshold{} for signal, val := range statements { result, err := parseThresholdStatement(evictionapi.Signal(signal), val) if err != nil { return nil, err } if result != nil { results = append(results, *result) } } return results, nil } // parseThresholdStatement parses a threshold statement and returns a threshold, // or nil if the threshold should be ignored. func parseThresholdStatement(signal evictionapi.Signal, val string) (*evictionapi.Threshold, error) { if !validSignal(signal) { return nil, fmt.Errorf(unsupportedEvictionSignal, signal) } operator := evictionapi.OpForSignal[signal] if strings.HasSuffix(val, "%") { // ignore 0% and 100% if val == "0%" || val == "100%" { return nil, nil } percentage, err := parsePercentage(val) if err != nil { return nil, err } if percentage < 0 { return nil, fmt.Errorf("eviction percentage threshold %v must be >= 0%%: %s", signal, val) } if percentage > 100 { return nil, fmt.Errorf("eviction percentage threshold %v must be <= 100%%: %s", signal, val) } return &evictionapi.Threshold{ Signal: signal, Operator: operator, Value: evictionapi.ThresholdValue{ Percentage: percentage, }, }, nil } quantity, err := resource.ParseQuantity(val) if err != nil { return nil, err } if quantity.Sign() < 0 || quantity.IsZero() { return nil, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity) } return &evictionapi.Threshold{ Signal: signal, Operator: operator, Value: evictionapi.ThresholdValue{ Quantity: &quantity, }, }, nil } // parsePercentage parses a string representing a percentage value func parsePercentage(input string) (float32, error) { value, err := strconv.ParseFloat(strings.TrimRight(input, "%"), 32) if err != nil { return 0, err } return float32(value) / 100, nil } // parseGracePeriods parses the grace period statements func parseGracePeriods(statements map[string]string) (map[evictionapi.Signal]time.Duration, error) { if len(statements) == 0 { return nil, nil } results := map[evictionapi.Signal]time.Duration{} for signal, val := range statements { signal := evictionapi.Signal(signal) if !validSignal(signal) { return nil, fmt.Errorf(unsupportedEvictionSignal, signal) } gracePeriod, err := time.ParseDuration(val) if err != nil { return nil, err } if gracePeriod < 0 { return nil, fmt.Errorf("invalid eviction grace period specified: %v, must be a positive value", val) } results[signal] = gracePeriod } return results, nil } // parseMinimumReclaims parses the minimum reclaim statements func parseMinimumReclaims(statements map[string]string) (map[evictionapi.Signal]evictionapi.ThresholdValue, error) { if len(statements) == 0 { return nil, nil } results := map[evictionapi.Signal]evictionapi.ThresholdValue{} for signal, val := range statements { signal := evictionapi.Signal(signal) if !validSignal(signal) { return nil, fmt.Errorf(unsupportedEvictionSignal, signal) } if strings.HasSuffix(val, "%") { percentage, err := parsePercentage(val) if err != nil { return nil, err } if percentage <= 0 { return nil, fmt.Errorf("eviction percentage minimum reclaim %v must be positive: %s", signal, val) } results[signal] = evictionapi.ThresholdValue{ Percentage: percentage, } continue } quantity, err := resource.ParseQuantity(val) if err != nil { return nil, err } if quantity.Sign() < 0 { return nil, fmt.Errorf("negative eviction minimum reclaim specified for %v", signal) } results[signal] = evictionapi.ThresholdValue{ Quantity: &quantity, } } return results, nil } // diskUsage converts used bytes into a resource quantity. func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity { if fsStats == nil || fsStats.UsedBytes == nil { return &resource.Quantity{Format: resource.BinarySI} } usage := int64(*fsStats.UsedBytes) return resource.NewQuantity(usage, resource.BinarySI) } // inodeUsage converts inodes consumed into a resource quantity. func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity { if fsStats == nil || fsStats.InodesUsed == nil { return &resource.Quantity{Format: resource.DecimalSI} } usage := int64(*fsStats.InodesUsed) return resource.NewQuantity(usage, resource.DecimalSI) } // memoryUsage converts working set into a resource quantity. func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity { if memStats == nil || memStats.WorkingSetBytes == nil { return &resource.Quantity{Format: resource.BinarySI} } usage := int64(*memStats.WorkingSetBytes) return resource.NewQuantity(usage, resource.BinarySI) } // localVolumeNames returns the set of volumes for the pod that are local // TODO: sumamry API should report what volumes consume local storage rather than hard-code here. func localVolumeNames(pod *v1.Pod) []string { result := []string{} for _, volume := range pod.Spec.Volumes { if volume.HostPath != nil || (volume.EmptyDir != nil && volume.EmptyDir.Medium != v1.StorageMediumMemory) || volume.ConfigMap != nil || volume.GitRepo != nil { result = append(result, volume.Name) } } return result } // containerUsage aggregates container disk usage and inode consumption for the specified stats to measure. func containerUsage(podStats statsapi.PodStats, statsToMeasure []fsStatsType) v1.ResourceList { disk := resource.Quantity{Format: resource.BinarySI} inodes := resource.Quantity{Format: resource.DecimalSI} for _, container := range podStats.Containers { if hasFsStatsType(statsToMeasure, fsStatsRoot) { disk.Add(*diskUsage(container.Rootfs)) inodes.Add(*inodeUsage(container.Rootfs)) } if hasFsStatsType(statsToMeasure, fsStatsLogs) { disk.Add(*diskUsage(container.Logs)) inodes.Add(*inodeUsage(container.Logs)) } } return v1.ResourceList{ v1.ResourceEphemeralStorage: disk, resourceInodes: inodes, } } // podLocalVolumeUsage aggregates pod local volumes disk usage and inode consumption for the specified stats to measure. func podLocalVolumeUsage(volumeNames []string, podStats statsapi.PodStats) v1.ResourceList { disk := resource.Quantity{Format: resource.BinarySI} inodes := resource.Quantity{Format: resource.DecimalSI} for _, volumeName := range volumeNames { for _, volumeStats := range podStats.VolumeStats { if volumeStats.Name == volumeName { disk.Add(*diskUsage(&volumeStats.FsStats)) inodes.Add(*inodeUsage(&volumeStats.FsStats)) break } } } return v1.ResourceList{ v1.ResourceEphemeralStorage: disk, resourceInodes: inodes, } } // podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure. func podDiskUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) { disk := resource.Quantity{Format: resource.BinarySI} inodes := resource.Quantity{Format: resource.DecimalSI} containerUsageList := containerUsage(podStats, statsToMeasure) disk.Add(containerUsageList[v1.ResourceEphemeralStorage]) inodes.Add(containerUsageList[resourceInodes]) if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) { volumeNames := localVolumeNames(pod) podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats) disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage]) inodes.Add(podLocalVolumeUsageList[resourceInodes]) } return v1.ResourceList{ v1.ResourceEphemeralStorage: disk, resourceInodes: inodes, }, nil } // localEphemeralVolumeNames returns the set of ephemeral volumes for the pod that are local func localEphemeralVolumeNames(pod *v1.Pod) []string { result := []string{} for _, volume := range pod.Spec.Volumes { if volume.GitRepo != nil || (volume.EmptyDir != nil && volume.EmptyDir.Medium != v1.StorageMediumMemory) || volume.ConfigMap != nil || volume.DownwardAPI != nil { result = append(result, volume.Name) } } return result } // podLocalEphemeralStorageUsage aggregates pod local ephemeral storage usage and inode consumption for the specified stats to measure. func podLocalEphemeralStorageUsage(podStats statsapi.PodStats, pod *v1.Pod, statsToMeasure []fsStatsType) (v1.ResourceList, error) { disk := resource.Quantity{Format: resource.BinarySI} inodes := resource.Quantity{Format: resource.DecimalSI} containerUsageList := containerUsage(podStats, statsToMeasure) disk.Add(containerUsageList[v1.ResourceEphemeralStorage]) inodes.Add(containerUsageList[resourceInodes]) if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) { volumeNames := localEphemeralVolumeNames(pod) podLocalVolumeUsageList := podLocalVolumeUsage(volumeNames, podStats) disk.Add(podLocalVolumeUsageList[v1.ResourceEphemeralStorage]) inodes.Add(podLocalVolumeUsageList[resourceInodes]) } return v1.ResourceList{ v1.ResourceEphemeralStorage: disk, resourceInodes: inodes, }, nil } // formatThreshold formats a threshold for logging. func formatThreshold(threshold evictionapi.Threshold) string { return fmt.Sprintf("threshold(signal=%v, operator=%v, value=%v, gracePeriod=%v)", threshold.Signal, threshold.Operator, evictionapi.ThresholdValue(threshold.Value), threshold.GracePeriod) } // formatevictionapi.ThresholdValue formats a thresholdValue for logging. func formatThresholdValue(value evictionapi.ThresholdValue) string { if value.Quantity != nil { return value.Quantity.String() } return fmt.Sprintf("%f%%", value.Percentage*float32(100)) } // cachedStatsFunc returns a statsFunc based on the provided pod stats. func cachedStatsFunc(podStats []statsapi.PodStats) statsFunc { uid2PodStats := map[string]statsapi.PodStats{} for i := range podStats { uid2PodStats[podStats[i].PodRef.UID] = podStats[i] } return func(pod *v1.Pod) (statsapi.PodStats, bool) { stats, found := uid2PodStats[string(pod.UID)] return stats, found } } // Cmp compares p1 and p2 and returns: // // -1 if p1 < p2 // 0 if p1 == p2 // +1 if p1 > p2 // type cmpFunc func(p1, p2 *v1.Pod) int // multiSorter implements the Sort interface, sorting changes within. type multiSorter struct { pods []*v1.Pod cmp []cmpFunc } // Sort sorts the argument slice according to the less functions passed to OrderedBy. func (ms *multiSorter) Sort(pods []*v1.Pod) { ms.pods = pods sort.Sort(ms) } // OrderedBy returns a Sorter that sorts using the cmp functions, in order. // Call its Sort method to sort the data. func orderedBy(cmp ...cmpFunc) *multiSorter { return &multiSorter{ cmp: cmp, } } // Len is part of sort.Interface. func (ms *multiSorter) Len() int { return len(ms.pods) } // Swap is part of sort.Interface. func (ms *multiSorter) Swap(i, j int) { ms.pods[i], ms.pods[j] = ms.pods[j], ms.pods[i] } // Less is part of sort.Interface. func (ms *multiSorter) Less(i, j int) bool { p1, p2 := ms.pods[i], ms.pods[j] var k int for k = 0; k < len(ms.cmp)-1; k++ { cmpResult := ms.cmp[k](p1, p2) // p1 is less than p2 if cmpResult < 0 { return true } // p1 is greater than p2 if cmpResult > 0 { return false } // we don't know yet } // the last cmp func is the final decider return ms.cmp[k](p1, p2) < 0 } // priority compares pods by Priority, if priority is enabled. func priority(p1, p2 *v1.Pod) int { if !utilfeature.DefaultFeatureGate.Enabled(features.PodPriority) { // If priority is not enabled, all pods are equal. return 0 } priority1 := schedulerutils.GetPodPriority(p1) priority2 := schedulerutils.GetPodPriority(p2) if priority1 == priority2 { return 0 } if priority1 > priority2 { return 1 } return -1 } // exceedMemoryRequests compares whether or not pods' memory usage exceeds their requests func exceedMemoryRequests(stats statsFunc) cmpFunc { return func(p1, p2 *v1.Pod) int { p1Stats, p1Found := stats(p1) p2Stats, p2Found := stats(p2) if !p1Found || !p2Found { // prioritize evicting the pod for which no stats were found return cmpBool(!p1Found, !p2Found) } p1Memory := memoryUsage(p1Stats.Memory) p2Memory := memoryUsage(p2Stats.Memory) p1ExceedsRequests := p1Memory.Cmp(podRequest(p1, v1.ResourceMemory)) == 1 p2ExceedsRequests := p2Memory.Cmp(podRequest(p2, v1.ResourceMemory)) == 1 // prioritize evicting the pod which exceeds its requests return cmpBool(p1ExceedsRequests, p2ExceedsRequests) } } // memory compares pods by largest consumer of memory relative to request. func memory(stats statsFunc) cmpFunc { return func(p1, p2 *v1.Pod) int { p1Stats, p1Found := stats(p1) p2Stats, p2Found := stats(p2) if !p1Found || !p2Found { // prioritize evicting the pod for which no stats were found return cmpBool(!p1Found, !p2Found) } // adjust p1, p2 usage relative to the request (if any) p1Memory := memoryUsage(p1Stats.Memory) p1Request := podRequest(p1, v1.ResourceMemory) p1Memory.Sub(p1Request) p2Memory := memoryUsage(p2Stats.Memory) p2Request := podRequest(p2, v1.ResourceMemory) p2Memory.Sub(p2Request) // prioritize evicting the pod which has the larger consumption of memory return p2Memory.Cmp(*p1Memory) } } // podRequest returns the total resource request of a pod which is the // max(max of init container requests, sum of container requests) func podRequest(pod *v1.Pod, resourceName v1.ResourceName) resource.Quantity { containerValue := resource.Quantity{Format: resource.BinarySI} if resourceName == v1.ResourceEphemeralStorage && !utilfeature.DefaultFeatureGate.Enabled(features.LocalStorageCapacityIsolation) { // if the local storage capacity isolation feature gate is disabled, pods request 0 disk return containerValue } for i := range pod.Spec.Containers { switch resourceName { case v1.ResourceMemory: containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.Memory()) case v1.ResourceEphemeralStorage: containerValue.Add(*pod.Spec.Containers[i].Resources.Requests.StorageEphemeral()) } } initValue := resource.Quantity{Format: resource.BinarySI} for i := range pod.Spec.InitContainers { switch resourceName { case v1.ResourceMemory: if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.Memory()) < 0 { initValue = *pod.Spec.InitContainers[i].Resources.Requests.Memory() } case v1.ResourceEphemeralStorage: if initValue.Cmp(*pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral()) < 0 { initValue = *pod.Spec.InitContainers[i].Resources.Requests.StorageEphemeral() } } } if containerValue.Cmp(initValue) > 0 { return containerValue } return initValue } // exceedDiskRequests compares whether or not pods' disk usage exceeds their requests func exceedDiskRequests(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) cmpFunc { return func(p1, p2 *v1.Pod) int { p1Stats, p1Found := stats(p1) p2Stats, p2Found := stats(p2) if !p1Found || !p2Found { // prioritize evicting the pod for which no stats were found return cmpBool(!p1Found, !p2Found) } p1Usage, p1Err := podDiskUsage(p1Stats, p1, fsStatsToMeasure) p2Usage, p2Err := podDiskUsage(p2Stats, p2, fsStatsToMeasure) if p1Err != nil || p2Err != nil { // prioritize evicting the pod which had an error getting stats return cmpBool(p1Err != nil, p2Err != nil) } p1Disk := p1Usage[diskResource] p2Disk := p2Usage[diskResource] p1ExceedsRequests := p1Disk.Cmp(podRequest(p1, diskResource)) == 1 p2ExceedsRequests := p2Disk.Cmp(podRequest(p2, diskResource)) == 1 // prioritize evicting the pod which exceeds its requests return cmpBool(p1ExceedsRequests, p2ExceedsRequests) } } // disk compares pods by largest consumer of disk relative to request for the specified disk resource. func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) cmpFunc { return func(p1, p2 *v1.Pod) int { p1Stats, p1Found := stats(p1) p2Stats, p2Found := stats(p2) if !p1Found || !p2Found { // prioritize evicting the pod for which no stats were found return cmpBool(!p1Found, !p2Found) } p1Usage, p1Err := podDiskUsage(p1Stats, p1, fsStatsToMeasure) p2Usage, p2Err := podDiskUsage(p2Stats, p2, fsStatsToMeasure) if p1Err != nil || p2Err != nil { // prioritize evicting the pod which had an error getting stats return cmpBool(p1Err != nil, p2Err != nil) } // adjust p1, p2 usage relative to the request (if any) p1Disk := p1Usage[diskResource] p2Disk := p2Usage[diskResource] p1Request := podRequest(p1, v1.ResourceEphemeralStorage) p1Disk.Sub(p1Request) p2Request := podRequest(p2, v1.ResourceEphemeralStorage) p2Disk.Sub(p2Request) // prioritize evicting the pod which has the larger consumption of disk return p2Disk.Cmp(p1Disk) } } // cmpBool compares booleans, placing true before false func cmpBool(a, b bool) int { if a == b { return 0 } if !b { return -1 } return 1 } // rankMemoryPressure orders the input pods for eviction in response to memory pressure. // It ranks by whether or not the pod's usage exceeds its requests, then by priority, and // finally by memory usage above requests. func rankMemoryPressure(pods []*v1.Pod, stats statsFunc) { orderedBy(exceedMemoryRequests(stats), priority, memory(stats)).Sort(pods) } // rankPIDPressure orders the input pods by priority in response to PID pressure. func rankPIDPressure(pods []*v1.Pod, stats statsFunc) { orderedBy(priority).Sort(pods) } // rankDiskPressureFunc returns a rankFunc that measures the specified fs stats. func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource v1.ResourceName) rankFunc { return func(pods []*v1.Pod, stats statsFunc) { orderedBy(exceedDiskRequests(stats, fsStatsToMeasure, diskResource), priority, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods) } } // byEvictionPriority implements sort.Interface for []v1.ResourceName. type byEvictionPriority []evictionapi.Threshold func (a byEvictionPriority) Len() int { return len(a) } func (a byEvictionPriority) Swap(i, j int) { a[i], a[j] = a[j], a[i] } // Less ranks memory before all other resources, and ranks thresholds with no resource to reclaim last func (a byEvictionPriority) Less(i, j int) bool { _, jSignalHasResource := signalToResource[a[j].Signal] return a[i].Signal == evictionapi.SignalMemoryAvailable || a[i].Signal == evictionapi.SignalAllocatableMemoryAvailable || !jSignalHasResource } // makeSignalObservations derives observations using the specified summary provider. func makeSignalObservations(summary *statsapi.Summary) (signalObservations, statsFunc) { // build the function to work against for pod stats statsFunc := cachedStatsFunc(summary.Pods) // build an evaluation context for current eviction signals result := signalObservations{} if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil { result[evictionapi.SignalMemoryAvailable] = signalObservation{ available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI), capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), time: memory.Time, } } if allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods); err != nil { klog.Errorf("eviction manager: failed to construct signal: %q error: %v", evictionapi.SignalAllocatableMemoryAvailable, err) } else { if memory := allocatableContainer.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil { result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{ available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI), capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), time: memory.Time, } } } if nodeFs := summary.Node.Fs; nodeFs != nil { if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil { result[evictionapi.SignalNodeFsAvailable] = signalObservation{ available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI), capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI), time: nodeFs.Time, } } if nodeFs.InodesFree != nil && nodeFs.Inodes != nil { result[evictionapi.SignalNodeFsInodesFree] = signalObservation{ available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.DecimalSI), capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.DecimalSI), time: nodeFs.Time, } } } if summary.Node.Runtime != nil { if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil { if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil { result[evictionapi.SignalImageFsAvailable] = signalObservation{ available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI), capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI), time: imageFs.Time, } if imageFs.InodesFree != nil && imageFs.Inodes != nil { result[evictionapi.SignalImageFsInodesFree] = signalObservation{ available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.DecimalSI), capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.DecimalSI), time: imageFs.Time, } } } } } if rlimit := summary.Node.Rlimit; rlimit != nil { if rlimit.NumOfRunningProcesses != nil && rlimit.MaxPID != nil { available := int64(*rlimit.MaxPID) - int64(*rlimit.NumOfRunningProcesses) result[evictionapi.SignalPIDAvailable] = signalObservation{ available: resource.NewQuantity(available, resource.BinarySI), capacity: resource.NewQuantity(int64(*rlimit.MaxPID), resource.BinarySI), time: rlimit.Time, } } } return result, statsFunc } func getSysContainer(sysContainers []statsapi.ContainerStats, name string) (*statsapi.ContainerStats, error) { for _, cont := range sysContainers { if cont.Name == name { return &cont, nil } } return nil, fmt.Errorf("system container %q not found in metrics", name) } // thresholdsMet returns the set of thresholds that were met independent of grace period func thresholdsMet(thresholds []evictionapi.Threshold, observations signalObservations, enforceMinReclaim bool) []evictionapi.Threshold { results := []evictionapi.Threshold{} for i := range thresholds { threshold := thresholds[i] observed, found := observations[threshold.Signal] if !found { klog.Warningf("eviction manager: no observation found for eviction signal %v", threshold.Signal) continue } // determine if we have met the specified threshold thresholdMet := false quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity) // if enforceMinReclaim is specified, we compare relative to value - minreclaim if enforceMinReclaim && threshold.MinReclaim != nil { quantity.Add(*evictionapi.GetThresholdQuantity(*threshold.MinReclaim, observed.capacity)) } thresholdResult := quantity.Cmp(*observed.available) switch threshold.Operator { case evictionapi.OpLessThan: thresholdMet = thresholdResult > 0 } if thresholdMet { results = append(results, threshold) } } return results } func debugLogObservations(logPrefix string, observations signalObservations) { if !klog.V(3) { return } for k, v := range observations { if !v.time.IsZero() { klog.Infof("eviction manager: %v: signal=%v, available: %v, capacity: %v, time: %v", logPrefix, k, v.available, v.capacity, v.time) } else { klog.Infof("eviction manager: %v: signal=%v, available: %v, capacity: %v", logPrefix, k, v.available, v.capacity) } } } func debugLogThresholdsWithObservation(logPrefix string, thresholds []evictionapi.Threshold, observations signalObservations) { if !klog.V(3) { return } for i := range thresholds { threshold := thresholds[i] observed, found := observations[threshold.Signal] if found { quantity := evictionapi.GetThresholdQuantity(threshold.Value, observed.capacity) klog.Infof("eviction manager: %v: threshold [signal=%v, quantity=%v] observed %v", logPrefix, threshold.Signal, quantity, observed.available) } else { klog.Infof("eviction manager: %v: threshold [signal=%v] had no observation", logPrefix, threshold.Signal) } } } func thresholdsUpdatedStats(thresholds []evictionapi.Threshold, observations, lastObservations signalObservations) []evictionapi.Threshold { results := []evictionapi.Threshold{} for i := range thresholds { threshold := thresholds[i] observed, found := observations[threshold.Signal] if !found { klog.Warningf("eviction manager: no observation found for eviction signal %v", threshold.Signal) continue } last, found := lastObservations[threshold.Signal] if !found || observed.time.IsZero() || observed.time.After(last.time.Time) { results = append(results, threshold) } } return results } // thresholdsFirstObservedAt merges the input set of thresholds with the previous observation to determine when active set of thresholds were initially met. func thresholdsFirstObservedAt(thresholds []evictionapi.Threshold, lastObservedAt thresholdsObservedAt, now time.Time) thresholdsObservedAt { results := thresholdsObservedAt{} for i := range thresholds { observedAt, found := lastObservedAt[thresholds[i]] if !found { observedAt = now } results[thresholds[i]] = observedAt } return results } // thresholdsMetGracePeriod returns the set of thresholds that have satisfied associated grace period func thresholdsMetGracePeriod(observedAt thresholdsObservedAt, now time.Time) []evictionapi.Threshold { results := []evictionapi.Threshold{} for threshold, at := range observedAt { duration := now.Sub(at) if duration < threshold.GracePeriod { klog.V(2).Infof("eviction manager: eviction criteria not yet met for %v, duration: %v", formatThreshold(threshold), duration) continue } results = append(results, threshold) } return results } // nodeConditions returns the set of node conditions associated with a threshold func nodeConditions(thresholds []evictionapi.Threshold) []v1.NodeConditionType { results := []v1.NodeConditionType{} for _, threshold := range thresholds { if nodeCondition, found := signalToNodeCondition[threshold.Signal]; found { if !hasNodeCondition(results, nodeCondition) { results = append(results, nodeCondition) } } } return results } // nodeConditionsLastObservedAt merges the input with the previous observation to determine when a condition was most recently met. func nodeConditionsLastObservedAt(nodeConditions []v1.NodeConditionType, lastObservedAt nodeConditionsObservedAt, now time.Time) nodeConditionsObservedAt { results := nodeConditionsObservedAt{} // the input conditions were observed "now" for i := range nodeConditions { results[nodeConditions[i]] = now } // the conditions that were not observed now are merged in with their old time for key, value := range lastObservedAt { _, found := results[key] if !found { results[key] = value } } return results } // nodeConditionsObservedSince returns the set of conditions that have been observed within the specified period func nodeConditionsObservedSince(observedAt nodeConditionsObservedAt, period time.Duration, now time.Time) []v1.NodeConditionType { results := []v1.NodeConditionType{} for nodeCondition, at := range observedAt { duration := now.Sub(at) if duration < period { results = append(results, nodeCondition) } } return results } // hasFsStatsType returns true if the fsStat is in the input list func hasFsStatsType(inputs []fsStatsType, item fsStatsType) bool { for _, input := range inputs { if input == item { return true } } return false } // hasNodeCondition returns true if the node condition is in the input list func hasNodeCondition(inputs []v1.NodeConditionType, item v1.NodeConditionType) bool { for _, input := range inputs { if input == item { return true } } return false } // mergeThresholds will merge both threshold lists eliminating duplicates. func mergeThresholds(inputsA []evictionapi.Threshold, inputsB []evictionapi.Threshold) []evictionapi.Threshold { results := inputsA for _, threshold := range inputsB { if !hasThreshold(results, threshold) { results = append(results, threshold) } } return results } // hasThreshold returns true if the threshold is in the input list func hasThreshold(inputs []evictionapi.Threshold, item evictionapi.Threshold) bool { for _, input := range inputs { if input.GracePeriod == item.GracePeriod && input.Operator == item.Operator && input.Signal == item.Signal && compareThresholdValue(input.Value, item.Value) { return true } } return false } // compareThresholdValue returns true if the two thresholdValue objects are logically the same func compareThresholdValue(a evictionapi.ThresholdValue, b evictionapi.ThresholdValue) bool { if a.Quantity != nil { if b.Quantity == nil { return false } return a.Quantity.Cmp(*b.Quantity) == 0 } if b.Quantity != nil { return false } return a.Percentage == b.Percentage } // isHardEvictionThreshold returns true if eviction should immediately occur func isHardEvictionThreshold(threshold evictionapi.Threshold) bool { return threshold.GracePeriod == time.Duration(0) } func isAllocatableEvictionThreshold(threshold evictionapi.Threshold) bool { return threshold.Signal == evictionapi.SignalAllocatableMemoryAvailable } // buildSignalToRankFunc returns ranking functions associated with resources func buildSignalToRankFunc(withImageFs bool) map[evictionapi.Signal]rankFunc { signalToRankFunc := map[evictionapi.Signal]rankFunc{ evictionapi.SignalMemoryAvailable: rankMemoryPressure, evictionapi.SignalAllocatableMemoryAvailable: rankMemoryPressure, evictionapi.SignalPIDAvailable: rankPIDPressure, } // usage of an imagefs is optional if withImageFs { // with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) // with an imagefs, imagefs pod rank func for eviction only includes rootfs signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, v1.ResourceEphemeralStorage) signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes) } else { // without an imagefs, nodefs pod rank func for eviction looks at all fs stats. // since imagefs and nodefs share a common device, they share common ranking functions. signalToRankFunc[evictionapi.SignalNodeFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) signalToRankFunc[evictionapi.SignalNodeFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) signalToRankFunc[evictionapi.SignalImageFsAvailable] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, v1.ResourceEphemeralStorage) signalToRankFunc[evictionapi.SignalImageFsInodesFree] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) } return signalToRankFunc } // PodIsEvicted returns true if the reported pod status is due to an eviction. func PodIsEvicted(podStatus v1.PodStatus) bool { return podStatus.Phase == v1.PodFailed && podStatus.Reason == Reason } // buildSignalToNodeReclaimFuncs returns reclaim functions associated with resources. func buildSignalToNodeReclaimFuncs(imageGC ImageGC, containerGC ContainerGC, withImageFs bool) map[evictionapi.Signal]nodeReclaimFuncs { signalToReclaimFunc := map[evictionapi.Signal]nodeReclaimFuncs{} // usage of an imagefs is optional if withImageFs { // with an imagefs, nodefs pressure should just delete logs signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{} signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{} // with an imagefs, imagefs pressure should delete unused images signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} } else { // without an imagefs, nodefs pressure should delete logs, and unused images // since imagefs and nodefs share a common device, they share common reclaim functions signalToReclaimFunc[evictionapi.SignalNodeFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} signalToReclaimFunc[evictionapi.SignalNodeFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} signalToReclaimFunc[evictionapi.SignalImageFsAvailable] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} signalToReclaimFunc[evictionapi.SignalImageFsInodesFree] = nodeReclaimFuncs{containerGC.DeleteAllUnusedContainers, imageGC.DeleteUnusedImages} } return signalToReclaimFunc } // evictionMessage constructs a useful message about why an eviction occurred, and annotations to provide metadata about the eviction func evictionMessage(resourceToReclaim v1.ResourceName, pod *v1.Pod, stats statsFunc) (message string, annotations map[string]string) { annotations = make(map[string]string) message = fmt.Sprintf(nodeLowMessageFmt, resourceToReclaim) containers := []string{} containerUsage := []string{} podStats, ok := stats(pod) if !ok { return } for _, containerStats := range podStats.Containers { for _, container := range pod.Spec.Containers { if container.Name == containerStats.Name { requests := container.Resources.Requests[resourceToReclaim] var usage *resource.Quantity switch resourceToReclaim { case v1.ResourceEphemeralStorage: if containerStats.Rootfs != nil && containerStats.Rootfs.UsedBytes != nil && containerStats.Logs != nil && containerStats.Logs.UsedBytes != nil { usage = resource.NewQuantity(int64(*containerStats.Rootfs.UsedBytes+*containerStats.Logs.UsedBytes), resource.BinarySI) } case v1.ResourceMemory: if containerStats.Memory != nil && containerStats.Memory.WorkingSetBytes != nil { usage = resource.NewQuantity(int64(*containerStats.Memory.WorkingSetBytes), resource.BinarySI) } } if usage != nil && usage.Cmp(requests) > 0 { message += fmt.Sprintf(containerMessageFmt, container.Name, usage.String(), requests.String()) containers = append(containers, container.Name) containerUsage = append(containerUsage, usage.String()) } } } } annotations[OffendingContainersKey] = strings.Join(containers, ",") annotations[OffendingContainersUsageKey] = strings.Join(containerUsage, ",") annotations[StarvedResourceKey] = string(resourceToReclaim) return }