From abe579053d1a86786ccaa1a88e13fb53b8ef8c03 Mon Sep 17 00:00:00 2001 From: juliogreff Date: Fri, 21 Feb 2020 10:27:43 +0100 Subject: [PATCH] capacity controller: better summarization of conditions Before this, the capacity controller would only put a list of unready clusters in the CapacityTarget's Ready condition when it would set it to False. This requires users to go digging into each cluster condition, and most likely they would only be directed to SadPods, where they could finally get some useful information. Now, that information is summarized in a very brief format, in the hopes that users will have to do less jumping around when investigating why their CapacityTarget is not progressing. For instance, if the CapacityTarget is stuck because one container can't pull its image, we'll now have the following in the CapacityTarget's .stauts.conditions: ``` [ { "lastTransitionTime": "2020-02-12T13:16:44Z", "status": "True", "type": "Operational" }, { "lastTransitionTime": "2020-02-12T13:16:44Z", "message": "docker-desktop: PodsNotReady 3/3: 3x\"test-nginx\" containers with [ImagePullBackOff]", "reason": "ClustersNotReady", "status": "False", "type": "Ready" } ] ``` As a bonus, this is also shown on a `kubectl get ct`: ``` % kubectl get ct snowflake-db84be2b-0 NAME OPERATIONAL READY REASON AGE snowflake-db84be2b-0 True False docker-desktop: PodsNotReady 3/3: 3x"test-nginx" containers with [ImagePullBackOff] 8d ``` --- crd/CapacityTarget-crd.yaml | 17 ++++ .../capacity/capacity_controller.go | 18 ++-- .../capacity/capacity_controller_test.go | 35 +++++--- pkg/controller/capacity/deployment.go | 62 +++++++++++++ pkg/controller/capacity/deployment_test.go | 87 +++++++++++++++++++ pkg/controller/capacity/utils_test.go | 11 +++ pkg/crds/capacitytarget.go | 26 ++++++ pkg/util/clusterstatus/conditions.go | 16 +++- 8 files changed, 253 insertions(+), 19 deletions(-) create mode 100644 pkg/controller/capacity/deployment_test.go diff --git a/crd/CapacityTarget-crd.yaml b/crd/CapacityTarget-crd.yaml index 97d637cb0..39c4954ff 100644 --- a/crd/CapacityTarget-crd.yaml +++ b/crd/CapacityTarget-crd.yaml @@ -4,6 +4,23 @@ metadata: # name must match the spec fields below, and be in the form: . name: capacitytargets.shipper.booking.com spec: + additionalPrinterColumns: + - JSONPath: .status.conditions[?(.type=="Operational")].status + description: Whether the capacity target is operational. + name: Operational + type: string + - JSONPath: .status.conditions[?(.type=="Ready")].status + description: Whether the capacity target is ready. + name: Ready + type: string + - JSONPath: .status.conditions[?(.status=="False")].message + description: Reason for the capactiy target to not be ready or operational. + name: Reason + type: string + - JSONPath: .metadata.creationTimestamp + description: The capacity target's age. + name: Age + type: date # group name to use for REST API: /apis// group: shipper.booking.com # version name to use for REST API: /apis// diff --git a/pkg/controller/capacity/capacity_controller.go b/pkg/controller/capacity/capacity_controller.go index b7a29a6c4..3c7064175 100644 --- a/pkg/controller/capacity/capacity_controller.go +++ b/pkg/controller/capacity/capacity_controller.go @@ -4,6 +4,7 @@ import ( "fmt" "reflect" "sort" + "strings" "time" appsv1 "k8s.io/api/apps/v1" @@ -309,10 +310,11 @@ func (c *Controller) processCapacityTargetOnCluster( } else if l := len(sadPods); l > 0 { // We ran out of conditions to look at, but we have pods that // aren't Ready, so that's one reason to be concerned. + summary := summarizeSadPods(sadPods) reason = PodsNotReady msg = fmt.Sprintf( - "%d out of %d pods are not Ready. this might require intervention, check SadPods in this object for more information", - l, desiredReplicas, + "%d/%d: %s", + l, desiredReplicas, summary, ) } else { // None of the existing pods are non-Ready, and we presumably @@ -400,19 +402,21 @@ func (c *Controller) processCapacityTarget(ct *shipper.CapacityTarget) (*shipper ct.Status.Clusters = newClusterStatuses ct.Status.ObservedGeneration = ct.Generation - clustersNotReady := []string{} + notReadyReasons := []string{} for _, clusterStatus := range ct.Status.Clusters { - if !clusterstatusutil.IsClusterCapacityReady(clusterStatus.Conditions) { - clustersNotReady = append(clustersNotReady, clusterStatus.Name) + ready, reason := clusterstatusutil.IsClusterCapacityReady(clusterStatus.Conditions) + if !ready { + notReadyReasons = append(notReadyReasons, + fmt.Sprintf("%s: %s", clusterStatus.Name, reason)) } } - if len(clustersNotReady) == 0 { + if len(notReadyReasons) == 0 { ct.Status.Conditions = targetutil.TransitionToReady(diff, ct.Status.Conditions) } else { ct.Status.Conditions = targetutil.TransitionToNotReady( diff, ct.Status.Conditions, - ClustersNotReady, fmt.Sprintf("%v", clustersNotReady)) + ClustersNotReady, strings.Join(notReadyReasons, "; ")) } return ct, clusterErrors.Flatten() diff --git a/pkg/controller/capacity/capacity_controller_test.go b/pkg/controller/capacity/capacity_controller_test.go index 2c7072458..96bc81943 100644 --- a/pkg/controller/capacity/capacity_controller_test.go +++ b/pkg/controller/capacity/capacity_controller_test.go @@ -148,7 +148,7 @@ func TestCapacityShiftingPodsNotSadButNotAvailable(t *testing.T) { Type: shipper.TargetConditionTypeReady, Status: corev1.ConditionFalse, Reason: ClustersNotReady, - Message: fmt.Sprintf("%v", []string{clusterA}), + Message: fmt.Sprintf("%s: InProgress", clusterA), }, }, } @@ -192,11 +192,25 @@ func TestCapacityShiftingSadPods(t *testing.T) { Owner: shipper.ClusterCapacityReportOwner{Name: ctName}, Breakdown: []shipper.ClusterCapacityReportBreakdown{ { - Type: "Ready", - Status: string(corev1.ConditionFalse), - Reason: "ExpectedFail", - Count: 1, - Containers: []shipper.ClusterCapacityReportContainerBreakdown{}, + Type: "Ready", + Status: string(corev1.ConditionFalse), + Reason: "ExpectedFail", + Count: 1, + Containers: []shipper.ClusterCapacityReportContainerBreakdown{ + { + Name: "app", + States: []shipper.ClusterCapacityReportContainerStateBreakdown{ + { + Count: 1, + Example: shipper.ClusterCapacityReportContainerBreakdownExample{ + Pod: "foobar-deadbeef", + }, + Reason: "ExpectedFail", + Type: "Waiting", + }, + }, + }, + }, }, }, }, @@ -213,13 +227,14 @@ func TestCapacityShiftingSadPods(t *testing.T) { Type: shipper.ClusterConditionTypeReady, Status: corev1.ConditionFalse, Reason: PodsNotReady, - Message: "1 out of 10 pods are not Ready. this might require intervention, check SadPods in this object for more information", + Message: `1/10: 1x"app" containers with [ExpectedFail]`, }, }, SadPods: []shipper.PodStatus{ { - Name: sadPod.Name, - Condition: sadPod.Status.Conditions[0], + Name: sadPod.Name, + Condition: sadPod.Status.Conditions[0], + Containers: sadPod.Status.ContainerStatuses, }, }, Reports: reports, @@ -231,7 +246,7 @@ func TestCapacityShiftingSadPods(t *testing.T) { Type: shipper.TargetConditionTypeReady, Status: corev1.ConditionFalse, Reason: ClustersNotReady, - Message: fmt.Sprintf("%v", []string{clusterA}), + Message: fmt.Sprintf(`%s: PodsNotReady 1/10: 1x"app" containers with [ExpectedFail]`, clusterA), }, }, } diff --git a/pkg/controller/capacity/deployment.go b/pkg/controller/capacity/deployment.go index 7b8d74e3d..8a18d5e26 100644 --- a/pkg/controller/capacity/deployment.go +++ b/pkg/controller/capacity/deployment.go @@ -4,6 +4,7 @@ import ( "fmt" "math" "sort" + "strings" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -94,3 +95,64 @@ func (c Controller) calculatePercentageFromAmount(total, amount int32) int32 { return int32(math.Ceil(result)) } + +type sadContainerSummary struct { + pods int + reasons map[string]struct{} +} + +func summarizeSadPods(sadPods []shipper.PodStatus) string { + summary := make(map[string]*sadContainerSummary) + + for _, sadPod := range sadPods { + summarizeContainers(sadPod.InitContainers, summary) + summarizeContainers(sadPod.Containers, summary) + } + + containers := make([]string, 0, len(summary)) + for c, _ := range summary { + containers = append(containers, c) + } + + sort.Strings(containers) + + summaryStrs := make([]string, 0, len(summary)) + for _, container := range containers { + summary := summary[container] + reasons := make([]string, 0, len(summary.reasons)) + for r, _ := range summary.reasons { + reasons = append(reasons, r) + } + + sort.Strings(reasons) + + summaryStrs = append(summaryStrs, fmt.Sprintf("%dx%q containers with %v", summary.pods, container, reasons)) + } + + return strings.Join(summaryStrs, "; ") +} + +func summarizeContainers(containers []corev1.ContainerStatus, summary map[string]*sadContainerSummary) { + for _, container := range containers { + if container.Ready { + continue + } + + sadContainer, ok := summary[container.Name] + if !ok { + sadContainer = &sadContainerSummary{ + reasons: make(map[string]struct{}), + } + + summary[container.Name] = sadContainer + } + + sadContainer.pods++ + + if state := container.State.Waiting; state != nil { + sadContainer.reasons[state.Reason] = struct{}{} + } else if state := container.State.Terminated; state != nil { + sadContainer.reasons[state.Reason] = struct{}{} + } + } +} diff --git a/pkg/controller/capacity/deployment_test.go b/pkg/controller/capacity/deployment_test.go new file mode 100644 index 000000000..16030a76b --- /dev/null +++ b/pkg/controller/capacity/deployment_test.go @@ -0,0 +1,87 @@ +package capacity + +import ( + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + + shipper "github.com/bookingcom/shipper/pkg/apis/shipper/v1alpha1" +) + +func TestSummarizeSadPods(t *testing.T) { + sadPods := []shipper.PodStatus{ + { + InitContainers: []corev1.ContainerStatus{ + { + Name: "ready-init-container", + Ready: true, + }, + { + Name: "waiting-init-container", + Ready: false, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ImagePullBackOff", + }, + }, + }, + }, + Containers: []corev1.ContainerStatus{ + { + Name: "ready-container", + Ready: true, + }, + { + Name: "waiting-container", + Ready: false, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ImagePullBackOff", + }, + }, + }, + { + Name: "terminated-container", + Ready: false, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + Reason: "Completed", + }, + }, + }, + }, + }, + { + InitContainers: []corev1.ContainerStatus{ + { + Name: "waiting-init-container", // but not really :D + Ready: true, + }, + }, + Containers: []corev1.ContainerStatus{ + { + Name: "waiting-container", + Ready: false, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ErrImagePull", + }, + }, + }, + }, + }, + } + + expected := strings.Join([]string{ + `1x"terminated-container" containers with [Completed]`, + `2x"waiting-container" containers with [ErrImagePull ImagePullBackOff]`, + `1x"waiting-init-container" containers with [ImagePullBackOff]`, + }, "; ") + actual := summarizeSadPods(sadPods) + if expected != actual { + t.Fatalf( + "summary does not match.\nexpected: %s\nactual: %s", + expected, actual) + } +} diff --git a/pkg/controller/capacity/utils_test.go b/pkg/controller/capacity/utils_test.go index 4db509871..bc46f6c68 100644 --- a/pkg/controller/capacity/utils_test.go +++ b/pkg/controller/capacity/utils_test.go @@ -124,6 +124,17 @@ func buildSadPodForDeployment(deployment *appsv1.Deployment) *corev1.Pod { Message: "This failure is meant to happen!", }, }, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "app", + Ready: false, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ExpectedFail", + }, + }, + }, + }, }, } } diff --git a/pkg/crds/capacitytarget.go b/pkg/crds/capacitytarget.go index 8fd8dc1b6..730985daa 100644 --- a/pkg/crds/capacitytarget.go +++ b/pkg/crds/capacitytarget.go @@ -64,5 +64,31 @@ var CapacityTarget = &apiextensionv1beta1.CustomResourceDefinition{ }, }, }, + AdditionalPrinterColumns: []apiextensionv1beta1.CustomResourceColumnDefinition{ + apiextensionv1beta1.CustomResourceColumnDefinition{ + Name: "Operational", + Type: "string", + Description: "Whether the capactiy target is operational.", + JSONPath: `.status.conditions[?(.type=="Operational")].status`, + }, + apiextensionv1beta1.CustomResourceColumnDefinition{ + Name: "Ready", + Type: "string", + Description: "Whether the capactiy target is ready.", + JSONPath: `.status.conditions[?(.type=="Ready")].status`, + }, + apiextensionv1beta1.CustomResourceColumnDefinition{ + Name: "Reason", + Type: "string", + Description: "Reason for the capactiy target to not be ready or operational.", + JSONPath: `.status.conditions[?(.status=="False")].message`, + }, + apiextensionv1beta1.CustomResourceColumnDefinition{ + Name: "Age", + Type: "date", + Description: "The capactiy target's age.", + JSONPath: ".metadata.creationTimestamp", + }, + }, }, } diff --git a/pkg/util/clusterstatus/conditions.go b/pkg/util/clusterstatus/conditions.go index 5b6839ea6..6f0ccd766 100644 --- a/pkg/util/clusterstatus/conditions.go +++ b/pkg/util/clusterstatus/conditions.go @@ -1,6 +1,8 @@ package clusterstatus import ( + "fmt" + shipper "github.com/bookingcom/shipper/pkg/apis/shipper/v1alpha1" corev1 "k8s.io/api/core/v1" ) @@ -21,7 +23,7 @@ func IsClusterTrafficReady(conditions []shipper.ClusterTrafficCondition) bool { return readyCond.Status == corev1.ConditionTrue } -func IsClusterCapacityReady(conditions []shipper.ClusterCapacityCondition) bool { +func IsClusterCapacityReady(conditions []shipper.ClusterCapacityCondition) (bool, string) { var readyCond shipper.ClusterCapacityCondition for _, c := range conditions { if c.Type == shipper.ClusterConditionTypeReady { @@ -30,7 +32,17 @@ func IsClusterCapacityReady(conditions []shipper.ClusterCapacityCondition) bool } } - return readyCond.Status == corev1.ConditionTrue + if readyCond.Status != corev1.ConditionTrue { + msg := readyCond.Reason + + if readyCond.Message != "" { + msg = fmt.Sprintf("%s %s", msg, readyCond.Message) + } + + return false, msg + } + + return true, "" } func IsClusterInstallationReady(conditions []shipper.ClusterInstallationCondition) bool {