diff --git a/crd/CapacityTarget-crd.yaml b/crd/CapacityTarget-crd.yaml index 97d637cb0..39c4954ff 100644 --- a/crd/CapacityTarget-crd.yaml +++ b/crd/CapacityTarget-crd.yaml @@ -4,6 +4,23 @@ metadata: # name must match the spec fields below, and be in the form: . name: capacitytargets.shipper.booking.com spec: + additionalPrinterColumns: + - JSONPath: .status.conditions[?(.type=="Operational")].status + description: Whether the capacity target is operational. + name: Operational + type: string + - JSONPath: .status.conditions[?(.type=="Ready")].status + description: Whether the capacity target is ready. + name: Ready + type: string + - JSONPath: .status.conditions[?(.status=="False")].message + description: Reason for the capactiy target to not be ready or operational. + name: Reason + type: string + - JSONPath: .metadata.creationTimestamp + description: The capacity target's age. + name: Age + type: date # group name to use for REST API: /apis// group: shipper.booking.com # version name to use for REST API: /apis// diff --git a/pkg/controller/capacity/capacity_controller.go b/pkg/controller/capacity/capacity_controller.go index b7a29a6c4..3c7064175 100644 --- a/pkg/controller/capacity/capacity_controller.go +++ b/pkg/controller/capacity/capacity_controller.go @@ -4,6 +4,7 @@ import ( "fmt" "reflect" "sort" + "strings" "time" appsv1 "k8s.io/api/apps/v1" @@ -309,10 +310,11 @@ func (c *Controller) processCapacityTargetOnCluster( } else if l := len(sadPods); l > 0 { // We ran out of conditions to look at, but we have pods that // aren't Ready, so that's one reason to be concerned. + summary := summarizeSadPods(sadPods) reason = PodsNotReady msg = fmt.Sprintf( - "%d out of %d pods are not Ready. this might require intervention, check SadPods in this object for more information", - l, desiredReplicas, + "%d/%d: %s", + l, desiredReplicas, summary, ) } else { // None of the existing pods are non-Ready, and we presumably @@ -400,19 +402,21 @@ func (c *Controller) processCapacityTarget(ct *shipper.CapacityTarget) (*shipper ct.Status.Clusters = newClusterStatuses ct.Status.ObservedGeneration = ct.Generation - clustersNotReady := []string{} + notReadyReasons := []string{} for _, clusterStatus := range ct.Status.Clusters { - if !clusterstatusutil.IsClusterCapacityReady(clusterStatus.Conditions) { - clustersNotReady = append(clustersNotReady, clusterStatus.Name) + ready, reason := clusterstatusutil.IsClusterCapacityReady(clusterStatus.Conditions) + if !ready { + notReadyReasons = append(notReadyReasons, + fmt.Sprintf("%s: %s", clusterStatus.Name, reason)) } } - if len(clustersNotReady) == 0 { + if len(notReadyReasons) == 0 { ct.Status.Conditions = targetutil.TransitionToReady(diff, ct.Status.Conditions) } else { ct.Status.Conditions = targetutil.TransitionToNotReady( diff, ct.Status.Conditions, - ClustersNotReady, fmt.Sprintf("%v", clustersNotReady)) + ClustersNotReady, strings.Join(notReadyReasons, "; ")) } return ct, clusterErrors.Flatten() diff --git a/pkg/controller/capacity/capacity_controller_test.go b/pkg/controller/capacity/capacity_controller_test.go index 2c7072458..96bc81943 100644 --- a/pkg/controller/capacity/capacity_controller_test.go +++ b/pkg/controller/capacity/capacity_controller_test.go @@ -148,7 +148,7 @@ func TestCapacityShiftingPodsNotSadButNotAvailable(t *testing.T) { Type: shipper.TargetConditionTypeReady, Status: corev1.ConditionFalse, Reason: ClustersNotReady, - Message: fmt.Sprintf("%v", []string{clusterA}), + Message: fmt.Sprintf("%s: InProgress", clusterA), }, }, } @@ -192,11 +192,25 @@ func TestCapacityShiftingSadPods(t *testing.T) { Owner: shipper.ClusterCapacityReportOwner{Name: ctName}, Breakdown: []shipper.ClusterCapacityReportBreakdown{ { - Type: "Ready", - Status: string(corev1.ConditionFalse), - Reason: "ExpectedFail", - Count: 1, - Containers: []shipper.ClusterCapacityReportContainerBreakdown{}, + Type: "Ready", + Status: string(corev1.ConditionFalse), + Reason: "ExpectedFail", + Count: 1, + Containers: []shipper.ClusterCapacityReportContainerBreakdown{ + { + Name: "app", + States: []shipper.ClusterCapacityReportContainerStateBreakdown{ + { + Count: 1, + Example: shipper.ClusterCapacityReportContainerBreakdownExample{ + Pod: "foobar-deadbeef", + }, + Reason: "ExpectedFail", + Type: "Waiting", + }, + }, + }, + }, }, }, }, @@ -213,13 +227,14 @@ func TestCapacityShiftingSadPods(t *testing.T) { Type: shipper.ClusterConditionTypeReady, Status: corev1.ConditionFalse, Reason: PodsNotReady, - Message: "1 out of 10 pods are not Ready. this might require intervention, check SadPods in this object for more information", + Message: `1/10: 1x"app" containers with [ExpectedFail]`, }, }, SadPods: []shipper.PodStatus{ { - Name: sadPod.Name, - Condition: sadPod.Status.Conditions[0], + Name: sadPod.Name, + Condition: sadPod.Status.Conditions[0], + Containers: sadPod.Status.ContainerStatuses, }, }, Reports: reports, @@ -231,7 +246,7 @@ func TestCapacityShiftingSadPods(t *testing.T) { Type: shipper.TargetConditionTypeReady, Status: corev1.ConditionFalse, Reason: ClustersNotReady, - Message: fmt.Sprintf("%v", []string{clusterA}), + Message: fmt.Sprintf(`%s: PodsNotReady 1/10: 1x"app" containers with [ExpectedFail]`, clusterA), }, }, } diff --git a/pkg/controller/capacity/deployment.go b/pkg/controller/capacity/deployment.go index 7b8d74e3d..8a18d5e26 100644 --- a/pkg/controller/capacity/deployment.go +++ b/pkg/controller/capacity/deployment.go @@ -4,6 +4,7 @@ import ( "fmt" "math" "sort" + "strings" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -94,3 +95,64 @@ func (c Controller) calculatePercentageFromAmount(total, amount int32) int32 { return int32(math.Ceil(result)) } + +type sadContainerSummary struct { + pods int + reasons map[string]struct{} +} + +func summarizeSadPods(sadPods []shipper.PodStatus) string { + summary := make(map[string]*sadContainerSummary) + + for _, sadPod := range sadPods { + summarizeContainers(sadPod.InitContainers, summary) + summarizeContainers(sadPod.Containers, summary) + } + + containers := make([]string, 0, len(summary)) + for c, _ := range summary { + containers = append(containers, c) + } + + sort.Strings(containers) + + summaryStrs := make([]string, 0, len(summary)) + for _, container := range containers { + summary := summary[container] + reasons := make([]string, 0, len(summary.reasons)) + for r, _ := range summary.reasons { + reasons = append(reasons, r) + } + + sort.Strings(reasons) + + summaryStrs = append(summaryStrs, fmt.Sprintf("%dx%q containers with %v", summary.pods, container, reasons)) + } + + return strings.Join(summaryStrs, "; ") +} + +func summarizeContainers(containers []corev1.ContainerStatus, summary map[string]*sadContainerSummary) { + for _, container := range containers { + if container.Ready { + continue + } + + sadContainer, ok := summary[container.Name] + if !ok { + sadContainer = &sadContainerSummary{ + reasons: make(map[string]struct{}), + } + + summary[container.Name] = sadContainer + } + + sadContainer.pods++ + + if state := container.State.Waiting; state != nil { + sadContainer.reasons[state.Reason] = struct{}{} + } else if state := container.State.Terminated; state != nil { + sadContainer.reasons[state.Reason] = struct{}{} + } + } +} diff --git a/pkg/controller/capacity/deployment_test.go b/pkg/controller/capacity/deployment_test.go new file mode 100644 index 000000000..16030a76b --- /dev/null +++ b/pkg/controller/capacity/deployment_test.go @@ -0,0 +1,87 @@ +package capacity + +import ( + "strings" + "testing" + + corev1 "k8s.io/api/core/v1" + + shipper "github.com/bookingcom/shipper/pkg/apis/shipper/v1alpha1" +) + +func TestSummarizeSadPods(t *testing.T) { + sadPods := []shipper.PodStatus{ + { + InitContainers: []corev1.ContainerStatus{ + { + Name: "ready-init-container", + Ready: true, + }, + { + Name: "waiting-init-container", + Ready: false, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ImagePullBackOff", + }, + }, + }, + }, + Containers: []corev1.ContainerStatus{ + { + Name: "ready-container", + Ready: true, + }, + { + Name: "waiting-container", + Ready: false, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ImagePullBackOff", + }, + }, + }, + { + Name: "terminated-container", + Ready: false, + State: corev1.ContainerState{ + Terminated: &corev1.ContainerStateTerminated{ + Reason: "Completed", + }, + }, + }, + }, + }, + { + InitContainers: []corev1.ContainerStatus{ + { + Name: "waiting-init-container", // but not really :D + Ready: true, + }, + }, + Containers: []corev1.ContainerStatus{ + { + Name: "waiting-container", + Ready: false, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ErrImagePull", + }, + }, + }, + }, + }, + } + + expected := strings.Join([]string{ + `1x"terminated-container" containers with [Completed]`, + `2x"waiting-container" containers with [ErrImagePull ImagePullBackOff]`, + `1x"waiting-init-container" containers with [ImagePullBackOff]`, + }, "; ") + actual := summarizeSadPods(sadPods) + if expected != actual { + t.Fatalf( + "summary does not match.\nexpected: %s\nactual: %s", + expected, actual) + } +} diff --git a/pkg/controller/capacity/utils_test.go b/pkg/controller/capacity/utils_test.go index 4db509871..bc46f6c68 100644 --- a/pkg/controller/capacity/utils_test.go +++ b/pkg/controller/capacity/utils_test.go @@ -124,6 +124,17 @@ func buildSadPodForDeployment(deployment *appsv1.Deployment) *corev1.Pod { Message: "This failure is meant to happen!", }, }, + ContainerStatuses: []corev1.ContainerStatus{ + { + Name: "app", + Ready: false, + State: corev1.ContainerState{ + Waiting: &corev1.ContainerStateWaiting{ + Reason: "ExpectedFail", + }, + }, + }, + }, }, } } diff --git a/pkg/crds/capacitytarget.go b/pkg/crds/capacitytarget.go index 8fd8dc1b6..730985daa 100644 --- a/pkg/crds/capacitytarget.go +++ b/pkg/crds/capacitytarget.go @@ -64,5 +64,31 @@ var CapacityTarget = &apiextensionv1beta1.CustomResourceDefinition{ }, }, }, + AdditionalPrinterColumns: []apiextensionv1beta1.CustomResourceColumnDefinition{ + apiextensionv1beta1.CustomResourceColumnDefinition{ + Name: "Operational", + Type: "string", + Description: "Whether the capactiy target is operational.", + JSONPath: `.status.conditions[?(.type=="Operational")].status`, + }, + apiextensionv1beta1.CustomResourceColumnDefinition{ + Name: "Ready", + Type: "string", + Description: "Whether the capactiy target is ready.", + JSONPath: `.status.conditions[?(.type=="Ready")].status`, + }, + apiextensionv1beta1.CustomResourceColumnDefinition{ + Name: "Reason", + Type: "string", + Description: "Reason for the capactiy target to not be ready or operational.", + JSONPath: `.status.conditions[?(.status=="False")].message`, + }, + apiextensionv1beta1.CustomResourceColumnDefinition{ + Name: "Age", + Type: "date", + Description: "The capactiy target's age.", + JSONPath: ".metadata.creationTimestamp", + }, + }, }, } diff --git a/pkg/util/clusterstatus/conditions.go b/pkg/util/clusterstatus/conditions.go index 5b6839ea6..6f0ccd766 100644 --- a/pkg/util/clusterstatus/conditions.go +++ b/pkg/util/clusterstatus/conditions.go @@ -1,6 +1,8 @@ package clusterstatus import ( + "fmt" + shipper "github.com/bookingcom/shipper/pkg/apis/shipper/v1alpha1" corev1 "k8s.io/api/core/v1" ) @@ -21,7 +23,7 @@ func IsClusterTrafficReady(conditions []shipper.ClusterTrafficCondition) bool { return readyCond.Status == corev1.ConditionTrue } -func IsClusterCapacityReady(conditions []shipper.ClusterCapacityCondition) bool { +func IsClusterCapacityReady(conditions []shipper.ClusterCapacityCondition) (bool, string) { var readyCond shipper.ClusterCapacityCondition for _, c := range conditions { if c.Type == shipper.ClusterConditionTypeReady { @@ -30,7 +32,17 @@ func IsClusterCapacityReady(conditions []shipper.ClusterCapacityCondition) bool } } - return readyCond.Status == corev1.ConditionTrue + if readyCond.Status != corev1.ConditionTrue { + msg := readyCond.Reason + + if readyCond.Message != "" { + msg = fmt.Sprintf("%s %s", msg, readyCond.Message) + } + + return false, msg + } + + return true, "" } func IsClusterInstallationReady(conditions []shipper.ClusterInstallationCondition) bool {