Skip to content
This repository has been archived by the owner on May 3, 2022. It is now read-only.

Commit

Permalink
capacity controller: better summarization of conditions
Browse files Browse the repository at this point in the history
Before this, the capacity controller would only put a list of unready
clusters in the CapacityTarget's Ready condition when it would set it to
False. This requires users to go digging into each cluster condition,
and most likely they would only be directed to SadPods, where they could
finally get some useful information.

Now, that information is summarized in a very brief format, in the hopes
that users will have to do less jumping around when investigating why
their CapacityTarget is not progressing.

For instance, if the CapacityTarget is stuck because one container can't
pull its image, we'll now have the following in the CapacityTarget's
.stauts.conditions:

```
[
  {
    "lastTransitionTime": "2020-02-12T13:16:44Z",
    "status": "True",
    "type": "Operational"
  },
  {
    "lastTransitionTime": "2020-02-12T13:16:44Z",
    "message": "docker-desktop: PodsNotReady 3/3: 3x\"test-nginx\" containers with [ImagePullBackOff]",
    "reason": "ClustersNotReady",
    "status": "False",
    "type": "Ready"
  }
]
```

As a bonus, this is also shown on a `kubectl get ct`:

```
% kubectl get ct  snowflake-db84be2b-0
NAME                   OPERATIONAL   READY   REASON                                                                                AGE
snowflake-db84be2b-0   True          False   docker-desktop: PodsNotReady 3/3: 3x"test-nginx" containers with [ImagePullBackOff]   8d
```
  • Loading branch information
juliogreff authored and Oleg Sidorov committed Feb 24, 2020
1 parent 3e58d0a commit abe5790
Show file tree
Hide file tree
Showing 8 changed files with 253 additions and 19 deletions.
17 changes: 17 additions & 0 deletions crd/CapacityTarget-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,23 @@ metadata:
# name must match the spec fields below, and be in the form: <plural>.<group>
name: capacitytargets.shipper.booking.com
spec:
additionalPrinterColumns:
- JSONPath: .status.conditions[?(.type=="Operational")].status
description: Whether the capacity target is operational.
name: Operational
type: string
- JSONPath: .status.conditions[?(.type=="Ready")].status
description: Whether the capacity target is ready.
name: Ready
type: string
- JSONPath: .status.conditions[?(.status=="False")].message
description: Reason for the capactiy target to not be ready or operational.
name: Reason
type: string
- JSONPath: .metadata.creationTimestamp
description: The capacity target's age.
name: Age
type: date
# group name to use for REST API: /apis/<group>/<version>
group: shipper.booking.com
# version name to use for REST API: /apis/<group>/<version>
Expand Down
18 changes: 11 additions & 7 deletions pkg/controller/capacity/capacity_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"reflect"
"sort"
"strings"
"time"

appsv1 "k8s.io/api/apps/v1"
Expand Down Expand Up @@ -309,10 +310,11 @@ func (c *Controller) processCapacityTargetOnCluster(
} else if l := len(sadPods); l > 0 {
// We ran out of conditions to look at, but we have pods that
// aren't Ready, so that's one reason to be concerned.
summary := summarizeSadPods(sadPods)
reason = PodsNotReady
msg = fmt.Sprintf(
"%d out of %d pods are not Ready. this might require intervention, check SadPods in this object for more information",
l, desiredReplicas,
"%d/%d: %s",
l, desiredReplicas, summary,
)
} else {
// None of the existing pods are non-Ready, and we presumably
Expand Down Expand Up @@ -400,19 +402,21 @@ func (c *Controller) processCapacityTarget(ct *shipper.CapacityTarget) (*shipper
ct.Status.Clusters = newClusterStatuses
ct.Status.ObservedGeneration = ct.Generation

clustersNotReady := []string{}
notReadyReasons := []string{}
for _, clusterStatus := range ct.Status.Clusters {
if !clusterstatusutil.IsClusterCapacityReady(clusterStatus.Conditions) {
clustersNotReady = append(clustersNotReady, clusterStatus.Name)
ready, reason := clusterstatusutil.IsClusterCapacityReady(clusterStatus.Conditions)
if !ready {
notReadyReasons = append(notReadyReasons,
fmt.Sprintf("%s: %s", clusterStatus.Name, reason))
}
}

if len(clustersNotReady) == 0 {
if len(notReadyReasons) == 0 {
ct.Status.Conditions = targetutil.TransitionToReady(diff, ct.Status.Conditions)
} else {
ct.Status.Conditions = targetutil.TransitionToNotReady(
diff, ct.Status.Conditions,
ClustersNotReady, fmt.Sprintf("%v", clustersNotReady))
ClustersNotReady, strings.Join(notReadyReasons, "; "))
}

return ct, clusterErrors.Flatten()
Expand Down
35 changes: 25 additions & 10 deletions pkg/controller/capacity/capacity_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ func TestCapacityShiftingPodsNotSadButNotAvailable(t *testing.T) {
Type: shipper.TargetConditionTypeReady,
Status: corev1.ConditionFalse,
Reason: ClustersNotReady,
Message: fmt.Sprintf("%v", []string{clusterA}),
Message: fmt.Sprintf("%s: InProgress", clusterA),
},
},
}
Expand Down Expand Up @@ -192,11 +192,25 @@ func TestCapacityShiftingSadPods(t *testing.T) {
Owner: shipper.ClusterCapacityReportOwner{Name: ctName},
Breakdown: []shipper.ClusterCapacityReportBreakdown{
{
Type: "Ready",
Status: string(corev1.ConditionFalse),
Reason: "ExpectedFail",
Count: 1,
Containers: []shipper.ClusterCapacityReportContainerBreakdown{},
Type: "Ready",
Status: string(corev1.ConditionFalse),
Reason: "ExpectedFail",
Count: 1,
Containers: []shipper.ClusterCapacityReportContainerBreakdown{
{
Name: "app",
States: []shipper.ClusterCapacityReportContainerStateBreakdown{
{
Count: 1,
Example: shipper.ClusterCapacityReportContainerBreakdownExample{
Pod: "foobar-deadbeef",
},
Reason: "ExpectedFail",
Type: "Waiting",
},
},
},
},
},
},
},
Expand All @@ -213,13 +227,14 @@ func TestCapacityShiftingSadPods(t *testing.T) {
Type: shipper.ClusterConditionTypeReady,
Status: corev1.ConditionFalse,
Reason: PodsNotReady,
Message: "1 out of 10 pods are not Ready. this might require intervention, check SadPods in this object for more information",
Message: `1/10: 1x"app" containers with [ExpectedFail]`,
},
},
SadPods: []shipper.PodStatus{
{
Name: sadPod.Name,
Condition: sadPod.Status.Conditions[0],
Name: sadPod.Name,
Condition: sadPod.Status.Conditions[0],
Containers: sadPod.Status.ContainerStatuses,
},
},
Reports: reports,
Expand All @@ -231,7 +246,7 @@ func TestCapacityShiftingSadPods(t *testing.T) {
Type: shipper.TargetConditionTypeReady,
Status: corev1.ConditionFalse,
Reason: ClustersNotReady,
Message: fmt.Sprintf("%v", []string{clusterA}),
Message: fmt.Sprintf(`%s: PodsNotReady 1/10: 1x"app" containers with [ExpectedFail]`, clusterA),
},
},
}
Expand Down
62 changes: 62 additions & 0 deletions pkg/controller/capacity/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"fmt"
"math"
"sort"
"strings"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -94,3 +95,64 @@ func (c Controller) calculatePercentageFromAmount(total, amount int32) int32 {

return int32(math.Ceil(result))
}

type sadContainerSummary struct {
pods int
reasons map[string]struct{}
}

func summarizeSadPods(sadPods []shipper.PodStatus) string {
summary := make(map[string]*sadContainerSummary)

for _, sadPod := range sadPods {
summarizeContainers(sadPod.InitContainers, summary)
summarizeContainers(sadPod.Containers, summary)
}

containers := make([]string, 0, len(summary))
for c, _ := range summary {
containers = append(containers, c)
}

sort.Strings(containers)

summaryStrs := make([]string, 0, len(summary))
for _, container := range containers {
summary := summary[container]
reasons := make([]string, 0, len(summary.reasons))
for r, _ := range summary.reasons {
reasons = append(reasons, r)
}

sort.Strings(reasons)

summaryStrs = append(summaryStrs, fmt.Sprintf("%dx%q containers with %v", summary.pods, container, reasons))
}

return strings.Join(summaryStrs, "; ")
}

func summarizeContainers(containers []corev1.ContainerStatus, summary map[string]*sadContainerSummary) {
for _, container := range containers {
if container.Ready {
continue
}

sadContainer, ok := summary[container.Name]
if !ok {
sadContainer = &sadContainerSummary{
reasons: make(map[string]struct{}),
}

summary[container.Name] = sadContainer
}

sadContainer.pods++

if state := container.State.Waiting; state != nil {
sadContainer.reasons[state.Reason] = struct{}{}
} else if state := container.State.Terminated; state != nil {
sadContainer.reasons[state.Reason] = struct{}{}
}
}
}
87 changes: 87 additions & 0 deletions pkg/controller/capacity/deployment_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package capacity

import (
"strings"
"testing"

corev1 "k8s.io/api/core/v1"

shipper "github.com/bookingcom/shipper/pkg/apis/shipper/v1alpha1"
)

func TestSummarizeSadPods(t *testing.T) {
sadPods := []shipper.PodStatus{
{
InitContainers: []corev1.ContainerStatus{
{
Name: "ready-init-container",
Ready: true,
},
{
Name: "waiting-init-container",
Ready: false,
State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "ImagePullBackOff",
},
},
},
},
Containers: []corev1.ContainerStatus{
{
Name: "ready-container",
Ready: true,
},
{
Name: "waiting-container",
Ready: false,
State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "ImagePullBackOff",
},
},
},
{
Name: "terminated-container",
Ready: false,
State: corev1.ContainerState{
Terminated: &corev1.ContainerStateTerminated{
Reason: "Completed",
},
},
},
},
},
{
InitContainers: []corev1.ContainerStatus{
{
Name: "waiting-init-container", // but not really :D
Ready: true,
},
},
Containers: []corev1.ContainerStatus{
{
Name: "waiting-container",
Ready: false,
State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "ErrImagePull",
},
},
},
},
},
}

expected := strings.Join([]string{
`1x"terminated-container" containers with [Completed]`,
`2x"waiting-container" containers with [ErrImagePull ImagePullBackOff]`,
`1x"waiting-init-container" containers with [ImagePullBackOff]`,
}, "; ")
actual := summarizeSadPods(sadPods)
if expected != actual {
t.Fatalf(
"summary does not match.\nexpected: %s\nactual: %s",
expected, actual)
}
}
11 changes: 11 additions & 0 deletions pkg/controller/capacity/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,17 @@ func buildSadPodForDeployment(deployment *appsv1.Deployment) *corev1.Pod {
Message: "This failure is meant to happen!",
},
},
ContainerStatuses: []corev1.ContainerStatus{
{
Name: "app",
Ready: false,
State: corev1.ContainerState{
Waiting: &corev1.ContainerStateWaiting{
Reason: "ExpectedFail",
},
},
},
},
},
}
}
26 changes: 26 additions & 0 deletions pkg/crds/capacitytarget.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,31 @@ var CapacityTarget = &apiextensionv1beta1.CustomResourceDefinition{
},
},
},
AdditionalPrinterColumns: []apiextensionv1beta1.CustomResourceColumnDefinition{
apiextensionv1beta1.CustomResourceColumnDefinition{
Name: "Operational",
Type: "string",
Description: "Whether the capactiy target is operational.",
JSONPath: `.status.conditions[?(.type=="Operational")].status`,
},
apiextensionv1beta1.CustomResourceColumnDefinition{
Name: "Ready",
Type: "string",
Description: "Whether the capactiy target is ready.",
JSONPath: `.status.conditions[?(.type=="Ready")].status`,
},
apiextensionv1beta1.CustomResourceColumnDefinition{
Name: "Reason",
Type: "string",
Description: "Reason for the capactiy target to not be ready or operational.",
JSONPath: `.status.conditions[?(.status=="False")].message`,
},
apiextensionv1beta1.CustomResourceColumnDefinition{
Name: "Age",
Type: "date",
Description: "The capactiy target's age.",
JSONPath: ".metadata.creationTimestamp",
},
},
},
}
16 changes: 14 additions & 2 deletions pkg/util/clusterstatus/conditions.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package clusterstatus

import (
"fmt"

shipper "github.com/bookingcom/shipper/pkg/apis/shipper/v1alpha1"
corev1 "k8s.io/api/core/v1"
)
Expand All @@ -21,7 +23,7 @@ func IsClusterTrafficReady(conditions []shipper.ClusterTrafficCondition) bool {
return readyCond.Status == corev1.ConditionTrue
}

func IsClusterCapacityReady(conditions []shipper.ClusterCapacityCondition) bool {
func IsClusterCapacityReady(conditions []shipper.ClusterCapacityCondition) (bool, string) {
var readyCond shipper.ClusterCapacityCondition
for _, c := range conditions {
if c.Type == shipper.ClusterConditionTypeReady {
Expand All @@ -30,7 +32,17 @@ func IsClusterCapacityReady(conditions []shipper.ClusterCapacityCondition) bool
}
}

return readyCond.Status == corev1.ConditionTrue
if readyCond.Status != corev1.ConditionTrue {
msg := readyCond.Reason

if readyCond.Message != "" {
msg = fmt.Sprintf("%s %s", msg, readyCond.Message)
}

return false, msg
}

return true, ""
}

func IsClusterInstallationReady(conditions []shipper.ClusterInstallationCondition) bool {
Expand Down

0 comments on commit abe5790

Please sign in to comment.