Skip to content

Commit

Permalink
Replacement logic should ignore process groups that are in maintenanc…
Browse files Browse the repository at this point in the history
…e mode (#1711)

* - Operator replacement logic should ignore process groups that are
in maintenance mode.
  • Loading branch information
sbodagala authored Jul 7, 2023
1 parent b096983 commit 4f14519
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 2 deletions.
5 changes: 5 additions & 0 deletions api/v1beta2/foundationdbcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,11 @@ func (processGroupStatus *ProcessGroupStatus) GetConditionTime(conditionType Pro
return nil
}

// IsUnderMaintenance checks if the process is in maintenance zone.
func (processGroupStatus *ProcessGroupStatus) IsUnderMaintenance(maintenanceZone FaultDomain) bool {
return processGroupStatus.FaultDomain == maintenanceZone
}

// GetCondition returns the ProcessGroupStatus's ProcessGroupCondition that matches the conditionType;
// It returns nil if the ProcessGroupStatus doesn't have a matching condition
func (processGroupStatus *ProcessGroupStatus) GetCondition(conditionType ProcessGroupConditionType) *ProcessGroupCondition {
Expand Down
2 changes: 1 addition & 1 deletion controllers/replace_failed_process_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func (c replaceFailedProcessGroups) reconcile(ctx context.Context, r *Foundation

// Only replace process groups without an address, if the cluster has the desired fault tolerance and is available.
hasDesiredFaultTolerance := fdbstatus.HasDesiredFaultToleranceFromStatus(logger, status, cluster)
if replacements.ReplaceFailedProcessGroups(logger, cluster, hasDesiredFaultTolerance) {
if replacements.ReplaceFailedProcessGroups(logger, cluster, status, hasDesiredFaultTolerance) {
err := r.updateOrApply(ctx, cluster)
if err != nil {
return &requeue{curError: err}
Expand Down
12 changes: 12 additions & 0 deletions controllers/replace_failed_process_groups_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -911,6 +911,18 @@ var _ = Describe("replace_failed_process_groups", func() {
})
})
})

Context("with maintenance mode enabled", func() {
BeforeEach(func() {
adminClient, err := mock.NewMockAdminClientUncast(cluster, k8sClient)
Expect(err).NotTo(HaveOccurred())
Expect(adminClient.SetMaintenanceZone("operator-test-1-storage-2", 0)).NotTo(HaveOccurred())
})

It("should not mark the process group for removal", func() {
Expect(getRemovedProcessGroupIDs(cluster)).To(BeEmpty())
})
})
})

Context("with a process that has been missing for a brief time", func() {
Expand Down
10 changes: 9 additions & 1 deletion internal/replacements/replace_failed_process_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func getMaxReplacements(cluster *fdbv1beta2.FoundationDBCluster, maxReplacements

// ReplaceFailedProcessGroups flags failed processes groups for removal and returns an indicator
// of whether any processes were thus flagged.
func ReplaceFailedProcessGroups(log logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, hasDesiredFaultTolerance bool) bool {
func ReplaceFailedProcessGroups(log logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, hasDesiredFaultTolerance bool) bool {
// Automatic replacements are disabled, so we don't have to check anything further
if !cluster.GetEnableAutomaticReplacements() {
return false
Expand All @@ -62,6 +62,14 @@ ProcessGroupLoop:
continue
}

if processGroupStatus.IsUnderMaintenance(status.Cluster.MaintenanceZone) {
log.Info(
"Skip process group that is in maintenance zone",
"processGroupID", processGroupStatus.ProcessGroupID,
"maintenance zone", processGroupStatus.FaultDomain)
continue
}

canReplace := maxReplacements > 0

for _, targets := range crashLoopContainerProcessGroups {
Expand Down

0 comments on commit 4f14519

Please sign in to comment.