From 4f1451958ae527115bc2a95614c9d79145a6dd4d Mon Sep 17 00:00:00 2001 From: Sreenath Bodagala <82616783+sbodagala@users.noreply.github.com> Date: Fri, 7 Jul 2023 01:58:36 -0400 Subject: [PATCH] Replacement logic should ignore process groups that are in maintenance mode (#1711) * - Operator replacement logic should ignore process groups that are in maintenance mode. --- api/v1beta2/foundationdbcluster_types.go | 5 +++++ controllers/replace_failed_process_groups.go | 2 +- controllers/replace_failed_process_groups_test.go | 12 ++++++++++++ .../replacements/replace_failed_process_groups.go | 10 +++++++++- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/api/v1beta2/foundationdbcluster_types.go b/api/v1beta2/foundationdbcluster_types.go index 5cdf9b9ff..f88a8a8b2 100644 --- a/api/v1beta2/foundationdbcluster_types.go +++ b/api/v1beta2/foundationdbcluster_types.go @@ -746,6 +746,11 @@ func (processGroupStatus *ProcessGroupStatus) GetConditionTime(conditionType Pro return nil } +// IsUnderMaintenance checks if the process is in maintenance zone. +func (processGroupStatus *ProcessGroupStatus) IsUnderMaintenance(maintenanceZone FaultDomain) bool { + return processGroupStatus.FaultDomain == maintenanceZone +} + // GetCondition returns the ProcessGroupStatus's ProcessGroupCondition that matches the conditionType; // It returns nil if the ProcessGroupStatus doesn't have a matching condition func (processGroupStatus *ProcessGroupStatus) GetCondition(conditionType ProcessGroupConditionType) *ProcessGroupCondition { diff --git a/controllers/replace_failed_process_groups.go b/controllers/replace_failed_process_groups.go index ff2f324a6..813fe1848 100644 --- a/controllers/replace_failed_process_groups.go +++ b/controllers/replace_failed_process_groups.go @@ -59,7 +59,7 @@ func (c replaceFailedProcessGroups) reconcile(ctx context.Context, r *Foundation // Only replace process groups without an address, if the cluster has the desired fault tolerance and is available. hasDesiredFaultTolerance := fdbstatus.HasDesiredFaultToleranceFromStatus(logger, status, cluster) - if replacements.ReplaceFailedProcessGroups(logger, cluster, hasDesiredFaultTolerance) { + if replacements.ReplaceFailedProcessGroups(logger, cluster, status, hasDesiredFaultTolerance) { err := r.updateOrApply(ctx, cluster) if err != nil { return &requeue{curError: err} diff --git a/controllers/replace_failed_process_groups_test.go b/controllers/replace_failed_process_groups_test.go index 20f8c8b24..3afec59cd 100644 --- a/controllers/replace_failed_process_groups_test.go +++ b/controllers/replace_failed_process_groups_test.go @@ -911,6 +911,18 @@ var _ = Describe("replace_failed_process_groups", func() { }) }) }) + + Context("with maintenance mode enabled", func() { + BeforeEach(func() { + adminClient, err := mock.NewMockAdminClientUncast(cluster, k8sClient) + Expect(err).NotTo(HaveOccurred()) + Expect(adminClient.SetMaintenanceZone("operator-test-1-storage-2", 0)).NotTo(HaveOccurred()) + }) + + It("should not mark the process group for removal", func() { + Expect(getRemovedProcessGroupIDs(cluster)).To(BeEmpty()) + }) + }) }) Context("with a process that has been missing for a brief time", func() { diff --git a/internal/replacements/replace_failed_process_groups.go b/internal/replacements/replace_failed_process_groups.go index b4816eede..a321ad2ab 100644 --- a/internal/replacements/replace_failed_process_groups.go +++ b/internal/replacements/replace_failed_process_groups.go @@ -45,7 +45,7 @@ func getMaxReplacements(cluster *fdbv1beta2.FoundationDBCluster, maxReplacements // ReplaceFailedProcessGroups flags failed processes groups for removal and returns an indicator // of whether any processes were thus flagged. -func ReplaceFailedProcessGroups(log logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, hasDesiredFaultTolerance bool) bool { +func ReplaceFailedProcessGroups(log logr.Logger, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus, hasDesiredFaultTolerance bool) bool { // Automatic replacements are disabled, so we don't have to check anything further if !cluster.GetEnableAutomaticReplacements() { return false @@ -62,6 +62,14 @@ ProcessGroupLoop: continue } + if processGroupStatus.IsUnderMaintenance(status.Cluster.MaintenanceZone) { + log.Info( + "Skip process group that is in maintenance zone", + "processGroupID", processGroupStatus.ProcessGroupID, + "maintenance zone", processGroupStatus.FaultDomain) + continue + } + canReplace := maxReplacements > 0 for _, targets := range crashLoopContainerProcessGroups {