diff --git a/controllers/cluster_controller.go b/controllers/cluster_controller.go index 1f1a2bfb..3b62a9e8 100644 --- a/controllers/cluster_controller.go +++ b/controllers/cluster_controller.go @@ -196,36 +196,44 @@ func (r *FoundationDBClusterReconciler) Reconcile(ctx context.Context, request c originalGeneration := cluster.ObjectMeta.Generation normalizedSpec := cluster.Spec.DeepCopy() - delayedRequeue := false + var delayedRequeueDuration time.Duration + var delayedRequeue bool for _, subReconciler := range subReconcilers { // We have to set the normalized spec here again otherwise any call to Update() for the status of the cluster // will reset all normalized fields... cluster.Spec = *(normalizedSpec.DeepCopy()) - requeue := runClusterSubReconciler(ctx, clusterLog, subReconciler, r, cluster, status) - if requeue == nil { + req := runClusterSubReconciler(ctx, clusterLog, subReconciler, r, cluster, status) + if req == nil { continue } - if requeue.delayedRequeue { + if req.delayedRequeue { clusterLog.Info("Delaying requeue for sub-reconciler", "reconciler", fmt.Sprintf("%T", subReconciler), - "message", requeue.message, - "error", requeue.curError) + "message", req.message, + "delayedRequeueDuration", delayedRequeueDuration.String(), + "error", req.curError) + if delayedRequeueDuration < req.delay { + delayedRequeueDuration = req.delay + } + delayedRequeue = true + continue } - return processRequeue(requeue, subReconciler, cluster, r.Recorder, clusterLog) + return processRequeue(req, subReconciler, cluster, r.Recorder, clusterLog) } - if cluster.Status.Generations.Reconciled < originalGeneration || delayedRequeue { + if cluster.Status.Generations.Reconciled < originalGeneration || delayedRequeue || delayedRequeueDuration > 0 { clusterLog.Info("Cluster was not fully reconciled by reconciliation process", "status", cluster.Status.Generations, "CurrentGeneration", cluster.Status.Generations.Reconciled, - "OriginalGeneration", originalGeneration, "DelayedRequeue", delayedRequeue) + "OriginalGeneration", originalGeneration, + "DelayedRequeue", delayedRequeueDuration.String()) - return ctrl.Result{Requeue: true}, nil + return ctrl.Result{Requeue: true, RequeueAfter: delayedRequeueDuration}, nil } clusterLog.Info("Reconciliation complete", "generation", cluster.Status.Generations.Reconciled) diff --git a/controllers/maintenance_mode_checker.go b/controllers/maintenance_mode_checker.go index cddcc949..164081ad 100644 --- a/controllers/maintenance_mode_checker.go +++ b/controllers/maintenance_mode_checker.go @@ -53,7 +53,7 @@ func (maintenanceModeChecker) reconcile(_ context.Context, r *FoundationDBCluste // If the cluster is not available we skip any further checks. if !status.Client.DatabaseStatus.Available { - return &requeue{message: "cluster is not available", delayedRequeue: true} + return &requeue{message: "cluster is not available", delayedRequeue: true, delay: 5 * time.Second} } // Get all the processes that are currently under maintenance based on the information stored in FDB. diff --git a/controllers/replace_failed_process_groups.go b/controllers/replace_failed_process_groups.go index 8a59bebc..087f4f86 100644 --- a/controllers/replace_failed_process_groups.go +++ b/controllers/replace_failed_process_groups.go @@ -58,6 +58,13 @@ func (c replaceFailedProcessGroups) reconcile(ctx context.Context, r *Foundation } } + // If the database is unavailable skip further steps as the operator is not able to make any good decision about the automatic + // replacements. + if !status.Client.DatabaseStatus.Available { + logger.Info("Skipping replaceFailedProcessGroups reconciler as the database is not available.") + return &requeue{message: "cluster is not available", delayedRequeue: true, delay: 5 * time.Second} + } + // Only replace process groups without an address, if the cluster has the desired fault tolerance and is available. hasDesiredFaultTolerance := fdbstatus.HasDesiredFaultToleranceFromStatus(logger, status, cluster) hasReplacement, hasMoreFailedProcesses := replacements.ReplaceFailedProcessGroups(logger, cluster, status, hasDesiredFaultTolerance) diff --git a/controllers/replace_failed_process_groups_test.go b/controllers/replace_failed_process_groups_test.go index e8a5f72c..8b237935 100644 --- a/controllers/replace_failed_process_groups_test.go +++ b/controllers/replace_failed_process_groups_test.go @@ -442,7 +442,7 @@ var _ = Describe("replace_failed_process_groups", func() { }) It("should return nil", func() { - Expect(result).To(BeNil()) + Expect(result).NotTo(BeNil()) }) It("should not mark the process group for removal", func() { @@ -894,7 +894,7 @@ var _ = Describe("replace_failed_process_groups", func() { }) It("should return nil", func() { - Expect(result).To(BeNil()) + Expect(result).NotTo(BeNil()) }) It("should not mark the process group for removal", func() { diff --git a/controllers/update_database_configuration.go b/controllers/update_database_configuration.go index 5032e72e..ea0182b3 100644 --- a/controllers/update_database_configuration.go +++ b/controllers/update_database_configuration.go @@ -61,9 +61,9 @@ func (u updateDatabaseConfiguration) reconcile(_ context.Context, r *FoundationD } initialConfig := !cluster.Status.Configured - if !(initialConfig || status.Client.DatabaseStatus.Available) { + if !initialConfig && !status.Client.DatabaseStatus.Available { logger.Info("Skipping database configuration change because database is unavailable") - return nil + return &requeue{message: "cluster is not available", delayedRequeue: true, delay: 5 * time.Second} } desiredConfiguration := cluster.DesiredDatabaseConfiguration() diff --git a/e2e/fixtures/fdb_cluster_specs.go b/e2e/fixtures/fdb_cluster_specs.go index f06d6e5d..6be3de28 100644 --- a/e2e/fixtures/fdb_cluster_specs.go +++ b/e2e/fixtures/fdb_cluster_specs.go @@ -72,9 +72,9 @@ func (factory *Factory) createFDBClusterSpec( WaitBetweenRemovalsSeconds: pointer.Int(0), Replacements: fdbv1beta2.AutomaticReplacementOptions{ Enabled: pointer.Bool(true), - // Setting this to 5 minutes is reasonable to prevent the operator recreating Pods when they wait for + // Setting this to 10 minutes is reasonable to prevent the operator recreating Pods when they wait for // new ec2 instances. - FailureDetectionTimeSeconds: pointer.Int(300), + FailureDetectionTimeSeconds: pointer.Int(600), // Setting TaintReplacementTimeSeconds as half of FailureDetectionTimeSeconds to make taint replacement faster TaintReplacementTimeSeconds: pointer.Int(150), MaxConcurrentReplacements: pointer.Int(2), diff --git a/pkg/fdbstatus/status_checks.go b/pkg/fdbstatus/status_checks.go index e90b94d5..f292fb32 100644 --- a/pkg/fdbstatus/status_checks.go +++ b/pkg/fdbstatus/status_checks.go @@ -60,7 +60,7 @@ type StatusContextKey struct{} // processes that are fully excluded and don't serve any roles and processes that are not marked for // exclusion. type exclusionStatus struct { - // inProgress containms all addresses that are excluded in the cluster and the exclude command can be used to verify if it's safe to remove this address. + // inProgress contains all addresses that are excluded in the cluster and the exclude command can be used to verify if it's safe to remove this address. inProgress []fdbv1beta2.ProcessAddress // fullyExcluded contains all addresses that are excluded and don't have any roles assigned, this is a sign that the process is "fully" excluded and safe to remove. fullyExcluded []fdbv1beta2.ProcessAddress