Skip to content

Commit

Permalink
Make sure we skip the automatic replacements if the cluster is unavai…
Browse files Browse the repository at this point in the history
…lable (#2054)

* Make sure we skip the automatic replacements if the cluster is unavailable
  • Loading branch information
johscheuer authored Jun 11, 2024
1 parent fc2f8d2 commit 58054cc
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 18 deletions.
28 changes: 18 additions & 10 deletions controllers/cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,36 +196,44 @@ func (r *FoundationDBClusterReconciler) Reconcile(ctx context.Context, request c

originalGeneration := cluster.ObjectMeta.Generation
normalizedSpec := cluster.Spec.DeepCopy()
delayedRequeue := false
var delayedRequeueDuration time.Duration
var delayedRequeue bool

for _, subReconciler := range subReconcilers {
// We have to set the normalized spec here again otherwise any call to Update() for the status of the cluster
// will reset all normalized fields...
cluster.Spec = *(normalizedSpec.DeepCopy())

requeue := runClusterSubReconciler(ctx, clusterLog, subReconciler, r, cluster, status)
if requeue == nil {
req := runClusterSubReconciler(ctx, clusterLog, subReconciler, r, cluster, status)
if req == nil {
continue
}

if requeue.delayedRequeue {
if req.delayedRequeue {
clusterLog.Info("Delaying requeue for sub-reconciler",
"reconciler", fmt.Sprintf("%T", subReconciler),
"message", requeue.message,
"error", requeue.curError)
"message", req.message,
"delayedRequeueDuration", delayedRequeueDuration.String(),
"error", req.curError)
if delayedRequeueDuration < req.delay {
delayedRequeueDuration = req.delay
}

delayedRequeue = true

continue
}

return processRequeue(requeue, subReconciler, cluster, r.Recorder, clusterLog)
return processRequeue(req, subReconciler, cluster, r.Recorder, clusterLog)
}

if cluster.Status.Generations.Reconciled < originalGeneration || delayedRequeue {
if cluster.Status.Generations.Reconciled < originalGeneration || delayedRequeue || delayedRequeueDuration > 0 {
clusterLog.Info("Cluster was not fully reconciled by reconciliation process", "status", cluster.Status.Generations,
"CurrentGeneration", cluster.Status.Generations.Reconciled,
"OriginalGeneration", originalGeneration, "DelayedRequeue", delayedRequeue)
"OriginalGeneration", originalGeneration,
"DelayedRequeue", delayedRequeueDuration.String())

return ctrl.Result{Requeue: true}, nil
return ctrl.Result{Requeue: true, RequeueAfter: delayedRequeueDuration}, nil
}

clusterLog.Info("Reconciliation complete", "generation", cluster.Status.Generations.Reconciled)
Expand Down
2 changes: 1 addition & 1 deletion controllers/maintenance_mode_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ func (maintenanceModeChecker) reconcile(_ context.Context, r *FoundationDBCluste

// If the cluster is not available we skip any further checks.
if !status.Client.DatabaseStatus.Available {
return &requeue{message: "cluster is not available", delayedRequeue: true}
return &requeue{message: "cluster is not available", delayedRequeue: true, delay: 5 * time.Second}
}

// Get all the processes that are currently under maintenance based on the information stored in FDB.
Expand Down
7 changes: 7 additions & 0 deletions controllers/replace_failed_process_groups.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ func (c replaceFailedProcessGroups) reconcile(ctx context.Context, r *Foundation
}
}

// If the database is unavailable skip further steps as the operator is not able to make any good decision about the automatic
// replacements.
if !status.Client.DatabaseStatus.Available {
logger.Info("Skipping replaceFailedProcessGroups reconciler as the database is not available.")
return &requeue{message: "cluster is not available", delayedRequeue: true, delay: 5 * time.Second}
}

// Only replace process groups without an address, if the cluster has the desired fault tolerance and is available.
hasDesiredFaultTolerance := fdbstatus.HasDesiredFaultToleranceFromStatus(logger, status, cluster)
hasReplacement, hasMoreFailedProcesses := replacements.ReplaceFailedProcessGroups(logger, cluster, status, hasDesiredFaultTolerance)
Expand Down
4 changes: 2 additions & 2 deletions controllers/replace_failed_process_groups_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ var _ = Describe("replace_failed_process_groups", func() {
})

It("should return nil", func() {
Expect(result).To(BeNil())
Expect(result).NotTo(BeNil())
})

It("should not mark the process group for removal", func() {
Expand Down Expand Up @@ -894,7 +894,7 @@ var _ = Describe("replace_failed_process_groups", func() {
})

It("should return nil", func() {
Expect(result).To(BeNil())
Expect(result).NotTo(BeNil())
})

It("should not mark the process group for removal", func() {
Expand Down
4 changes: 2 additions & 2 deletions controllers/update_database_configuration.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,9 @@ func (u updateDatabaseConfiguration) reconcile(_ context.Context, r *FoundationD
}

initialConfig := !cluster.Status.Configured
if !(initialConfig || status.Client.DatabaseStatus.Available) {
if !initialConfig && !status.Client.DatabaseStatus.Available {
logger.Info("Skipping database configuration change because database is unavailable")
return nil
return &requeue{message: "cluster is not available", delayedRequeue: true, delay: 5 * time.Second}
}

desiredConfiguration := cluster.DesiredDatabaseConfiguration()
Expand Down
4 changes: 2 additions & 2 deletions e2e/fixtures/fdb_cluster_specs.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ func (factory *Factory) createFDBClusterSpec(
WaitBetweenRemovalsSeconds: pointer.Int(0),
Replacements: fdbv1beta2.AutomaticReplacementOptions{
Enabled: pointer.Bool(true),
// Setting this to 5 minutes is reasonable to prevent the operator recreating Pods when they wait for
// Setting this to 10 minutes is reasonable to prevent the operator recreating Pods when they wait for
// new ec2 instances.
FailureDetectionTimeSeconds: pointer.Int(300),
FailureDetectionTimeSeconds: pointer.Int(600),
// Setting TaintReplacementTimeSeconds as half of FailureDetectionTimeSeconds to make taint replacement faster
TaintReplacementTimeSeconds: pointer.Int(150),
MaxConcurrentReplacements: pointer.Int(2),
Expand Down
2 changes: 1 addition & 1 deletion pkg/fdbstatus/status_checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ type StatusContextKey struct{}
// processes that are fully excluded and don't serve any roles and processes that are not marked for
// exclusion.
type exclusionStatus struct {
// inProgress containms all addresses that are excluded in the cluster and the exclude command can be used to verify if it's safe to remove this address.
// inProgress contains all addresses that are excluded in the cluster and the exclude command can be used to verify if it's safe to remove this address.
inProgress []fdbv1beta2.ProcessAddress
// fullyExcluded contains all addresses that are excluded and don't have any roles assigned, this is a sign that the process is "fully" excluded and safe to remove.
fullyExcluded []fdbv1beta2.ProcessAddress
Expand Down

0 comments on commit 58054cc

Please sign in to comment.