Skip to content

Commit

Permalink
Verify the processes that are excluded (#1752)
Browse files Browse the repository at this point in the history
* Verify the processes that are assumed to be excluded by the machine-readable status are also excluded based on the exclude command
  • Loading branch information
johscheuer authored Jul 28, 2023
1 parent d28bd2b commit a1e5ba0
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 24 deletions.
61 changes: 51 additions & 10 deletions fdbclient/admin_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -902,7 +902,7 @@ protocol fdb00b071010000`,
}

mockRunner = &mockCommandRunner{
mockedError: fdbv1beta2.TimeoutError{Err: fmt.Errorf("timed out")},
mockedError: nil,
mockedOutput: "",
}
})
Expand All @@ -926,8 +926,39 @@ protocol fdb00b071010000`,
Expect(err).NotTo(HaveOccurred())
})

It("should not issue an exclude command", func() {
Expect(mockRunner.receivedBinary).To(BeEmpty())
It("should issue an exclude command to verify the exclusion", func() {
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
Expect(mockRunner.receivedArgs).To(ContainElements("exclude 192.168.0.1:4500 192.168.0.2:4500"))
})
})

When("all provided processes are fully excluded in the status but the exclude command returns an error", func() {
BeforeEach(func() {
addressesToCheck = []fdbv1beta2.ProcessAddress{
{
IPAddress: net.ParseIP("192.168.0.1"),
Port: 4500,
},
{
IPAddress: net.ParseIP("192.168.0.2"),
Port: 4500,
},
}

mockRunner = &mockCommandRunner{
mockedError: fdbv1beta2.TimeoutError{Err: fmt.Errorf("timed out")},
mockedOutput: "",
}
})

It("should return an empty list and an error", func() {
Expect(result).To(HaveLen(0))
Expect(err).To(HaveOccurred())
})

It("should issue an exclude command to verify the exclusion", func() {
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
Expect(mockRunner.receivedArgs).To(ContainElements("exclude 192.168.0.1:4500 192.168.0.2:4500"))
})
})

Expand All @@ -947,6 +978,11 @@ protocol fdb00b071010000`,
Port: 4500,
},
}

mockRunner = &mockCommandRunner{
mockedError: nil,
mockedOutput: "",
}
})

It("should return the one process that is not excluded", func() {
Expand All @@ -957,8 +993,8 @@ protocol fdb00b071010000`,
Expect(err).NotTo(HaveOccurred())
})

It("should not issue an exclude command", func() {
Expect(mockRunner.receivedBinary).To(BeEmpty())
It("should issue an exclude command to verify the exclusion", func() {
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
})
})

Expand Down Expand Up @@ -988,8 +1024,8 @@ protocol fdb00b071010000`,
Expect(err).NotTo(HaveOccurred())
})

It("should not issue an exclude command", func() {
Expect(mockRunner.receivedBinary).To(BeEmpty())
It("should issue an exclude command to verify the exclusion", func() {
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
})
})

Expand Down Expand Up @@ -1028,12 +1064,12 @@ protocol fdb00b071010000`,
Expect(err).NotTo(HaveOccurred())
})

It("should not issue an exclude command", func() {
Expect(mockRunner.receivedBinary).To(BeEmpty())
It("should issue an exclude command to verify the exclusion", func() {
Expect(mockRunner.receivedBinary).To(HaveSuffix(fdbcliStr))
})
})

When("one process is missing in the cluster status", func() {
When("one process is missing in the cluster status and the exclude command times out", func() {
BeforeEach(func() {
addressesToCheck = []fdbv1beta2.ProcessAddress{
{
Expand All @@ -1049,6 +1085,11 @@ protocol fdb00b071010000`,
Port: 4500,
},
}

mockRunner = &mockCommandRunner{
mockedError: fdbv1beta2.TimeoutError{Err: fmt.Errorf("timed out")},
mockedOutput: "",
}
})

It("should return an empty list and no error", func() {
Expand Down
24 changes: 12 additions & 12 deletions internal/removals/remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ func GetProcessGroupsToRemove(removalMode fdbv1beta2.PodUpdateMode, removals map
// If the process group has not an associated process in the cluster status the zone will be UnknownZone.
// if the process group has the ResourcesTerminating condition the zone will be TerminatingZone.
func GetZonedRemovals(status *fdbv1beta2.FoundationDBStatus, processGroupsToRemove []*fdbv1beta2.ProcessGroupStatus) (map[string][]fdbv1beta2.ProcessGroupID, int64, error) {
var lastestRemovalTimestamp int64
var latestRemovalTimestamp int64
// Convert the process list into a map with the process group ID as key.
processInfo := map[fdbv1beta2.ProcessGroupID]fdbv1beta2.FoundationDBStatusProcessInfo{}
for _, p := range status.Cluster.Processes {
Expand All @@ -98,8 +98,8 @@ func GetZonedRemovals(status *fdbv1beta2.FoundationDBStatus, processGroupsToRemo
// that state.
removalTimestamp := pointer.Int64Deref(pg.GetConditionTime(fdbv1beta2.ResourcesTerminating), 0)
if removalTimestamp > 0 {
if removalTimestamp > lastestRemovalTimestamp {
lastestRemovalTimestamp = removalTimestamp
if removalTimestamp > latestRemovalTimestamp {
latestRemovalTimestamp = removalTimestamp
}
zoneMap[TerminatingZone] = append(zoneMap[TerminatingZone], pg.ProcessGroupID)
continue
Expand All @@ -115,12 +115,11 @@ func GetZonedRemovals(status *fdbv1beta2.FoundationDBStatus, processGroupsToRemo
zoneMap[zone] = append(zoneMap[zone], pg.ProcessGroupID)
}

return zoneMap, lastestRemovalTimestamp, nil
return zoneMap, latestRemovalTimestamp, nil
}

// GetRemainingMap returns a map that indicates if a process group is fully excluded in the cluster.
func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient, cluster *fdbv1beta2.FoundationDBCluster, status *fdbv1beta2.FoundationDBStatus) (map[string]bool, error) {
var err error
addresses := make([]fdbv1beta2.ProcessAddress, 0, len(cluster.Status.ProcessGroups))
for _, processGroup := range cluster.Status.ProcessGroups {
if !processGroup.IsMarkedForRemoval() || processGroup.IsExcluded() {
Expand All @@ -141,19 +140,20 @@ func GetRemainingMap(logger logr.Logger, adminClient fdbadminclient.AdminClient,
}
}

var remaining []fdbv1beta2.ProcessAddress
if len(addresses) > 0 {
remaining, err = fdbstatus.CanSafelyRemoveFromStatus(logger, adminClient, addresses, status)
if err != nil {
return map[string]bool{}, err
}
remainingMap := map[string]bool{}
if len(addresses) == 0 {
return remainingMap, nil
}

remaining, err := fdbstatus.CanSafelyRemoveFromStatus(logger, adminClient, addresses, status)
if err != nil {
return nil, err
}

if len(remaining) > 0 {
logger.Info("Exclusions to complete", "remainingServers", remaining)
}

remainingMap := make(map[string]bool, len(remaining))
for _, address := range addresses {
remainingMap[address.String()] = false
}
Expand Down
14 changes: 12 additions & 2 deletions pkg/fdbstatus/status_checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ func getRemainingAndExcludedFromStatus(logger logr.Logger, status *fdbv1beta2.Fo
}

if len(process.Roles) == 0 {
logger.Info("found fully excluded process without any roles", "process", process)
fullyExcludedAddresses[process.Address.MachineAddress()]++
}
}
Expand Down Expand Up @@ -216,12 +217,21 @@ func CanSafelyRemoveFromStatus(logger logr.Logger, client fdbadminclient.AdminCl
}
}

// Verify that all processes that are assumed to be fully excluded based on the machine-readable status are actually
// not serving any roles by running the exclude command again. If those processes are actually fully excluded and are not
// serving any roles, the exclude command should terminate quickly, otherwise we will hit a timeout, and we know that
// not all processes are fully excluded. This is meant to be an additional safeguard if the machine-readable status
// returns the wrong signals.
if len(exclusions.fullyExcluded) > 0 {
// When we hit a timeout error here we know that at least one of the fullyExcluded is still not fully excluded.
return notSafeToDelete, client.ExcludeProcesses(exclusions.fullyExcluded)
}

// All processes that are either not yet marked as excluded or still serving at least one role, cannot be removed safely.
return notSafeToDelete, nil
}

// GetExclusions gets a list of the addresses currently excluded from the
// database, based on the provided status.
// GetExclusions gets a list of the addresses currently excluded from the database, based on the provided status.
func GetExclusions(status *fdbv1beta2.FoundationDBStatus) ([]fdbv1beta2.ProcessAddress, error) {
excludedServers := status.Cluster.DatabaseConfiguration.ExcludedServers
exclusions := make([]fdbv1beta2.ProcessAddress, 0, len(excludedServers))
Expand Down

0 comments on commit a1e5ba0

Please sign in to comment.