Skip to content

Commit 1ebb6f1

Browse files
authored
In the multi-region recover command handle cases where the cluster entries cannot be resolved. (#2391)
* If the cluster has no entries in the cluster file that can be resolved, assume in the multi-region recover command that the cluster should be recovered
1 parent d51fb89 commit 1ebb6f1

File tree

2 files changed

+30
-0
lines changed

2 files changed

+30
-0
lines changed

e2e/test_operator_plugin/operator_plugin_test.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"context"
2929
"fmt"
3030
"log"
31+
"strings"
3132
"time"
3233

3334
fdbv1beta2 "github.com/FoundationDB/fdb-kubernetes-operator/v2/api/v1beta2"
@@ -249,6 +250,11 @@ var _ = Describe("Operator Plugin", Label("e2e", "pr"), func() {
249250
false,
250251
)
251252
log.Println("stdout:", stdout, "stderr:", stderr)
253+
if strings.Contains(stderr, "Error determining public address") {
254+
Skip(
255+
"plugin was not able to determine public address, this means that all coordinators are probably gone",
256+
)
257+
}
252258
Expect(err).NotTo(HaveOccurred())
253259

254260
// Ensure the cluster is available again.

kubectl-fdb/cmd/recover_multi_region_cluster.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -732,6 +732,30 @@ func checkIfClusterIsUnavailableAndMajorityOfCoordinatorsAreUnreachable(
732732
}
733733

734734
log.Println("Getting the status from:", clientPod.Name)
735+
for retry := 0; retry < 5; retry++ {
736+
err = getStatusAndCheckIfClusterShouldBeRecovered(ctx, kubeClient, config, clientPod)
737+
if err == nil {
738+
break
739+
}
740+
741+
time.Sleep(5 * time.Second)
742+
}
743+
744+
// If DNS is used for the cluster file, we could hit cases where no DNS entry can be resolved, in this case we could
745+
// assume that the cluster is also down. The error from the client side is the following:
746+
// Error: error getting status: Error determining public address.
747+
// ERROR: Unable to bind to network (1512)
748+
if err != nil && strings.Contains(err.Error(), "Error determining public address") {
749+
return err
750+
}
751+
752+
return err
753+
}
754+
755+
func getStatusAndCheckIfClusterShouldBeRecovered(ctx context.Context,
756+
kubeClient client.Client,
757+
config *rest.Config,
758+
clientPod *corev1.Pod) error {
735759
status, err := getStatus(ctx, kubeClient, config, clientPod)
736760
if err != nil {
737761
return err

0 commit comments

Comments
 (0)