Skip to content

Commit 68a5c9f

Browse files
horsgkechegegunesgithub-actions[bot]
authored
K8SPS-548 resolve gr-self-healing test failures (#1142)
* K8SPS-548 resolve gr-self-healing test failures Fix two critical issues causing gr-self-healing test failures: 1. False split-brain detection due to incorrect quorum calculation - CheckIfInPrimaryPartition now only counts ONLINE members for quorum - Prevents false "possible split brain!" errors from stale UNREACHABLE entries 2. Cluster creation failures from stale group_replication_group_seeds - Clear persisted seeds early in bootstrap process - Prevents "Peer address is not valid" errors after pod deletions * fix split-brain detecion in liveness probe * improve test * Update e2e-tests/functions Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update e2e-tests/functions Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update e2e-tests/functions Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> * Update e2e-tests/functions Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --------- Co-authored-by: George Kechagias <george.kechagias@percona.com> Co-authored-by: Ege Güneş <ege.gunes@percona.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
1 parent 020b888 commit 68a5c9f

File tree

2 files changed

+21
-4
lines changed

2 files changed

+21
-4
lines changed

cmd/bootstrap/gr/group_replication.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -408,6 +408,13 @@ func Bootstrap(ctx context.Context) error {
408408
return errors.Wrap(err, "connect to local")
409409
}
410410

411+
// Clear group_replication_group_seeds early to prevent any stale entries
412+
// from causing "Peer address is not valid" errors during cluster operations
413+
log.Printf("Clearing any stale group_replication_group_seeds")
414+
if err := localShell.setGroupSeeds(ctx, ""); err != nil {
415+
log.Printf("WARNING: failed to clear group_replication_group_seeds: %v", err)
416+
}
417+
411418
err = localShell.configureInstance(ctx)
412419
if err != nil {
413420
return err
@@ -463,6 +470,7 @@ func Bootstrap(ctx context.Context) error {
463470
}
464471
} else {
465472
log.Printf("Can't connect to any of the peers, we need to reboot")
473+
466474
if err := handleFullClusterCrash(ctx, mysqlshVer); err != nil {
467475
return errors.Wrap(err, "handle full cluster crash")
468476
}

e2e-tests/functions

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1302,6 +1302,8 @@ check_primary_chaos() {
13021302
local chaos_type=$1
13031303
local ns=$2
13041304
local primary_before_failure=$3
1305+
local primary_restart_count_before=$(kubectl get pod -n ${NAMESPACE} ${primary_before_failure} -o yaml \
1306+
| yq '.status.containerStatuses[] | select(.name=="mysql") | .restartCount')
13051307

13061308
local chaos_name
13071309
case ${chaos_type} in
@@ -1330,12 +1332,19 @@ check_primary_chaos() {
13301332

13311333
wait_cluster_consistency_gr "$(get_cluster_name)" 3 3
13321334

1333-
primary_after_failure=$(get_primary_from_group_replication)
1334-
uri=$(get_mysqlsh_uri_for_pod ${primary_after_failure})
1335+
local primary_restart_count_after=$(kubectl get pod -n ${NAMESPACE} ${primary_before_failure} -o yaml \
1336+
| yq '.status.containerStatuses[] | select(.name=="mysql") | .restartCount')
1337+
local primary_after_failure=$(get_primary_from_group_replication)
1338+
local uri=$(get_mysqlsh_uri_for_pod ${primary_after_failure})
13351339
wait_until_innodb_ok ${uri}
13361340

1337-
if [[ ${primary_before_failure} == "${primary_after_failure}" ]]; then
1338-
echo "primary pod was not killed! something went wrong."
1341+
if [[ ${chaos_type} != "full-cluster-crash" && ${primary_before_failure} == "${primary_after_failure}" ]]; then
1342+
echo "primary pod was not changed! something went wrong re-election."
1343+
exit 1
1344+
fi
1345+
1346+
if [[ ${chaos_type} == "network" && ${primary_restart_count_before} == ${primary_restart_count_after} ]]; then
1347+
echo "old primary pod was not restarted! something went wrong with liveness probe."
13391348
exit 1
13401349
fi
13411350

0 commit comments

Comments
 (0)