Skip to content

Commit a9f82d2

Browse files
authored
[Bugfix] Ensure PDBs Consistency (#1221)
1 parent 880ae23 commit a9f82d2

File tree

3 files changed

+87
-88
lines changed

3 files changed

+87
-88
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
- (Bugfix) Recover from locked ShuttingDown state
77
- (Feature) Add tolerations runtime rotation
88
- (Feature) Promote Version Check Feature
9+
- (Bugfix) Ensure PDBs Consistency
910

1011
## [1.2.22](https://github.com/arangodb/kube-arangodb/tree/1.2.22) (2022-12-13)
1112
- (Bugfix) Do not manage ports in managed ExternalAccess mode

pkg/deployment/resources/inspector/inspector.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,8 @@ func (i *inspectorState) refreshInThreads(ctx context.Context, threads int, load
411411

412412
i.throttles = n.throttles
413413

414+
i.versionInfo = n.versionInfo
415+
414416
i.last = time.Now()
415417
i.initialised = true
416418

pkg/deployment/resources/pdbs.go

Lines changed: 84 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ package resources
2323
import (
2424
"context"
2525
"fmt"
26-
"time"
2726

2827
policyv1 "k8s.io/api/policy/v1"
2928
policyv1beta1 "k8s.io/api/policy/v1beta1"
@@ -35,7 +34,6 @@ import (
3534
"github.com/arangodb/kube-arangodb/pkg/util/globals"
3635
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3736
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/kerrors"
38-
"github.com/arangodb/kube-arangodb/pkg/util/timer"
3937
)
4038

4139
func min(a int, b int) int {
@@ -50,36 +48,45 @@ func (r *Resources) EnsurePDBs(ctx context.Context) error {
5048

5149
// Only in Cluster and Production Mode
5250
spec := r.context.GetSpec()
51+
status := r.context.GetStatus()
5352
if spec.IsProduction() && spec.GetMode().IsCluster() {
5453

5554
// We want to lose at most one agent and dbserver.
5655
// Coordinators are not that critical. To keep the service available two should be enough
5756
minAgents := spec.GetServerGroupSpec(api.ServerGroupAgents).GetCount() - 1
57+
currAgents := status.Members.Agents.MembersReady()
58+
5859
minDBServers := spec.GetServerGroupSpec(api.ServerGroupDBServers).GetCount() - 1
60+
currDBServers := status.Members.DBServers.MembersReady()
61+
5962
minCoordinators := min(spec.GetServerGroupSpec(api.ServerGroupCoordinators).GetCount()-1, 2)
63+
currCoordinators := status.Members.Coordinators.MembersReady()
6064

6165
// Setting those to zero triggers a remove of the PDB
62-
minSyncMaster := 0
63-
minSyncWorker := 0
66+
minSyncMaster, currSyncMaster := 0, 0
67+
minSyncWorker, currSyncWorker := 0, 0
6468
if r.context.IsSyncEnabled() {
6569
minSyncMaster = spec.GetServerGroupSpec(api.ServerGroupSyncMasters).GetCount() - 1
70+
currSyncMaster = status.Members.SyncMasters.MembersReady()
71+
6672
minSyncWorker = spec.GetServerGroupSpec(api.ServerGroupSyncWorkers).GetCount() - 1
73+
currSyncWorker = status.Members.SyncWorkers.MembersReady()
6774
}
6875

6976
// Ensure all PDBs as calculated
70-
if err := r.ensurePDBForGroup(ctx, api.ServerGroupAgents, minAgents); err != nil {
77+
if err := r.ensurePDBForGroup(ctx, api.ServerGroupAgents, minAgents, currAgents); err != nil {
7178
return err
7279
}
73-
if err := r.ensurePDBForGroup(ctx, api.ServerGroupDBServers, minDBServers); err != nil {
80+
if err := r.ensurePDBForGroup(ctx, api.ServerGroupDBServers, minDBServers, currDBServers); err != nil {
7481
return err
7582
}
76-
if err := r.ensurePDBForGroup(ctx, api.ServerGroupCoordinators, minCoordinators); err != nil {
83+
if err := r.ensurePDBForGroup(ctx, api.ServerGroupCoordinators, minCoordinators, currCoordinators); err != nil {
7784
return err
7885
}
79-
if err := r.ensurePDBForGroup(ctx, api.ServerGroupSyncMasters, minSyncMaster); err != nil {
86+
if err := r.ensurePDBForGroup(ctx, api.ServerGroupSyncMasters, minSyncMaster, currSyncMaster); err != nil {
8087
return err
8188
}
82-
if err := r.ensurePDBForGroup(ctx, api.ServerGroupSyncWorkers, minSyncWorker); err != nil {
89+
if err := r.ensurePDBForGroup(ctx, api.ServerGroupSyncWorkers, minSyncWorker, currSyncWorker); err != nil {
8390
return err
8491
}
8592
}
@@ -122,7 +129,7 @@ func newPDBV1(minAvail int, deplname string, group api.ServerGroup, owner meta.O
122129
}
123130

124131
// ensurePDBForGroup ensure pdb for a specific server group, if wantMinAvail is zero or less, the PDB is removed and not recreated
125-
func (r *Resources) ensurePDBForGroup(ctx context.Context, group api.ServerGroup, wantedMinAvail int) error {
132+
func (r *Resources) ensurePDBForGroup(ctx context.Context, group api.ServerGroup, wantedMinAvail, current int) error {
126133
if wantedMinAvail < 0 {
127134
// Enforce removal
128135
wantedMinAvail = 0
@@ -134,101 +141,90 @@ func (r *Resources) ensurePDBForGroup(ctx context.Context, group api.ServerGroup
134141
cache := r.context.ACS().CurrentClusterCache()
135142
pdbMod := cache.PodDisruptionBudgetsModInterface()
136143

137-
for {
138-
var minAvailable *intstr.IntOrString
139-
var deletionTimestamp *meta.Time
144+
var minAvailable *intstr.IntOrString
145+
var deletionTimestamp *meta.Time
140146

141-
err := globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
142-
if inspector, err := cache.PodDisruptionBudget().V1(); err == nil {
143-
if pdb, err := inspector.Read().Get(ctxChild, pdbName, meta.GetOptions{}); err != nil {
144-
return err
145-
} else {
146-
minAvailable = pdb.Spec.MinAvailable
147-
deletionTimestamp = pdb.GetDeletionTimestamp()
148-
}
149-
} else if inspector, err := cache.PodDisruptionBudget().V1Beta1(); err == nil {
150-
if pdb, err := inspector.Read().Get(ctxChild, pdbName, meta.GetOptions{}); err != nil {
151-
return err
152-
} else {
153-
minAvailable = pdb.Spec.MinAvailable
154-
deletionTimestamp = pdb.GetDeletionTimestamp()
155-
}
147+
err := globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
148+
if inspector, err := cache.PodDisruptionBudget().V1(); err == nil {
149+
if pdb, err := inspector.Read().Get(ctxChild, pdbName, meta.GetOptions{}); err != nil {
150+
return err
156151
} else {
157-
return errors.WithStack(err)
152+
minAvailable = pdb.Spec.MinAvailable
153+
deletionTimestamp = pdb.GetDeletionTimestamp()
158154
}
159-
160-
return nil
161-
})
162-
163-
if kerrors.IsNotFound(err) {
164-
if wantedMinAvail != 0 {
165-
// No PDB found - create new.
166-
log.Debug("Creating new PDB")
167-
err = globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
168-
var errInternal error
169-
170-
if cache.PodDisruptionBudget().Version().IsV1() {
171-
pdb := newPDBV1(wantedMinAvail, deplName, group, r.context.GetAPIObject().AsOwner())
172-
_, errInternal = pdbMod.V1().Create(ctxChild, pdb, meta.CreateOptions{})
173-
} else {
174-
pdb := newPDBV1Beta1(wantedMinAvail, deplName, group, r.context.GetAPIObject().AsOwner())
175-
_, errInternal = pdbMod.V1Beta1().Create(ctxChild, pdb, meta.CreateOptions{})
176-
}
177-
178-
return errInternal
179-
})
180-
181-
if err != nil {
182-
log.Err(err).Error("failed to create PDB")
183-
return errors.WithStack(err)
184-
}
155+
} else if inspector, err := cache.PodDisruptionBudget().V1Beta1(); err == nil {
156+
if pdb, err := inspector.Read().Get(ctxChild, pdbName, meta.GetOptions{}); err != nil {
157+
return err
158+
} else {
159+
minAvailable = pdb.Spec.MinAvailable
160+
deletionTimestamp = pdb.GetDeletionTimestamp()
185161
}
186-
187-
return nil
188-
} else if err != nil {
189-
// Some other error than not found.
162+
} else {
190163
return errors.WithStack(err)
191164
}
192165

193-
// PDB v1 or v1beta1 is here.
194-
if minAvailable.IntValue() == wantedMinAvail && wantedMinAvail != 0 {
195-
return nil
196-
}
197-
// Update for PDBs is forbidden, thus one has to delete it and then create it again
198-
// Otherwise delete it if wantedMinAvail is zero
199-
log.Int("wanted-min-avail", wantedMinAvail).
200-
Int("current-min-avail", minAvailable.IntValue()).
201-
Debug("Recreating PDB")
202-
203-
// Trigger deletion only if not already deleted.
204-
if deletionTimestamp == nil {
205-
// Update the PDB.
206-
err := globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
166+
return nil
167+
})
168+
169+
if kerrors.IsNotFound(err) {
170+
if wantedMinAvail != 0 && wantedMinAvail < current {
171+
// No PDB found - create new.
172+
log.Debug("Creating new PDB")
173+
err = globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
174+
var errInternal error
175+
207176
if cache.PodDisruptionBudget().Version().IsV1() {
208-
return pdbMod.V1().Delete(ctxChild, pdbName, meta.DeleteOptions{})
177+
pdb := newPDBV1(wantedMinAvail, deplName, group, r.context.GetAPIObject().AsOwner())
178+
_, errInternal = pdbMod.V1().Create(ctxChild, pdb, meta.CreateOptions{})
179+
} else {
180+
pdb := newPDBV1Beta1(wantedMinAvail, deplName, group, r.context.GetAPIObject().AsOwner())
181+
_, errInternal = pdbMod.V1Beta1().Create(ctxChild, pdb, meta.CreateOptions{})
209182
}
210183

211-
return pdbMod.V1Beta1().Delete(ctxChild, pdbName, meta.DeleteOptions{})
184+
return errInternal
212185
})
213-
if err != nil && !kerrors.IsNotFound(err) {
214-
log.Err(err).Error("PDB deletion failed")
186+
187+
if err != nil {
188+
log.Err(err).Error("failed to create PDB")
215189
return errors.WithStack(err)
216190
}
217-
} else {
218-
log.Debug("PDB already deleted")
219-
}
220-
// Exit here if deletion was intended
221-
if wantedMinAvail == 0 {
222-
return nil
223191
}
224192

225-
log.Debug("Retry loop for PDB")
226-
select {
227-
case <-ctx.Done():
228-
return ctx.Err()
229-
case <-timer.After(time.Second):
193+
return nil
194+
} else if err != nil {
195+
// Some other error than not found.
196+
return errors.WithStack(err)
197+
}
198+
199+
// PDB v1 or v1beta1 is here.
200+
if minAvailable.IntValue() == wantedMinAvail && wantedMinAvail != 0 {
201+
return nil
202+
}
203+
// Update for PDBs is forbidden, thus one has to delete it and then create it again
204+
// Otherwise delete it if wantedMinAvail is zero
205+
log.Int("wanted-min-avail", wantedMinAvail).
206+
Int("current-min-avail", minAvailable.IntValue()).
207+
Debug("Recreating PDB")
208+
209+
// Trigger deletion only if not already deleted.
210+
if deletionTimestamp == nil {
211+
// Update the PDB.
212+
err := globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
213+
if cache.PodDisruptionBudget().Version().IsV1() {
214+
return pdbMod.V1().Delete(ctxChild, pdbName, meta.DeleteOptions{})
215+
}
216+
217+
return pdbMod.V1Beta1().Delete(ctxChild, pdbName, meta.DeleteOptions{})
218+
})
219+
if err != nil && !kerrors.IsNotFound(err) {
220+
log.Err(err).Error("PDB deletion failed")
221+
return errors.WithStack(err)
230222
}
223+
} else {
224+
log.Debug("PDB already deleted")
231225
}
226+
227+
return nil
232228
}
233229

234230
func newFromInt(v int) *intstr.IntOrString {

0 commit comments

Comments
 (0)