Skip to content

Commit c0afad6

Browse files
authored
GT-449 Block traffic on the services if there is more than 1 active leader in ActiveFailover mode (#1337)
1 parent 1f80b97 commit c0afad6

File tree

4 files changed

+25
-19
lines changed

4 files changed

+25
-19
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Change Log
22

33
## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A)
4+
- (Improvement) Block traffic on the services if there is more than 1 active leader in ActiveFailover mode
45

56
## [1.2.30](https://github.com/arangodb/kube-arangodb/tree/1.2.30) (2023-06-16)
67
- (Feature) AgencyCache Interface

pkg/deployment/resources/logger.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,6 @@ var (
3030
logger = logging.Global().RegisterAndGetLogger("deployment-resources", logging.Info)
3131
)
3232

33-
func (d *Resources) WrapLogger(in *zerolog.Event) *zerolog.Event {
34-
return in.Str("namespace", d.namespace).Str("name", d.name)
33+
func (r *Resources) WrapLogger(in *zerolog.Event) *zerolog.Event {
34+
return in.Str("namespace", r.namespace).Str("name", r.name)
3535
}

pkg/deployment/resources/metrics.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -168,29 +168,29 @@ type MetricMemberRestarts map[int32]MetricMemberRestartReason
168168

169169
type MetricMemberRestartReason map[string]uint64
170170

171-
func (d *Resources) CollectMetrics(m metrics.PushMetric) {
172-
for member, info := range d.metrics.Members {
171+
func (r *Resources) CollectMetrics(m metrics.PushMetric) {
172+
for member, info := range r.metrics.Members {
173173
// Containers
174174
for container, restarts := range info.ContainerRestarts {
175175
for code, reasons := range restarts {
176176
for reason, count := range reasons {
177-
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "container", fmt.Sprintf("%d", code), reason))
177+
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), r.namespace, r.name, member, container, "container", fmt.Sprintf("%d", code), reason))
178178
}
179179
}
180180
}
181181
// InitContainers
182182
for container, restarts := range info.InitContainerRestarts {
183183
for code, reasons := range restarts {
184184
for reason, count := range reasons {
185-
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "initContainer", fmt.Sprintf("%d", code), reason))
185+
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), r.namespace, r.name, member, container, "initContainer", fmt.Sprintf("%d", code), reason))
186186
}
187187
}
188188
}
189189
// EphemeralContainers
190190
for container, restarts := range info.EphemeralContainerRestarts {
191191
for code, reasons := range restarts {
192192
for reason, count := range reasons {
193-
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), d.namespace, d.name, member, container, "ephemeralContainer", fmt.Sprintf("%d", code), reason))
193+
m.Push(metric_descriptions.ArangodbOperatorMembersUnexpectedContainerExitCodesCounter(float64(count), r.namespace, r.name, member, container, "ephemeralContainer", fmt.Sprintf("%d", code), reason))
194194
}
195195
}
196196
}

pkg/deployment/resources/pod_leader.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ package resources
2222

2323
import (
2424
"context"
25+
"strings"
2526
"sync"
2627

2728
core "k8s.io/api/core/v1"
@@ -150,11 +151,11 @@ func (r *Resources) EnsureLeader(ctx context.Context, cachedStatus inspectorInte
150151
return errors.Reconcile()
151152
}
152153

153-
// getSingleServerLeaderID returns id of a single server leader.
154-
func (r *Resources) getSingleServerLeaderID(ctx context.Context) (string, error) {
154+
// getSingleServerLeaderID returns ids of a single server leaders.
155+
func (r *Resources) getSingleServerLeaderID(ctx context.Context) ([]string, error) {
155156
status := r.context.GetStatus()
156157
var mutex sync.Mutex
157-
var leaderID string
158+
var leaderIDs []string
158159
var anyError error
159160

160161
ctxCancel, cancel := context.WithCancel(ctx)
@@ -177,10 +178,8 @@ func (r *Resources) getSingleServerLeaderID(ctx context.Context) (string, error)
177178
return errors.New("not available")
178179
}
179180

180-
// Other requests can be interrupted, because a leader is known already.
181-
cancel()
182181
mutex.Lock()
183-
leaderID = id
182+
leaderIDs = append(leaderIDs, id)
184183
mutex.Unlock()
185184
return nil
186185
})
@@ -194,15 +193,15 @@ func (r *Resources) getSingleServerLeaderID(ctx context.Context) (string, error)
194193
}
195194
wg.Wait()
196195

197-
if len(leaderID) > 0 {
198-
return leaderID, nil
196+
if len(leaderIDs) > 0 {
197+
return leaderIDs, nil
199198
}
200199

201200
if anyError != nil {
202-
return "", errors.WithMessagef(anyError, "unable to get a leader")
201+
return nil, errors.WithMessagef(anyError, "unable to get a leader")
203202
}
204203

205-
return "", errors.New("unable to get a leader")
204+
return nil, errors.New("unable to get a leader")
206205
}
207206

208207
// setSingleServerLeadership adds or removes leadership label on a single server pod.
@@ -212,10 +211,16 @@ func (r *Resources) ensureSingleServerLeader(ctx context.Context, cachedStatus i
212211
enabled := features.FailoverLeadership().Enabled()
213212
var leaderID string
214213
if enabled {
215-
var err error
216-
if leaderID, err = r.getSingleServerLeaderID(ctx); err != nil {
214+
leaderIDs, err := r.getSingleServerLeaderID(ctx)
215+
if err != nil {
217216
return err
218217
}
218+
219+
if len(leaderIDs) == 1 {
220+
leaderID = leaderIDs[0]
221+
} else if len(leaderIDs) > 1 {
222+
r.log.Error("multiple leaders found: %s. Blocking traffic to the deployment services", strings.Join(leaderIDs, ", "))
223+
}
219224
}
220225

221226
status := r.context.GetStatus()

0 commit comments

Comments
 (0)