Skip to content

Commit 4936851

Browse files
authored
[Bugfix] Add Panic Handler (#1050)
1 parent 8ca0835 commit 4936851

File tree

15 files changed

+704
-178
lines changed

15 files changed

+704
-178
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
- (Bugfix) Add DistributeShardsLike support
3131
- (Feature) Member restarts metric
3232
- (Bugfix) Infinite loop fix in ArangoD AsyncClient
33+
- (Bugfix) Add Panic Handler
3334

3435
## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07)
3536
- (Bugfix) Fix arangosync members state inspection

docs/generated/metrics/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
| [arangodb_operator_agency_cache_member_serving](./arangodb_operator_agency_cache_member_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency member is reachable |
1515
| [arangodb_operator_agency_cache_present](./arangodb_operator_agency_cache_present.md) | arangodb_operator | agency_cache | Gauge | Determines if local agency cache is present |
1616
| [arangodb_operator_agency_cache_serving](./arangodb_operator_agency_cache_serving.md) | arangodb_operator | agency_cache | Gauge | Determines if agency is serving |
17+
| [arangodb_operator_engine_panics_recovered](./arangodb_operator_engine_panics_recovered.md) | arangodb_operator | engine | Counter | Number of Panics recovered inside Operator reconciliation loop |
1718
| [arangodb_operator_members_unexpected_container_exit_codes](./arangodb_operator_members_unexpected_container_exit_codes.md) | arangodb_operator | members | Counter | Counter of unexpected restarts in pod (Containers/InitContainers/EphemeralContainers) |
1819
| [arangodb_operator_rebalancer_enabled](./arangodb_operator_rebalancer_enabled.md) | arangodb_operator | rebalancer | Gauge | Determines if rebalancer is enabled |
1920
| [arangodb_operator_rebalancer_moves_current](./arangodb_operator_rebalancer_moves_current.md) | arangodb_operator | rebalancer | Gauge | Define how many moves are currently in progress |
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# arangodb_operator_engine_panics_recovered (Counter)
2+
3+
## Description
4+
5+
Number of Panics recovered inside Operator reconciliation loop. Section represents recovery section
6+
7+
## Labels
8+
9+
| Label | Description |
10+
|:-------:|:--------------|
11+
| section | Panic Section |

internal/metrics.yaml

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,4 +166,13 @@ namespaces:
166166
- key: container_type
167167
description: "Container/InitContainer/EphemeralContainer"
168168
- key: code
169-
description: "ExitCode"
169+
description: "ExitCode"
170+
engine:
171+
panics_recovered:
172+
shortDescription: "Number of Panics recovered inside Operator reconciliation loop"
173+
description: "Number of Panics recovered inside Operator reconciliation loop. Section represents recovery section"
174+
type: "Counter"
175+
labels:
176+
- key: section
177+
description: "Panic Section"
178+

pkg/deployment/reconcile/plan_executor.go

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ import (
3030
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
3131
"github.com/arangodb/kube-arangodb/pkg/metrics"
3232
"github.com/arangodb/kube-arangodb/pkg/util/errors"
33+
"github.com/arangodb/kube-arangodb/pkg/util/errors/panics"
3334
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
3435
)
3536

@@ -305,7 +306,7 @@ func (d *Reconciler) executeAction(ctx context.Context, planAction api.Action, a
305306

306307
if !planAction.IsStarted() {
307308
// Not started yet
308-
ready, err := action.Start(ctx)
309+
ready, err := d.executeActionStart(ctx, action)
309310
if err != nil {
310311
if g := getStartFailureGracePeriod(action); g > 0 && !planAction.CreationTime.IsZero() {
311312
if time.Since(planAction.CreationTime.Time) < g {
@@ -333,7 +334,7 @@ func (d *Reconciler) executeAction(ctx context.Context, planAction api.Action, a
333334
}
334335

335336
// First action of plan has been started, check its progress
336-
ready, abort, err := action.CheckProgress(ctx)
337+
ready, abort, err := d.executeActionCheckProgress(ctx, action)
337338
if err != nil {
338339
log.Err(err).Debug("Failed to check action progress")
339340
return false, false, false, false, errors.WithStack(err)
@@ -362,6 +363,24 @@ func (d *Reconciler) executeAction(ctx context.Context, planAction api.Action, a
362363
return false, false, true, false, nil
363364
}
364365

366+
func (d *Reconciler) executeActionCheckProgress(ctx context.Context, action Action) (ready bool, abort bool, retErr error) {
367+
retErr = panics.RecoverWithSection("ActionProgress", func() (err error) {
368+
ready, abort, err = action.CheckProgress(ctx)
369+
return
370+
})
371+
372+
return
373+
}
374+
375+
func (d *Reconciler) executeActionStart(ctx context.Context, action Action) (done bool, retErr error) {
376+
retErr = panics.RecoverWithSection("ActionStart", func() (err error) {
377+
done, err = action.Start(ctx)
378+
return
379+
})
380+
381+
return
382+
}
383+
365384
// createAction create action object based on action type
366385
func (d *Reconciler) createAction(action api.Action) (Action, ActionContext) {
367386
actionCtx := newActionContext(d.log, d.context, &d.metrics)

pkg/generated/metric_descriptions/arangodb_operator_engine_panics_recovered.go

Lines changed: 39 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/logging/logger_test.go

Lines changed: 0 additions & 175 deletions
This file was deleted.

0 commit comments

Comments
 (0)