Skip to content

Commit 1f7d272

Browse files
authored
[Feature] Graceful Shutdown (#824)
1 parent 6c98029 commit 1f7d272

32 files changed

+497
-146
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
- Split & Unify Lifecycle management functionality
55
- Drop support for ArangoDB <= 3.5 (versions already EOL)
66
- Add new admin commands to fetch agency dump and agency state
7+
- Add Graceful shutdown as finalizer (supports kubectl delete)
78

89
## [1.2.4](https://github.com/arangodb/kube-arangodb/tree/1.2.4) (2021-10-22)
910
- Replace `beta.kubernetes.io/arch` Pod label with `kubernetes.io/arch` using Silent Rotation

go.mod

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module github.com/arangodb/kube-arangodb
33
go 1.16
44

55
replace (
6-
github.com/arangodb/go-driver => github.com/arangodb/go-driver v0.0.0-20210621075908-e7a6fa0cbd18
6+
github.com/arangodb/go-driver => github.com/arangodb/go-driver v1.2.1
77
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring => github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.46.0
88
github.com/prometheus-operator/prometheus-operator/pkg/client => github.com/prometheus-operator/prometheus-operator/pkg/client v0.46.0
99
github.com/stretchr/testify => github.com/stretchr/testify v1.5.1
@@ -25,8 +25,8 @@ replace (
2525
require (
2626
github.com/arangodb-helper/go-certificates v0.0.0-20180821055445-9fca24fc2680
2727
github.com/arangodb/arangosync-client v0.7.0
28-
github.com/arangodb/go-driver v0.0.0-20210621075908-e7a6fa0cbd18
29-
github.com/arangodb/go-driver/v2 v2.0.0-20211001173946-eafa9b638e13
28+
github.com/arangodb/go-driver v1.2.1
29+
github.com/arangodb/go-driver/v2 v2.0.0-20211021031401-d92dcd5a4c83
3030
github.com/arangodb/go-upgrade-rules v0.0.0-20180809110947-031b4774ff21
3131
github.com/cenkalti/backoff v2.2.1+incompatible
3232
github.com/dchest/uniuri v0.0.0-20160212164326-8902c56451e9

go.sum

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,10 @@ github.com/arangodb/arangosync-client v0.7.0 h1:3vLOVnMyr5vGlPA0OHxJL9Wyy49JJwN0
4444
github.com/arangodb/arangosync-client v0.7.0/go.mod h1:g+JcxH3C63wKaJPnPr9nggYoGbt/bYCWpfcRG0NSodY=
4545
github.com/arangodb/go-driver v0.0.0-20210621075908-e7a6fa0cbd18 h1:3J0tqp5eQ8ptGOeeu7vo92RKf24bOA7MFy0z3uPiTWg=
4646
github.com/arangodb/go-driver v0.0.0-20210621075908-e7a6fa0cbd18/go.mod h1:3NUekcRLpgheFIGEwcOvxilEW73MV1queNKW58k7sdc=
47-
github.com/arangodb/go-driver/v2 v2.0.0-20211001173946-eafa9b638e13 h1:5egTRo3Met3xXUVj/Pbn1gXeY2C4bQZycJoHSnndfig=
48-
github.com/arangodb/go-driver/v2 v2.0.0-20211001173946-eafa9b638e13/go.mod h1:X3uG4XbfQS35AjsFJLwNLyA6UZofNV5ufe2KoNxcMO0=
47+
github.com/arangodb/go-driver v1.2.1 h1:HREDHhDmzdIWxHmfkfTESbYUnRjESjPh4WUuXq7FZa8=
48+
github.com/arangodb/go-driver v1.2.1/go.mod h1:zdDkJJnCj8DAkfbtIjIXnsTrWIiy6VhP3Vy14p+uQeY=
49+
github.com/arangodb/go-driver/v2 v2.0.0-20211021031401-d92dcd5a4c83 h1:PCbi3alUFastUw6InBKGEXqniveJJcQuMYspubJMRS8=
50+
github.com/arangodb/go-driver/v2 v2.0.0-20211021031401-d92dcd5a4c83/go.mod h1:B8byYwvt1mDOQzpjiMuDTP5jOif/Y5dcEJtkdvPB7HY=
4951
github.com/arangodb/go-upgrade-rules v0.0.0-20180809110947-031b4774ff21 h1:+W7D5ttxi/Ygh/39vialtypE23p9KI7P0J2qtoqUV4w=
5052
github.com/arangodb/go-upgrade-rules v0.0.0-20180809110947-031b4774ff21/go.mod h1:RkPIG6JJ2pcJUoymc18NxAJGraZd+iAEVnOTDjZey/w=
5153
github.com/arangodb/go-velocypack v0.0.0-20200318135517-5af53c29c67e h1:Xg+hGrY2LcQBbxd0ZFdbGSyRKTYMZCfBbw/pMJFOk1g=
@@ -184,6 +186,7 @@ github.com/gogo/protobuf v1.3.1/go.mod h1:SlYgWuQ5SjCEi6WLHjHCa1yvBfUnHcTbrrZtXP
184186
github.com/golang-jwt/jwt v3.2.1+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
185187
github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY=
186188
github.com/golang-jwt/jwt v3.2.2+incompatible/go.mod h1:8pz2t5EyA70fFQQSrl6XZXzqecmYZeUEB8OUGHkxJ+I=
189+
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b h1:VKtxabqXZkF25pY9ekfRL6a582T4P37/31XEstQ5p58=
187190
github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
188191
github.com/golang/groupcache v0.0.0-20160516000752-02826c3e7903/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
189192
github.com/golang/groupcache v0.0.0-20190129154638-5b532d6fd5ef/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=

lifecycle.go

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,16 @@ package main
2525
import (
2626
"context"
2727
"io"
28+
"net"
2829
"os"
2930
"path/filepath"
31+
"strconv"
3032
"time"
3133

34+
"github.com/arangodb/kube-arangodb/pkg/backup/utils"
35+
36+
"github.com/arangodb/kube-arangodb/pkg/util/retry"
37+
3238
"github.com/arangodb/kube-arangodb/pkg/version"
3339

3440
"github.com/spf13/cobra"
@@ -54,6 +60,10 @@ var (
5460
Run: cmdLifecyclePreStopRunFinalizer,
5561
Hidden: true,
5662
}
63+
cmdLifecyclePreStopPort = &cobra.Command{
64+
Use: "port",
65+
Hidden: true,
66+
}
5767
cmdLifecycleCopy = &cobra.Command{
5868
Use: "copy",
5969
Run: cmdLifecycleCopyRun,
@@ -68,7 +78,15 @@ var (
6878
func init() {
6979
cmdMain.AddCommand(cmdLifecycle)
7080

71-
cmdLifecyclePreStop.AddCommand(cmdLifecyclePreStopFinalizers)
81+
var preStopPort cmdLifecyclePreStopRunPort
82+
83+
cmdLifecyclePreStopPort.RunE = preStopPort.run
84+
85+
f := cmdLifecyclePreStopPort.Flags()
86+
87+
f.DurationVar(&preStopPort.timeout, "timeout", 6*60*time.Minute, "PreStopTimeout")
88+
89+
cmdLifecyclePreStop.AddCommand(cmdLifecyclePreStopFinalizers, cmdLifecyclePreStopPort)
7290

7391
cmdLifecycle.AddCommand(cmdLifecyclePreStop)
7492
cmdLifecycle.AddCommand(cmdLifecycleCopy)
@@ -162,3 +180,63 @@ func cmdLifecycleCopyRun(cmd *cobra.Command, args []string) {
162180

163181
cliLog.Info().Msgf("Executable copied to %s", targetPath)
164182
}
183+
184+
type cmdLifecyclePreStopRunPort struct {
185+
timeout time.Duration
186+
}
187+
188+
// Wait until port 8529 is closed.
189+
func (c *cmdLifecyclePreStopRunPort) run(cmd *cobra.Command, args []string) error {
190+
address := net.JoinHostPort("127.0.0.1", strconv.Itoa(k8sutil.ArangoPort))
191+
192+
// Get environment
193+
namespace := os.Getenv(constants.EnvOperatorPodNamespace)
194+
if len(namespace) == 0 {
195+
cliLog.Fatal().Msgf("%s environment variable missing", constants.EnvOperatorPodNamespace)
196+
}
197+
name := os.Getenv(constants.EnvOperatorPodName)
198+
if len(name) == 0 {
199+
cliLog.Fatal().Msgf("%s environment variable missing", constants.EnvOperatorPodName)
200+
}
201+
202+
// Create kubernetes client
203+
kubecli, err := k8sutil.NewKubeClient()
204+
if err != nil {
205+
cliLog.Fatal().Err(err).Msg("Failed to create Kubernetes client")
206+
}
207+
208+
pods := kubecli.CoreV1().Pods(namespace)
209+
210+
recentErrors := 0
211+
212+
return retry.NewTimeout(func() error {
213+
conn, err := net.DialTimeout("tcp", address, 500*time.Millisecond)
214+
215+
if err != nil {
216+
return retry.Interrput()
217+
}
218+
219+
conn.Close()
220+
221+
p, err := pods.Get(context.Background(), name, metav1.GetOptions{})
222+
if k8sutil.IsNotFound(err) {
223+
cliLog.Warn().Msg("Pod not found")
224+
return nil
225+
} else if err != nil {
226+
recentErrors++
227+
cliLog.Error().Err(err).Msg("Failed to get pod")
228+
if recentErrors > 20 {
229+
cliLog.Fatal().Err(err).Msg("Too many recent errors")
230+
return nil
231+
}
232+
} else {
233+
// We got our pod
234+
finalizers := utils.StringList(p.GetFinalizers())
235+
if !finalizers.Has(constants.FinalizerPodGracefulShutdown) {
236+
return retry.Interrput()
237+
}
238+
}
239+
240+
return nil
241+
}).Timeout(125*time.Millisecond, c.timeout)
242+
}

pkg/apis/deployment/v1/plan.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ const (
7373
ActionTypeShutdownMember ActionType = "ShutdownMember"
7474
// ActionTypeResignLeadership causes a member to resign leadership.
7575
ActionTypeResignLeadership ActionType = "ResignLeadership"
76+
// ActionTypeKillMemberPod causes a pod to get delete request. It also waits until Delay finalizer will be removed.
77+
ActionTypeKillMemberPod ActionType = "KillMemberPod"
7678
// ActionTypeRotateMember causes a member to be shutdown and have it's pod removed.
7779
ActionTypeRotateMember ActionType = "RotateMember"
7880
// ActionTypeRotateStartMember causes a member to be shutdown and have it's pod removed. Do not wait to pod recover.

pkg/apis/deployment/v1/server_group.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ func (g ServerGroup) DefaultTerminationGracePeriod() time.Duration {
148148
return time.Minute
149149
case ServerGroupDBServers:
150150
return time.Hour
151+
case ServerGroupCoordinators:
152+
return time.Hour
151153
default:
152154
return time.Second * 30
153155
}

pkg/apis/deployment/v1/server_group_spec.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -682,3 +682,16 @@ func (s *ServerGroupSpec) GetEntrypoint(defaultEntrypoint string) string {
682682

683683
return *s.Entrypoint
684684
}
685+
686+
// GetShutdownDelay returns defined or default Group ShutdownDelay in seconds
687+
func (s ServerGroupSpec) GetShutdownDelay(group ServerGroup) int {
688+
if s.ShutdownDelay == nil {
689+
switch group {
690+
case ServerGroupCoordinators:
691+
return 3
692+
default:
693+
return 0
694+
}
695+
}
696+
return *s.ShutdownDelay
697+
}

pkg/apis/deployment/v2alpha1/deployment_metrics_spec.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ func (m MetricsMode) New() *MetricsMode {
4141
return &m
4242
}
4343

44+
// deprecated
4445
func (m MetricsMode) GetMetricsEndpoint() string {
4546
switch m {
4647
case MetricsModeInternal:
@@ -51,9 +52,12 @@ func (m MetricsMode) GetMetricsEndpoint() string {
5152
}
5253

5354
const (
55+
// deprecated
5456
// MetricsModeExporter exporter mode for old exporter type
5557
MetricsModeExporter MetricsMode = "exporter"
56-
MetricsModeSidecar MetricsMode = "sidecar"
58+
// deprecated
59+
MetricsModeSidecar MetricsMode = "sidecar"
60+
// deprecated
5761
MetricsModeInternal MetricsMode = "internal"
5862
)
5963

@@ -67,12 +71,14 @@ func (m *MetricsMode) Get() MetricsMode {
6771

6872
// MetricsSpec contains spec for arangodb exporter
6973
type MetricsSpec struct {
70-
Enabled *bool `json:"enabled,omitempty"`
74+
Enabled *bool `json:"enabled,omitempty"`
75+
// deprecated
7176
Image *string `json:"image,omitempty"`
7277
Authentication MetricsAuthenticationSpec `json:"authentication,omitempty"`
7378
Resources v1.ResourceRequirements `json:"resources,omitempty"`
74-
Mode *MetricsMode `json:"mode,omitempty"`
75-
TLS *bool `json:"tls,omitempty"`
79+
// deprecated
80+
Mode *MetricsMode `json:"mode,omitempty"`
81+
TLS *bool `json:"tls,omitempty"`
7682

7783
ServiceMonitor *MetricsServiceMonitorSpec `json:"serviceMonitor,omitempty"`
7884

@@ -100,11 +106,13 @@ func (s *MetricsSpec) IsEnabled() bool {
100106
return util.BoolOrDefault(s.Enabled, false)
101107
}
102108

109+
// deprecated
103110
// HasImage returns whether a image was specified or not
104111
func (s *MetricsSpec) HasImage() bool {
105112
return s.Image != nil
106113
}
107114

115+
// deprecated
108116
// GetImage returns the Image or empty string
109117
func (s *MetricsSpec) GetImage() string {
110118
return util.StringOrDefault(s.Image)

pkg/apis/deployment/v2alpha1/plan.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ const (
7373
ActionTypeShutdownMember ActionType = "ShutdownMember"
7474
// ActionTypeResignLeadership causes a member to resign leadership.
7575
ActionTypeResignLeadership ActionType = "ResignLeadership"
76+
// ActionTypeKillMemberPod causes a pod to get delete request. It also waits until Delay finalizer will be removed.
77+
ActionTypeKillMemberPod ActionType = "KillMemberPod"
7678
// ActionTypeRotateMember causes a member to be shutdown and have it's pod removed.
7779
ActionTypeRotateMember ActionType = "RotateMember"
7880
// ActionTypeRotateStartMember causes a member to be shutdown and have it's pod removed. Do not wait to pod recover.

pkg/apis/deployment/v2alpha1/server_group.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,8 @@ func (g ServerGroup) DefaultTerminationGracePeriod() time.Duration {
148148
return time.Minute
149149
case ServerGroupDBServers:
150150
return time.Hour
151+
case ServerGroupCoordinators:
152+
return time.Hour
151153
default:
152154
return time.Second * 30
153155
}

0 commit comments

Comments
 (0)