Skip to content

Commit 0165f98

Browse files
authored
Merge pull request #236 from arangodb/bugfix/safeguard-current-image
Use new CurrentImage field to prevent unintended upgrades.
2 parents d4c6a11 + e4736f1 commit 0165f98

File tree

10 files changed

+237
-41
lines changed

10 files changed

+237
-41
lines changed

pkg/apis/deployment/v1alpha/deployment_status.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ type DeploymentStatus struct {
3838

3939
// Images holds a list of ArangoDB images with their ID and ArangoDB version.
4040
Images ImageInfoList `json:"arangodb-images,omitempty"`
41+
// Image that is currently being used when new pods are created
42+
CurrentImage *ImageInfo `json:"current-image,omitempty"`
4143

4244
// Members holds the status for all members in all server groups
4345
Members DeploymentStatusMembers `json:"members"`

pkg/apis/deployment/v1alpha/plan.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ const (
4949
ActionTypeRenewTLSCertificate ActionType = "RenewTLSCertificate"
5050
// ActionTypeRenewTLSCACertificate causes the TLS CA certificate of the entire deployment to be renewed.
5151
ActionTypeRenewTLSCACertificate ActionType = "RenewTLSCACertificate"
52+
// ActionTypeSetCurrentImage causes status.CurrentImage to be updated to the image given in the action.
53+
ActionTypeSetCurrentImage ActionType = "SetCurrentImage"
5254
)
5355

5456
const (
@@ -73,6 +75,8 @@ type Action struct {
7375
StartTime *metav1.Time `json:"startTime,omitempty"`
7476
// Reason for this action
7577
Reason string `json:"reason,omitempty"`
78+
// Image used in can of a SetCurrentImage action.
79+
Image string `json:"image,omitempty"`
7680
}
7781

7882
// NewAction instantiates a new Action.
@@ -90,6 +94,13 @@ func NewAction(actionType ActionType, group ServerGroup, memberID string, reason
9094
return a
9195
}
9296

97+
// SetImage sets the Image field to the given value and returns the modified
98+
// action.
99+
func (a Action) SetImage(image string) Action {
100+
a.Image = image
101+
return a
102+
}
103+
93104
// Plan is a list of actions that will be taken to update a deployment.
94105
// Only 1 action is in progress at a time. The operator will wait for that
95106
// action to be completely and then remove the action.

pkg/apis/deployment/v1alpha/zz_generated.deepcopy.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,15 @@ func (in *DeploymentStatus) DeepCopyInto(out *DeploymentStatus) {
295295
*out = make(ImageInfoList, len(*in))
296296
copy(*out, *in)
297297
}
298+
if in.CurrentImage != nil {
299+
in, out := &in.CurrentImage, &out.CurrentImage
300+
if *in == nil {
301+
*out = nil
302+
} else {
303+
*out = new(ImageInfo)
304+
**out = **in
305+
}
306+
}
298307
in.Members.DeepCopyInto(&out.Members)
299308
if in.Conditions != nil {
300309
in, out := &in.Conditions, &out.Conditions

pkg/deployment/reconcile/action_context.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ type ActionContext interface {
7878
DeleteTLSKeyfile(group api.ServerGroup, member api.MemberStatus) error
7979
// DeleteTLSCASecret removes the Secret containing the TLS CA certificate.
8080
DeleteTLSCASecret() error
81+
// GetImageInfo returns the image info for an image with given name.
82+
// Returns: (info, infoFound)
83+
GetImageInfo(imageName string) (api.ImageInfo, bool)
84+
// SetCurrentImage changes the CurrentImage field in the deployment
85+
// status to the given image.
86+
SetCurrentImage(imageInfo api.ImageInfo) error
8187
}
8288

8389
// newActionContext creates a new ActionContext implementation.
@@ -260,3 +266,21 @@ func (ac *actionContext) DeleteTLSCASecret() error {
260266
}
261267
return nil
262268
}
269+
270+
// GetImageInfo returns the image info for an image with given name.
271+
// Returns: (info, infoFound)
272+
func (ac *actionContext) GetImageInfo(imageName string) (api.ImageInfo, bool) {
273+
status, _ := ac.context.GetStatus()
274+
return status.Images.GetByImage(imageName)
275+
}
276+
277+
// SetCurrentImage changes the CurrentImage field in the deployment
278+
// status to the given image.
279+
func (ac *actionContext) SetCurrentImage(imageInfo api.ImageInfo) error {
280+
status, lastVersion := ac.context.GetStatus()
281+
status.CurrentImage = &imageInfo
282+
if err := ac.context.UpdateStatus(status, lastVersion); err != nil {
283+
return maskAny(err)
284+
}
285+
return nil
286+
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2018 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Ewout Prangsma
21+
//
22+
23+
package reconcile
24+
25+
import (
26+
"context"
27+
"time"
28+
29+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1alpha"
30+
"github.com/rs/zerolog"
31+
)
32+
33+
// NewSetCurrentImageAction creates a new Action that implements the given
34+
// planned SetCurrentImage action.
35+
func NewSetCurrentImageAction(log zerolog.Logger, action api.Action, actionCtx ActionContext) Action {
36+
return &setCurrentImageAction{
37+
log: log,
38+
action: action,
39+
actionCtx: actionCtx,
40+
}
41+
}
42+
43+
// setCurrentImageAction implements an SetCurrentImage.
44+
type setCurrentImageAction struct {
45+
log zerolog.Logger
46+
action api.Action
47+
actionCtx ActionContext
48+
}
49+
50+
// Start performs the start of the action.
51+
// Returns true if the action is completely finished, false in case
52+
// the start time needs to be recorded and a ready condition needs to be checked.
53+
func (a *setCurrentImageAction) Start(ctx context.Context) (bool, error) {
54+
ready, _, err := a.CheckProgress(ctx)
55+
if err != nil {
56+
return false, maskAny(err)
57+
}
58+
return ready, nil
59+
}
60+
61+
// CheckProgress checks the progress of the action.
62+
// Returns true if the action is completely finished, false otherwise.
63+
func (a *setCurrentImageAction) CheckProgress(ctx context.Context) (bool, bool, error) {
64+
log := a.log
65+
66+
imageInfo, found := a.actionCtx.GetImageInfo(a.action.Image)
67+
if !found {
68+
return false, false, nil
69+
}
70+
if err := a.actionCtx.SetCurrentImage(imageInfo); err != nil {
71+
return false, false, maskAny(err)
72+
}
73+
log.Info().Str("image", a.action.Image).Msg("Changed current image")
74+
return true, false, nil
75+
}
76+
77+
// Timeout returns the amount of time after which this action will timeout.
78+
func (a *setCurrentImageAction) Timeout() time.Duration {
79+
return upgradeMemberTimeout
80+
}
81+
82+
// Return the MemberID used / created in this action
83+
func (a *setCurrentImageAction) MemberID() string {
84+
return ""
85+
}

pkg/deployment/reconcile/plan_builder.go

Lines changed: 65 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ package reconcile
2525
import (
2626
"strings"
2727

28+
driver "github.com/arangodb/go-driver"
2829
upgraderules "github.com/arangodb/go-upgrade-rules"
2930
"github.com/rs/zerolog"
3031
"github.com/rs/zerolog/log"
@@ -37,6 +38,8 @@ import (
3738

3839
// upgradeDecision is the result of an upgrade check.
3940
type upgradeDecision struct {
41+
FromVersion driver.Version
42+
ToVersion driver.Version
4043
UpgradeNeeded bool // If set, the image version has changed
4144
UpgradeAllowed bool // If set, it is an allowed version change
4245
AutoUpgradeNeeded bool // If set, the database must be started with `--database.auto-upgrade` once
@@ -149,33 +152,56 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
149152
}
150153
return nil
151154
}
152-
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
153-
for _, m := range members {
154-
if len(plan) > 0 {
155-
// Only 1 change at a time
156-
continue
157-
}
158-
if m.Phase != api.MemberPhaseCreated {
159-
// Only rotate when phase is created
160-
continue
161-
}
162-
if podName := m.PodName; podName != "" {
163-
if p := getPod(podName); p != nil {
164-
// Got pod, compare it with what it should be
165-
decision := podNeedsUpgrading(*p, spec, status.Images)
166-
if decision.UpgradeNeeded && decision.UpgradeAllowed {
167-
plan = append(plan, createUpgradeMemberPlan(log, m, group, "Version upgrade")...)
168-
} else {
169-
rotNeeded, reason := podNeedsRotation(log, *p, apiObject, spec, group, status.Members.Agents, m.ID, context)
170-
if rotNeeded {
171-
plan = append(plan, createRotateMemberPlan(log, m, group, reason)...)
155+
// createRotateOrUpgradePlan goes over all pods to check if an upgrade or rotate
156+
// is needed. If an upgrade is needed but not allowed, the second return value
157+
// will be true.
158+
// Returns: (newPlan, upgradeNotAllowed)
159+
createRotateOrUpgradePlan := func() (api.Plan, bool, driver.Version, driver.Version) {
160+
var newPlan api.Plan
161+
upgradeNotAllowed := false
162+
var fromVersion, toVersion driver.Version
163+
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
164+
for _, m := range members {
165+
if m.Phase != api.MemberPhaseCreated {
166+
// Only rotate when phase is created
167+
continue
168+
}
169+
if podName := m.PodName; podName != "" {
170+
if p := getPod(podName); p != nil {
171+
// Got pod, compare it with what it should be
172+
decision := podNeedsUpgrading(*p, spec, status.Images)
173+
if decision.UpgradeNeeded && !decision.UpgradeAllowed {
174+
// Oops, upgrade is not allowed
175+
upgradeNotAllowed = true
176+
fromVersion = decision.FromVersion
177+
toVersion = decision.ToVersion
178+
return nil
179+
} else if len(newPlan) == 0 {
180+
// Only rotate/upgrade 1 pod at a time
181+
if decision.UpgradeNeeded {
182+
// Yes, upgrade is needed (and allowed)
183+
newPlan = createUpgradeMemberPlan(log, m, group, "Version upgrade", spec.GetImage(), status)
184+
} else {
185+
// Upgrade is not needed, see if rotation is needed
186+
if rotNeeded, reason := podNeedsRotation(log, *p, apiObject, spec, group, status.Members.Agents, m.ID, context); rotNeeded {
187+
newPlan = createRotateMemberPlan(log, m, group, reason)
188+
}
189+
}
172190
}
173191
}
174192
}
175193
}
176-
}
177-
return nil
178-
})
194+
return nil
195+
})
196+
return newPlan, upgradeNotAllowed, fromVersion, toVersion
197+
}
198+
if newPlan, upgradeNotAllowed, fromVersion, toVersion := createRotateOrUpgradePlan(); upgradeNotAllowed {
199+
// Upgrade is needed, but not allowed
200+
context.CreateEvent(k8sutil.NewUpgradeNotAllowedEvent(apiObject, fromVersion, toVersion))
201+
} else {
202+
// Use the new plan
203+
plan = newPlan
204+
}
179205
}
180206

181207
// Check for the need to rotate TLS certificate of a members
@@ -218,18 +244,27 @@ func podNeedsUpgrading(p v1.Pod, spec api.DeploymentSpec, images api.ImageInfoLi
218244
podVersion := podImageInfo.ArangoDBVersion
219245
if err := upgraderules.CheckUpgradeRules(podVersion, specVersion); err != nil {
220246
// E.g. 3.x -> 4.x, we cannot allow automatically
221-
return upgradeDecision{UpgradeNeeded: true, UpgradeAllowed: false}
247+
return upgradeDecision{
248+
FromVersion: podVersion,
249+
ToVersion: specVersion,
250+
UpgradeNeeded: true,
251+
UpgradeAllowed: false,
252+
}
222253
}
223254
if specVersion.Major() != podVersion.Major() || specVersion.Minor() != podVersion.Minor() {
224255
// Is allowed, with `--database.auto-upgrade`
225256
return upgradeDecision{
257+
FromVersion: podVersion,
258+
ToVersion: specVersion,
226259
UpgradeNeeded: true,
227260
UpgradeAllowed: true,
228261
AutoUpgradeNeeded: true,
229262
}
230263
}
231264
// Patch version change, rotate only
232265
return upgradeDecision{
266+
FromVersion: podVersion,
267+
ToVersion: specVersion,
233268
UpgradeNeeded: true,
234269
UpgradeAllowed: true,
235270
AutoUpgradeNeeded: false,
@@ -343,7 +378,7 @@ func createRotateMemberPlan(log zerolog.Logger, member api.MemberStatus,
343378
// createUpgradeMemberPlan creates a plan to upgrade (stop-recreateWithAutoUpgrade-stop-start) an existing
344379
// member.
345380
func createUpgradeMemberPlan(log zerolog.Logger, member api.MemberStatus,
346-
group api.ServerGroup, reason string) api.Plan {
381+
group api.ServerGroup, reason string, imageName string, status api.DeploymentStatus) api.Plan {
347382
log.Debug().
348383
Str("id", member.ID).
349384
Str("role", group.AsRole()).
@@ -353,6 +388,11 @@ func createUpgradeMemberPlan(log zerolog.Logger, member api.MemberStatus,
353388
api.NewAction(api.ActionTypeUpgradeMember, group, member.ID, reason),
354389
api.NewAction(api.ActionTypeWaitForMemberUp, group, member.ID),
355390
}
391+
if status.CurrentImage == nil || status.CurrentImage.Image != imageName {
392+
plan = append(api.Plan{
393+
api.NewAction(api.ActionTypeSetCurrentImage, group, "", reason).SetImage(imageName),
394+
}, plan...)
395+
}
356396
return plan
357397
}
358398

pkg/deployment/reconcile/plan_executor.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ func (d *Reconciler) createAction(ctx context.Context, log zerolog.Logger, actio
181181
return NewRenewTLSCertificateAction(log, action, actionCtx)
182182
case api.ActionTypeRenewTLSCACertificate:
183183
return NewRenewTLSCACertificateAction(log, action, actionCtx)
184+
case api.ActionTypeSetCurrentImage:
185+
return NewSetCurrentImageAction(log, action, actionCtx)
184186
default:
185187
panic(fmt.Sprintf("Unknown action type '%s'", action.Type))
186188
}

pkg/deployment/resources/pod_creator.go

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -459,13 +459,23 @@ func (r *Resources) createPodForMember(spec api.DeploymentSpec, memberID string,
459459
podSuffix := createPodSuffix(spec)
460460
m.PodName = k8sutil.CreatePodName(apiObject.GetName(), roleAbbr, m.ID, podSuffix)
461461
newPhase := api.MemberPhaseCreated
462-
// Find image ID
463-
imageInfo, imageFound := status.Images.GetByImage(spec.GetImage())
464-
if !imageFound {
465-
imageNotFoundOnce.Do(func() {
466-
log.Debug().Str("image", spec.GetImage()).Msg("Image ID is not known yet for image")
467-
})
468-
return nil
462+
// Select image
463+
var imageInfo api.ImageInfo
464+
if current := status.CurrentImage; current != nil {
465+
// Use current image
466+
imageInfo = *current
467+
} else {
468+
// Find image ID
469+
info, imageFound := status.Images.GetByImage(spec.GetImage())
470+
if !imageFound {
471+
imageNotFoundOnce.Do(func() {
472+
log.Debug().Str("image", spec.GetImage()).Msg("Image ID is not known yet for image")
473+
})
474+
return nil
475+
}
476+
imageInfo = info
477+
// Save image as current image
478+
status.CurrentImage = &info
469479
}
470480
// Create pod
471481
if group.IsArangod() {

pkg/deployment/server_api.go

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -218,17 +218,15 @@ func (d *Deployment) DatabaseURL() string {
218218
// DatabaseVersion returns the version used by the deployment
219219
// Returns versionNumber, licenseType
220220
func (d *Deployment) DatabaseVersion() (string, string) {
221-
image := d.GetSpec().GetImage()
222221
status, _ := d.GetStatus()
223-
info, found := status.Images.GetByImage(image)
224-
if !found {
225-
return "", ""
226-
}
227-
license := "community"
228-
if info.Enterprise {
229-
license = "enterprise"
222+
if current := status.CurrentImage; current != nil {
223+
license := "community"
224+
if current.Enterprise {
225+
license = "enterprise"
226+
}
227+
return string(current.ArangoDBVersion), license
230228
}
231-
return string(info.ArangoDBVersion), license
229+
return "", ""
232230
}
233231

234232
// Members returns all members of the deployment by role.

0 commit comments

Comments
 (0)