Skip to content

Commit f63f444

Browse files
authored
✨ KCP: implement trigger in-place update (#12897)
* KCP: implement trigger in-place update Signed-off-by: Stefan Büringer buringerst@vmware.com * Fix review findings --------- Signed-off-by: Stefan Büringer buringerst@vmware.com
1 parent fbdba20 commit f63f444

File tree

10 files changed

+758
-17
lines changed

10 files changed

+758
-17
lines changed

controlplane/kubeadm/internal/control_plane.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,10 @@ import (
3333
bootstrapv1 "sigs.k8s.io/cluster-api/api/bootstrap/kubeadm/v1beta2"
3434
controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2"
3535
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
36+
runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1"
3637
"sigs.k8s.io/cluster-api/controllers/external"
3738
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
39+
"sigs.k8s.io/cluster-api/internal/hooks"
3840
"sigs.k8s.io/cluster-api/util/collections"
3941
"sigs.k8s.io/cluster-api/util/conditions"
4042
"sigs.k8s.io/cluster-api/util/failuredomains"
@@ -185,6 +187,26 @@ func (c *ControlPlane) MachineWithDeleteAnnotation(machines collections.Machines
185187
return annotatedMachines
186188
}
187189

190+
// MachinesToCompleteTriggerInPlaceUpdate returns Machines for which we have to complete triggering
191+
// the in-place update. This can become necessary if triggering the in-place update fails after
192+
// we added UpdateInProgressAnnotation and before we marked the UpdateMachine hook as pending.
193+
func (c *ControlPlane) MachinesToCompleteTriggerInPlaceUpdate() collections.Machines {
194+
return c.Machines.Filter(func(machine *clusterv1.Machine) bool {
195+
_, ok := machine.Annotations[clusterv1.UpdateInProgressAnnotation]
196+
return ok && !hooks.IsPending(runtimehooksv1.UpdateMachine, machine)
197+
})
198+
}
199+
200+
// MachinesToCompleteInPlaceUpdate returns Machines that still have to complete their in-place update.
201+
func (c *ControlPlane) MachinesToCompleteInPlaceUpdate() collections.Machines {
202+
return c.Machines.Filter(func(machine *clusterv1.Machine) bool {
203+
// Note: Checking both annotations here to make this slightly more robust.
204+
// Theoretically only checking for IsPending would have been enough.
205+
_, ok := machine.Annotations[clusterv1.UpdateInProgressAnnotation]
206+
return ok || hooks.IsPending(runtimehooksv1.UpdateMachine, machine)
207+
})
208+
}
209+
188210
// FailureDomainWithMostMachines returns the fd with most machines in it and at least one eligible machine in it.
189211
// Note: if there are eligibleMachines machines in failure domain that do not exist anymore, cleaning up those failure domains takes precedence.
190212
func (c *ControlPlane) FailureDomainWithMostMachines(ctx context.Context, eligibleMachines collections.Machines) string {

controlplane/kubeadm/internal/control_plane_test.go

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727

2828
controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2"
2929
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
30+
runtimev1 "sigs.k8s.io/cluster-api/api/runtime/v1beta2"
3031
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/etcd"
3132
"sigs.k8s.io/cluster-api/util/collections"
3233
)
@@ -293,6 +294,160 @@ func TestHasHealthyMachineStillProvisioning(t *testing.T) {
293294
})
294295
}
295296

297+
func TestMachinesToCompleteTriggerInPlaceUpdate(t *testing.T) {
298+
machineWithoutAnnotations := &clusterv1.Machine{
299+
ObjectMeta: metav1.ObjectMeta{
300+
Name: "machineWithoutAnnotations",
301+
},
302+
}
303+
machineWithUpdateInProgressAnnotation := &clusterv1.Machine{
304+
ObjectMeta: metav1.ObjectMeta{
305+
Name: "machineWithUpdateInProgressAnnotation",
306+
Annotations: map[string]string{
307+
clusterv1.UpdateInProgressAnnotation: "",
308+
},
309+
},
310+
}
311+
machineWithPendingHooksAnnotation := &clusterv1.Machine{
312+
ObjectMeta: metav1.ObjectMeta{
313+
Name: "machineWithPendingHooksAnnotation",
314+
Annotations: map[string]string{
315+
runtimev1.PendingHooksAnnotation: "UpdateMachine",
316+
},
317+
},
318+
}
319+
machineWithBothAnnotations := &clusterv1.Machine{
320+
ObjectMeta: metav1.ObjectMeta{
321+
Name: "machineWithBothAnnotations",
322+
Annotations: map[string]string{
323+
clusterv1.UpdateInProgressAnnotation: "",
324+
runtimev1.PendingHooksAnnotation: "UpdateMachine",
325+
},
326+
},
327+
}
328+
329+
tests := []struct {
330+
name string
331+
machine *clusterv1.Machine
332+
completeTriggerInPlaceUpdate bool
333+
}{
334+
{
335+
name: "machineWithoutAnnotations => false",
336+
machine: machineWithoutAnnotations,
337+
completeTriggerInPlaceUpdate: false,
338+
},
339+
{
340+
name: "machineWithUpdateInProgressAnnotation => true",
341+
machine: machineWithUpdateInProgressAnnotation,
342+
completeTriggerInPlaceUpdate: true,
343+
},
344+
{
345+
name: "machineWithPendingHooksAnnotation => false",
346+
machine: machineWithPendingHooksAnnotation,
347+
completeTriggerInPlaceUpdate: false,
348+
},
349+
{
350+
name: "machineWithBothAnnotations => false",
351+
machine: machineWithBothAnnotations,
352+
completeTriggerInPlaceUpdate: false,
353+
},
354+
}
355+
356+
for _, tt := range tests {
357+
t.Run(tt.name, func(t *testing.T) {
358+
g := NewWithT(t)
359+
360+
c := ControlPlane{
361+
Machines: collections.FromMachines(tt.machine),
362+
}
363+
364+
if tt.completeTriggerInPlaceUpdate {
365+
g.Expect(c.MachinesToCompleteTriggerInPlaceUpdate().Len()).To(Equal(1))
366+
g.Expect(c.MachinesToCompleteTriggerInPlaceUpdate().Has(tt.machine)).To(BeTrue())
367+
} else {
368+
g.Expect(c.MachinesToCompleteTriggerInPlaceUpdate().Len()).To(Equal(0))
369+
}
370+
})
371+
}
372+
}
373+
374+
func TestMachinesToCompleteInPlaceUpdate(t *testing.T) {
375+
machineWithoutAnnotations := &clusterv1.Machine{
376+
ObjectMeta: metav1.ObjectMeta{
377+
Name: "machineWithoutAnnotations",
378+
},
379+
}
380+
machineWithUpdateInProgressAnnotation := &clusterv1.Machine{
381+
ObjectMeta: metav1.ObjectMeta{
382+
Name: "machineWithUpdateInProgressAnnotation",
383+
Annotations: map[string]string{
384+
clusterv1.UpdateInProgressAnnotation: "",
385+
},
386+
},
387+
}
388+
machineWithPendingHooksAnnotation := &clusterv1.Machine{
389+
ObjectMeta: metav1.ObjectMeta{
390+
Name: "machineWithPendingHooksAnnotation",
391+
Annotations: map[string]string{
392+
runtimev1.PendingHooksAnnotation: "UpdateMachine",
393+
},
394+
},
395+
}
396+
machineWithBothAnnotations := &clusterv1.Machine{
397+
ObjectMeta: metav1.ObjectMeta{
398+
Name: "machineWithBothAnnotations",
399+
Annotations: map[string]string{
400+
clusterv1.UpdateInProgressAnnotation: "",
401+
runtimev1.PendingHooksAnnotation: "UpdateMachine",
402+
},
403+
},
404+
}
405+
406+
tests := []struct {
407+
name string
408+
machine *clusterv1.Machine
409+
completeInPlaceUpdate bool
410+
}{
411+
{
412+
name: "machineWithoutAnnotations => false",
413+
machine: machineWithoutAnnotations,
414+
completeInPlaceUpdate: false,
415+
},
416+
{
417+
name: "machineWithUpdateInProgressAnnotation => true",
418+
machine: machineWithUpdateInProgressAnnotation,
419+
completeInPlaceUpdate: true,
420+
},
421+
{
422+
name: "machineWithPendingHooksAnnotation => true",
423+
machine: machineWithPendingHooksAnnotation,
424+
completeInPlaceUpdate: true,
425+
},
426+
{
427+
name: "machineWithBothAnnotations => true",
428+
machine: machineWithBothAnnotations,
429+
completeInPlaceUpdate: true,
430+
},
431+
}
432+
433+
for _, tt := range tests {
434+
t.Run(tt.name, func(t *testing.T) {
435+
g := NewWithT(t)
436+
437+
c := ControlPlane{
438+
Machines: collections.FromMachines(tt.machine),
439+
}
440+
441+
if tt.completeInPlaceUpdate {
442+
g.Expect(c.MachinesToCompleteInPlaceUpdate().Len()).To(Equal(1))
443+
g.Expect(c.MachinesToCompleteInPlaceUpdate().Has(tt.machine)).To(BeTrue())
444+
} else {
445+
g.Expect(c.MachinesToCompleteInPlaceUpdate().Len()).To(Equal(0))
446+
}
447+
})
448+
}
449+
}
450+
296451
func TestStatusToLogKeyAndValues(t *testing.T) {
297452
healthyMachine := &clusterv1.Machine{
298453
ObjectMeta: metav1.ObjectMeta{Name: "healthy"},

controlplane/kubeadm/internal/controllers/controller.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ type KubeadmControlPlaneReconciler struct {
107107
overridePreflightChecksFunc func(ctx context.Context, controlPlane *internal.ControlPlane, excludeFor ...*clusterv1.Machine) ctrl.Result
108108
overrideCanUpdateMachineFunc func(ctx context.Context, machine *clusterv1.Machine, machineUpToDateResult internal.UpToDateResult) (bool, error)
109109
overrideCanExtensionsUpdateMachine func(ctx context.Context, machine *clusterv1.Machine, machineUpToDateResult internal.UpToDateResult, extensionHandlers []string) (bool, []string, error)
110+
overrideTriggerInPlaceUpdate func(ctx context.Context, machine *clusterv1.Machine, machineUpToDateResult internal.UpToDateResult) error
110111
// Note: This field is only used for unit tests that use fake client because the fake client does not properly set resourceVersion
111112
// on BootstrapConfig/InfraMachine after ssa.Patch and then ssa.RemoveManagedFieldsForLabelsAndAnnotations would fail.
112113
disableRemoveManagedFieldsForLabelsAndAnnotations bool
@@ -479,12 +480,33 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl
479480
return result, err
480481
}
481482

483+
// Complete triggering in-place update if necessary, for reentrancy if triggerInPlaceUpdate failed
484+
// when triggering the in-place update initially.
485+
if machines := controlPlane.MachinesToCompleteTriggerInPlaceUpdate(); len(machines) > 0 {
486+
_, machinesUpToDateResults := controlPlane.NotUpToDateMachines()
487+
for _, m := range machines {
488+
if err := r.triggerInPlaceUpdate(ctx, m, machinesUpToDateResults[m.Name]); err != nil {
489+
return ctrl.Result{}, err
490+
}
491+
}
492+
return ctrl.Result{}, nil // Note: Changes to Machines trigger another reconcile.
493+
}
494+
482495
// Reconcile unhealthy machines by triggering deletion and requeue if it is considered safe to remediate,
483496
// otherwise continue with the other KCP operations.
484497
if result, err := r.reconcileUnhealthyMachines(ctx, controlPlane); err != nil || !result.IsZero() {
485498
return result, err
486499
}
487500

501+
// Wait for in-place update to complete.
502+
// Note: If a Machine becomes unhealthy during in-place update reconcileUnhealthyMachines above remediates it.
503+
// Note: We have to wait here even if there are no more Machines that need rollout (in-place update in
504+
// progress is not counted as needs rollout).
505+
if machines := controlPlane.MachinesToCompleteInPlaceUpdate(); machines.Len() > 0 {
506+
log.Info("Waiting for in-place update to complete", "machines", strings.Join(machines.Names(), ", "))
507+
return ctrl.Result{}, nil // Note: Changes to Machines trigger another reconcile.
508+
}
509+
488510
// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
489511
machinesNeedingRollout, machinesUpToDateResults := controlPlane.MachinesNeedingRollout()
490512
switch {

controlplane/kubeadm/internal/controllers/inplace.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@ func (r *KubeadmControlPlaneReconciler) tryInPlaceUpdate(
4848
return false, resultForAllMachines, nil
4949
}
5050

51+
// Note: Usually canUpdateMachine is only called once for a single Machine rollout.
52+
// If it returns true, the code below will mark the in-place update as in progress via
53+
// UpdateInProgressAnnotation. From this point forward we are not going to call canUpdateMachine again.
54+
// If it returns false, we are going to fall back to scale down which will delete the Machine.
55+
// We only have to repeat the canUpdateMachine call if the write call to set UpdateInProgressAnnotation
56+
// fails or if we fail to delete the Machine.
5157
canUpdate, err := r.canUpdateMachine(ctx, machineToInPlaceUpdate, machineUpToDateResult)
5258
if err != nil {
5359
return false, ctrl.Result{}, errors.Wrapf(err, "failed to determine if Machine %s can be updated in-place", machineToInPlaceUpdate.Name)
@@ -57,6 +63,5 @@ func (r *KubeadmControlPlaneReconciler) tryInPlaceUpdate(
5763
return true, ctrl.Result{}, nil
5864
}
5965

60-
// Always fallback to scale down until triggering in-place updates is implemented.
61-
return true, ctrl.Result{}, nil
66+
return false, ctrl.Result{}, r.triggerInPlaceUpdate(ctx, machineToInPlaceUpdate, machineUpToDateResult)
6267
}

controlplane/kubeadm/internal/controllers/inplace_canupdatemachine_test.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ func Test_createRequest(t *testing.T) {
785785
},
786786
},
787787
{
788-
name: "Should prepare all objects for diff: desiredInfraMachine picks up changes from currentMachine via SSA dry-run",
788+
name: "Should prepare all objects for diff: desiredInfraMachine picks up changes from currentInfraMachine via SSA dry-run",
789789
currentMachine: currentMachine,
790790
currentInfraMachine: currentInfraMachine,
791791
currentKubeadmConfig: currentKubeadmConfig,
@@ -815,7 +815,7 @@ func Test_createRequest(t *testing.T) {
815815
},
816816
},
817817
{
818-
name: "Should prepare all objects for diff: currentKubeadmConfig & currentKubeadmConfig are prepared for diff",
818+
name: "Should prepare all objects for diff: currentKubeadmConfig & desiredKubeadmConfig are prepared for diff",
819819
currentMachine: currentMachine,
820820
currentInfraMachine: currentInfraMachine,
821821
currentKubeadmConfig: currentKubeadmConfigWithInitConfiguration,
@@ -845,7 +845,6 @@ func Test_createRequest(t *testing.T) {
845845

846846
// Create Machine (same as in createMachine)
847847
currentMachineForPatch := tt.currentMachine.DeepCopy()
848-
currentMachineForPatch.SetGroupVersionKind(clusterv1.GroupVersion.WithKind("Machine")) // Has to be set for ssa.Patch
849848
g.Expect(ssa.Patch(ctx, env.Client, kcpManagerName, currentMachineForPatch)).To(Succeed())
850849
t.Cleanup(func() {
851850
g.Expect(env.CleanupAndWait(context.Background(), tt.currentMachine)).To(Succeed())
@@ -861,7 +860,6 @@ func Test_createRequest(t *testing.T) {
861860

862861
// Create KubeadmConfig (same as in createKubeadmConfig)
863862
currentKubeadmConfigForPatch := tt.currentKubeadmConfig.DeepCopy()
864-
currentKubeadmConfigForPatch.SetGroupVersionKind(bootstrapv1.GroupVersion.WithKind("KubeadmConfig")) // Has to be set for ssa.Patch
865863
g.Expect(ssa.Patch(ctx, env.Client, kcpManagerName, currentKubeadmConfigForPatch)).To(Succeed())
866864
g.Expect(ssa.RemoveManagedFieldsForLabelsAndAnnotations(ctx, env.Client, env.GetAPIReader(), currentKubeadmConfigForPatch, kcpManagerName)).To(Succeed())
867865
t.Cleanup(func() {

controlplane/kubeadm/internal/controllers/inplace_test.go

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,15 @@ func Test_tryInPlaceUpdate(t *testing.T) {
3737
}
3838

3939
tests := []struct {
40-
name string
41-
preflightChecksFunc func(ctx context.Context, controlPlane *internal.ControlPlane, excludeFor ...*clusterv1.Machine) ctrl.Result
42-
canUpdateMachineFunc func(ctx context.Context, machine *clusterv1.Machine, machineUpToDateResult internal.UpToDateResult) (bool, error)
43-
wantCanUpdateMachineCalled bool
44-
wantFallbackToScaleDown bool
45-
wantError bool
46-
wantErrorMessage string
47-
wantRes ctrl.Result
40+
name string
41+
preflightChecksFunc func(ctx context.Context, controlPlane *internal.ControlPlane, excludeFor ...*clusterv1.Machine) ctrl.Result
42+
canUpdateMachineFunc func(ctx context.Context, machine *clusterv1.Machine, machineUpToDateResult internal.UpToDateResult) (bool, error)
43+
wantCanUpdateMachineCalled bool
44+
wantTriggerInPlaceUpdateCalled bool
45+
wantFallbackToScaleDown bool
46+
wantError bool
47+
wantErrorMessage string
48+
wantRes ctrl.Result
4849
}{
4950
{
5051
name: "Requeue if preflight checks for all Machines failed",
@@ -94,16 +95,17 @@ func Test_tryInPlaceUpdate(t *testing.T) {
9495
canUpdateMachineFunc: func(_ context.Context, _ *clusterv1.Machine, _ internal.UpToDateResult) (bool, error) {
9596
return true, nil
9697
},
97-
wantCanUpdateMachineCalled: true,
98-
// TODO(in-place): Will be modified once tryInPlaceUpdate triggers in-place updates.
99-
wantFallbackToScaleDown: true,
98+
wantCanUpdateMachineCalled: true,
99+
wantTriggerInPlaceUpdateCalled: true,
100+
wantFallbackToScaleDown: false,
100101
},
101102
}
102103
for _, tt := range tests {
103104
t.Run(tt.name, func(t *testing.T) {
104105
g := NewWithT(t)
105106

106107
var canUpdateMachineCalled bool
108+
var triggerInPlaceUpdateCalled bool
107109
r := &KubeadmControlPlaneReconciler{
108110
overridePreflightChecksFunc: func(ctx context.Context, controlPlane *internal.ControlPlane, excludeFor ...*clusterv1.Machine) ctrl.Result {
109111
return tt.preflightChecksFunc(ctx, controlPlane, excludeFor...)
@@ -112,6 +114,10 @@ func Test_tryInPlaceUpdate(t *testing.T) {
112114
canUpdateMachineCalled = true
113115
return tt.canUpdateMachineFunc(ctx, machine, machineUpToDateResult)
114116
},
117+
overrideTriggerInPlaceUpdate: func(_ context.Context, _ *clusterv1.Machine, _ internal.UpToDateResult) error {
118+
triggerInPlaceUpdateCalled = true
119+
return nil
120+
},
115121
}
116122

117123
fallbackToScaleDown, res, err := r.tryInPlaceUpdate(ctx, nil, machineToInPlaceUpdate, internal.UpToDateResult{})
@@ -125,6 +131,7 @@ func Test_tryInPlaceUpdate(t *testing.T) {
125131
g.Expect(fallbackToScaleDown).To(Equal(tt.wantFallbackToScaleDown))
126132

127133
g.Expect(canUpdateMachineCalled).To(Equal(tt.wantCanUpdateMachineCalled), "canUpdateMachineCalled: actual: %t expected: %t", canUpdateMachineCalled, tt.wantCanUpdateMachineCalled)
134+
g.Expect(triggerInPlaceUpdateCalled).To(Equal(tt.wantTriggerInPlaceUpdateCalled), "triggerInPlaceUpdateCalled: actual: %t expected: %t", triggerInPlaceUpdateCalled, tt.wantTriggerInPlaceUpdateCalled)
128135
})
129136
}
130137
}

0 commit comments

Comments
 (0)