Skip to content

Commit 7847c39

Browse files
authored
Merge pull request #12840 from sbueringer/pr-kcp-rollout-logic
✨ KCP: Extend rollout logic for in-place updates
2 parents db83f64 + 835429d commit 7847c39

File tree

10 files changed

+464
-78
lines changed

10 files changed

+464
-78
lines changed

controlplane/kubeadm/internal/control_plane.go

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,12 @@ type ControlPlane struct {
5151
machinesPatchHelpers map[string]*patch.Helper
5252

5353
// MachinesNotUpToDate is the source of truth for Machines that are not up-to-date.
54-
// It should be used to check if a Machine is up-to-date (not machinesNotUpToDateResults).
54+
// It should be used to check if a Machine is up-to-date (not machinesUpToDateResults).
5555
MachinesNotUpToDate collections.Machines
56-
// machinesNotUpToDateResults is used to store the result of the UpToDate call for all Machines
56+
// machinesUpToDateResults is used to store the result of the UpToDate call for all Machines
5757
// (even for Machines that are up-to-date).
5858
// MachinesNotUpToDate should always be used instead to check if a Machine is up-to-date.
59-
machinesNotUpToDateResults map[string]NotUpToDateResult
59+
machinesUpToDateResults map[string]UpToDateResult
6060

6161
// reconciliationTime is the time of the current reconciliation, and should be used for all "now" calculations
6262
reconciliationTime metav1.Time
@@ -122,9 +122,9 @@ func NewControlPlane(ctx context.Context, managementCluster ManagementCluster, c
122122
// Select machines that should be rolled out because of an outdated configuration or because rolloutAfter/Before expired.
123123
reconciliationTime := metav1.Now()
124124
machinesNotUptoDate := make(collections.Machines, len(ownedMachines))
125-
machinesNotUpToDateResults := map[string]NotUpToDateResult{}
125+
machinesUpToDateResults := map[string]UpToDateResult{}
126126
for _, m := range ownedMachines {
127-
upToDate, notUpToDateResult, err := UpToDate(ctx, client, cluster, m, kcp, &reconciliationTime, infraMachines, kubeadmConfigs)
127+
upToDate, upToDateResult, err := UpToDate(ctx, client, cluster, m, kcp, &reconciliationTime, infraMachines, kubeadmConfigs)
128128
if err != nil {
129129
return nil, err
130130
}
@@ -133,20 +133,20 @@ func NewControlPlane(ctx context.Context, managementCluster ManagementCluster, c
133133
}
134134
// Set this even if machine is UpToDate. This is needed to complete triggering in-place updates
135135
// MachinesNotUpToDate should always be used instead to check if a Machine is up-to-date.
136-
machinesNotUpToDateResults[m.Name] = *notUpToDateResult
136+
machinesUpToDateResults[m.Name] = *upToDateResult
137137
}
138138

139139
return &ControlPlane{
140-
KCP: kcp,
141-
Cluster: cluster,
142-
Machines: ownedMachines,
143-
machinesPatchHelpers: patchHelpers,
144-
MachinesNotUpToDate: machinesNotUptoDate,
145-
machinesNotUpToDateResults: machinesNotUpToDateResults,
146-
KubeadmConfigs: kubeadmConfigs,
147-
InfraResources: infraMachines,
148-
reconciliationTime: reconciliationTime,
149-
managementCluster: managementCluster,
140+
KCP: kcp,
141+
Cluster: cluster,
142+
Machines: ownedMachines,
143+
machinesPatchHelpers: patchHelpers,
144+
MachinesNotUpToDate: machinesNotUptoDate,
145+
machinesUpToDateResults: machinesUpToDateResults,
146+
KubeadmConfigs: kubeadmConfigs,
147+
InfraResources: infraMachines,
148+
reconciliationTime: reconciliationTime,
149+
managementCluster: managementCluster,
150150
}, nil
151151
}
152152

@@ -240,15 +240,15 @@ func (c *ControlPlane) GetKubeadmConfig(machineName string) (*bootstrapv1.Kubead
240240
}
241241

242242
// MachinesNeedingRollout return a list of machines that need to be rolled out.
243-
func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string]NotUpToDateResult) {
243+
func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string]UpToDateResult) {
244244
// Note: Machines already deleted are dropped because they will be replaced by new machines after deletion completes.
245-
return c.MachinesNotUpToDate.Filter(collections.Not(collections.HasDeletionTimestamp)), c.machinesNotUpToDateResults
245+
return c.MachinesNotUpToDate.Filter(collections.Not(collections.HasDeletionTimestamp)), c.machinesUpToDateResults
246246
}
247247

248248
// NotUpToDateMachines return a list of machines that are not up to date with the control
249249
// plane's configuration.
250-
func (c *ControlPlane) NotUpToDateMachines() (collections.Machines, map[string]NotUpToDateResult) {
251-
return c.MachinesNotUpToDate, c.machinesNotUpToDateResults
250+
func (c *ControlPlane) NotUpToDateMachines() (collections.Machines, map[string]UpToDateResult) {
251+
return c.MachinesNotUpToDate, c.machinesUpToDateResults
252252
}
253253

254254
// UpToDateMachines returns the machines that are up to date with the control

controlplane/kubeadm/internal/control_plane_test.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -122,19 +122,19 @@ func TestControlPlane(t *testing.T) {
122122

123123
g.Expect(controlPlane.Machines).To(HaveLen(5))
124124

125-
machinesNotUptoDate, machinesNotUpToDateResults := controlPlane.NotUpToDateMachines()
125+
machinesNotUptoDate, machinesUpToDateResults := controlPlane.NotUpToDateMachines()
126126
g.Expect(machinesNotUptoDate.Names()).To(ConsistOf("m2", "m3"))
127-
// machinesNotUpToDateResults contains results for all Machines (including up-to-date Machines).
128-
g.Expect(machinesNotUpToDateResults).To(HaveLen(5))
129-
g.Expect(machinesNotUpToDateResults["m2"].ConditionMessages).To(Equal([]string{"Version v1.29.0, v1.31.0 required"}))
130-
g.Expect(machinesNotUpToDateResults["m3"].ConditionMessages).To(Equal([]string{"Version v1.29.3, v1.31.0 required"}))
127+
// machinesUpToDateResults contains results for all Machines (including up-to-date Machines).
128+
g.Expect(machinesUpToDateResults).To(HaveLen(5))
129+
g.Expect(machinesUpToDateResults["m2"].ConditionMessages).To(Equal([]string{"Version v1.29.0, v1.31.0 required"}))
130+
g.Expect(machinesUpToDateResults["m3"].ConditionMessages).To(Equal([]string{"Version v1.29.3, v1.31.0 required"}))
131131

132-
machinesNeedingRollout, machinesNotUpToDateResults := controlPlane.MachinesNeedingRollout()
132+
machinesNeedingRollout, machinesUpToDateResults := controlPlane.MachinesNeedingRollout()
133133
g.Expect(machinesNeedingRollout.Names()).To(ConsistOf("m2"))
134-
// machinesNotUpToDateResults contains results for all Machines (including up-to-date Machines).
135-
g.Expect(machinesNotUpToDateResults).To(HaveLen(5))
136-
g.Expect(machinesNotUpToDateResults["m2"].LogMessages).To(Equal([]string{"Machine version \"v1.29.0\" is not equal to KCP version \"v1.31.0\""}))
137-
g.Expect(machinesNotUpToDateResults["m3"].LogMessages).To(Equal([]string{"Machine version \"v1.29.3\" is not equal to KCP version \"v1.31.0\""}))
134+
// machinesUpToDateResults contains results for all Machines (including up-to-date Machines).
135+
g.Expect(machinesUpToDateResults).To(HaveLen(5))
136+
g.Expect(machinesUpToDateResults["m2"].LogMessages).To(Equal([]string{"Machine version \"v1.29.0\" is not equal to KCP version \"v1.31.0\""}))
137+
g.Expect(machinesUpToDateResults["m3"].LogMessages).To(Equal([]string{"Machine version \"v1.29.3\" is not equal to KCP version \"v1.31.0\""}))
138138

139139
upToDateMachines := controlPlane.UpToDateMachines()
140140
g.Expect(upToDateMachines).To(HaveLen(3))

controlplane/kubeadm/internal/controllers/controller.go

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,11 @@ type KubeadmControlPlaneReconciler struct {
9696
managementCluster internal.ManagementCluster
9797
managementClusterUncached internal.ManagementCluster
9898
ssaCache ssa.Cache
99+
100+
// Only used for testing
101+
overrideTryInPlaceUpdateFunc func(ctx context.Context, controlPlane *internal.ControlPlane, machineToInPlaceUpdate *clusterv1.Machine, machineUpToDateResult internal.UpToDateResult) (bool, ctrl.Result, error)
102+
overrideScaleUpControlPlaneFunc func(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error)
103+
overrideScaleDownControlPlaneFunc func(ctx context.Context, controlPlane *internal.ControlPlane, machineToDelete *clusterv1.Machine) (ctrl.Result, error)
99104
}
100105

101106
func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error {
@@ -469,16 +474,16 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl
469474
}
470475

471476
// Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations.
472-
machinesNeedingRollout, machinesNeedingRolloutResults := controlPlane.MachinesNeedingRollout()
477+
machinesNeedingRollout, machinesUpToDateResults := controlPlane.MachinesNeedingRollout()
473478
switch {
474479
case len(machinesNeedingRollout) > 0:
475480
var allMessages []string
476-
for machine, machinesNeedingRolloutResult := range machinesNeedingRolloutResults {
477-
allMessages = append(allMessages, fmt.Sprintf("Machine %s needs rollout: %s", machine, strings.Join(machinesNeedingRolloutResult.LogMessages, ",")))
481+
for machine, machineUpToDateResult := range machinesUpToDateResults {
482+
allMessages = append(allMessages, fmt.Sprintf("Machine %s needs rollout: %s", machine, strings.Join(machineUpToDateResult.LogMessages, ",")))
478483
}
479484
log.Info(fmt.Sprintf("Rolling out Control Plane machines: %s", strings.Join(allMessages, ",")), "machinesNeedingRollout", machinesNeedingRollout.Names())
480485
v1beta1conditions.MarkFalse(controlPlane.KCP, controlplanev1.MachinesSpecUpToDateV1Beta1Condition, controlplanev1.RollingUpdateInProgressV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Rolling %d replicas with outdated spec (%d replicas up to date)", len(machinesNeedingRollout), len(controlPlane.Machines)-len(machinesNeedingRollout))
481-
return r.upgradeControlPlane(ctx, controlPlane, machinesNeedingRollout)
486+
return r.updateControlPlane(ctx, controlPlane, machinesNeedingRollout, machinesUpToDateResults)
482487
default:
483488
// make sure last upgrade operation is marked as completed.
484489
// NOTE: we are checking the condition already exists in order to avoid to set this condition at the first
@@ -508,7 +513,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl
508513
case numMachines > desiredReplicas:
509514
log.Info("Scaling down control plane", "desired", desiredReplicas, "existing", numMachines)
510515
// The last parameter (i.e. machines needing to be rolled out) should always be empty here.
511-
return r.scaleDownControlPlane(ctx, controlPlane, collections.Machines{})
516+
// Pick the Machine that we should scale down.
517+
machineToDelete, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, controlPlane, collections.Machines{})
518+
if err != nil {
519+
return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down")
520+
}
521+
return r.scaleDownControlPlane(ctx, controlPlane, machineToDelete)
512522
}
513523

514524
// Get the workload cluster client.
@@ -977,16 +987,16 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneAndMachinesConditio
977987
}
978988

979989
func reconcileMachineUpToDateCondition(_ context.Context, controlPlane *internal.ControlPlane) {
980-
machinesNotUptoDate, machinesNotUpToDateResults := controlPlane.NotUpToDateMachines()
990+
machinesNotUptoDate, machinesUpToDateResults := controlPlane.NotUpToDateMachines()
981991
machinesNotUptoDateNames := sets.New(machinesNotUptoDate.Names()...)
982992

983993
for _, machine := range controlPlane.Machines {
984994
if machinesNotUptoDateNames.Has(machine.Name) {
985995
// Note: the code computing the message for KCP's RolloutOut condition is making assumptions on the format/content of this message.
986996
message := ""
987-
if machinesNotUpToDateResult, ok := machinesNotUpToDateResults[machine.Name]; ok && len(machinesNotUpToDateResult.ConditionMessages) > 0 {
997+
if machineUpToDateResult, ok := machinesUpToDateResults[machine.Name]; ok && len(machineUpToDateResult.ConditionMessages) > 0 {
988998
var reasons []string
989-
for _, conditionMessage := range machinesNotUpToDateResult.ConditionMessages {
999+
for _, conditionMessage := range machineUpToDateResult.ConditionMessages {
9901000
reasons = append(reasons, fmt.Sprintf("* %s", conditionMessage))
9911001
}
9921002
message = strings.Join(reasons, "\n")
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
Copyright 2025 The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package controllers
18+
19+
import (
20+
"context"
21+
22+
ctrl "sigs.k8s.io/controller-runtime"
23+
24+
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
25+
"sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
26+
)
27+
28+
func (r *KubeadmControlPlaneReconciler) tryInPlaceUpdate(
29+
ctx context.Context,
30+
controlPlane *internal.ControlPlane,
31+
machineToInPlaceUpdate *clusterv1.Machine,
32+
machineUpToDateResult internal.UpToDateResult,
33+
) (fallbackToScaleDown bool, _ ctrl.Result, _ error) {
34+
if r.overrideTryInPlaceUpdateFunc != nil {
35+
return r.overrideTryInPlaceUpdateFunc(ctx, controlPlane, machineToInPlaceUpdate, machineUpToDateResult)
36+
}
37+
38+
// Always fallback to scale down until in-place is implemented.
39+
return true, ctrl.Result{}, nil
40+
}

controlplane/kubeadm/internal/controllers/scale.go

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte
6363
}
6464

6565
func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) {
66+
if r.overrideScaleUpControlPlaneFunc != nil {
67+
return r.overrideScaleUpControlPlaneFunc(ctx, controlPlane)
68+
}
69+
6670
log := ctrl.LoggerFrom(ctx)
6771

6872
// Run preflight checks to ensure that the control plane is stable before proceeding with a scale up/scale down operation; if not, wait.
@@ -95,16 +99,14 @@ func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context,
9599
func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane(
96100
ctx context.Context,
97101
controlPlane *internal.ControlPlane,
98-
outdatedMachines collections.Machines,
102+
machineToDelete *clusterv1.Machine,
99103
) (ctrl.Result, error) {
100-
log := ctrl.LoggerFrom(ctx)
101-
102-
// Pick the Machine that we should scale down.
103-
machineToDelete, err := selectMachineForScaleDown(ctx, controlPlane, outdatedMachines)
104-
if err != nil {
105-
return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down")
104+
if r.overrideScaleDownControlPlaneFunc != nil {
105+
return r.overrideScaleDownControlPlaneFunc(ctx, controlPlane, machineToDelete)
106106
}
107107

108+
log := ctrl.LoggerFrom(ctx)
109+
108110
// Run preflight checks ensuring the control plane is stable before proceeding with a scale up/scale down operation; if not, wait.
109111
// Given that we're scaling down, we can exclude the machineToDelete from the preflight checks.
110112
if result, err := r.preflightChecks(ctx, controlPlane, machineToDelete); err != nil || !result.IsZero() {
@@ -265,7 +267,8 @@ func preflightCheckCondition(kind string, obj *clusterv1.Machine, conditionType
265267
return nil
266268
}
267269

268-
// selectMachineForScaleDown select a machine candidate for scaling down. The selection is a two phase process:
270+
// selectMachineForInPlaceUpdateOrScaleDown select a machine candidate for scaling down or for in-place update.
271+
// The selection is a two phase process:
269272
//
270273
// In the first phase it selects a subset of machines eligible for deletion:
271274
// - if there are outdated machines with the delete machine annotation, use them as eligible subset (priority to user requests, part 1)
@@ -276,18 +279,20 @@ func preflightCheckCondition(kind string, obj *clusterv1.Machine, conditionType
276279
//
277280
// Once the subset of machines eligible for deletion is identified, one machine is picked out of this subset by
278281
// selecting the machine in the failure domain with most machines (including both eligible and not eligible machines).
279-
func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.ControlPlane, outdatedMachines collections.Machines) (*clusterv1.Machine, error) {
282+
func selectMachineForInPlaceUpdateOrScaleDown(ctx context.Context, controlPlane *internal.ControlPlane, outdatedMachines collections.Machines) (*clusterv1.Machine, error) {
280283
// Select the subset of machines eligible for scale down.
281-
eligibleMachines := controlPlane.Machines
284+
var eligibleMachines collections.Machines
282285
switch {
283286
case controlPlane.MachineWithDeleteAnnotation(outdatedMachines).Len() > 0:
284287
eligibleMachines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines)
285-
case controlPlane.MachineWithDeleteAnnotation(eligibleMachines).Len() > 0:
286-
eligibleMachines = controlPlane.MachineWithDeleteAnnotation(eligibleMachines)
288+
case controlPlane.MachineWithDeleteAnnotation(controlPlane.Machines).Len() > 0:
289+
eligibleMachines = controlPlane.MachineWithDeleteAnnotation(controlPlane.Machines)
287290
case controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0:
288291
eligibleMachines = controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines)
289292
case outdatedMachines.Len() > 0:
290293
eligibleMachines = outdatedMachines
294+
default:
295+
eligibleMachines = controlPlane.Machines
291296
}
292297

293298
// Pick an eligible machine from the failure domain with most machines in (including both eligible and not eligible machines)

controlplane/kubeadm/internal/controllers/scale_test.go

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,9 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing.
284284
}
285285
controlPlane.InjectTestManagementCluster(r.managementCluster)
286286

287-
result, err := r.scaleDownControlPlane(context.Background(), controlPlane, controlPlane.Machines)
287+
machineToDelete, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, controlPlane, controlPlane.Machines)
288+
g.Expect(err).ToNot(HaveOccurred())
289+
result, err := r.scaleDownControlPlane(context.Background(), controlPlane, machineToDelete)
288290
g.Expect(err).ToNot(HaveOccurred())
289291
g.Expect(result).To(BeComparableTo(ctrl.Result{Requeue: true}))
290292

@@ -326,7 +328,9 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing.
326328
}
327329
controlPlane.InjectTestManagementCluster(r.managementCluster)
328330

329-
result, err := r.scaleDownControlPlane(context.Background(), controlPlane, controlPlane.Machines)
331+
machineToDelete, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, controlPlane, controlPlane.Machines)
332+
g.Expect(err).ToNot(HaveOccurred())
333+
result, err := r.scaleDownControlPlane(context.Background(), controlPlane, machineToDelete)
330334
g.Expect(err).ToNot(HaveOccurred())
331335
g.Expect(result).To(BeComparableTo(ctrl.Result{Requeue: true}))
332336

@@ -364,7 +368,9 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing.
364368
}
365369
controlPlane.InjectTestManagementCluster(r.managementCluster)
366370

367-
result, err := r.scaleDownControlPlane(context.Background(), controlPlane, controlPlane.Machines)
371+
machineToDelete, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, controlPlane, controlPlane.Machines)
372+
g.Expect(err).ToNot(HaveOccurred())
373+
result, err := r.scaleDownControlPlane(context.Background(), controlPlane, machineToDelete)
368374
g.Expect(err).ToNot(HaveOccurred())
369375
g.Expect(result).To(BeComparableTo(ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}))
370376

@@ -374,7 +380,7 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing.
374380
})
375381
}
376382

377-
func TestSelectMachineForScaleDown(t *testing.T) {
383+
func TestSelectMachineForInPlaceUpdateOrScaleDown(t *testing.T) {
378384
kcp := controlplanev1.KubeadmControlPlane{
379385
Spec: controlplanev1.KubeadmControlPlaneSpec{},
380386
}
@@ -503,7 +509,7 @@ func TestSelectMachineForScaleDown(t *testing.T) {
503509
t.Run(tc.name, func(t *testing.T) {
504510
g := NewWithT(t)
505511

506-
selectedMachine, err := selectMachineForScaleDown(ctx, tc.cp, tc.outDatedMachines)
512+
selectedMachine, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, tc.cp, tc.outDatedMachines)
507513

508514
if tc.expectErr {
509515
g.Expect(err).To(HaveOccurred())

0 commit comments

Comments
 (0)