@@ -27,13 +27,15 @@ import (
2727 bootstrapv1 "sigs.k8s.io/cluster-api/api/bootstrap/kubeadm/v1beta2"
2828 controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2"
2929 "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal"
30+ "sigs.k8s.io/cluster-api/feature"
3031 "sigs.k8s.io/cluster-api/util/collections"
3132)
3233
33- func (r * KubeadmControlPlaneReconciler ) upgradeControlPlane (
34+ func (r * KubeadmControlPlaneReconciler ) updateControlPlane (
3435 ctx context.Context ,
3536 controlPlane * internal.ControlPlane ,
36- machinesRequireUpgrade collections.Machines ,
37+ machinesNeedingRollout collections.Machines ,
38+ machinesNeedingRolloutResults map [string ]internal.NotUpToDateResult ,
3739) (ctrl.Result , error ) {
3840 log := ctrl .LoggerFrom (ctx )
3941
@@ -42,17 +44,17 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane(
4244 workloadCluster , err := controlPlane .GetWorkloadCluster (ctx )
4345 if err != nil {
4446 log .Error (err , "failed to get remote client for workload cluster" , "Cluster" , klog .KObj (controlPlane .Cluster ))
45- return ctrl.Result {}, err
47+ return ctrl.Result {}, errors . Wrapf ( err , "failed to update control plane" )
4648 }
4749
4850 parsedVersion , err := semver .ParseTolerant (controlPlane .KCP .Spec .Version )
4951 if err != nil {
50- return ctrl.Result {}, errors .Wrapf (err , "failed to parse kubernetes version %q" , controlPlane .KCP .Spec .Version )
52+ return ctrl.Result {}, errors .Wrapf (err , "failed to update control plane: failed to parse Kubernetes version %q" , controlPlane .KCP .Spec .Version )
5153 }
5254
5355 // Ensure kubeadm clusterRoleBinding for v1.29+ as per https://github.com/kubernetes/kubernetes/pull/121305
5456 if err := workloadCluster .AllowClusterAdminPermissions (ctx , parsedVersion ); err != nil {
55- return ctrl.Result {}, errors .Wrap (err , "failed to set cluster-admin ClusterRoleBinding for kubeadm" )
57+ return ctrl.Result {}, errors .Wrap (err , "failed to update control plane: failed to set cluster-admin ClusterRoleBinding for kubeadm" )
5658 }
5759
5860 kubeadmCMMutators := make ([]func (* bootstrapv1.ClusterConfiguration ), 0 )
@@ -81,21 +83,76 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane(
8183
8284 // collectively update Kubeadm config map
8385 if err = workloadCluster .UpdateClusterConfiguration (ctx , parsedVersion , kubeadmCMMutators ... ); err != nil {
84- return ctrl.Result {}, err
86+ return ctrl.Result {}, errors . Wrapf ( err , "failed to update control plane" )
8587 }
8688
8789 switch controlPlane .KCP .Spec .Rollout .Strategy .Type {
8890 case controlplanev1 .RollingUpdateStrategyType :
8991 // RolloutStrategy is currently defaulted and validated to be RollingUpdate
90- // We can ignore MaxUnavailable because we are enforcing health checks before we get here.
91- maxNodes := * controlPlane .KCP .Spec .Replicas + int32 (controlPlane .KCP .Spec .Rollout .Strategy .RollingUpdate .MaxSurge .IntValue ())
92- if int32 (controlPlane .Machines .Len ()) < maxNodes {
93- // scaleUp ensures that we don't continue scaling up while waiting for Machines to have NodeRefs
94- return r .scaleUpControlPlane (ctx , controlPlane )
92+ res , err := r .rollingUpdate (ctx , controlPlane , machinesNeedingRollout , machinesNeedingRolloutResults )
93+ if err != nil {
94+ return ctrl.Result {}, errors .Wrapf (err , "failed to update control plane" )
9595 }
96- return r . scaleDownControlPlane ( ctx , controlPlane , machinesRequireUpgrade )
96+ return res , nil
9797 default :
9898 log .Info ("RolloutStrategy type is not set to RollingUpdate, unable to determine the strategy for rolling out machines" )
9999 return ctrl.Result {}, nil
100100 }
101101}
102+
103+ func (r * KubeadmControlPlaneReconciler ) rollingUpdate (
104+ ctx context.Context ,
105+ controlPlane * internal.ControlPlane ,
106+ machinesNeedingRollout collections.Machines ,
107+ machinesNeedingRolloutResults map [string ]internal.NotUpToDateResult ,
108+ ) (ctrl.Result , error ) {
109+ currentReplicas := int32 (controlPlane .Machines .Len ())
110+ currentUpToDateReplicas := int32 (controlPlane .UpToDateMachines ().Len ())
111+ desiredReplicas := * controlPlane .KCP .Spec .Replicas
112+ maxSurge := int32 (controlPlane .KCP .Spec .Rollout .Strategy .RollingUpdate .MaxSurge .IntValue ())
113+ // Note: As MaxSurge is validated to be either 0 or 1, maxReplicas will be either desiredReplicas or desiredReplicas+1.
114+ maxReplicas := desiredReplicas + maxSurge
115+
116+ // If currentReplicas < maxReplicas we have to scale up
117+ // Note: This is done to ensure we have as many Machines as allowed during rollout to maximize fault tolerance.
118+ if currentReplicas < maxReplicas {
119+ // Note: scaleUpControlPlane ensures that we don't continue scaling up while waiting for Machines to have NodeRefs.
120+ return r .scaleUpControlPlane (ctx , controlPlane )
121+ }
122+
123+ // If currentReplicas >= maxReplicas we have to scale down.
124+ // Note: If we are already at or above the maximum Machines we have to in-place update or delete a Machine
125+ // to make progress with the update (as we cannot create additional new Machines above the maximum).
126+
127+ // Pick the Machine that we should in-place update or scale down.
128+ machineToInPlaceUpdateOrScaleDown , err := selectMachineForInPlaceUpdateOrScaleDown (ctx , controlPlane , machinesNeedingRollout )
129+ if err != nil {
130+ return ctrl.Result {}, errors .Wrap (err , "failed to select next Machine for rollout" )
131+ }
132+ machinesNeedingRolloutResult , ok := machinesNeedingRolloutResults [machineToInPlaceUpdateOrScaleDown .Name ]
133+ if ! ok {
134+ // Note: This should never happen as we store results for all Machines in machinesNeedingRolloutResults.
135+ return ctrl.Result {}, errors .Errorf ("failed to check if Machine %s is UpToDate" , machineToInPlaceUpdateOrScaleDown .Name )
136+ }
137+
138+ // If the selected Machine is eligible for in-place update and we don't already have enough up-to-date replicas, try in-place update.
139+ // Note: To be safe we only try an in-place update when we would otherwise delete a Machine. This ensures we could
140+ // afford if the in-place update fails and the Machine becomes unavailable (and eventually MHC kicks in and the Machine is recreated).
141+ if feature .Gates .Enabled (feature .InPlaceUpdates ) &&
142+ machinesNeedingRolloutResult .EligibleForInPlaceUpdate &&
143+ currentUpToDateReplicas < desiredReplicas {
144+ fallbackToScaleDown , res , err := r .tryInPlaceUpdate (ctx , controlPlane , machineToInPlaceUpdateOrScaleDown , machinesNeedingRolloutResult )
145+ if err != nil {
146+ return ctrl.Result {}, err
147+ }
148+ if ! res .IsZero () {
149+ return res , nil
150+ }
151+ if fallbackToScaleDown {
152+ return r .scaleDownControlPlane (ctx , controlPlane , machineToInPlaceUpdateOrScaleDown )
153+ }
154+ // In-place update triggered
155+ return ctrl.Result {}, nil // Note: Requeue is not needed, changes to Machines trigger another reconcile.
156+ }
157+ return r .scaleDownControlPlane (ctx , controlPlane , machineToInPlaceUpdateOrScaleDown )
158+ }
0 commit comments