Skip to content

Commit 45159ff

Browse files
✨ Call new lifecycle hooks for chained-upgrades (#12891)
* Call new lifecycle hooks for chained-upgrades * Address comments * More feedback
1 parent c6df77f commit 45159ff

File tree

11 files changed

+1792
-174
lines changed

11 files changed

+1792
-174
lines changed

exp/topology/desiredstate/desired_state.go

Lines changed: 93 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1"
3838
"sigs.k8s.io/cluster-api/controllers/clustercache"
3939
"sigs.k8s.io/cluster-api/controllers/external"
40+
runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog"
4041
runtimeclient "sigs.k8s.io/cluster-api/exp/runtime/client"
4142
"sigs.k8s.io/cluster-api/exp/topology/scope"
4243
"sigs.k8s.io/cluster-api/feature"
@@ -552,23 +553,29 @@ func (g *generator) computeControlPlaneVersion(ctx context.Context, s *scope.Sco
552553
}
553554

554555
// if the control plane is not upgrading, before making further considerations about if to pick up another version,
555-
// we should call the AfterControlPlaneUpgrade hook if not already done.
556+
// we should call the AfterControlPlaneUpgrade and the BeforeWorkersUpgrade hooks if not already done.
556557
if feature.Gates.Enabled(feature.RuntimeSDK) {
557-
hookCompleted, err := g.callAfterControlPlaneUpgradeHook(ctx, s, currentVersion, topologyVersion)
558+
// Note: calling the AfterControlPlaneUpgrade is the final step of a control plane upgrade.
559+
hookCompleted, err := g.callAfterControlPlaneUpgradeHook(ctx, s, currentVersion)
558560
if err != nil {
559561
return "", err
560562
}
561563
if !hookCompleted {
562564
return *currentVersion, nil
563565
}
564-
}
565-
566-
// At this stage, we can assume the previous control plane upgrade is fully complete (including calling the AfterControlPlaneUpgrade).
567-
// It is now possible to start making considerations if to pick up another version.
568566

569-
// If the control plane is not pending upgrade, then it is already at the desired version and there is no other version to pick up.
570-
if !s.UpgradeTracker.ControlPlane.IsPendingUpgrade {
571-
return *currentVersion, nil
567+
// Note: calling the BeforeWorkersUpgrade is the first part of the execution of a worker upgrade step from the upgrade plan.
568+
// The call to this hook is implemented in this function in order to ensure the hook is called
569+
// after AfterControlPlaneUpgrade unblocks, and also to ensure that BeforeWorkersUpgrade
570+
// can block the control plane upgrade to proceed in the upgrade plan.
571+
// Note: this operation is a no-op if workers are not required to upgrade to the current control plane version.
572+
hookCompleted, err = g.callBeforeWorkersUpgradeHook(ctx, s, &s.UpgradeTracker.MinWorkersVersion, *currentVersion)
573+
if err != nil {
574+
return "", err
575+
}
576+
if !hookCompleted {
577+
return *currentVersion, nil
578+
}
572579
}
573580

574581
// Before considering picking up the next control plane version, check if workers are required
@@ -595,31 +602,87 @@ func (g *generator) computeControlPlaneVersion(ctx context.Context, s *scope.Sco
595602

596603
// At this point we can assume the control plane is stable and also MachineDeployments/MachinePools
597604
// are not upgrading/are not required to upgrade.
605+
606+
// If not already done, call the AfterWorkersUpgrade hook before picking up the desired version.
607+
// (this is the last step of the previous upgrade).
608+
if feature.Gates.Enabled(feature.RuntimeSDK) {
609+
// Note: calling the AfterWorkersUpgrade is the last step of workers upgrade.
610+
// The call to this hook is implemented in this function in order to ensure that AfterWorkersUpgrade
611+
// can block the control plane upgrade to proceed in the upgrade plan.
612+
// Note: this operation is a no-op if workers are not required to upgrade to the current control plane version.
613+
hookCompleted, err := g.callAfterWorkersUpgradeHook(ctx, s, currentVersion)
614+
if err != nil {
615+
return "", err
616+
}
617+
if !hookCompleted {
618+
return *currentVersion, nil
619+
}
620+
}
621+
622+
// At this stage, we can assume the previous control plane upgrade is fully complete (including calling the AfterControlPlaneUpgrade).
623+
// It is now possible to start making considerations if to pick up another version.
624+
625+
// If the control plane is not pending upgrade, then it is already at the desired version and there is no other version to pick up.
626+
if !s.UpgradeTracker.ControlPlane.IsPendingUpgrade {
627+
return *currentVersion, nil
628+
}
629+
598630
// If not already done, call the BeforeClusterUpgrade hook before picking up the desired version.
599631
if feature.Gates.Enabled(feature.RuntimeSDK) {
632+
// Note: calling the BeforeClusterUpgrade is the first step of an upgrade plan;
633+
// this operation is a no-op for intermediate steps of an upgrade plan.
600634
hookCompleted, err := g.callBeforeClusterUpgradeHook(ctx, s, currentVersion, topologyVersion)
601635
if err != nil {
602636
return "", err
603637
}
604638
if !hookCompleted {
605639
return *currentVersion, nil
606640
}
641+
642+
// After BeforeClusterUpgrade unblocked the upgrade, consider the upgrade started.
643+
// As a consequence, the system start tracking the intent of calling AfterClusterUpgrade once the upgrade is complete.
644+
// Note: this also prevent the BeforeClusterUpgrade to be called again (until after the upgrade is completed).
645+
if err := hooks.MarkAsPending(ctx, g.Client, s.Current.Cluster, runtimehooksv1.AfterClusterUpgrade); err != nil {
646+
return "", err
647+
}
607648
}
608649

609-
// Control plane and machine deployments are stable. All the required hooks are called.
650+
// Control plane and machine deployments are stable. The BeforeClusterUpgrade hook have been called.
610651
// Ready to pick up the next version in the upgrade plan.
611652

612-
// Track the intent of calling the AfterControlPlaneUpgrade and the AfterClusterUpgrade hooks once we are done with the upgrade.
613-
if err := hooks.MarkAsPending(ctx, g.Client, s.Current.Cluster, runtimehooksv1.AfterControlPlaneUpgrade, runtimehooksv1.AfterClusterUpgrade); err != nil {
614-
return "", err
615-
}
616-
617-
// Pick up the new version
653+
// Select the next version for the control plane
618654
if len(s.UpgradeTracker.ControlPlane.UpgradePlan) == 0 {
619655
return "", errors.New("cannot compute the control plane version if the control plane is pending upgrade and the upgrade plan is not set")
620656
}
621657
nextVersion := s.UpgradeTracker.ControlPlane.UpgradePlan[0]
622658

659+
if feature.Gates.Enabled(feature.RuntimeSDK) {
660+
// Note: calling the BeforeControlPlaneUpgrade is the first step of a control plan upgrade step from the upgrade plan.
661+
hookCompleted, err := g.callBeforeControlPlaneUpgradeHook(ctx, s, currentVersion, nextVersion)
662+
if err != nil {
663+
return "", err
664+
}
665+
if !hookCompleted {
666+
return *currentVersion, nil
667+
}
668+
669+
// After BeforeControlPlaneUpgrade unblocked the upgrade step, consider the upgrade step start started,
670+
// As a consequence, the system start tracking the intent of calling other hooks for this upgrade step:
671+
// - AfterControlPlaneUpgrade hook to be called after the control plane completes the upgrade step.
672+
// - If workers are required to upgrade to the current control plane version:
673+
// - BeforeWorkersUpgrade hook to be called before workers start the upgrade step.
674+
// - AfterWorkersUpgrade hook to be called after workers completes the upgrade step.
675+
hooksToBeCalled := []runtimecatalog.Hook{runtimehooksv1.AfterControlPlaneUpgrade}
676+
machineDeploymentPendingUpgrade := len(s.UpgradeTracker.MachineDeployments.UpgradePlan) > 0 && s.UpgradeTracker.MachineDeployments.UpgradePlan[0] == nextVersion
677+
machinePoolPendingUpgrade := len(s.UpgradeTracker.MachinePools.UpgradePlan) > 0 && s.UpgradeTracker.MachinePools.UpgradePlan[0] == nextVersion
678+
if machineDeploymentPendingUpgrade || machinePoolPendingUpgrade {
679+
hooksToBeCalled = append(hooksToBeCalled, runtimehooksv1.BeforeWorkersUpgrade, runtimehooksv1.AfterWorkersUpgrade)
680+
}
681+
if err := hooks.MarkAsPending(ctx, g.Client, s.Current.Cluster, hooksToBeCalled...); err != nil {
682+
return "", err
683+
}
684+
}
685+
623686
// The upgrade is now starting in this reconcile and not pending anymore.
624687
// Note: it is important to unset IsPendingUpgrade, otherwise reconcileState will assume that we are still waiting for another upgrade (and thus defer the one we are starting).
625688
s.UpgradeTracker.ControlPlane.IsStartingUpgrade = true
@@ -979,7 +1042,7 @@ func (g *generator) computeMachineDeploymentVersion(s *scope.Scope, machineDeplo
9791042
// Example: join could fail if the load balancers are slow in detecting when CP machines are
9801043
// being deleted.
9811044
if currentMDState == nil || currentMDState.Object == nil {
982-
if !s.UpgradeTracker.ControlPlane.IsControlPlaneStable() || s.HookResponseTracker.IsBlocking(runtimehooksv1.AfterControlPlaneUpgrade) {
1045+
if !s.UpgradeTracker.ControlPlane.IsControlPlaneStable() || s.HookResponseTracker.IsBlocking(runtimehooksv1.AfterControlPlaneUpgrade) || s.HookResponseTracker.IsBlocking(runtimehooksv1.BeforeWorkersUpgrade) {
9831046
s.UpgradeTracker.MachineDeployments.MarkPendingCreate(machineDeploymentTopology.Name)
9841047
}
9851048
return topologyVersion, nil
@@ -1007,6 +1070,12 @@ func (g *generator) computeMachineDeploymentVersion(s *scope.Scope, machineDeplo
10071070
return currentVersion, nil
10081071
}
10091072

1073+
// Return early if the BeforeWorkersUpgrade hook returns a blocking response.
1074+
if s.HookResponseTracker.IsBlocking(runtimehooksv1.BeforeWorkersUpgrade) {
1075+
s.UpgradeTracker.MachineDeployments.MarkPendingUpgrade(currentMDState.Object.Name)
1076+
return currentVersion, nil
1077+
}
1078+
10101079
// Return early if the upgrade concurrency is reached.
10111080
if s.UpgradeTracker.MachineDeployments.UpgradeConcurrencyReached() {
10121081
s.UpgradeTracker.MachineDeployments.MarkPendingUpgrade(currentMDState.Object.Name)
@@ -1293,7 +1362,7 @@ func (g *generator) computeMachinePoolVersion(s *scope.Scope, machinePoolTopolog
12931362
// Example: join could fail if the load balancers are slow in detecting when CP machines are
12941363
// being deleted.
12951364
if currentMPState == nil || currentMPState.Object == nil {
1296-
if !s.UpgradeTracker.ControlPlane.IsControlPlaneStable() || s.HookResponseTracker.IsBlocking(runtimehooksv1.AfterControlPlaneUpgrade) {
1365+
if !s.UpgradeTracker.ControlPlane.IsControlPlaneStable() || s.HookResponseTracker.IsBlocking(runtimehooksv1.AfterControlPlaneUpgrade) || s.HookResponseTracker.IsBlocking(runtimehooksv1.BeforeWorkersUpgrade) {
12971366
s.UpgradeTracker.MachinePools.MarkPendingCreate(machinePoolTopology.Name)
12981367
}
12991368
return topologyVersion, nil
@@ -1321,6 +1390,12 @@ func (g *generator) computeMachinePoolVersion(s *scope.Scope, machinePoolTopolog
13211390
return currentVersion, nil
13221391
}
13231392

1393+
// Return early if the BeforeWorkersUpgrade hook returns a blocking response.
1394+
if s.HookResponseTracker.IsBlocking(runtimehooksv1.BeforeWorkersUpgrade) {
1395+
s.UpgradeTracker.MachinePools.MarkPendingUpgrade(currentMPState.Object.Name)
1396+
return currentVersion, nil
1397+
}
1398+
13241399
// Return early if the upgrade concurrency is reached.
13251400
if s.UpgradeTracker.MachinePools.UpgradeConcurrencyReached() {
13261401
s.UpgradeTracker.MachinePools.MarkPendingUpgrade(currentMPState.Object.Name)

0 commit comments

Comments
 (0)