Skip to content

Commit 67cc286

Browse files
Improve chained upgrade observability
1 parent 376d30b commit 67cc286

17 files changed

+1284
-817
lines changed

api/core/v1beta2/cluster_types.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,15 +82,18 @@ const (
8282

8383
// ClusterTopologyReconciledControlPlaneUpgradePendingReason documents reconciliation of a Cluster topology
8484
// not yet completed because Control Plane is not yet updated to match the desired topology spec.
85+
// Deprecated: please use ClusterUpgrading instead.
8586
ClusterTopologyReconciledControlPlaneUpgradePendingReason = "ControlPlaneUpgradePending"
8687

8788
// ClusterTopologyReconciledMachineDeploymentsCreatePendingReason documents reconciliation of a Cluster topology
8889
// not yet completed because at least one of the MachineDeployments is yet to be created.
8990
// This generally happens because new MachineDeployment creations are held off while the ControlPlane is not stable.
91+
// Deprecated: please use ClusterUpgrading instead.
9092
ClusterTopologyReconciledMachineDeploymentsCreatePendingReason = "MachineDeploymentsCreatePending"
9193

9294
// ClusterTopologyReconciledMachineDeploymentsUpgradePendingReason documents reconciliation of a Cluster topology
9395
// not yet completed because at least one of the MachineDeployments is not yet updated to match the desired topology spec.
96+
// Deprecated: please use ClusterUpgrading instead.
9497
ClusterTopologyReconciledMachineDeploymentsUpgradePendingReason = "MachineDeploymentsUpgradePending"
9598

9699
// ClusterTopologyReconciledMachineDeploymentsUpgradeDeferredReason documents reconciliation of a Cluster topology
@@ -99,21 +102,29 @@ const (
99102

100103
// ClusterTopologyReconciledMachinePoolsUpgradePendingReason documents reconciliation of a Cluster topology
101104
// not yet completed because at least one of the MachinePools is not yet updated to match the desired topology spec.
105+
// Deprecated: please use ClusterUpgrading instead.
102106
ClusterTopologyReconciledMachinePoolsUpgradePendingReason = "MachinePoolsUpgradePending"
103107

104108
// ClusterTopologyReconciledMachinePoolsCreatePendingReason documents reconciliation of a Cluster topology
105109
// not yet completed because at least one of the MachinePools is yet to be created.
106110
// This generally happens because new MachinePool creations are held off while the ControlPlane is not stable.
111+
// Deprecated: please use ClusterUpgrading instead.
107112
ClusterTopologyReconciledMachinePoolsCreatePendingReason = "MachinePoolsCreatePending"
108113

109114
// ClusterTopologyReconciledMachinePoolsUpgradeDeferredReason documents reconciliation of a Cluster topology
110115
// not yet completed because the upgrade for at least one of the MachinePools has been deferred.
116+
// Deprecated: please use ClusterUpgrading instead.
111117
ClusterTopologyReconciledMachinePoolsUpgradeDeferredReason = "MachinePoolsUpgradeDeferred"
112118

113119
// ClusterTopologyReconciledHookBlockingReason documents reconciliation of a Cluster topology
114120
// not yet completed because at least one of the lifecycle hooks is blocking.
121+
// Deprecated: please use ClusterUpgrading instead.
115122
ClusterTopologyReconciledHookBlockingReason = "LifecycleHookBlocking"
116123

124+
// ClusterTopologyReconciledClusterUpgradingReason documents reconciliation of a Cluster topology
125+
// not yet completed because a cluster upgrade is still in progress.
126+
ClusterTopologyReconciledClusterUpgradingReason = "ClusterUpgrading"
127+
117128
// ClusterTopologyReconciledClusterClassNotReconciledReason documents reconciliation of a Cluster topology not
118129
// yet completed because the ClusterClass has not reconciled yet. If this condition persists there may be an issue
119130
// with the ClusterClass surfaced in the ClusterClass status or controller logs.

api/core/v1beta2/v1beta1_condition_consts.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,15 +302,18 @@ const (
302302

303303
// TopologyReconciledControlPlaneUpgradePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
304304
// not yet completed because Control Plane is not yet updated to match the desired topology spec.
305+
// Deprecated: please use ClusterUpgrading instead.
305306
TopologyReconciledControlPlaneUpgradePendingV1Beta1Reason = "ControlPlaneUpgradePending"
306307

307308
// TopologyReconciledMachineDeploymentsCreatePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
308309
// not yet completed because at least one of the MachineDeployments is yet to be created.
309310
// This generally happens because new MachineDeployment creations are held off while the ControlPlane is not stable.
311+
// Deprecated: please use ClusterUpgrading instead.
310312
TopologyReconciledMachineDeploymentsCreatePendingV1Beta1Reason = "MachineDeploymentsCreatePending"
311313

312314
// TopologyReconciledMachineDeploymentsUpgradePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
313315
// not yet completed because at least one of the MachineDeployments is not yet updated to match the desired topology spec.
316+
// Deprecated: please use ClusterUpgrading instead.
314317
TopologyReconciledMachineDeploymentsUpgradePendingV1Beta1Reason = "MachineDeploymentsUpgradePending"
315318

316319
// TopologyReconciledMachineDeploymentsUpgradeDeferredV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
@@ -319,11 +322,13 @@ const (
319322

320323
// TopologyReconciledMachinePoolsUpgradePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
321324
// not yet completed because at least one of the MachinePools is not yet updated to match the desired topology spec.
325+
// Deprecated: please use ClusterUpgrading instead.
322326
TopologyReconciledMachinePoolsUpgradePendingV1Beta1Reason = "MachinePoolsUpgradePending"
323327

324328
// TopologyReconciledMachinePoolsCreatePendingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
325329
// not yet completed because at least one of the MachinePools is yet to be created.
326330
// This generally happens because new MachinePool creations are held off while the ControlPlane is not stable.
331+
// Deprecated: please use ClusterUpgrading instead.
327332
TopologyReconciledMachinePoolsCreatePendingV1Beta1Reason = "MachinePoolsCreatePending"
328333

329334
// TopologyReconciledMachinePoolsUpgradeDeferredV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
@@ -332,8 +337,13 @@ const (
332337

333338
// TopologyReconciledHookBlockingV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology
334339
// not yet completed because at least one of the lifecycle hooks is blocking.
340+
// Deprecated: please use ClusterUpgrading instead.
335341
TopologyReconciledHookBlockingV1Beta1Reason = "LifecycleHookBlocking"
336342

343+
// ClusterTopologyReconciledClusterUpgradingV1Beta1Reason documents reconciliation of a Cluster topology
344+
// not yet completed because a cluster upgrade is still in progress.
345+
ClusterTopologyReconciledClusterUpgradingV1Beta1Reason = "ClusterUpgrading"
346+
337347
// TopologyReconciledClusterClassNotReconciledV1Beta1Reason (Severity=Info) documents reconciliation of a Cluster topology not
338348
// yet completed because the ClusterClass has not reconciled yet. If this condition persists there may be an issue
339349
// with the ClusterClass surfaced in the ClusterClass status or controller logs.

exp/topology/desiredstate/desired_state.go

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package desiredstate
1919

2020
import (
2121
"context"
22+
"fmt"
2223
"maps"
2324
"reflect"
2425
"time"
@@ -30,6 +31,7 @@ import (
3031
"k8s.io/apimachinery/pkg/runtime/schema"
3132
"k8s.io/klog/v2"
3233
"k8s.io/utils/ptr"
34+
ctrl "sigs.k8s.io/controller-runtime"
3335
"sigs.k8s.io/controller-runtime/pkg/client"
3436

3537
clusterv1beta1 "sigs.k8s.io/cluster-api/api/core/v1beta1"
@@ -507,6 +509,8 @@ func (g *generator) computeControlPlane(ctx context.Context, s *scope.Scope, inf
507509
// The version is calculated using the state of the current machine deployments, the current control plane
508510
// and the version defined in the topology.
509511
func (g *generator) computeControlPlaneVersion(ctx context.Context, s *scope.Scope) (string, error) {
512+
log := ctrl.LoggerFrom(ctx)
513+
510514
topologyVersion := s.Blueprint.Topology.Version
511515
// If we are creating the control plane object (current control plane is nil), use version from topology.
512516
if s.Current.ControlPlane == nil || s.Current.ControlPlane.Object == nil {
@@ -599,8 +603,7 @@ func (g *generator) computeControlPlaneVersion(ctx context.Context, s *scope.Sco
599603
// Also check if MachineDeployments/MachinePools are already upgrading.
600604
// If the MachineDeployments/MachinePools are upgrading, then do not pick up the next control plane version yet.
601605
// We will pick up the new version after the MachineDeployments/MachinePools finish upgrading.
602-
if len(s.UpgradeTracker.MachineDeployments.UpgradingNames()) > 0 ||
603-
len(s.UpgradeTracker.MachinePools.UpgradingNames()) > 0 {
606+
if s.UpgradeTracker.MachineDeployments.IsAnyUpgrading() || s.UpgradeTracker.MachinePools.IsAnyUpgrading() {
604607
return *currentVersion, nil
605608
}
606609

@@ -692,6 +695,11 @@ func (g *generator) computeControlPlaneVersion(ctx context.Context, s *scope.Sco
692695
s.UpgradeTracker.ControlPlane.IsStartingUpgrade = true
693696
s.UpgradeTracker.ControlPlane.IsPendingUpgrade = false
694697

698+
log.Info(fmt.Sprintf("Control plane %s upgraded from version %s to version %s", klog.KObj(s.Current.ControlPlane.Object), *currentVersion, nextVersion),
699+
"ControlPlaneUpgrades", toUpgradeStep(s.UpgradeTracker.ControlPlane.UpgradePlan),
700+
"WorkersUpgrades", toUpgradeStep(s.UpgradeTracker.MachineDeployments.UpgradePlan, s.UpgradeTracker.MachinePools.UpgradePlan),
701+
s.Current.ControlPlane.Object.GetKind(), klog.KObj(s.Current.ControlPlane.Object),
702+
)
695703
return nextVersion, nil
696704
}
697705

@@ -857,7 +865,7 @@ func (g *generator) computeMachineDeployment(ctx context.Context, s *scope.Scope
857865
// Add ClusterTopologyMachineDeploymentLabel to the generated InfrastructureMachine template
858866
infraMachineTemplateLabels[clusterv1.ClusterTopologyMachineDeploymentNameLabel] = machineDeploymentTopology.Name
859867
desiredMachineDeployment.InfrastructureMachineTemplate.SetLabels(infraMachineTemplateLabels)
860-
version, err := g.computeMachineDeploymentVersion(s, machineDeploymentTopology, currentMachineDeployment)
868+
version, err := g.computeMachineDeploymentVersion(ctx, s, machineDeploymentTopology, currentMachineDeployment)
861869
if err != nil {
862870
return nil, err
863871
}
@@ -1039,7 +1047,9 @@ func (g *generator) computeMachineDeployment(ctx context.Context, s *scope.Scope
10391047
// computeMachineDeploymentVersion calculates the version of the desired machine deployment.
10401048
// The version is calculated using the state of the current machine deployments,
10411049
// the current control plane and the version defined in the topology.
1042-
func (g *generator) computeMachineDeploymentVersion(s *scope.Scope, machineDeploymentTopology clusterv1.MachineDeploymentTopology, currentMDState *scope.MachineDeploymentState) (string, error) {
1050+
func (g *generator) computeMachineDeploymentVersion(ctx context.Context, s *scope.Scope, machineDeploymentTopology clusterv1.MachineDeploymentTopology, currentMDState *scope.MachineDeploymentState) (string, error) {
1051+
log := ctrl.LoggerFrom(ctx)
1052+
10431053
topologyVersion := s.Blueprint.Topology.Version
10441054
// If creating a new machine deployment, mark it as pending if the control plane is not
10451055
// yet stable. Creating a new MD while the control plane is upgrading can lead to unexpected race conditions.
@@ -1111,6 +1121,12 @@ func (g *generator) computeMachineDeploymentVersion(s *scope.Scope, machineDeplo
11111121
s.UpgradeTracker.MachineDeployments.MarkUpgrading(currentMDState.Object.Name)
11121122

11131123
nextVersion := s.UpgradeTracker.MachineDeployments.UpgradePlan[0]
1124+
1125+
log.Info(fmt.Sprintf("MachineDeployment %s upgraded from version %s to version %s", klog.KObj(currentMDState.Object), currentVersion, nextVersion),
1126+
"ControlPlaneUpgrades", toUpgradeStep(s.UpgradeTracker.ControlPlane.UpgradePlan),
1127+
"WorkersUpgrades", toUpgradeStep(s.UpgradeTracker.MachineDeployments.UpgradePlan, s.UpgradeTracker.MachinePools.UpgradePlan),
1128+
"MachineDeployment", klog.KObj(currentMDState.Object),
1129+
)
11141130
return nextVersion, nil
11151131
}
11161132

@@ -1165,7 +1181,7 @@ func (g *generator) computeMachinePools(ctx context.Context, s *scope.Scope) (sc
11651181
// computeMachinePool computes the desired state for a MachinePoolTopology.
11661182
// The generated machinePool object is calculated using the values from the machinePoolTopology and
11671183
// the machinePool class.
1168-
func (g *generator) computeMachinePool(_ context.Context, s *scope.Scope, machinePoolTopology clusterv1.MachinePoolTopology) (*scope.MachinePoolState, error) {
1184+
func (g *generator) computeMachinePool(ctx context.Context, s *scope.Scope, machinePoolTopology clusterv1.MachinePoolTopology) (*scope.MachinePoolState, error) {
11691185
desiredMachinePool := &scope.MachinePoolState{}
11701186

11711187
// Gets the blueprint for the MachinePool class.
@@ -1243,7 +1259,7 @@ func (g *generator) computeMachinePool(_ context.Context, s *scope.Scope, machin
12431259
// Add ClusterTopologyMachinePoolLabel to the generated InfrastructureMachinePool object
12441260
infraMachinePoolObjectLabels[clusterv1.ClusterTopologyMachinePoolNameLabel] = machinePoolTopology.Name
12451261
desiredMachinePool.InfrastructureMachinePoolObject.SetLabels(infraMachinePoolObjectLabels)
1246-
version, err := g.computeMachinePoolVersion(s, machinePoolTopology, currentMachinePool)
1262+
version, err := g.computeMachinePoolVersion(ctx, s, machinePoolTopology, currentMachinePool)
12471263
if err != nil {
12481264
return nil, err
12491265
}
@@ -1359,7 +1375,9 @@ func (g *generator) computeMachinePool(_ context.Context, s *scope.Scope, machin
13591375
// computeMachinePoolVersion calculates the version of the desired machine pool.
13601376
// The version is calculated using the state of the current machine pools,
13611377
// the current control plane and the version defined in the topology.
1362-
func (g *generator) computeMachinePoolVersion(s *scope.Scope, machinePoolTopology clusterv1.MachinePoolTopology, currentMPState *scope.MachinePoolState) (string, error) {
1378+
func (g *generator) computeMachinePoolVersion(ctx context.Context, s *scope.Scope, machinePoolTopology clusterv1.MachinePoolTopology, currentMPState *scope.MachinePoolState) (string, error) {
1379+
log := ctrl.LoggerFrom(ctx)
1380+
13631381
topologyVersion := s.Blueprint.Topology.Version
13641382
// If creating a new machine pool, mark it as pending if the control plane is not
13651383
// yet stable. Creating a new MP while the control plane is upgrading can lead to unexpected race conditions.
@@ -1431,6 +1449,12 @@ func (g *generator) computeMachinePoolVersion(s *scope.Scope, machinePoolTopolog
14311449
s.UpgradeTracker.MachinePools.MarkUpgrading(currentMPState.Object.Name)
14321450

14331451
nextVersion := s.UpgradeTracker.MachinePools.UpgradePlan[0]
1452+
1453+
log.Info(fmt.Sprintf("MachinePool %s upgraded from version %s to version %s", klog.KObj(currentMPState.Object), currentVersion, nextVersion),
1454+
"ControlPlaneUpgrades", toUpgradeStep(s.UpgradeTracker.ControlPlane.UpgradePlan),
1455+
"WorkersUpgrades", toUpgradeStep(s.UpgradeTracker.MachineDeployments.UpgradePlan, s.UpgradeTracker.MachinePools.UpgradePlan),
1456+
"MachinePool", klog.KObj(currentMPState.Object),
1457+
)
14341458
return nextVersion, nil
14351459
}
14361460

exp/topology/desiredstate/desired_state_test.go

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1062,11 +1062,13 @@ func TestComputeControlPlaneVersion(t *testing.T) {
10621062

10631063
catalog := runtimecatalog.New()
10641064
_ = runtimehooksv1.AddToCatalog(catalog)
1065+
beforeClusterUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.BeforeClusterUpgrade)
1066+
beforeControlPlaneUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.BeforeControlPlaneUpgrade)
1067+
afterControlPlaneUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.AfterControlPlaneUpgrade)
1068+
beforeWorkersUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.BeforeWorkersUpgrade)
1069+
afterWorkersUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.AfterWorkersUpgrade)
1070+
afterClusterUpgradeGVH, _ := catalog.GroupVersionHook(runtimehooksv1.AfterClusterUpgrade)
10651071

1066-
beforeClusterUpgradeGVH, err := catalog.GroupVersionHook(runtimehooksv1.BeforeClusterUpgrade)
1067-
if err != nil {
1068-
panic("unable to compute GVH")
1069-
}
10701072
nonBlockingBeforeClusterUpgradeResponse := &runtimehooksv1.BeforeClusterUpgradeResponse{
10711073
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
10721074
CommonResponse: runtimehooksv1.CommonResponse{
@@ -1090,10 +1092,6 @@ func TestComputeControlPlaneVersion(t *testing.T) {
10901092
},
10911093
}
10921094

1093-
beforeControlPlaneUpgradeGVH, err := catalog.GroupVersionHook(runtimehooksv1.BeforeControlPlaneUpgrade)
1094-
if err != nil {
1095-
panic("unable to compute GVH")
1096-
}
10971095
nonBlockingBeforeControlPlaneUpgradeResponse := &runtimehooksv1.BeforeControlPlaneUpgradeResponse{
10981096
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
10991097
CommonResponse: runtimehooksv1.CommonResponse{
@@ -1117,10 +1115,6 @@ func TestComputeControlPlaneVersion(t *testing.T) {
11171115
},
11181116
}
11191117

1120-
beforeWorkersUpgradeGVH, err := catalog.GroupVersionHook(runtimehooksv1.BeforeWorkersUpgrade)
1121-
if err != nil {
1122-
panic("unable to compute GVH")
1123-
}
11241118
nonBlockingBeforeWorkersUpgradeResponse := &runtimehooksv1.BeforeWorkersUpgradeResponse{
11251119
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
11261120
CommonResponse: runtimehooksv1.CommonResponse{
@@ -1144,10 +1138,6 @@ func TestComputeControlPlaneVersion(t *testing.T) {
11441138
},
11451139
}
11461140

1147-
afterWorkersUpgradeGVH, err := catalog.GroupVersionHook(runtimehooksv1.AfterWorkersUpgrade)
1148-
if err != nil {
1149-
panic("unable to compute GVH")
1150-
}
11511141
nonBlockingAfterWorkersUpgradeResponse := &runtimehooksv1.AfterWorkersUpgradeResponse{
11521142
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
11531143
CommonResponse: runtimehooksv1.CommonResponse{
@@ -1705,6 +1695,14 @@ func TestComputeControlPlaneVersion(t *testing.T) {
17051695

17061696
runtimeClient := fakeruntimeclient.NewRuntimeClientBuilder().
17071697
WithCatalog(catalog).
1698+
WithGetAllExtensionResponses(map[runtimecatalog.GroupVersionHook][]string{
1699+
beforeClusterUpgradeGVH: {"foo"},
1700+
beforeControlPlaneUpgradeGVH: {"foo"},
1701+
afterControlPlaneUpgradeGVH: {"foo"},
1702+
beforeWorkersUpgradeGVH: {"foo"},
1703+
afterWorkersUpgradeGVH: {"foo"},
1704+
afterClusterUpgradeGVH: {"foo"},
1705+
}).
17081706
WithCallAllExtensionResponses(map[runtimecatalog.GroupVersionHook]runtimehooksv1.ResponseObject{
17091707
beforeClusterUpgradeGVH: tt.beforeClusterUpgradeResponse,
17101708
beforeControlPlaneUpgradeGVH: tt.beforeControlPlaneUpgradeResponse,
@@ -2969,7 +2967,7 @@ func TestComputeMachineDeploymentVersion(t *testing.T) {
29692967

29702968
e := generator{}
29712969

2972-
version, err := e.computeMachineDeploymentVersion(s, tt.machineDeploymentTopology, tt.currentMachineDeploymentState)
2970+
version, err := e.computeMachineDeploymentVersion(ctx, s, tt.machineDeploymentTopology, tt.currentMachineDeploymentState)
29732971
g.Expect(err).NotTo(HaveOccurred())
29742972
g.Expect(version).To(Equal(tt.expectedVersion))
29752973

@@ -3214,7 +3212,7 @@ func TestComputeMachinePoolVersion(t *testing.T) {
32143212

32153213
e := generator{}
32163214

3217-
version, err := e.computeMachinePoolVersion(s, tt.machinePoolTopology, tt.currentMachinePoolState)
3215+
version, err := e.computeMachinePoolVersion(ctx, s, tt.machinePoolTopology, tt.currentMachinePoolState)
32183216
g.Expect(err).NotTo(HaveOccurred())
32193217
g.Expect(version).To(Equal(tt.expectedVersion))
32203218

0 commit comments

Comments
 (0)