Skip to content

Commit 61e7368

Browse files
⚠️ Make the AfterClusterUpgrade hook blocking (#12984)
* Make the AfterClusterUpgrade hook blocking # Conflicts: # internal/webhooks/cluster.go * Address comments
1 parent 6dd963e commit 61e7368

File tree

12 files changed

+218
-75
lines changed

12 files changed

+218
-75
lines changed

api/runtime/hooks/v1alpha1/lifecyclehooks_types.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,8 +322,8 @@ var _ ResponseObject = &AfterClusterUpgradeResponse{}
322322
type AfterClusterUpgradeResponse struct {
323323
metav1.TypeMeta `json:",inline"`
324324

325-
// CommonResponse contains Status and Message fields common to all response types.
326-
CommonResponse `json:",inline"`
325+
// CommonRetryResponse contains Status, Message and RetryAfterSeconds fields.
326+
CommonRetryResponse `json:",inline"`
327327
}
328328

329329
// AfterClusterUpgrade is the hook that is called after the entire cluster is updated
@@ -458,7 +458,7 @@ func init() {
458458
"Notes:\n" +
459459
"- This hook will be called only for Clusters with a managed topology\n" +
460460
"- The call's request contains the Cluster object and the Kubernetes version we upgraded to \n" +
461-
"- This is a non-blocking hook",
461+
"- This is a blocking hook; Runtime Extension implementers can use this hook to prevent the next upgrade to start.\n",
462462
})
463463

464464
catalogBuilder.RegisterHook(BeforeClusterDelete, &runtimecatalog.HookMeta{

api/runtime/hooks/v1alpha1/zz_generated.deepcopy.go

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/runtime/hooks/v1alpha1/zz_generated.openapi.go

Lines changed: 9 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

exp/topology/scope/hookresponsetracker_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ func TestHookResponseTracker_IsBlocking(t *testing.T) {
164164
},
165165
}
166166

167-
afterClusterUpgradeResponse := &runtimehooksv1.AfterClusterUpgradeResponse{
167+
afterAfterControlPlaneInitializedResponse := &runtimehooksv1.AfterControlPlaneInitializedResponse{
168168
CommonResponse: runtimehooksv1.CommonResponse{},
169169
}
170170

@@ -195,8 +195,8 @@ func TestHookResponseTracker_IsBlocking(t *testing.T) {
195195
t.Run("should return false if the hook is non-blocking", func(t *testing.T) {
196196
g := NewWithT(t)
197197
hrt := NewHookResponseTracker()
198-
// AfterClusterUpgradeHook is non-blocking.
199-
hrt.Add(runtimehooksv1.AfterClusterUpgrade, afterClusterUpgradeResponse)
200-
g.Expect(hrt.IsBlocking(runtimehooksv1.AfterClusterUpgrade)).To(BeFalse())
198+
// AfterControlPlaneInitialized is non-blocking.
199+
hrt.Add(runtimehooksv1.AfterControlPlaneInitialized, afterAfterControlPlaneInitializedResponse)
200+
g.Expect(hrt.IsBlocking(runtimehooksv1.AfterControlPlaneInitialized)).To(BeFalse())
201201
})
202202
}

internal/controllers/topology/cluster/cluster_controller.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -580,7 +580,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, s *scope.Scope) (ctrl.
580580
if err := hooks.MarkAsOkToDelete(ctx, r.Client, cluster); err != nil {
581581
return ctrl.Result{}, err
582582
}
583-
log.Info(fmt.Sprintf("Cluster deletion js unblocked by %s hook", runtimecatalog.HookName(runtimehooksv1.BeforeClusterDelete)))
583+
log.Info(fmt.Sprintf("Cluster deletion is unblocked by %s hook", runtimecatalog.HookName(runtimehooksv1.BeforeClusterDelete)))
584584
}
585585
}
586586
return ctrl.Result{}, nil

internal/controllers/topology/cluster/conditions_test.go

Lines changed: 40 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -919,55 +919,52 @@ func TestReconcileTopologyReconciledCondition(t *testing.T) {
919919
wantConditionMessage: "Cluster is upgrading to v1.22.0\n" +
920920
" * Following hooks are blocking upgrade: AfterWorkersUpgrade: msg",
921921
},
922-
// TODO(chained): uncomment this as soon as AfterClusterUpgrade will be blocking
923-
/*
924-
{
925-
name: "should set the condition to false if AfterClusterUpgrade hook is blocking",
926-
reconcileErr: nil,
927-
s: &scope.Scope{
928-
Current: &scope.ClusterState{
929-
Cluster: &clusterv1.Cluster{
930-
ObjectMeta: metav1.ObjectMeta{
931-
Annotations: map[string]string{
932-
runtimev1.PendingHooksAnnotation: "AfterClusterUpgrade",
933-
},
934-
},
935-
Spec: clusterv1.ClusterSpec{
936-
ControlPlaneRef: clusterv1.ContractVersionedObjectReference{Name: "controlplane1"},
937-
InfrastructureRef: clusterv1.ContractVersionedObjectReference{Name: "infra1"},
938-
Topology: clusterv1.Topology{
939-
Version: "v1.22.0",
940-
},
922+
{
923+
name: "should set the condition to false if AfterClusterUpgrade hook is blocking",
924+
reconcileErr: nil,
925+
s: &scope.Scope{
926+
Current: &scope.ClusterState{
927+
Cluster: &clusterv1.Cluster{
928+
ObjectMeta: metav1.ObjectMeta{
929+
Annotations: map[string]string{
930+
runtimev1.PendingHooksAnnotation: "AfterClusterUpgrade",
941931
},
942932
},
943-
ControlPlane: &scope.ControlPlaneState{
944-
Object: builder.ControlPlane("ns1", "controlplane1").WithVersion("v1.22.0").Build(),
933+
Spec: clusterv1.ClusterSpec{
934+
ControlPlaneRef: clusterv1.ContractVersionedObjectReference{Name: "controlplane1"},
935+
InfrastructureRef: clusterv1.ContractVersionedObjectReference{Name: "infra1"},
936+
Topology: clusterv1.Topology{
937+
Version: "v1.22.0",
938+
},
945939
},
946940
},
947-
UpgradeTracker: scope.NewUpgradeTracker(),
948-
HookResponseTracker: func() *scope.HookResponseTracker {
949-
hrt := scope.NewHookResponseTracker()
950-
hrt.Add(runtimehooksv1.AfterClusterUpgrade, &runtimehooksv1.AfterClusterUpgradeResponse{
951-
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
952-
CommonResponse: runtimehooksv1.CommonResponse{
953-
Message: "msg",
954-
},
955-
RetryAfterSeconds: 10,
956-
},
957-
})
958-
return hrt
959-
}(),
941+
ControlPlane: &scope.ControlPlaneState{
942+
Object: builder.ControlPlane("ns1", "controlplane1").WithVersion("v1.22.0").Build(),
943+
},
960944
},
961-
wantV1Beta1ConditionStatus: corev1.ConditionFalse,
962-
wantV1Beta1ConditionReason: clusterv1.TopologyReconciledClusterUpgradingV1Beta1Reason,
963-
wantV1Beta1ConditionMessage: "Cluster is upgrading to v1.22.0\n" +
964-
" * Following hooks are blocking upgrade: AfterClusterUpgrade: msg",
965-
wantConditionStatus: metav1.ConditionFalse,
966-
wantConditionReason: clusterv1.ClusterTopologyReconciledClusterUpgradingReason,
967-
wantConditionMessage: "Cluster is upgrading to v1.22.0\n" +
968-
" * Following hooks are blocking upgrade: AfterClusterUpgrade: msg",
945+
UpgradeTracker: scope.NewUpgradeTracker(),
946+
HookResponseTracker: func() *scope.HookResponseTracker {
947+
hrt := scope.NewHookResponseTracker()
948+
hrt.Add(runtimehooksv1.AfterClusterUpgrade, &runtimehooksv1.AfterClusterUpgradeResponse{
949+
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
950+
CommonResponse: runtimehooksv1.CommonResponse{
951+
Message: "msg",
952+
},
953+
RetryAfterSeconds: 10,
954+
},
955+
})
956+
return hrt
957+
}(),
969958
},
970-
*/
959+
wantV1Beta1ConditionStatus: corev1.ConditionFalse,
960+
wantV1Beta1ConditionReason: clusterv1.TopologyReconciledClusterUpgradingV1Beta1Reason,
961+
wantV1Beta1ConditionMessage: "Cluster is upgrading to v1.22.0\n" +
962+
" * Following hooks are blocking upgrade: AfterClusterUpgrade: msg",
963+
wantConditionStatus: metav1.ConditionFalse,
964+
wantConditionReason: clusterv1.ClusterTopologyReconciledClusterUpgradingReason,
965+
wantConditionMessage: "Cluster is upgrading to v1.22.0\n" +
966+
" * Following hooks are blocking upgrade: AfterClusterUpgrade: msg",
967+
},
971968

972969
// Hold & defer upgrade
973970

internal/controllers/topology/cluster/reconcile_state.go

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -279,14 +279,10 @@ func (r *Reconciler) callAfterClusterUpgrade(ctx context.Context, s *scope.Scope
279279
}
280280
s.HookResponseTracker.Add(runtimehooksv1.AfterClusterUpgrade, hookResponse)
281281

282-
// TODO(chained): uncomment this as soon as AfterClusterUpgrade will be blocking
283-
//nolint:gocritic
284-
/*
285-
if hookResponse.RetryAfterSeconds != 0 {
286-
log.Info(fmt.Sprintf("Cluster upgrade to version %s completed but %s hook is still blocking", hookRequest.KubernetesVersion, runtimecatalog.HookName(runtimehooksv1.AfterClusterUpgrade)))
287-
return nil
288-
}
289-
*/
282+
if hookResponse.RetryAfterSeconds != 0 {
283+
log.Info(fmt.Sprintf("Cluster upgrade to version %s completed but next upgrades are blocked by %s hook", hookRequest.KubernetesVersion, runtimecatalog.HookName(runtimehooksv1.AfterClusterUpgrade)))
284+
return nil
285+
}
290286

291287
// The hook is successfully called; we can remove this hook from the list of pending-hooks.
292288
if err := hooks.MarkAsDone(ctx, r.Client, s.Current.Cluster, runtimehooksv1.AfterClusterUpgrade); err != nil {

internal/controllers/topology/cluster/reconcile_state_test.go

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"net/http"
2424
"regexp"
2525
"testing"
26+
"time"
2627

2728
"github.com/google/go-cmp/cmp"
2829
. "github.com/onsi/gomega"
@@ -558,13 +559,26 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) {
558559
}
559560

560561
successResponse := &runtimehooksv1.AfterClusterUpgradeResponse{
561-
CommonResponse: runtimehooksv1.CommonResponse{
562-
Status: runtimehooksv1.ResponseStatusSuccess,
562+
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
563+
CommonResponse: runtimehooksv1.CommonResponse{
564+
Status: runtimehooksv1.ResponseStatusSuccess,
565+
},
563566
},
564567
}
565568
failureResponse := &runtimehooksv1.AfterClusterUpgradeResponse{
566-
CommonResponse: runtimehooksv1.CommonResponse{
567-
Status: runtimehooksv1.ResponseStatusFailure,
569+
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
570+
CommonResponse: runtimehooksv1.CommonResponse{
571+
Status: runtimehooksv1.ResponseStatusFailure,
572+
},
573+
},
574+
}
575+
blockingResponse := &runtimehooksv1.AfterClusterUpgradeResponse{
576+
CommonRetryResponse: runtimehooksv1.CommonRetryResponse{
577+
CommonResponse: runtimehooksv1.CommonResponse{
578+
Status: runtimehooksv1.ResponseStatusSuccess,
579+
Message: "foo",
580+
},
581+
RetryAfterSeconds: 60,
568582
},
569583
}
570584

@@ -578,6 +592,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) {
578592
hookResponse *runtimehooksv1.AfterClusterUpgradeResponse
579593
wantMarked bool
580594
wantHookToBeCalled bool
595+
wantRetryAfter time.Duration
581596
wantError bool
582597
}{
583598
{
@@ -1172,6 +1187,44 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) {
11721187
wantHookToBeCalled: true,
11731188
wantError: false,
11741189
},
1190+
{
1191+
name: "hook should be called if the control plane, MDs, and MPs are stable at the topology version - retry response should leave the hook marked",
1192+
s: &scope.Scope{
1193+
Blueprint: &scope.ClusterBlueprint{
1194+
Topology: clusterv1.Topology{
1195+
ControlPlane: clusterv1.ControlPlaneTopology{
1196+
Replicas: ptr.To[int32](2),
1197+
},
1198+
},
1199+
},
1200+
Current: &scope.ClusterState{
1201+
Cluster: &clusterv1.Cluster{
1202+
ObjectMeta: metav1.ObjectMeta{
1203+
Name: "test-cluster",
1204+
Namespace: "test-ns",
1205+
Annotations: map[string]string{
1206+
runtimev1.PendingHooksAnnotation: "AfterClusterUpgrade",
1207+
},
1208+
},
1209+
Spec: clusterv1.ClusterSpec{
1210+
Topology: clusterv1.Topology{
1211+
Version: topologyVersion,
1212+
},
1213+
},
1214+
},
1215+
ControlPlane: &scope.ControlPlaneState{
1216+
Object: controlPlaneObj,
1217+
},
1218+
},
1219+
HookResponseTracker: scope.NewHookResponseTracker(),
1220+
UpgradeTracker: scope.NewUpgradeTracker(),
1221+
},
1222+
wantMarked: true,
1223+
hookResponse: blockingResponse,
1224+
wantHookToBeCalled: true,
1225+
wantRetryAfter: time.Second * time.Duration(blockingResponse.RetryAfterSeconds),
1226+
wantError: false,
1227+
},
11751228
{
11761229
name: "hook should be called if the control plane, MDs, and MPs are stable at the topology version - failure response should leave the hook marked",
11771230
s: &scope.Scope{
@@ -1265,6 +1318,7 @@ func TestReconcile_callAfterClusterUpgrade(t *testing.T) {
12651318
g.Expect(err).ToNot(HaveOccurred())
12661319
}
12671320

1321+
g.Expect(tt.s.HookResponseTracker.AggregateRetryAfter()).To(Equal(tt.wantRetryAfter))
12681322
if tt.wantHookToBeCalled {
12691323
g.Expect(fakeRuntimeClient.CallAllCount(runtimehooksv1.AfterClusterUpgrade)).To(Equal(1), "Expected hook to be called once")
12701324
} else {

internal/webhooks/cluster.go

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
apierrors "k8s.io/apimachinery/pkg/api/errors"
3030
"k8s.io/apimachinery/pkg/runtime"
3131
kerrors "k8s.io/apimachinery/pkg/util/errors"
32+
"k8s.io/apimachinery/pkg/util/sets"
3233
"k8s.io/apimachinery/pkg/util/validation"
3334
"k8s.io/apimachinery/pkg/util/validation/field"
3435
"k8s.io/apimachinery/pkg/util/wait"
@@ -38,7 +39,10 @@ import (
3839
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"
3940

4041
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
42+
runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1"
43+
runtimev1 "sigs.k8s.io/cluster-api/api/runtime/v1beta2"
4144
"sigs.k8s.io/cluster-api/controllers/external"
45+
runtimecatalog "sigs.k8s.io/cluster-api/exp/runtime/catalog"
4246
"sigs.k8s.io/cluster-api/feature"
4347
"sigs.k8s.io/cluster-api/internal/contract"
4448
"sigs.k8s.io/cluster-api/internal/topology/check"
@@ -400,7 +404,7 @@ func (webhook *Cluster) validateTopology(ctx context.Context, oldCluster, newClu
400404
// If a generateUpgradePlan extension is defined, we assume that additionally a Cluster validating webhook is implemented
401405
// that validates Cluster.spec.topology.version in a way that matches with GenerateUpgradePlan responses.
402406
shouldValidateVersionCeiling := len(clusterClass.Spec.KubernetesVersions) == 0 && clusterClass.Spec.Upgrade.External.GenerateUpgradePlanExtension == ""
403-
if err := webhook.validateTopologyVersionUpdate(ctx, fldPath.Child("version"), newCluster.Spec.Topology.Version, inVersion, oldVersion, oldCluster, shouldValidateVersionCeiling); err != nil {
407+
if err := webhook.validateTopologyVersionUpdate(ctx, fldPath.Child("version"), newCluster.Spec.Topology.Version, inVersion, oldVersion, newCluster, oldCluster, shouldValidateVersionCeiling); err != nil {
404408
allErrs = append(allErrs, err)
405409
}
406410
}
@@ -430,7 +434,7 @@ func (webhook *Cluster) validateTopology(ctx context.Context, oldCluster, newClu
430434
return allWarnings, allErrs
431435
}
432436

433-
func (webhook *Cluster) validateTopologyVersionUpdate(ctx context.Context, fldPath *field.Path, fldValue string, inVersion, oldVersion semver.Version, oldCluster *clusterv1.Cluster, shouldValidateCeiling bool) *field.Error {
437+
func (webhook *Cluster) validateTopologyVersionUpdate(ctx context.Context, fldPath *field.Path, fldValue string, inVersion, oldVersion semver.Version, newCluster, oldCluster *clusterv1.Cluster, shouldValidateCeiling bool) *field.Error {
434438
// Nothing to do if the version doesn't change.
435439
if inVersion.String() == oldVersion.String() {
436440
return nil
@@ -461,6 +465,15 @@ func (webhook *Cluster) validateTopologyVersionUpdate(ctx context.Context, fldPa
461465
}
462466
}
463467

468+
// Cannot upgrade when lifecycle hooks are still being completed for the previous upgrade.
469+
if IsPending(runtimehooksv1.AfterClusterUpgrade, newCluster) {
470+
return field.Invalid(
471+
fldPath,
472+
fldValue,
473+
fmt.Sprintf("version cannot be changed when the %q hook is still blocking", runtimecatalog.HookName(runtimehooksv1.AfterClusterUpgrade)),
474+
)
475+
}
476+
464477
allErrs := []error{}
465478
// minor version cannot be increased if control plane is upgrading or not yet on the current version
466479
if err := validateTopologyControlPlaneVersion(ctx, webhook.Client, oldCluster, oldVersion); err != nil {
@@ -1070,3 +1083,27 @@ func validateAutoscalerAnnotationsForCluster(cluster *clusterv1.Cluster, cluster
10701083
}
10711084
return allErrs
10721085
}
1086+
1087+
// Note: code duplicated from internal/hooks/tracking.go to avoid a circular dependency when running tests
1088+
// # sigs.k8s.io/cluster-api/util/patch
1089+
// package sigs.k8s.io/cluster-api/util/patch
1090+
// imports sigs.k8s.io/cluster-api/internal/test/envtest from suite_test.go
1091+
// imports sigs.k8s.io/cluster-api/internal/webhooks from environment.go
1092+
// imports sigs.k8s.io/cluster-api/internal/hooks from cluster.go
1093+
// imports sigs.k8s.io/cluster-api/util/patch from tracking.go: import cycle not allowed in test
1094+
// TODO: investigate.
1095+
1096+
// IsPending returns true if there is an intent to call a hook being tracked in the object's PendingHooksAnnotation.
1097+
func IsPending(hook runtimecatalog.Hook, obj client.Object) bool {
1098+
hookName := runtimecatalog.HookName(hook)
1099+
annotations := obj.GetAnnotations()
1100+
if annotations == nil {
1101+
return false
1102+
}
1103+
return isInCommaSeparatedList(annotations[runtimev1.PendingHooksAnnotation], hookName)
1104+
}
1105+
1106+
func isInCommaSeparatedList(list, item string) bool {
1107+
set := sets.Set[string]{}.Insert(strings.Split(list, ",")...)
1108+
return set.Has(item)
1109+
}

0 commit comments

Comments
 (0)