From dbf82575d686f34f9f1304450f2ced065723ead3 Mon Sep 17 00:00:00 2001 From: Stefan Bueringer Date: Sat, 25 Oct 2025 09:38:12 +0200 Subject: [PATCH] Simplify Cluster webhook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Stefan Büringer buringerst@vmware.com --- internal/webhooks/cluster.go | 152 ++++++++++++++---------------- internal/webhooks/cluster_test.go | 4 +- 2 files changed, 74 insertions(+), 82 deletions(-) diff --git a/internal/webhooks/cluster.go b/internal/webhooks/cluster.go index f93c24e6c265..efe70245b70b 100644 --- a/internal/webhooks/cluster.go +++ b/internal/webhooks/cluster.go @@ -79,8 +79,6 @@ type Cluster struct { var _ webhook.CustomDefaulter = &Cluster{} var _ webhook.CustomValidator = &Cluster{} -var errClusterClassNotReconciled = errors.New("ClusterClass is not successfully reconciled") - // Default satisfies the defaulting webhook interface. func (webhook *Cluster) Default(ctx context.Context, obj runtime.Object) error { // We gather all defaulting errors and return them together. @@ -109,14 +107,15 @@ func (webhook *Cluster) Default(ctx context.Context, obj runtime.Object) error { return apierrors.NewInvalid(clusterv1.GroupVersion.WithKind("Cluster").GroupKind(), cluster.Name, allErrs) } - clusterClass, err := webhook.pollClusterClassForCluster(ctx, cluster) + clusterClass, clusterClassNotReconciled, clusterClassNotFound, err := webhook.pollClusterClassForCluster(ctx, cluster) if err != nil { - // If the ClusterClass can't be found or is not up to date ignore the error. - if apierrors.IsNotFound(err) || errors.Is(err, errClusterClassNotReconciled) { - return nil - } return apierrors.NewInternalError(errors.Wrapf(err, "Cluster %s can't be defaulted. ClusterClass %s can not be retrieved", cluster.Name, cluster.GetClassKey().Name)) } + if clusterClassNotReconciled || clusterClassNotFound { + // If the ClusterClass can't be found or is not reconciled, return as we shouldn't + // default and validate variables in that case. + return nil + } // Validate cluster class variables transitions that may be enforced by CEL validation rules on variables. // If no request found in context, then this has not come via a webhook request, so skip validation of old cluster. @@ -292,6 +291,8 @@ func (webhook *Cluster) validateTopology(ctx context.Context, oldCluster, newClu // metadata in topology should be valid allErrs = append(allErrs, validateTopologyMetadata(newCluster.Spec.Topology, fldPath)...) + allErrs = append(allErrs, validateTopologyRollout(newCluster.Spec.Topology, fldPath)...) + // upgrade concurrency should be a numeric value. if concurrency, ok := newCluster.Annotations[clusterv1.ClusterTopologyUpgradeConcurrencyAnnotation]; ok { concurrencyAnnotationField := field.NewPath("metadata", "annotations", clusterv1.ClusterTopologyUpgradeConcurrencyAnnotation) @@ -312,39 +313,36 @@ func (webhook *Cluster) validateTopology(ctx context.Context, oldCluster, newClu } // Get the ClusterClass referenced in the Cluster. - clusterClass, warnings, clusterClassPollErr := webhook.validateClusterClassExistsAndIsReconciled(ctx, newCluster) - // If the error is anything other than "NotFound" or "NotReconciled" return all errors. - if clusterClassPollErr != nil && (!apierrors.IsNotFound(clusterClassPollErr) && !errors.Is(clusterClassPollErr, errClusterClassNotReconciled)) { - allErrs = append( - allErrs, field.InternalError( - fldPath.Child("class"), - clusterClassPollErr)) - return allWarnings, allErrs + // Note: If the ClusterClass is not found, a warning and no err is returned and the ClusterClass is nil. + // Note: If the ClusterClass is not reconciled, a warning and no err is returned and the ClusterClass is returned. + clusterClass, warnings, err := webhook.validateClusterClassExistsAndIsReconciled(ctx, newCluster) + if err != nil { + return allWarnings, append(allErrs, field.InternalError(fldPath.Child("class"), err)) } - - // Add the warnings if no error was returned. allWarnings = append(allWarnings, warnings...) - // If there's no error validate the Cluster based on the ClusterClass. - if clusterClassPollErr == nil { + // If we could get the ClusterClass validate the Cluster based on the ClusterClass. + if clusterClass != nil { allErrs = append(allErrs, ValidateClusterForClusterClass(newCluster, clusterClass)...) } // Validate the Cluster and associated ClusterClass' autoscaler annotations. + // Note: ClusterClass validation is only run if clusterClass is not nil. allErrs = append(allErrs, validateAutoscalerAnnotationsForCluster(newCluster, clusterClass)...) if oldCluster != nil { // On update - // The ClusterClass must exist to proceed with update validation. Return an error if the ClusterClass was - // not found. - if apierrors.IsNotFound(clusterClassPollErr) { + // The ClusterClass must exist to proceed with update validation as in that case the ClusterClass + // should always be there (vs. during create it could be racy if Cluster and ClusterClass are created + // at the same time). Return an error if the ClusterClass was not found. + if clusterClass == nil { allErrs = append( allErrs, field.InternalError( fldPath.Child("class"), - clusterClassPollErr)) + errors.Errorf("ClusterClass %s not found", newCluster.GetClassKey()))) return allWarnings, allErrs } - // Topology or Class can not be added on update unless ClusterTopologyUnsafeUpdateClassNameAnnotation is set. + // Topology or Class can not be added or update unless ClusterTopologyUnsafeUpdateClassNameAnnotation is set. if !oldCluster.Spec.Topology.IsDefined() || oldCluster.GetClassKey().Name == "" { if _, ok := newCluster.Annotations[clusterv1.ClusterTopologyUnsafeUpdateClassNameAnnotation]; ok { return allWarnings, allErrs @@ -402,29 +400,21 @@ func (webhook *Cluster) validateTopology(ctx context.Context, oldCluster, newClu // If the ClusterClass referenced in the Topology has changed compatibility checks are needed. if oldCluster.GetClassKey() != newCluster.GetClassKey() { - if clusterClassPollErr != nil { - allErrs = append( - allErrs, field.Forbidden( - fldPath.Child("class"), - fmt.Sprintf("cannot rebase to ClusterClass %q: %s", - newCluster.GetClassKey(), clusterClassPollErr.Error()))) - // Return early with errors if the new ClusterClass can't be retrieved. - return allWarnings, allErrs - } - // Check to see if the ClusterClass referenced in the old version of the Cluster exists. - oldClusterClass, err := webhook.pollClusterClassForCluster(ctx, oldCluster) - if err != nil { + // Return early with errors if the old ClusterClass can't be retrieved. + oldClusterClass := &clusterv1.ClusterClass{} + if err := webhook.Client.Get(ctx, oldCluster.GetClassKey(), oldClusterClass); err != nil { allErrs = append( allErrs, field.Forbidden( fldPath.Child("class"), fmt.Sprintf("valid ClusterClass with name %q could not be retrieved, change from class %[1]q to class %q cannot be validated. Error: %s", oldCluster.GetClassKey(), newCluster.GetClassKey(), err.Error()))) - - // Return early with errors if the old ClusterClass can't be retrieved. return allWarnings, allErrs } + // Note: We don't care if the old ClusterClass is reconciled as the validation below doesn't need it + // and we want to allow to rebase away from a broken ClusterClass. + // Check if the new and old ClusterClasses are compatible with one another. allErrs = append(allErrs, check.ClusterClassesAreCompatible(oldClusterClass, clusterClass)...) } @@ -639,11 +629,11 @@ func validateTopologyMachinePoolVersions(ctx context.Context, ctrlClient client. return nil } -func validateClusterRollout(cluster *clusterv1.Cluster) field.ErrorList { +func validateTopologyRollout(topology clusterv1.Topology, fldPath *field.Path) field.ErrorList { var allErrs field.ErrorList - for _, md := range cluster.Spec.Topology.Workers.MachineDeployments { - fldPath := field.NewPath("spec", "topology", "workers", "machineDeployments").Key(md.Name).Child("rollout") + for _, md := range topology.Workers.MachineDeployments { + fldPath := fldPath.Child("workers", "machineDeployments").Key(md.Name).Child("rollout") allErrs = append(allErrs, validateRolloutStrategy(fldPath.Child("strategy"), md.Rollout.Strategy.RollingUpdate.MaxUnavailable, md.Rollout.Strategy.RollingUpdate.MaxSurge)...) } @@ -917,8 +907,6 @@ func ValidateClusterForClusterClass(cluster *clusterv1.Cluster, clusterClass *cl allErrs = append(allErrs, check.MachinePoolTopologiesAreValidAndDefinedInClusterClass(cluster, clusterClass)...) - allErrs = append(allErrs, validateClusterRollout(cluster)...) - // Validate the MachineHealthChecks defined in the cluster topology. allErrs = append(allErrs, validateMachineHealthChecks(cluster, clusterClass)...) return allErrs @@ -928,38 +916,37 @@ func ValidateClusterForClusterClass(cluster *clusterv1.Cluster, clusterClass *cl // In any other case it will return an error. func (webhook *Cluster) validateClusterClassExistsAndIsReconciled(ctx context.Context, newCluster *clusterv1.Cluster) (*clusterv1.ClusterClass, admission.Warnings, error) { var allWarnings admission.Warnings - clusterClass, clusterClassPollErr := webhook.pollClusterClassForCluster(ctx, newCluster) - if clusterClassPollErr != nil { - // Add a warning if the Class does not exist or if it has not been successfully reconciled. - switch { - case apierrors.IsNotFound(clusterClassPollErr): - allWarnings = append(allWarnings, - fmt.Sprintf( - "Cluster refers to ClusterClass %s, but this ClusterClass does not exist. "+ - "Cluster topology has not been fully validated. "+ - "The ClusterClass must be created to reconcile the Cluster", newCluster.GetClassKey()), - ) - case errors.Is(clusterClassPollErr, errClusterClassNotReconciled): - allWarnings = append(allWarnings, - fmt.Sprintf( - "Cluster refers to ClusterClass %s, but this ClusterClass hasn't been successfully reconciled. "+ - "Cluster topology has not been fully validated. "+ - "Please take a look at the ClusterClass status", newCluster.GetClassKey()), - ) - // If there's any other error return a generic warning with the error message. - default: - allWarnings = append(allWarnings, - fmt.Sprintf( - "Cluster refers to ClusterClass %s, but this ClusterClass could not be retrieved. "+ - "Cluster topology has not been fully validated: %s", newCluster.GetClassKey(), clusterClassPollErr.Error()), - ) - } + clusterClass, clusterClassNotReconciled, clusterClassNotFound, err := webhook.pollClusterClassForCluster(ctx, newCluster) + // Add a warning if the Class does not exist or if it has not been successfully reconciled. + switch { + case err != nil: + allWarnings = append(allWarnings, + fmt.Sprintf( + "Cluster refers to ClusterClass %s, but this ClusterClass could not be retrieved. "+ + "Cluster topology has not been fully validated: %s", newCluster.GetClassKey(), err.Error()), + ) + case clusterClassNotFound: + allWarnings = append(allWarnings, + fmt.Sprintf( + "Cluster refers to ClusterClass %s, but this ClusterClass does not exist. "+ + "Cluster topology has not been fully validated. "+ + "The ClusterClass must be created to reconcile the Cluster", newCluster.GetClassKey()), + ) + case clusterClassNotReconciled: + allWarnings = append(allWarnings, + fmt.Sprintf( + "Cluster refers to ClusterClass %s, but this ClusterClass hasn't been successfully reconciled. "+ + "Cluster topology has not been fully validated. "+ + "Please take a look at the ClusterClass status", newCluster.GetClassKey()), + ) } - return clusterClass, allWarnings, clusterClassPollErr + return clusterClass, allWarnings, err } // pollClusterClassForCluster will retry getting the ClusterClass referenced in the Cluster for two seconds. -func (webhook *Cluster) pollClusterClassForCluster(ctx context.Context, cluster *clusterv1.Cluster) (*clusterv1.ClusterClass, error) { +func (webhook *Cluster) pollClusterClassForCluster(ctx context.Context, cluster *clusterv1.Cluster) (_ *clusterv1.ClusterClass, clusterClassNotReconciled, clusterClassNotFound bool, _ error) { + var errClusterClassNotReconciled = errors.New("ClusterClass is not successfully reconciled") + clusterClass := &clusterv1.ClusterClass{} var clusterClassPollErr error _ = wait.PollUntilContextTimeout(ctx, 200*time.Millisecond, 2*time.Second, true, func(ctx context.Context) (bool, error) { @@ -967,35 +954,40 @@ func (webhook *Cluster) pollClusterClassForCluster(ctx context.Context, cluster return false, nil //nolint:nilerr } - if clusterClassPollErr = clusterClassIsReconciled(clusterClass); clusterClassPollErr != nil { - return false, nil //nolint:nilerr + if !clusterClassIsReconciled(clusterClass) { + clusterClassPollErr = errClusterClassNotReconciled + return false, nil } + clusterClassPollErr = nil return true, nil }) if clusterClassPollErr != nil { + if apierrors.IsNotFound(clusterClassPollErr) { + return nil, false, true, nil + } if errors.Is(clusterClassPollErr, errClusterClassNotReconciled) { // Return ClusterClass if we were able to get it and it's just not reconciled. - return clusterClass, clusterClassPollErr + return clusterClass, true, false, nil } - return nil, clusterClassPollErr + return nil, false, false, clusterClassPollErr } - return clusterClass, nil + return clusterClass, false, false, nil } // clusterClassIsReconciled returns errClusterClassNotReconciled if the ClusterClass has not successfully reconciled or if the // ClusterClass variables have not been successfully reconciled. -func clusterClassIsReconciled(clusterClass *clusterv1.ClusterClass) error { +func clusterClassIsReconciled(clusterClass *clusterv1.ClusterClass) bool { // If the clusterClass metadata generation does not match the status observed generation, the ClusterClass has not been successfully reconciled. if clusterClass.Generation != clusterClass.Status.ObservedGeneration { - return errClusterClassNotReconciled + return false } // If the clusterClass does not have ClusterClassVariablesReconciled==True, the ClusterClass has not been successfully reconciled. if !conditions.Has(clusterClass, clusterv1.ClusterClassVariablesReadyCondition) || conditions.IsFalse(clusterClass, clusterv1.ClusterClassVariablesReadyCondition) { - return errClusterClassNotReconciled + return false } - return nil + return true } func validateTopologyMetadata(topology clusterv1.Topology, fldPath *field.Path) field.ErrorList { diff --git a/internal/webhooks/cluster_test.go b/internal/webhooks/cluster_test.go index 208a47a1fc91..50f4a98e3e07 100644 --- a/internal/webhooks/cluster_test.go +++ b/internal/webhooks/cluster_test.go @@ -3450,11 +3450,11 @@ func TestClusterClassPollingErrors(t *testing.T) { wantErr: false, }, { - name: "Fail on update if oldCluster ClusterClass generation does not match observedGeneration", + name: "Pass on update if oldCluster ClusterClass generation does not match observedGeneration", cluster: builder.Cluster(metav1.NamespaceDefault, "cluster1").WithTopology(secondTopology).Build(), oldCluster: builder.Cluster(metav1.NamespaceDefault, "cluster1").WithTopology(topology).Build(), clusterClasses: []*clusterv1.ClusterClass{ccGenerationMismatch, secondFullyReconciled}, - wantErr: true, + wantErr: false, }, { name: "Fail on update if old Cluster ClusterClass is not found",