From c0657c79010a965ba30ec5669722ca1c2b1c9034 Mon Sep 17 00:00:00 2001 From: Liangquan Li Date: Thu, 13 Nov 2025 23:24:21 +0800 Subject: [PATCH 1/2] feat: implement auto-population of AWSMachineTemplate capacity and nodeInfo Add AWSMachineTemplateReconciler to automatically populate capacity and node info fields by querying AWS EC2 API. This completes the autoscaling from zero implementation by ensuring the required metadata is available without manual configuration. Changes include: - Add NodeInfo struct with Architecture and OperatingSystem fields to AWSMachineTemplate status - Implement controller that queries EC2 API for instance type specifications - Auto-populate CPU, memory, pods, and ephemeral storage capacity - Auto-detect architecture (amd64/arm64) and OS (linux/windows) from AMI - Add conversion logic for backward compatibility with v1beta1 - Enable status subresource on AWSMachineTemplate CRD - Add comprehensive unit tests (351 lines) covering various scenarios - Add RBAC permissions for controller operations The controller automatically populates these fields when an AWSMachineTemplate is created or updated, eliminating the need for manual configuration and enabling Cluster Autoscaler to make informed scaling decisions from zero nodes. Related: https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md Squashed from 5 commits: - 9a92a4352 Implement autoscaling from zero by auto-populating AWSMachineTemplate capacity - 86fe07247 add AWSMachineTemplate NodeInfo - ddaf62cdd Fix review comments - 4ea52c8f2 Fix review comments 2 - b398ffcfb Fix review comments 3 --- api/v1beta1/awsmachine_conversion.go | 3 + api/v1beta1/conversion.go | 5 + api/v1beta1/zz_generated.conversion.go | 6 +- api/v1beta2/awsmachinetemplate_types.go | 43 ++ api/v1beta2/zz_generated.deepcopy.go | 20 + ....cluster.x-k8s.io_awsmachinetemplates.yaml | 25 ++ config/rbac/role.yaml | 1 + controllers/awsmachinetemplate_controller.go | 411 ++++++++++++++++++ ...awsmachinetemplate_controller_unit_test.go | 351 +++++++++++++++ main.go | 10 + .../unmanaged/unmanaged_functional_test.go | 28 ++ 11 files changed, 898 insertions(+), 5 deletions(-) create mode 100644 controllers/awsmachinetemplate_controller.go create mode 100644 controllers/awsmachinetemplate_controller_unit_test.go diff --git a/api/v1beta1/awsmachine_conversion.go b/api/v1beta1/awsmachine_conversion.go index 85f7a1c36b..f7f55af221 100644 --- a/api/v1beta1/awsmachine_conversion.go +++ b/api/v1beta1/awsmachine_conversion.go @@ -136,6 +136,9 @@ func (r *AWSMachineTemplate) ConvertTo(dstRaw conversion.Hub) error { } } + // Restore Status fields that don't exist in v1beta1. + dst.Status.NodeInfo = restored.Status.NodeInfo + return nil } diff --git a/api/v1beta1/conversion.go b/api/v1beta1/conversion.go index c3000ee97c..0f3833fba8 100644 --- a/api/v1beta1/conversion.go +++ b/api/v1beta1/conversion.go @@ -108,3 +108,8 @@ func Convert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in *v1beta2.AW // Note: DedicatedHostID is not present in v1beta1, so it will be dropped during conversion return autoConvert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in, out, s) } + +func Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { + // NodeInfo field is ignored (dropped) as it doesn't exist in v1beta1 + return autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in, out, s) +} diff --git a/api/v1beta1/zz_generated.conversion.go b/api/v1beta1/zz_generated.conversion.go index ebd82d07d9..19b919643e 100644 --- a/api/v1beta1/zz_generated.conversion.go +++ b/api/v1beta1/zz_generated.conversion.go @@ -1625,14 +1625,10 @@ func Convert_v1beta1_AWSMachineTemplateStatus_To_v1beta2_AWSMachineTemplateStatu func autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { out.Capacity = *(*v1.ResourceList)(unsafe.Pointer(&in.Capacity)) + // WARNING: in.NodeInfo requires manual conversion: does not exist in peer-type return nil } -// Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus is an autogenerated conversion function. -func Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { - return autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in, out, s) -} - func autoConvert_v1beta1_AWSResourceReference_To_v1beta2_AWSResourceReference(in *AWSResourceReference, out *v1beta2.AWSResourceReference, s conversion.Scope) error { out.ID = (*string)(unsafe.Pointer(in.ID)) // WARNING: in.ARN requires manual conversion: does not exist in peer-type diff --git a/api/v1beta2/awsmachinetemplate_types.go b/api/v1beta2/awsmachinetemplate_types.go index 50d8dda22d..d9d94dc73d 100644 --- a/api/v1beta2/awsmachinetemplate_types.go +++ b/api/v1beta2/awsmachinetemplate_types.go @@ -23,6 +23,42 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" ) +// Architecture represents the CPU architecture of the node. +// Its underlying type is a string and its value can be any of amd64, arm64. +type Architecture string + +// Architecture constants. +const ( + ArchitectureAmd64 Architecture = "amd64" + ArchitectureArm64 Architecture = "arm64" +) + +// OperatingSystem represents the operating system of the node. +// Its underlying type is a string and its value can be any of linux, windows. +type OperatingSystem string + +// Operating system constants. +const ( + // OperatingSystemLinux represents the Linux operating system. + OperatingSystemLinux OperatingSystem = "linux" + // OperatingSystemWindows represents the Windows operating system. + OperatingSystemWindows OperatingSystem = "windows" +) + +// NodeInfo contains information about the node's architecture and operating system. +type NodeInfo struct { + // Architecture is the CPU architecture of the node. + // Its underlying type is a string and its value can be any of amd64, arm64. + // +kubebuilder:validation:Enum=amd64;arm64 + // +optional + Architecture Architecture `json:"architecture,omitempty"` + // OperatingSystem is the operating system of the node. + // Its underlying type is a string and its value can be any of linux, windows. + // +kubebuilder:validation:Enum=linux;windows + // +optional + OperatingSystem OperatingSystem `json:"operatingSystem,omitempty"` +} + // AWSMachineTemplateStatus defines a status for an AWSMachineTemplate. type AWSMachineTemplateStatus struct { // Capacity defines the resource capacity for this machine. @@ -30,6 +66,12 @@ type AWSMachineTemplateStatus struct { // https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md // +optional Capacity corev1.ResourceList `json:"capacity,omitempty"` + + // NodeInfo contains information about the node's architecture and operating system. + // This value is used for autoscaling from zero operations as defined in: + // https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md + // +optional + NodeInfo *NodeInfo `json:"nodeInfo,omitempty"` } // AWSMachineTemplateSpec defines the desired state of AWSMachineTemplate. @@ -40,6 +82,7 @@ type AWSMachineTemplateSpec struct { // +kubebuilder:object:root=true // +kubebuilder:resource:path=awsmachinetemplates,scope=Namespaced,categories=cluster-api,shortName=awsmt // +kubebuilder:storageversion +// +kubebuilder:subresource:status // +k8s:defaulter-gen=true // AWSMachineTemplate is the schema for the Amazon EC2 Machine Templates API. diff --git a/api/v1beta2/zz_generated.deepcopy.go b/api/v1beta2/zz_generated.deepcopy.go index 1bd4313128..6a30f71412 100644 --- a/api/v1beta2/zz_generated.deepcopy.go +++ b/api/v1beta2/zz_generated.deepcopy.go @@ -948,6 +948,11 @@ func (in *AWSMachineTemplateStatus) DeepCopyInto(out *AWSMachineTemplateStatus) (*out)[key] = val.DeepCopy() } } + if in.NodeInfo != nil { + in, out := &in.NodeInfo, &out.NodeInfo + *out = new(NodeInfo) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AWSMachineTemplateStatus. @@ -2013,6 +2018,21 @@ func (in *NetworkStatus) DeepCopy() *NetworkStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeInfo) DeepCopyInto(out *NodeInfo) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeInfo. +func (in *NodeInfo) DeepCopy() *NodeInfo { + if in == nil { + return nil + } + out := new(NodeInfo) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PrivateDNSName) DeepCopyInto(out *PrivateDNSName) { *out = *in diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml index 1d3f40efed..ca52869005 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml @@ -1156,7 +1156,32 @@ spec: This value is used for autoscaling from zero operations as defined in: https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md type: object + nodeInfo: + description: |- + NodeInfo contains information about the node's architecture and operating system. + This value is used for autoscaling from zero operations as defined in: + https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md + properties: + architecture: + description: |- + Architecture is the CPU architecture of the node. + Its underlying type is a string and its value can be any of amd64, arm64. + enum: + - amd64 + - arm64 + type: string + operatingSystem: + description: |- + OperatingSystem is the operating system of the node. + Its underlying type is a string and its value can be any of linux, windows. + enum: + - linux + - windows + type: string + type: object type: object type: object served: true storage: true + subresources: + status: {} diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9fc33ff9ea..bb3bdb7546 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -175,6 +175,7 @@ rules: resources: - awsclusters/status - awsfargateprofiles/status + - awsmachinetemplates/status - rosaclusters/status - rosanetworks/status - rosaroleconfigs/status diff --git a/controllers/awsmachinetemplate_controller.go b/controllers/awsmachinetemplate_controller.go new file mode 100644 index 0000000000..c22c1e8afc --- /dev/null +++ b/controllers/awsmachinetemplate_controller.go @@ -0,0 +1,411 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "fmt" + "strings" + + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/v2/controlplane/eks/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" + ec2service "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services/ec2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/logger" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" + "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/predicates" +) + +// AWSMachineTemplateReconciler reconciles AWSMachineTemplate objects. +// +// This controller automatically populates capacity information for AWSMachineTemplate resources +// to enable autoscaling from zero. +// +// See: https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md +type AWSMachineTemplateReconciler struct { + client.Client + WatchFilterValue string +} + +// SetupWithManager sets up the controller with the Manager. +func (r *AWSMachineTemplateReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { + log := logger.FromContext(ctx) + + return ctrl.NewControllerManagedBy(mgr). + For(&infrav1.AWSMachineTemplate{}). + WithOptions(options). + WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(mgr.GetScheme(), log.GetLogger(), r.WatchFilterValue)). + Complete(r) +} + +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachinetemplates,verbs=get;list;watch +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachinetemplates/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsclusters,verbs=get;list;watch +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters,verbs=get;list;watch +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinedeployments,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch + +// Reconcile populates capacity information for AWSMachineTemplate. +func (r *AWSMachineTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logger.FromContext(ctx) + + // Fetch the AWSMachineTemplate + awsMachineTemplate := &infrav1.AWSMachineTemplate{} + if err := r.Get(ctx, req.NamespacedName, awsMachineTemplate); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // Get instance type from spec + instanceType := awsMachineTemplate.Spec.Template.Spec.InstanceType + if instanceType == "" { + return ctrl.Result{}, nil + } + + // Check if capacity and nodeInfo are already populated + // This avoids unnecessary AWS API calls when the status is already populated + if len(awsMachineTemplate.Status.Capacity) > 0 && + awsMachineTemplate.Status.NodeInfo != nil && awsMachineTemplate.Status.NodeInfo.OperatingSystem != "" && awsMachineTemplate.Status.NodeInfo.Architecture != "" { + return ctrl.Result{}, nil + } + + // Find the region by checking ownerReferences + region, err := r.getRegion(ctx, awsMachineTemplate) + if err != nil { + return ctrl.Result{}, err + } + if region == "" { + return ctrl.Result{}, nil + } + + // Create global scope for this region + // Reference: exp/instancestate/awsinstancestate_controller.go:68-76 + globalScope, err := scope.NewGlobalScope(scope.GlobalScopeParams{ + ControllerName: "awsmachinetemplate", + Region: region, + }) + if err != nil { + record.Warnf(awsMachineTemplate, "AWSSessionFailed", "Failed to create AWS session for region %q: %v", region, err) + return ctrl.Result{}, nil + } + + // Create EC2 client from global scope + ec2Client := ec2.NewFromConfig(globalScope.Session()) + + // Query instance type capacity + capacity, err := r.getInstanceTypeCapacity(ctx, ec2Client, instanceType) + if err != nil { + record.Warnf(awsMachineTemplate, "CapacityQueryFailed", "Failed to query capacity for instance type %q: %v", instanceType, err) + return ctrl.Result{}, nil + } + + // Query node info (architecture and OS) + nodeInfo, err := r.getNodeInfo(ctx, ec2Client, awsMachineTemplate, instanceType) + if err != nil { + record.Warnf(awsMachineTemplate, "NodeInfoQueryFailed", "Failed to query node info for instance type %q: %v", instanceType, err) + return ctrl.Result{}, nil + } + + // Save original before modifying, then update all status fields at once + original := awsMachineTemplate.DeepCopy() + if len(capacity) > 0 { + awsMachineTemplate.Status.Capacity = capacity + } + if nodeInfo != nil && (nodeInfo.Architecture != "" || nodeInfo.OperatingSystem != "") { + awsMachineTemplate.Status.NodeInfo = nodeInfo + } + if err := r.Status().Patch(ctx, awsMachineTemplate, client.MergeFrom(original)); err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to update AWSMachineTemplate status") + } + + log.Info("Successfully populated capacity and nodeInfo", "instanceType", instanceType, "region", region, "capacity", capacity, "nodeInfo", nodeInfo) + return ctrl.Result{}, nil +} + +// getRegion finds the region by checking the template's owner cluster reference. +func (r *AWSMachineTemplateReconciler) getRegion(ctx context.Context, template *infrav1.AWSMachineTemplate) (string, error) { + // Get the owner cluster + cluster, err := util.GetOwnerCluster(ctx, r.Client, template.ObjectMeta) + if err != nil { + return "", err + } + if cluster == nil { + return "", errors.New("no owner cluster found") + } + + // Get region from AWSCluster (standard EC2-based cluster) + if cluster.Spec.InfrastructureRef != nil && cluster.Spec.InfrastructureRef.Kind == "AWSCluster" { + awsCluster := &infrav1.AWSCluster{} + if err := r.Get(ctx, client.ObjectKey{ + Namespace: cluster.Namespace, + Name: cluster.Spec.InfrastructureRef.Name, + }, awsCluster); err != nil { + if !apierrors.IsNotFound(err) { + return "", errors.Wrapf(err, "failed to get AWSCluster %s/%s", cluster.Namespace, cluster.Spec.InfrastructureRef.Name) + } + } else if awsCluster.Spec.Region != "" { + return awsCluster.Spec.Region, nil + } + } + + // Get region from AWSManagedControlPlane (EKS cluster) + if cluster.Spec.ControlPlaneRef != nil && cluster.Spec.ControlPlaneRef.Kind == "AWSManagedControlPlane" { + awsManagedCP := &ekscontrolplanev1.AWSManagedControlPlane{} + if err := r.Get(ctx, client.ObjectKey{ + Namespace: cluster.Namespace, + Name: cluster.Spec.ControlPlaneRef.Name, + }, awsManagedCP); err != nil { + if !apierrors.IsNotFound(err) { + return "", errors.Wrapf(err, "failed to get AWSManagedControlPlane %s/%s", cluster.Namespace, cluster.Spec.ControlPlaneRef.Name) + } + } else if awsManagedCP.Spec.Region != "" { + return awsManagedCP.Spec.Region, nil + } + } + + return "", nil +} + +// getInstanceTypeCapacity queries AWS EC2 API for instance type capacity information. +// Returns the resource list (CPU, Memory). +func (r *AWSMachineTemplateReconciler) getInstanceTypeCapacity(ctx context.Context, ec2Client *ec2.Client, instanceType string) (corev1.ResourceList, error) { + // Query instance type information + input := &ec2.DescribeInstanceTypesInput{ + InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)}, + } + + result, err := ec2Client.DescribeInstanceTypes(ctx, input) + if err != nil { + return nil, errors.Wrapf(err, "failed to describe instance type %q", instanceType) + } + + if len(result.InstanceTypes) == 0 { + return nil, errors.Errorf("no information found for instance type %q", instanceType) + } + + // Extract capacity information + info := result.InstanceTypes[0] + resourceList := corev1.ResourceList{} + + // CPU + if info.VCpuInfo != nil && info.VCpuInfo.DefaultVCpus != nil { + resourceList[corev1.ResourceCPU] = *resource.NewQuantity(int64(*info.VCpuInfo.DefaultVCpus), resource.DecimalSI) + } + + // Memory + if info.MemoryInfo != nil && info.MemoryInfo.SizeInMiB != nil { + resourceList[corev1.ResourceMemory] = resource.MustParse(fmt.Sprintf("%dMi", *info.MemoryInfo.SizeInMiB)) + } + + return resourceList, nil +} + +// getNodeInfo queries node information (architecture and OS) for the AWSMachineTemplate. +// It attempts to resolve nodeInfo using three strategies in order of priority: +// 1. Directly from explicitly specified AMI ID +// 2. From default AMI lookup (requires Kubernetes version from owner MachineDeployment/KubeadmControlPlane) +// 3. From instance type architecture (OS cannot be determined, only architecture) +func (r *AWSMachineTemplateReconciler) getNodeInfo(ctx context.Context, ec2Client *ec2.Client, template *infrav1.AWSMachineTemplate, instanceType string) (*infrav1.NodeInfo, error) { + // Strategy 1: Extract nodeInfo from the AMI if an ID is set. + if amiID := ptr.Deref(template.Spec.Template.Spec.AMI.ID, ""); amiID != "" { + result, err := ec2Client.DescribeImages(ctx, &ec2.DescribeImagesInput{ + ImageIds: []string{amiID}, + }) + if err != nil { + return nil, errors.Wrapf(err, "failed to describe AMI %q", amiID) + } + if len(result.Images) == 0 { + return nil, errors.Errorf("no information found for AMI %q", amiID) + } + // Extract nodeInfo directly from the image object (no additional API call needed) + return r.extractNodeInfoFromImage(result.Images[0]), nil + } + + // No explicit AMI ID specified, query instance type to determine architecture + // This architecture will be used to lookup default AMI (Strategy 2) or as fallback (Strategy 3) + result, err := ec2Client.DescribeInstanceTypes(ctx, &ec2.DescribeInstanceTypesInput{ + InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)}, + }) + if err != nil { + return nil, errors.Wrapf(err, "failed to describe instance type %q", instanceType) + } + + if len(result.InstanceTypes) == 0 { + return nil, errors.Errorf("no information found for instance type %q", instanceType) + } + + instanceTypeInfo := result.InstanceTypes[0] + + // Instance type must support exactly one architecture + if instanceTypeInfo.ProcessorInfo == nil || len(instanceTypeInfo.ProcessorInfo.SupportedArchitectures) != 1 { + return nil, errors.Errorf("instance type must support exactly one architecture, got %d", len(instanceTypeInfo.ProcessorInfo.SupportedArchitectures)) + } + + // Map EC2 architecture type to architecture tag for AMI lookup + var architecture string + var nodeInfoArch infrav1.Architecture + switch instanceTypeInfo.ProcessorInfo.SupportedArchitectures[0] { + case ec2types.ArchitectureTypeX8664: + architecture = ec2service.Amd64ArchitectureTag + nodeInfoArch = infrav1.ArchitectureAmd64 + case ec2types.ArchitectureTypeArm64: + architecture = ec2service.Arm64ArchitectureTag + nodeInfoArch = infrav1.ArchitectureArm64 + default: + return nil, errors.Errorf("unsupported architecture: %v", instanceTypeInfo.ProcessorInfo.SupportedArchitectures[0]) + } + + // Strategy 2: Try to get Kubernetes version and lookup default AMI + kubernetesVersion, err := r.getKubernetesVersion(ctx, template) + if err == nil && kubernetesVersion != "" { + // Attempt AMI lookup with the version + image, err := ec2service.DefaultAMILookup( + ec2Client, + template.Spec.Template.Spec.ImageLookupOrg, + template.Spec.Template.Spec.ImageLookupBaseOS, + kubernetesVersion, + architecture, + template.Spec.Template.Spec.ImageLookupFormat, + ) + if err == nil && image != nil { + // Successfully found AMI, extract accurate nodeInfo from it + return r.extractNodeInfoFromImage(*image), nil + } + // AMI lookup failed, fall through to Strategy 3 + } + + // Strategy 3: Fallback to instance type architecture only + // Note: OS cannot be determined from instance type alone, only architecture + return &infrav1.NodeInfo{ + Architecture: nodeInfoArch, + }, nil +} + +// extractNodeInfoFromImage extracts nodeInfo (architecture and OS) from an EC2 image. +// This is a pure function with no AWS API calls. +func (r *AWSMachineTemplateReconciler) extractNodeInfoFromImage(image ec2types.Image) *infrav1.NodeInfo { + nodeInfo := &infrav1.NodeInfo{} + + // Extract architecture from AMI + switch image.Architecture { + case ec2types.ArchitectureValuesX8664: + nodeInfo.Architecture = infrav1.ArchitectureAmd64 + case ec2types.ArchitectureValuesArm64: + nodeInfo.Architecture = infrav1.ArchitectureArm64 + } + + // Determine OS - default to Linux, change to Windows if detected + // Most AMIs are Linux-based, so we initialize with Linux as the default + nodeInfo.OperatingSystem = infrav1.OperatingSystemLinux + + // Check Platform field (most reliable for Windows detection) + if image.Platform == ec2types.PlatformValuesWindows { + nodeInfo.OperatingSystem = infrav1.OperatingSystemWindows + return nodeInfo + } + + // Check PlatformDetails field for Windows indication + if image.PlatformDetails != nil { + platformDetails := strings.ToLower(*image.PlatformDetails) + if strings.Contains(platformDetails, string(infrav1.OperatingSystemWindows)) { + nodeInfo.OperatingSystem = infrav1.OperatingSystemWindows + } + } + + return nodeInfo +} + +// getKubernetesVersion attempts to find the Kubernetes version by querying MachineDeployments +// or KubeadmControlPlanes that reference this AWSMachineTemplate. +func (r *AWSMachineTemplateReconciler) getKubernetesVersion(ctx context.Context, template *infrav1.AWSMachineTemplate) (string, error) { + listOpts, err := getParentListOptions(template.ObjectMeta) + if err != nil { + return "", errors.Wrap(err, "failed to get parent list options") + } + + // Try to find version from MachineDeployment first + machineDeploymentList := &clusterv1.MachineDeploymentList{} + if err := r.List(ctx, machineDeploymentList, listOpts...); err != nil { + return "", errors.Wrap(err, "failed to list MachineDeployments") + } + + // Find MachineDeployments that reference this AWSMachineTemplate + for _, md := range machineDeploymentList.Items { + if version := ptr.Deref(md.Spec.Template.Spec.Version, ""); md.Spec.Template.Spec.InfrastructureRef.Kind == "AWSMachineTemplate" && + md.Spec.Template.Spec.InfrastructureRef.Name == template.Name && version != "" { + return version, nil + } + } + + // If not found in MachineDeployment, try KubeadmControlPlane + kcpList := &controlplanev1.KubeadmControlPlaneList{} + if err := r.List(ctx, kcpList, listOpts...); err != nil { + return "", errors.Wrap(err, "failed to list KubeadmControlPlanes") + } + + // Find KubeadmControlPlanes that reference this AWSMachineTemplate + for _, kcp := range kcpList.Items { + if kcp.Spec.MachineTemplate.InfrastructureRef.Kind == "AWSMachineTemplate" && + kcp.Spec.MachineTemplate.InfrastructureRef.Name == template.Name && + kcp.Spec.Version != "" { + return kcp.Spec.Version, nil + } + } + + return "", errors.New("no MachineDeployment or KubeadmControlPlane found referencing this AWSMachineTemplate with a version") +} + +func getParentListOptions(obj metav1.ObjectMeta) ([]client.ListOption, error) { + listOpts := []client.ListOption{ + client.InNamespace(obj.Namespace), + } + + for _, ref := range obj.GetOwnerReferences() { + if ref.Kind != "Cluster" { + continue + } + gv, err := schema.ParseGroupVersion(ref.APIVersion) + if err != nil { + return nil, errors.WithStack(err) + } + if gv.Group == clusterv1.GroupVersion.Group { + listOpts = append(listOpts, client.MatchingLabels{ + clusterv1.ClusterNameLabel: ref.Name, + }) + break + } + } + return listOpts, nil +} diff --git a/controllers/awsmachinetemplate_controller_unit_test.go b/controllers/awsmachinetemplate_controller_unit_test.go new file mode 100644 index 0000000000..322e423a54 --- /dev/null +++ b/controllers/awsmachinetemplate_controller_unit_test.go @@ -0,0 +1,351 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" +) + +func TestAWSMachineTemplateReconciler(t *testing.T) { + setupScheme := func() *runtime.Scheme { + scheme := runtime.NewScheme() + _ = infrav1.AddToScheme(scheme) + _ = clusterv1.AddToScheme(scheme) + _ = corev1.AddToScheme(scheme) + return scheme + } + + newFakeClient := func(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(setupScheme()). + WithObjects(objs...). + WithStatusSubresource(&infrav1.AWSMachineTemplate{}). + Build() + } + + newAWSMachineTemplate := func(name string) *infrav1.AWSMachineTemplate { + return &infrav1.AWSMachineTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + }, + Spec: infrav1.AWSMachineTemplateSpec{ + Template: infrav1.AWSMachineTemplateResource{ + Spec: infrav1.AWSMachineSpec{ + InstanceType: "t3.medium", + }, + }, + }, + } + } + + t.Run("getRegion", func(t *testing.T) { + t.Run("should get region from AWSCluster", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("us-west-2")) + }) + + t.Run("should return error when no owner cluster found", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).ToNot(BeNil()) + g.Expect(err.Error()).To(ContainSubstring("no owner cluster found")) + g.Expect(region).To(Equal("")) + }) + + t.Run("should return empty when cluster has no infrastructure ref", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("")) + }) + + t.Run("should return empty when AWSCluster not found", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + region, err := reconciler.getRegion(context.Background(), template) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("")) + }) + }) + + // Note: getInstanceTypeInfo tests are skipped as they require EC2 client injection + // which would need significant refactoring. The function is tested indirectly through + // integration tests. + + t.Run("Reconcile", func(t *testing.T) { + t.Run("should skip reconcile when capacity and nodeInfo are already populated", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Status.Capacity = corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + corev1.ResourceMemory: resource.MustParse("4Gi"), + } + template.Status.NodeInfo = &infrav1.NodeInfo{ + Architecture: infrav1.ArchitectureAmd64, + OperatingSystem: infrav1.OperatingSystemLinux, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + // Should skip reconcile and return early without calling AWS APIs + // No need to set up owner cluster or region since the early return happens before that + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should reconcile when capacity set but nodeInfo is not", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Status.Capacity = corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + } + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + // This will fail at AWS API call, but demonstrates that reconcile proceeds + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when instance type is empty", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Spec.Template.Spec.InstanceType = "" + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should return error when no owner cluster", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).ToNot(BeNil()) + g.Expect(err.Error()).To(ContainSubstring("no owner cluster found")) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when region is empty", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should return nil when template not found", func(t *testing.T) { + g := NewWithT(t) + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKey{ + Namespace: "default", + Name: "nonexistent", + }, + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + }) +} diff --git a/main.go b/main.go index 785d1e7969..eb985017eb 100644 --- a/main.go +++ b/main.go @@ -360,9 +360,19 @@ func setupReconcilersAndWebhooks(ctx context.Context, mgr ctrl.Manager, setupLog.Error(err, "unable to create controller", "controller", "AWSCluster") os.Exit(1) } + + setupLog.Info("enabling AWSMachineTemplate controller") + if err := (&controllers.AWSMachineTemplateReconciler{ + Client: mgr.GetClient(), + WatchFilterValue: watchFilterValue, + }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: awsClusterConcurrency, RecoverPanic: ptr.To[bool](true)}); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "AWSMachineTemplate") + os.Exit(1) + } } else { setupLog.Info("controller disabled", "controller", "AWSMachine", "controller-group", controllers.Unmanaged) setupLog.Info("controller disabled", "controller", "AWSCluster", "controller-group", controllers.Unmanaged) + setupLog.Info("controller disabled", "controller", "AWSMachineTemplate", "controller-group", controllers.Unmanaged) } if feature.Gates.Enabled(feature.MachinePool) { diff --git a/test/e2e/suites/unmanaged/unmanaged_functional_test.go b/test/e2e/suites/unmanaged/unmanaged_functional_test.go index f4d6d42e94..ac4e4d321a 100644 --- a/test/e2e/suites/unmanaged/unmanaged_functional_test.go +++ b/test/e2e/suites/unmanaged/unmanaged_functional_test.go @@ -352,6 +352,34 @@ var _ = ginkgo.Context("[unmanaged] [functional]", func() { }) Expect(len(workerMachines)).To(Equal(1)) Expect(len(controlPlaneMachines)).To(Equal(1)) + + ginkgo.By("Verifying AWSMachineTemplate capacity is populated for autoscaling from zero") + awsMachineTemplateList := &infrav1.AWSMachineTemplateList{} + err := e2eCtx.Environment.BootstrapClusterProxy.GetClient().List(ctx, awsMachineTemplateList, client.InNamespace(namespace.Name)) + Expect(err).To(BeNil()) + Expect(len(awsMachineTemplateList.Items)).To(BeNumerically(">", 0), "Expected at least one AWSMachineTemplate") + + foundTemplateWithCapacity := false + foundTemplateWithNodeInfo := false + for _, template := range awsMachineTemplateList.Items { + if len(template.Status.Capacity) > 0 { + foundTemplateWithCapacity = true + ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has capacity populated: %v", template.Name, template.Status.Capacity)) + Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceCPU), "Expected CPU to be set in capacity") + Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceMemory), "Expected Memory to be set in capacity") + } + if template.Status.NodeInfo != nil { + foundTemplateWithNodeInfo = true + ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has nodeInfo populated: %v", template.Name, template.Status.NodeInfo)) + // Verify architecture is set (should be either amd64 or arm64 for AWS) + Expect(template.Status.NodeInfo.Architecture).ToNot(BeEmpty(), "Expected architecture to be set in nodeInfo") + Expect(string(template.Status.NodeInfo.Architecture)).To(MatchRegexp("^(amd64|arm64)$"), "Expected architecture to be amd64 or arm64") + // Verify operating system is set + Expect(template.Status.NodeInfo.OperatingSystem).ToNot(BeEmpty(), "Expected operatingSystem to be set in nodeInfo") + } + } + Expect(foundTemplateWithCapacity).To(BeTrue(), "Expected at least one AWSMachineTemplate to have capacity populated") + Expect(foundTemplateWithNodeInfo).To(BeTrue(), "Expected at least one AWSMachineTemplate to have nodeInfo populated") }) }) From 03683f1998ae12b8631525d5c0346f991310dd69 Mon Sep 17 00:00:00 2001 From: Liangquan Li Date: Fri, 14 Nov 2025 00:38:06 +0800 Subject: [PATCH 2/2] Fix review comments 4 --- api/v1beta1/awsmachine_conversion.go | 1 + api/v1beta1/conversion.go | 2 +- api/v1beta1/zz_generated.conversion.go | 1 + api/v1beta2/awsmachinetemplate_types.go | 14 ++ api/v1beta2/zz_generated.deepcopy.go | 7 + ....cluster.x-k8s.io_awsmachinetemplates.yaml | 52 +++++++ controllers/awsmachinetemplate_controller.go | 24 ++- ...awsmachinetemplate_controller_unit_test.go | 140 +++++++++++++----- 8 files changed, 196 insertions(+), 45 deletions(-) diff --git a/api/v1beta1/awsmachine_conversion.go b/api/v1beta1/awsmachine_conversion.go index f7f55af221..6e87547918 100644 --- a/api/v1beta1/awsmachine_conversion.go +++ b/api/v1beta1/awsmachine_conversion.go @@ -138,6 +138,7 @@ func (r *AWSMachineTemplate) ConvertTo(dstRaw conversion.Hub) error { // Restore Status fields that don't exist in v1beta1. dst.Status.NodeInfo = restored.Status.NodeInfo + dst.Status.Conditions = restored.Status.Conditions return nil } diff --git a/api/v1beta1/conversion.go b/api/v1beta1/conversion.go index 0f3833fba8..a2a895bfd1 100644 --- a/api/v1beta1/conversion.go +++ b/api/v1beta1/conversion.go @@ -110,6 +110,6 @@ func Convert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in *v1beta2.AW } func Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { - // NodeInfo field is ignored (dropped) as it doesn't exist in v1beta1 + // NodeInfo and Conditions fields are ignored (dropped) as they don't exist in v1beta1 return autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in, out, s) } diff --git a/api/v1beta1/zz_generated.conversion.go b/api/v1beta1/zz_generated.conversion.go index 19b919643e..aa024f661b 100644 --- a/api/v1beta1/zz_generated.conversion.go +++ b/api/v1beta1/zz_generated.conversion.go @@ -1626,6 +1626,7 @@ func Convert_v1beta1_AWSMachineTemplateStatus_To_v1beta2_AWSMachineTemplateStatu func autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { out.Capacity = *(*v1.ResourceList)(unsafe.Pointer(&in.Capacity)) // WARNING: in.NodeInfo requires manual conversion: does not exist in peer-type + // WARNING: in.Conditions requires manual conversion: does not exist in peer-type return nil } diff --git a/api/v1beta2/awsmachinetemplate_types.go b/api/v1beta2/awsmachinetemplate_types.go index d9d94dc73d..cbbfa15492 100644 --- a/api/v1beta2/awsmachinetemplate_types.go +++ b/api/v1beta2/awsmachinetemplate_types.go @@ -72,6 +72,10 @@ type AWSMachineTemplateStatus struct { // https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md // +optional NodeInfo *NodeInfo `json:"nodeInfo,omitempty"` + + // Conditions defines current service state of the AWSMachineTemplate. + // +optional + Conditions clusterv1.Conditions `json:"conditions,omitempty"` } // AWSMachineTemplateSpec defines the desired state of AWSMachineTemplate. @@ -114,6 +118,16 @@ type AWSMachineTemplateResource struct { Spec AWSMachineSpec `json:"spec"` } +// GetConditions returns the observations of the operational state of the AWSMachineTemplate resource. +func (r *AWSMachineTemplate) GetConditions() clusterv1.Conditions { + return r.Status.Conditions +} + +// SetConditions sets the underlying service state of the AWSMachineTemplate to the predescribed clusterv1.Conditions. +func (r *AWSMachineTemplate) SetConditions(conditions clusterv1.Conditions) { + r.Status.Conditions = conditions +} + func init() { SchemeBuilder.Register(&AWSMachineTemplate{}, &AWSMachineTemplateList{}) } diff --git a/api/v1beta2/zz_generated.deepcopy.go b/api/v1beta2/zz_generated.deepcopy.go index 6a30f71412..c17702d3a1 100644 --- a/api/v1beta2/zz_generated.deepcopy.go +++ b/api/v1beta2/zz_generated.deepcopy.go @@ -953,6 +953,13 @@ func (in *AWSMachineTemplateStatus) DeepCopyInto(out *AWSMachineTemplateStatus) *out = new(NodeInfo) **out = **in } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make(v1beta1.Conditions, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AWSMachineTemplateStatus. diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml index ca52869005..f8f0b32215 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml @@ -1156,6 +1156,58 @@ spec: This value is used for autoscaling from zero operations as defined in: https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md type: object + conditions: + description: Conditions defines current service state of the AWSMachineTemplate. + items: + description: Condition defines an observation of a Cluster API resource + operational state. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when + the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This field may be empty. + maxLength: 10240 + minLength: 1 + type: string + reason: + description: |- + reason is the reason for the condition's last transition in CamelCase. + The specific API may choose whether or not this field is considered a guaranteed API. + This field may be empty. + maxLength: 256 + minLength: 1 + type: string + severity: + description: |- + severity provides an explicit classification of Reason code, so the users or machines can immediately + understand the current situation and act accordingly. + The Severity field MUST be set only when Status=False. + maxLength: 32 + type: string + status: + description: status of the condition, one of True, False, Unknown. + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions + can be useful (see .node.status.conditions), the ability to deconflict is important. + maxLength: 256 + minLength: 1 + type: string + required: + - lastTransitionTime + - status + - type + type: object + type: array nodeInfo: description: |- NodeInfo contains information about the node's architecture and operating system. diff --git a/controllers/awsmachinetemplate_controller.go b/controllers/awsmachinetemplate_controller.go index c22c1e8afc..cc19e0eaf3 100644 --- a/controllers/awsmachinetemplate_controller.go +++ b/controllers/awsmachinetemplate_controller.go @@ -40,6 +40,7 @@ import ( ec2service "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services/ec2" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/logger" "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record" + "sigs.k8s.io/cluster-api-provider-aws/v2/util/paused" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" "sigs.k8s.io/cluster-api/util" @@ -101,8 +102,22 @@ func (r *AWSMachineTemplateReconciler) Reconcile(ctx context.Context, req ctrl.R return ctrl.Result{}, nil } + // Get the owner cluster + cluster, err := util.GetOwnerCluster(ctx, r.Client, awsMachineTemplate.ObjectMeta) + if err != nil { + return ctrl.Result{}, err + } + if cluster == nil { + return ctrl.Result{}, nil + } + + // Check if the resource is paused + if isPaused, conditionChanged, err := paused.EnsurePausedCondition(ctx, r.Client, cluster, awsMachineTemplate); err != nil || isPaused || conditionChanged { + return ctrl.Result{}, err + } + // Find the region by checking ownerReferences - region, err := r.getRegion(ctx, awsMachineTemplate) + region, err := r.getRegion(ctx, cluster) if err != nil { return ctrl.Result{}, err } @@ -155,12 +170,7 @@ func (r *AWSMachineTemplateReconciler) Reconcile(ctx context.Context, req ctrl.R } // getRegion finds the region by checking the template's owner cluster reference. -func (r *AWSMachineTemplateReconciler) getRegion(ctx context.Context, template *infrav1.AWSMachineTemplate) (string, error) { - // Get the owner cluster - cluster, err := util.GetOwnerCluster(ctx, r.Client, template.ObjectMeta) - if err != nil { - return "", err - } +func (r *AWSMachineTemplateReconciler) getRegion(ctx context.Context, cluster *clusterv1.Cluster) (string, error) { if cluster == nil { return "", errors.New("no owner cluster found") } diff --git a/controllers/awsmachinetemplate_controller_unit_test.go b/controllers/awsmachinetemplate_controller_unit_test.go index 322e423a54..19482990ba 100644 --- a/controllers/awsmachinetemplate_controller_unit_test.go +++ b/controllers/awsmachinetemplate_controller_unit_test.go @@ -69,14 +69,6 @@ func TestAWSMachineTemplateReconciler(t *testing.T) { t.Run("getRegion", func(t *testing.T) { t.Run("should get region from AWSCluster", func(t *testing.T) { g := NewWithT(t) - template := newAWSMachineTemplate("test-template") - template.OwnerReferences = []metav1.OwnerReference{ - { - APIVersion: clusterv1.GroupVersion.String(), - Kind: "Cluster", - Name: "test-cluster", - }, - } cluster := &clusterv1.Cluster{ ObjectMeta: metav1.ObjectMeta{ Name: "test-cluster", @@ -101,24 +93,23 @@ func TestAWSMachineTemplateReconciler(t *testing.T) { } reconciler := &AWSMachineTemplateReconciler{ - Client: newFakeClient(template, cluster, awsCluster), + Client: newFakeClient(cluster, awsCluster), } - region, err := reconciler.getRegion(context.Background(), template) + region, err := reconciler.getRegion(context.Background(), cluster) g.Expect(err).To(BeNil()) g.Expect(region).To(Equal("us-west-2")) }) - t.Run("should return error when no owner cluster found", func(t *testing.T) { + t.Run("should return error when cluster is nil", func(t *testing.T) { g := NewWithT(t) - template := newAWSMachineTemplate("test-template") reconciler := &AWSMachineTemplateReconciler{ - Client: newFakeClient(template), + Client: newFakeClient(), } - region, err := reconciler.getRegion(context.Background(), template) + region, err := reconciler.getRegion(context.Background(), nil) g.Expect(err).ToNot(BeNil()) g.Expect(err.Error()).To(ContainSubstring("no owner cluster found")) @@ -127,14 +118,6 @@ func TestAWSMachineTemplateReconciler(t *testing.T) { t.Run("should return empty when cluster has no infrastructure ref", func(t *testing.T) { g := NewWithT(t) - template := newAWSMachineTemplate("test-template") - template.OwnerReferences = []metav1.OwnerReference{ - { - APIVersion: clusterv1.GroupVersion.String(), - Kind: "Cluster", - Name: "test-cluster", - }, - } cluster := &clusterv1.Cluster{ ObjectMeta: metav1.ObjectMeta{ Name: "test-cluster", @@ -143,10 +126,10 @@ func TestAWSMachineTemplateReconciler(t *testing.T) { } reconciler := &AWSMachineTemplateReconciler{ - Client: newFakeClient(template, cluster), + Client: newFakeClient(cluster), } - region, err := reconciler.getRegion(context.Background(), template) + region, err := reconciler.getRegion(context.Background(), cluster) g.Expect(err).To(BeNil()) g.Expect(region).To(Equal("")) @@ -154,14 +137,6 @@ func TestAWSMachineTemplateReconciler(t *testing.T) { t.Run("should return empty when AWSCluster not found", func(t *testing.T) { g := NewWithT(t) - template := newAWSMachineTemplate("test-template") - template.OwnerReferences = []metav1.OwnerReference{ - { - APIVersion: clusterv1.GroupVersion.String(), - Kind: "Cluster", - Name: "test-cluster", - }, - } cluster := &clusterv1.Cluster{ ObjectMeta: metav1.ObjectMeta{ Name: "test-cluster", @@ -177,10 +152,10 @@ func TestAWSMachineTemplateReconciler(t *testing.T) { } reconciler := &AWSMachineTemplateReconciler{ - Client: newFakeClient(template, cluster), + Client: newFakeClient(cluster), } - region, err := reconciler.getRegion(context.Background(), template) + region, err := reconciler.getRegion(context.Background(), cluster) g.Expect(err).To(BeNil()) g.Expect(region).To(Equal("")) @@ -284,7 +259,99 @@ func TestAWSMachineTemplateReconciler(t *testing.T) { g.Expect(result.Requeue).To(BeFalse()) }) - t.Run("should return error when no owner cluster", func(t *testing.T) { + t.Run("should not reconcile when cluster is paused", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + Paused: true, + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should not reconcile when template has paused annotation", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Annotations = map[string]string{clusterv1.PausedAnnotation: ""} + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when no owner cluster", func(t *testing.T) { g := NewWithT(t) template := newAWSMachineTemplate("test-template") @@ -296,8 +363,7 @@ func TestAWSMachineTemplateReconciler(t *testing.T) { NamespacedName: client.ObjectKeyFromObject(template), }) - g.Expect(err).ToNot(BeNil()) - g.Expect(err.Error()).To(ContainSubstring("no owner cluster found")) + g.Expect(err).To(BeNil()) g.Expect(result.Requeue).To(BeFalse()) })