diff --git a/api/v1beta1/awsmachine_conversion.go b/api/v1beta1/awsmachine_conversion.go index 85f7a1c36b..6e87547918 100644 --- a/api/v1beta1/awsmachine_conversion.go +++ b/api/v1beta1/awsmachine_conversion.go @@ -136,6 +136,10 @@ func (r *AWSMachineTemplate) ConvertTo(dstRaw conversion.Hub) error { } } + // Restore Status fields that don't exist in v1beta1. + dst.Status.NodeInfo = restored.Status.NodeInfo + dst.Status.Conditions = restored.Status.Conditions + return nil } diff --git a/api/v1beta1/conversion.go b/api/v1beta1/conversion.go index c3000ee97c..a2a895bfd1 100644 --- a/api/v1beta1/conversion.go +++ b/api/v1beta1/conversion.go @@ -108,3 +108,8 @@ func Convert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in *v1beta2.AW // Note: DedicatedHostID is not present in v1beta1, so it will be dropped during conversion return autoConvert_v1beta2_AWSMachineStatus_To_v1beta1_AWSMachineStatus(in, out, s) } + +func Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { + // NodeInfo and Conditions fields are ignored (dropped) as they don't exist in v1beta1 + return autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in, out, s) +} diff --git a/api/v1beta1/zz_generated.conversion.go b/api/v1beta1/zz_generated.conversion.go index ebd82d07d9..aa024f661b 100644 --- a/api/v1beta1/zz_generated.conversion.go +++ b/api/v1beta1/zz_generated.conversion.go @@ -1625,14 +1625,11 @@ func Convert_v1beta1_AWSMachineTemplateStatus_To_v1beta2_AWSMachineTemplateStatu func autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { out.Capacity = *(*v1.ResourceList)(unsafe.Pointer(&in.Capacity)) + // WARNING: in.NodeInfo requires manual conversion: does not exist in peer-type + // WARNING: in.Conditions requires manual conversion: does not exist in peer-type return nil } -// Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus is an autogenerated conversion function. -func Convert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in *v1beta2.AWSMachineTemplateStatus, out *AWSMachineTemplateStatus, s conversion.Scope) error { - return autoConvert_v1beta2_AWSMachineTemplateStatus_To_v1beta1_AWSMachineTemplateStatus(in, out, s) -} - func autoConvert_v1beta1_AWSResourceReference_To_v1beta2_AWSResourceReference(in *AWSResourceReference, out *v1beta2.AWSResourceReference, s conversion.Scope) error { out.ID = (*string)(unsafe.Pointer(in.ID)) // WARNING: in.ARN requires manual conversion: does not exist in peer-type diff --git a/api/v1beta2/awsmachinetemplate_types.go b/api/v1beta2/awsmachinetemplate_types.go index 50d8dda22d..cbbfa15492 100644 --- a/api/v1beta2/awsmachinetemplate_types.go +++ b/api/v1beta2/awsmachinetemplate_types.go @@ -23,6 +23,42 @@ import ( clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" ) +// Architecture represents the CPU architecture of the node. +// Its underlying type is a string and its value can be any of amd64, arm64. +type Architecture string + +// Architecture constants. +const ( + ArchitectureAmd64 Architecture = "amd64" + ArchitectureArm64 Architecture = "arm64" +) + +// OperatingSystem represents the operating system of the node. +// Its underlying type is a string and its value can be any of linux, windows. +type OperatingSystem string + +// Operating system constants. +const ( + // OperatingSystemLinux represents the Linux operating system. + OperatingSystemLinux OperatingSystem = "linux" + // OperatingSystemWindows represents the Windows operating system. + OperatingSystemWindows OperatingSystem = "windows" +) + +// NodeInfo contains information about the node's architecture and operating system. +type NodeInfo struct { + // Architecture is the CPU architecture of the node. + // Its underlying type is a string and its value can be any of amd64, arm64. + // +kubebuilder:validation:Enum=amd64;arm64 + // +optional + Architecture Architecture `json:"architecture,omitempty"` + // OperatingSystem is the operating system of the node. + // Its underlying type is a string and its value can be any of linux, windows. + // +kubebuilder:validation:Enum=linux;windows + // +optional + OperatingSystem OperatingSystem `json:"operatingSystem,omitempty"` +} + // AWSMachineTemplateStatus defines a status for an AWSMachineTemplate. type AWSMachineTemplateStatus struct { // Capacity defines the resource capacity for this machine. @@ -30,6 +66,16 @@ type AWSMachineTemplateStatus struct { // https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md // +optional Capacity corev1.ResourceList `json:"capacity,omitempty"` + + // NodeInfo contains information about the node's architecture and operating system. + // This value is used for autoscaling from zero operations as defined in: + // https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md + // +optional + NodeInfo *NodeInfo `json:"nodeInfo,omitempty"` + + // Conditions defines current service state of the AWSMachineTemplate. + // +optional + Conditions clusterv1.Conditions `json:"conditions,omitempty"` } // AWSMachineTemplateSpec defines the desired state of AWSMachineTemplate. @@ -40,6 +86,7 @@ type AWSMachineTemplateSpec struct { // +kubebuilder:object:root=true // +kubebuilder:resource:path=awsmachinetemplates,scope=Namespaced,categories=cluster-api,shortName=awsmt // +kubebuilder:storageversion +// +kubebuilder:subresource:status // +k8s:defaulter-gen=true // AWSMachineTemplate is the schema for the Amazon EC2 Machine Templates API. @@ -71,6 +118,16 @@ type AWSMachineTemplateResource struct { Spec AWSMachineSpec `json:"spec"` } +// GetConditions returns the observations of the operational state of the AWSMachineTemplate resource. +func (r *AWSMachineTemplate) GetConditions() clusterv1.Conditions { + return r.Status.Conditions +} + +// SetConditions sets the underlying service state of the AWSMachineTemplate to the predescribed clusterv1.Conditions. +func (r *AWSMachineTemplate) SetConditions(conditions clusterv1.Conditions) { + r.Status.Conditions = conditions +} + func init() { SchemeBuilder.Register(&AWSMachineTemplate{}, &AWSMachineTemplateList{}) } diff --git a/api/v1beta2/zz_generated.deepcopy.go b/api/v1beta2/zz_generated.deepcopy.go index 1bd4313128..c17702d3a1 100644 --- a/api/v1beta2/zz_generated.deepcopy.go +++ b/api/v1beta2/zz_generated.deepcopy.go @@ -948,6 +948,18 @@ func (in *AWSMachineTemplateStatus) DeepCopyInto(out *AWSMachineTemplateStatus) (*out)[key] = val.DeepCopy() } } + if in.NodeInfo != nil { + in, out := &in.NodeInfo, &out.NodeInfo + *out = new(NodeInfo) + **out = **in + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make(v1beta1.Conditions, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AWSMachineTemplateStatus. @@ -2013,6 +2025,21 @@ func (in *NetworkStatus) DeepCopy() *NetworkStatus { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *NodeInfo) DeepCopyInto(out *NodeInfo) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeInfo. +func (in *NodeInfo) DeepCopy() *NodeInfo { + if in == nil { + return nil + } + out := new(NodeInfo) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *PrivateDNSName) DeepCopyInto(out *PrivateDNSName) { *out = *in diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml index 1d3f40efed..f8f0b32215 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_awsmachinetemplates.yaml @@ -1156,7 +1156,84 @@ spec: This value is used for autoscaling from zero operations as defined in: https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md type: object + conditions: + description: Conditions defines current service state of the AWSMachineTemplate. + items: + description: Condition defines an observation of a Cluster API resource + operational state. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when + the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This field may be empty. + maxLength: 10240 + minLength: 1 + type: string + reason: + description: |- + reason is the reason for the condition's last transition in CamelCase. + The specific API may choose whether or not this field is considered a guaranteed API. + This field may be empty. + maxLength: 256 + minLength: 1 + type: string + severity: + description: |- + severity provides an explicit classification of Reason code, so the users or machines can immediately + understand the current situation and act accordingly. + The Severity field MUST be set only when Status=False. + maxLength: 32 + type: string + status: + description: status of the condition, one of True, False, Unknown. + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions + can be useful (see .node.status.conditions), the ability to deconflict is important. + maxLength: 256 + minLength: 1 + type: string + required: + - lastTransitionTime + - status + - type + type: object + type: array + nodeInfo: + description: |- + NodeInfo contains information about the node's architecture and operating system. + This value is used for autoscaling from zero operations as defined in: + https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md + properties: + architecture: + description: |- + Architecture is the CPU architecture of the node. + Its underlying type is a string and its value can be any of amd64, arm64. + enum: + - amd64 + - arm64 + type: string + operatingSystem: + description: |- + OperatingSystem is the operating system of the node. + Its underlying type is a string and its value can be any of linux, windows. + enum: + - linux + - windows + type: string + type: object type: object type: object served: true storage: true + subresources: + status: {} diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 9fc33ff9ea..bb3bdb7546 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -175,6 +175,7 @@ rules: resources: - awsclusters/status - awsfargateprofiles/status + - awsmachinetemplates/status - rosaclusters/status - rosanetworks/status - rosaroleconfigs/status diff --git a/controllers/awsmachinetemplate_controller.go b/controllers/awsmachinetemplate_controller.go new file mode 100644 index 0000000000..cc19e0eaf3 --- /dev/null +++ b/controllers/awsmachinetemplate_controller.go @@ -0,0 +1,421 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "fmt" + "strings" + + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + ekscontrolplanev1 "sigs.k8s.io/cluster-api-provider-aws/v2/controlplane/eks/api/v1beta2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/scope" + ec2service "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/cloud/services/ec2" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/logger" + "sigs.k8s.io/cluster-api-provider-aws/v2/pkg/record" + "sigs.k8s.io/cluster-api-provider-aws/v2/util/paused" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + controlplanev1 "sigs.k8s.io/cluster-api/controlplane/kubeadm/api/v1beta1" + "sigs.k8s.io/cluster-api/util" + "sigs.k8s.io/cluster-api/util/predicates" +) + +// AWSMachineTemplateReconciler reconciles AWSMachineTemplate objects. +// +// This controller automatically populates capacity information for AWSMachineTemplate resources +// to enable autoscaling from zero. +// +// See: https://github.com/kubernetes-sigs/cluster-api/blob/main/docs/proposals/20210310-opt-in-autoscaling-from-zero.md +type AWSMachineTemplateReconciler struct { + client.Client + WatchFilterValue string +} + +// SetupWithManager sets up the controller with the Manager. +func (r *AWSMachineTemplateReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { + log := logger.FromContext(ctx) + + return ctrl.NewControllerManagedBy(mgr). + For(&infrav1.AWSMachineTemplate{}). + WithOptions(options). + WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(mgr.GetScheme(), log.GetLogger(), r.WatchFilterValue)). + Complete(r) +} + +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachinetemplates,verbs=get;list;watch +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsmachinetemplates/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=awsclusters,verbs=get;list;watch +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=clusters,verbs=get;list;watch +// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machinedeployments,verbs=get;list;watch +// +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch + +// Reconcile populates capacity information for AWSMachineTemplate. +func (r *AWSMachineTemplateReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logger.FromContext(ctx) + + // Fetch the AWSMachineTemplate + awsMachineTemplate := &infrav1.AWSMachineTemplate{} + if err := r.Get(ctx, req.NamespacedName, awsMachineTemplate); err != nil { + if apierrors.IsNotFound(err) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, err + } + + // Get instance type from spec + instanceType := awsMachineTemplate.Spec.Template.Spec.InstanceType + if instanceType == "" { + return ctrl.Result{}, nil + } + + // Check if capacity and nodeInfo are already populated + // This avoids unnecessary AWS API calls when the status is already populated + if len(awsMachineTemplate.Status.Capacity) > 0 && + awsMachineTemplate.Status.NodeInfo != nil && awsMachineTemplate.Status.NodeInfo.OperatingSystem != "" && awsMachineTemplate.Status.NodeInfo.Architecture != "" { + return ctrl.Result{}, nil + } + + // Get the owner cluster + cluster, err := util.GetOwnerCluster(ctx, r.Client, awsMachineTemplate.ObjectMeta) + if err != nil { + return ctrl.Result{}, err + } + if cluster == nil { + return ctrl.Result{}, nil + } + + // Check if the resource is paused + if isPaused, conditionChanged, err := paused.EnsurePausedCondition(ctx, r.Client, cluster, awsMachineTemplate); err != nil || isPaused || conditionChanged { + return ctrl.Result{}, err + } + + // Find the region by checking ownerReferences + region, err := r.getRegion(ctx, cluster) + if err != nil { + return ctrl.Result{}, err + } + if region == "" { + return ctrl.Result{}, nil + } + + // Create global scope for this region + // Reference: exp/instancestate/awsinstancestate_controller.go:68-76 + globalScope, err := scope.NewGlobalScope(scope.GlobalScopeParams{ + ControllerName: "awsmachinetemplate", + Region: region, + }) + if err != nil { + record.Warnf(awsMachineTemplate, "AWSSessionFailed", "Failed to create AWS session for region %q: %v", region, err) + return ctrl.Result{}, nil + } + + // Create EC2 client from global scope + ec2Client := ec2.NewFromConfig(globalScope.Session()) + + // Query instance type capacity + capacity, err := r.getInstanceTypeCapacity(ctx, ec2Client, instanceType) + if err != nil { + record.Warnf(awsMachineTemplate, "CapacityQueryFailed", "Failed to query capacity for instance type %q: %v", instanceType, err) + return ctrl.Result{}, nil + } + + // Query node info (architecture and OS) + nodeInfo, err := r.getNodeInfo(ctx, ec2Client, awsMachineTemplate, instanceType) + if err != nil { + record.Warnf(awsMachineTemplate, "NodeInfoQueryFailed", "Failed to query node info for instance type %q: %v", instanceType, err) + return ctrl.Result{}, nil + } + + // Save original before modifying, then update all status fields at once + original := awsMachineTemplate.DeepCopy() + if len(capacity) > 0 { + awsMachineTemplate.Status.Capacity = capacity + } + if nodeInfo != nil && (nodeInfo.Architecture != "" || nodeInfo.OperatingSystem != "") { + awsMachineTemplate.Status.NodeInfo = nodeInfo + } + if err := r.Status().Patch(ctx, awsMachineTemplate, client.MergeFrom(original)); err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to update AWSMachineTemplate status") + } + + log.Info("Successfully populated capacity and nodeInfo", "instanceType", instanceType, "region", region, "capacity", capacity, "nodeInfo", nodeInfo) + return ctrl.Result{}, nil +} + +// getRegion finds the region by checking the template's owner cluster reference. +func (r *AWSMachineTemplateReconciler) getRegion(ctx context.Context, cluster *clusterv1.Cluster) (string, error) { + if cluster == nil { + return "", errors.New("no owner cluster found") + } + + // Get region from AWSCluster (standard EC2-based cluster) + if cluster.Spec.InfrastructureRef != nil && cluster.Spec.InfrastructureRef.Kind == "AWSCluster" { + awsCluster := &infrav1.AWSCluster{} + if err := r.Get(ctx, client.ObjectKey{ + Namespace: cluster.Namespace, + Name: cluster.Spec.InfrastructureRef.Name, + }, awsCluster); err != nil { + if !apierrors.IsNotFound(err) { + return "", errors.Wrapf(err, "failed to get AWSCluster %s/%s", cluster.Namespace, cluster.Spec.InfrastructureRef.Name) + } + } else if awsCluster.Spec.Region != "" { + return awsCluster.Spec.Region, nil + } + } + + // Get region from AWSManagedControlPlane (EKS cluster) + if cluster.Spec.ControlPlaneRef != nil && cluster.Spec.ControlPlaneRef.Kind == "AWSManagedControlPlane" { + awsManagedCP := &ekscontrolplanev1.AWSManagedControlPlane{} + if err := r.Get(ctx, client.ObjectKey{ + Namespace: cluster.Namespace, + Name: cluster.Spec.ControlPlaneRef.Name, + }, awsManagedCP); err != nil { + if !apierrors.IsNotFound(err) { + return "", errors.Wrapf(err, "failed to get AWSManagedControlPlane %s/%s", cluster.Namespace, cluster.Spec.ControlPlaneRef.Name) + } + } else if awsManagedCP.Spec.Region != "" { + return awsManagedCP.Spec.Region, nil + } + } + + return "", nil +} + +// getInstanceTypeCapacity queries AWS EC2 API for instance type capacity information. +// Returns the resource list (CPU, Memory). +func (r *AWSMachineTemplateReconciler) getInstanceTypeCapacity(ctx context.Context, ec2Client *ec2.Client, instanceType string) (corev1.ResourceList, error) { + // Query instance type information + input := &ec2.DescribeInstanceTypesInput{ + InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)}, + } + + result, err := ec2Client.DescribeInstanceTypes(ctx, input) + if err != nil { + return nil, errors.Wrapf(err, "failed to describe instance type %q", instanceType) + } + + if len(result.InstanceTypes) == 0 { + return nil, errors.Errorf("no information found for instance type %q", instanceType) + } + + // Extract capacity information + info := result.InstanceTypes[0] + resourceList := corev1.ResourceList{} + + // CPU + if info.VCpuInfo != nil && info.VCpuInfo.DefaultVCpus != nil { + resourceList[corev1.ResourceCPU] = *resource.NewQuantity(int64(*info.VCpuInfo.DefaultVCpus), resource.DecimalSI) + } + + // Memory + if info.MemoryInfo != nil && info.MemoryInfo.SizeInMiB != nil { + resourceList[corev1.ResourceMemory] = resource.MustParse(fmt.Sprintf("%dMi", *info.MemoryInfo.SizeInMiB)) + } + + return resourceList, nil +} + +// getNodeInfo queries node information (architecture and OS) for the AWSMachineTemplate. +// It attempts to resolve nodeInfo using three strategies in order of priority: +// 1. Directly from explicitly specified AMI ID +// 2. From default AMI lookup (requires Kubernetes version from owner MachineDeployment/KubeadmControlPlane) +// 3. From instance type architecture (OS cannot be determined, only architecture) +func (r *AWSMachineTemplateReconciler) getNodeInfo(ctx context.Context, ec2Client *ec2.Client, template *infrav1.AWSMachineTemplate, instanceType string) (*infrav1.NodeInfo, error) { + // Strategy 1: Extract nodeInfo from the AMI if an ID is set. + if amiID := ptr.Deref(template.Spec.Template.Spec.AMI.ID, ""); amiID != "" { + result, err := ec2Client.DescribeImages(ctx, &ec2.DescribeImagesInput{ + ImageIds: []string{amiID}, + }) + if err != nil { + return nil, errors.Wrapf(err, "failed to describe AMI %q", amiID) + } + if len(result.Images) == 0 { + return nil, errors.Errorf("no information found for AMI %q", amiID) + } + // Extract nodeInfo directly from the image object (no additional API call needed) + return r.extractNodeInfoFromImage(result.Images[0]), nil + } + + // No explicit AMI ID specified, query instance type to determine architecture + // This architecture will be used to lookup default AMI (Strategy 2) or as fallback (Strategy 3) + result, err := ec2Client.DescribeInstanceTypes(ctx, &ec2.DescribeInstanceTypesInput{ + InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)}, + }) + if err != nil { + return nil, errors.Wrapf(err, "failed to describe instance type %q", instanceType) + } + + if len(result.InstanceTypes) == 0 { + return nil, errors.Errorf("no information found for instance type %q", instanceType) + } + + instanceTypeInfo := result.InstanceTypes[0] + + // Instance type must support exactly one architecture + if instanceTypeInfo.ProcessorInfo == nil || len(instanceTypeInfo.ProcessorInfo.SupportedArchitectures) != 1 { + return nil, errors.Errorf("instance type must support exactly one architecture, got %d", len(instanceTypeInfo.ProcessorInfo.SupportedArchitectures)) + } + + // Map EC2 architecture type to architecture tag for AMI lookup + var architecture string + var nodeInfoArch infrav1.Architecture + switch instanceTypeInfo.ProcessorInfo.SupportedArchitectures[0] { + case ec2types.ArchitectureTypeX8664: + architecture = ec2service.Amd64ArchitectureTag + nodeInfoArch = infrav1.ArchitectureAmd64 + case ec2types.ArchitectureTypeArm64: + architecture = ec2service.Arm64ArchitectureTag + nodeInfoArch = infrav1.ArchitectureArm64 + default: + return nil, errors.Errorf("unsupported architecture: %v", instanceTypeInfo.ProcessorInfo.SupportedArchitectures[0]) + } + + // Strategy 2: Try to get Kubernetes version and lookup default AMI + kubernetesVersion, err := r.getKubernetesVersion(ctx, template) + if err == nil && kubernetesVersion != "" { + // Attempt AMI lookup with the version + image, err := ec2service.DefaultAMILookup( + ec2Client, + template.Spec.Template.Spec.ImageLookupOrg, + template.Spec.Template.Spec.ImageLookupBaseOS, + kubernetesVersion, + architecture, + template.Spec.Template.Spec.ImageLookupFormat, + ) + if err == nil && image != nil { + // Successfully found AMI, extract accurate nodeInfo from it + return r.extractNodeInfoFromImage(*image), nil + } + // AMI lookup failed, fall through to Strategy 3 + } + + // Strategy 3: Fallback to instance type architecture only + // Note: OS cannot be determined from instance type alone, only architecture + return &infrav1.NodeInfo{ + Architecture: nodeInfoArch, + }, nil +} + +// extractNodeInfoFromImage extracts nodeInfo (architecture and OS) from an EC2 image. +// This is a pure function with no AWS API calls. +func (r *AWSMachineTemplateReconciler) extractNodeInfoFromImage(image ec2types.Image) *infrav1.NodeInfo { + nodeInfo := &infrav1.NodeInfo{} + + // Extract architecture from AMI + switch image.Architecture { + case ec2types.ArchitectureValuesX8664: + nodeInfo.Architecture = infrav1.ArchitectureAmd64 + case ec2types.ArchitectureValuesArm64: + nodeInfo.Architecture = infrav1.ArchitectureArm64 + } + + // Determine OS - default to Linux, change to Windows if detected + // Most AMIs are Linux-based, so we initialize with Linux as the default + nodeInfo.OperatingSystem = infrav1.OperatingSystemLinux + + // Check Platform field (most reliable for Windows detection) + if image.Platform == ec2types.PlatformValuesWindows { + nodeInfo.OperatingSystem = infrav1.OperatingSystemWindows + return nodeInfo + } + + // Check PlatformDetails field for Windows indication + if image.PlatformDetails != nil { + platformDetails := strings.ToLower(*image.PlatformDetails) + if strings.Contains(platformDetails, string(infrav1.OperatingSystemWindows)) { + nodeInfo.OperatingSystem = infrav1.OperatingSystemWindows + } + } + + return nodeInfo +} + +// getKubernetesVersion attempts to find the Kubernetes version by querying MachineDeployments +// or KubeadmControlPlanes that reference this AWSMachineTemplate. +func (r *AWSMachineTemplateReconciler) getKubernetesVersion(ctx context.Context, template *infrav1.AWSMachineTemplate) (string, error) { + listOpts, err := getParentListOptions(template.ObjectMeta) + if err != nil { + return "", errors.Wrap(err, "failed to get parent list options") + } + + // Try to find version from MachineDeployment first + machineDeploymentList := &clusterv1.MachineDeploymentList{} + if err := r.List(ctx, machineDeploymentList, listOpts...); err != nil { + return "", errors.Wrap(err, "failed to list MachineDeployments") + } + + // Find MachineDeployments that reference this AWSMachineTemplate + for _, md := range machineDeploymentList.Items { + if version := ptr.Deref(md.Spec.Template.Spec.Version, ""); md.Spec.Template.Spec.InfrastructureRef.Kind == "AWSMachineTemplate" && + md.Spec.Template.Spec.InfrastructureRef.Name == template.Name && version != "" { + return version, nil + } + } + + // If not found in MachineDeployment, try KubeadmControlPlane + kcpList := &controlplanev1.KubeadmControlPlaneList{} + if err := r.List(ctx, kcpList, listOpts...); err != nil { + return "", errors.Wrap(err, "failed to list KubeadmControlPlanes") + } + + // Find KubeadmControlPlanes that reference this AWSMachineTemplate + for _, kcp := range kcpList.Items { + if kcp.Spec.MachineTemplate.InfrastructureRef.Kind == "AWSMachineTemplate" && + kcp.Spec.MachineTemplate.InfrastructureRef.Name == template.Name && + kcp.Spec.Version != "" { + return kcp.Spec.Version, nil + } + } + + return "", errors.New("no MachineDeployment or KubeadmControlPlane found referencing this AWSMachineTemplate with a version") +} + +func getParentListOptions(obj metav1.ObjectMeta) ([]client.ListOption, error) { + listOpts := []client.ListOption{ + client.InNamespace(obj.Namespace), + } + + for _, ref := range obj.GetOwnerReferences() { + if ref.Kind != "Cluster" { + continue + } + gv, err := schema.ParseGroupVersion(ref.APIVersion) + if err != nil { + return nil, errors.WithStack(err) + } + if gv.Group == clusterv1.GroupVersion.Group { + listOpts = append(listOpts, client.MatchingLabels{ + clusterv1.ClusterNameLabel: ref.Name, + }) + break + } + } + return listOpts, nil +} diff --git a/controllers/awsmachinetemplate_controller_unit_test.go b/controllers/awsmachinetemplate_controller_unit_test.go new file mode 100644 index 0000000000..19482990ba --- /dev/null +++ b/controllers/awsmachinetemplate_controller_unit_test.go @@ -0,0 +1,417 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + infrav1 "sigs.k8s.io/cluster-api-provider-aws/v2/api/v1beta2" + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" +) + +func TestAWSMachineTemplateReconciler(t *testing.T) { + setupScheme := func() *runtime.Scheme { + scheme := runtime.NewScheme() + _ = infrav1.AddToScheme(scheme) + _ = clusterv1.AddToScheme(scheme) + _ = corev1.AddToScheme(scheme) + return scheme + } + + newFakeClient := func(objs ...client.Object) client.Client { + return fake.NewClientBuilder(). + WithScheme(setupScheme()). + WithObjects(objs...). + WithStatusSubresource(&infrav1.AWSMachineTemplate{}). + Build() + } + + newAWSMachineTemplate := func(name string) *infrav1.AWSMachineTemplate { + return &infrav1.AWSMachineTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: "default", + }, + Spec: infrav1.AWSMachineTemplateSpec{ + Template: infrav1.AWSMachineTemplateResource{ + Spec: infrav1.AWSMachineSpec{ + InstanceType: "t3.medium", + }, + }, + }, + } + } + + t.Run("getRegion", func(t *testing.T) { + t.Run("should get region from AWSCluster", func(t *testing.T) { + g := NewWithT(t) + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(cluster, awsCluster), + } + + region, err := reconciler.getRegion(context.Background(), cluster) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("us-west-2")) + }) + + t.Run("should return error when cluster is nil", func(t *testing.T) { + g := NewWithT(t) + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(), + } + + region, err := reconciler.getRegion(context.Background(), nil) + + g.Expect(err).ToNot(BeNil()) + g.Expect(err.Error()).To(ContainSubstring("no owner cluster found")) + g.Expect(region).To(Equal("")) + }) + + t.Run("should return empty when cluster has no infrastructure ref", func(t *testing.T) { + g := NewWithT(t) + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(cluster), + } + + region, err := reconciler.getRegion(context.Background(), cluster) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("")) + }) + + t.Run("should return empty when AWSCluster not found", func(t *testing.T) { + g := NewWithT(t) + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(cluster), + } + + region, err := reconciler.getRegion(context.Background(), cluster) + + g.Expect(err).To(BeNil()) + g.Expect(region).To(Equal("")) + }) + }) + + // Note: getInstanceTypeInfo tests are skipped as they require EC2 client injection + // which would need significant refactoring. The function is tested indirectly through + // integration tests. + + t.Run("Reconcile", func(t *testing.T) { + t.Run("should skip reconcile when capacity and nodeInfo are already populated", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Status.Capacity = corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + corev1.ResourceMemory: resource.MustParse("4Gi"), + } + template.Status.NodeInfo = &infrav1.NodeInfo{ + Architecture: infrav1.ArchitectureAmd64, + OperatingSystem: infrav1.OperatingSystemLinux, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + // Should skip reconcile and return early without calling AWS APIs + // No need to set up owner cluster or region since the early return happens before that + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should reconcile when capacity set but nodeInfo is not", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Status.Capacity = corev1.ResourceList{ + corev1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), + } + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + // This will fail at AWS API call, but demonstrates that reconcile proceeds + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when instance type is empty", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Spec.Template.Spec.InstanceType = "" + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should not reconcile when cluster is paused", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + Paused: true, + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should not reconcile when template has paused annotation", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.Annotations = map[string]string{clusterv1.PausedAnnotation: ""} + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + Spec: clusterv1.ClusterSpec{ + InfrastructureRef: &corev1.ObjectReference{ + Kind: "AWSCluster", + Name: "test-aws-cluster", + Namespace: "default", + }, + }, + } + awsCluster := &infrav1.AWSCluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-aws-cluster", + Namespace: "default", + }, + Spec: infrav1.AWSClusterSpec{ + Region: "us-west-2", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster, awsCluster), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when no owner cluster", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should skip when region is empty", func(t *testing.T) { + g := NewWithT(t) + template := newAWSMachineTemplate("test-template") + template.OwnerReferences = []metav1.OwnerReference{ + { + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Cluster", + Name: "test-cluster", + }, + } + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-cluster", + Namespace: "default", + }, + } + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(template, cluster), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKeyFromObject(template), + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + + t.Run("should return nil when template not found", func(t *testing.T) { + g := NewWithT(t) + + reconciler := &AWSMachineTemplateReconciler{ + Client: newFakeClient(), + } + + result, err := reconciler.Reconcile(context.Background(), ctrl.Request{ + NamespacedName: client.ObjectKey{ + Namespace: "default", + Name: "nonexistent", + }, + }) + + g.Expect(err).To(BeNil()) + g.Expect(result.Requeue).To(BeFalse()) + }) + }) +} diff --git a/main.go b/main.go index 785d1e7969..eb985017eb 100644 --- a/main.go +++ b/main.go @@ -360,9 +360,19 @@ func setupReconcilersAndWebhooks(ctx context.Context, mgr ctrl.Manager, setupLog.Error(err, "unable to create controller", "controller", "AWSCluster") os.Exit(1) } + + setupLog.Info("enabling AWSMachineTemplate controller") + if err := (&controllers.AWSMachineTemplateReconciler{ + Client: mgr.GetClient(), + WatchFilterValue: watchFilterValue, + }).SetupWithManager(ctx, mgr, controller.Options{MaxConcurrentReconciles: awsClusterConcurrency, RecoverPanic: ptr.To[bool](true)}); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "AWSMachineTemplate") + os.Exit(1) + } } else { setupLog.Info("controller disabled", "controller", "AWSMachine", "controller-group", controllers.Unmanaged) setupLog.Info("controller disabled", "controller", "AWSCluster", "controller-group", controllers.Unmanaged) + setupLog.Info("controller disabled", "controller", "AWSMachineTemplate", "controller-group", controllers.Unmanaged) } if feature.Gates.Enabled(feature.MachinePool) { diff --git a/test/e2e/suites/unmanaged/unmanaged_functional_test.go b/test/e2e/suites/unmanaged/unmanaged_functional_test.go index f4d6d42e94..ac4e4d321a 100644 --- a/test/e2e/suites/unmanaged/unmanaged_functional_test.go +++ b/test/e2e/suites/unmanaged/unmanaged_functional_test.go @@ -352,6 +352,34 @@ var _ = ginkgo.Context("[unmanaged] [functional]", func() { }) Expect(len(workerMachines)).To(Equal(1)) Expect(len(controlPlaneMachines)).To(Equal(1)) + + ginkgo.By("Verifying AWSMachineTemplate capacity is populated for autoscaling from zero") + awsMachineTemplateList := &infrav1.AWSMachineTemplateList{} + err := e2eCtx.Environment.BootstrapClusterProxy.GetClient().List(ctx, awsMachineTemplateList, client.InNamespace(namespace.Name)) + Expect(err).To(BeNil()) + Expect(len(awsMachineTemplateList.Items)).To(BeNumerically(">", 0), "Expected at least one AWSMachineTemplate") + + foundTemplateWithCapacity := false + foundTemplateWithNodeInfo := false + for _, template := range awsMachineTemplateList.Items { + if len(template.Status.Capacity) > 0 { + foundTemplateWithCapacity = true + ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has capacity populated: %v", template.Name, template.Status.Capacity)) + Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceCPU), "Expected CPU to be set in capacity") + Expect(template.Status.Capacity).To(HaveKey(corev1.ResourceMemory), "Expected Memory to be set in capacity") + } + if template.Status.NodeInfo != nil { + foundTemplateWithNodeInfo = true + ginkgo.By(fmt.Sprintf("AWSMachineTemplate %s has nodeInfo populated: %v", template.Name, template.Status.NodeInfo)) + // Verify architecture is set (should be either amd64 or arm64 for AWS) + Expect(template.Status.NodeInfo.Architecture).ToNot(BeEmpty(), "Expected architecture to be set in nodeInfo") + Expect(string(template.Status.NodeInfo.Architecture)).To(MatchRegexp("^(amd64|arm64)$"), "Expected architecture to be amd64 or arm64") + // Verify operating system is set + Expect(template.Status.NodeInfo.OperatingSystem).ToNot(BeEmpty(), "Expected operatingSystem to be set in nodeInfo") + } + } + Expect(foundTemplateWithCapacity).To(BeTrue(), "Expected at least one AWSMachineTemplate to have capacity populated") + Expect(foundTemplateWithNodeInfo).To(BeTrue(), "Expected at least one AWSMachineTemplate to have nodeInfo populated") }) })