From a5399b73664b26340177461f7dfcbb0be1656e9f Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 24 Oct 2025 12:52:35 -0400 Subject: [PATCH 1/9] api: Add AvailabilityZones field to LoadBalancerSpec Add support for configuring availability zones on load balancers to enable zone-redundant configurations for high availability. For internal load balancers, zones are set directly on the frontend IP configuration. For public load balancers, zones should be set on the associated public IP addresses. The field supports up to 3 zones and uses a set list type to prevent duplicates. Zones are immutable after creation per Azure platform requirements. --- api/v1beta1/types.go | 8 ++++++++ api/v1beta1/zz_generated.deepcopy.go | 5 +++++ 2 files changed, 13 insertions(+) diff --git a/api/v1beta1/types.go b/api/v1beta1/types.go index 3423976cbdd..d20c1f9539c 100644 --- a/api/v1beta1/types.go +++ b/api/v1beta1/types.go @@ -364,6 +364,14 @@ type LoadBalancerSpec struct { // BackendPool describes the backend pool of the load balancer. // +optional BackendPool BackendPool `json:"backendPool,omitempty"` + // AvailabilityZones is a list of availability zones for the load balancer. + // When specified for an internal load balancer, the frontend IP configuration + // will be zone-redundant across the specified zones. + // For public load balancers, this should be set on the associated public IP addresses instead. + // +optional + // +listType=set + // +kubebuilder:validation:MaxItems=3 + AvailabilityZones []string `json:"availabilityZones,omitempty"` LoadBalancerClassSpec `json:",inline"` } diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 71792a2ce24..a281b35dc05 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -3441,6 +3441,11 @@ func (in *LoadBalancerSpec) DeepCopyInto(out *LoadBalancerSpec) { **out = **in } out.BackendPool = in.BackendPool + if in.AvailabilityZones != nil { + in, out := &in.AvailabilityZones, &out.AvailabilityZones + *out = make([]string, len(*in)) + copy(*out, *in) + } in.LoadBalancerClassSpec.DeepCopyInto(&out.LoadBalancerClassSpec) } From 884ff40b2578342d48ea7a6ef1dfb97e35536a10 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 24 Oct 2025 12:52:44 -0400 Subject: [PATCH 2/9] azure: Implement zone-redundant load balancer support Implement the service layer changes to support zone-redundant load balancers: - Update LBSpec to include AvailabilityZones field - Modify getFrontendIPConfigs to set zones on frontend IP configurations - Update all four load balancer specs in cluster scope to pass zones: - APIServerLB - NodeOutboundLB - ControlPlaneOutboundLB - ControlPlaneInternalLB Zones are converted from []string to []*string for Azure SDK compatibility and applied to frontend IP configurations for zone redundancy. --- azure/scope/cluster.go | 4 ++++ azure/services/loadbalancers/spec.go | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/azure/scope/cluster.go b/azure/scope/cluster.go index c9760b5ad7e..0c00daceb45 100644 --- a/azure/scope/cluster.go +++ b/azure/scope/cluster.go @@ -267,6 +267,7 @@ func (s *ClusterScope) LBSpecs() []azure.ResourceSpecGetter { IdleTimeoutInMinutes: s.APIServerLB().IdleTimeoutInMinutes, AdditionalTags: s.AdditionalTags(), AdditionalPorts: s.AdditionalAPIServerLBPorts(), + AvailabilityZones: s.APIServerLB().AvailabilityZones, } if s.APIServerLB().FrontendIPs != nil { @@ -301,6 +302,7 @@ func (s *ClusterScope) LBSpecs() []azure.ResourceSpecGetter { IdleTimeoutInMinutes: s.APIServerLB().IdleTimeoutInMinutes, AdditionalTags: s.AdditionalTags(), AdditionalPorts: s.AdditionalAPIServerLBPorts(), + AvailabilityZones: s.APIServerLB().AvailabilityZones, } privateIPFound := false @@ -348,6 +350,7 @@ func (s *ClusterScope) LBSpecs() []azure.ResourceSpecGetter { IdleTimeoutInMinutes: s.NodeOutboundLB().IdleTimeoutInMinutes, Role: infrav1.NodeOutboundRole, AdditionalTags: s.AdditionalTags(), + AvailabilityZones: s.NodeOutboundLB().AvailabilityZones, }) } @@ -369,6 +372,7 @@ func (s *ClusterScope) LBSpecs() []azure.ResourceSpecGetter { IdleTimeoutInMinutes: s.ControlPlaneOutboundLB().IdleTimeoutInMinutes, Role: infrav1.ControlPlaneOutboundRole, AdditionalTags: s.AdditionalTags(), + AvailabilityZones: s.ControlPlaneOutboundLB().AvailabilityZones, }) } diff --git a/azure/services/loadbalancers/spec.go b/azure/services/loadbalancers/spec.go index 5fd3cedf9f2..625fc010b4e 100644 --- a/azure/services/loadbalancers/spec.go +++ b/azure/services/loadbalancers/spec.go @@ -48,6 +48,7 @@ type LBSpec struct { IdleTimeoutInMinutes *int32 AdditionalTags map[string]string AdditionalPorts []infrav1.LoadBalancerPort + AvailabilityZones []string } // ResourceName returns the name of the load balancer. @@ -167,6 +168,16 @@ func (s *LBSpec) Parameters(_ context.Context, existing interface{}) (parameters func getFrontendIPConfigs(lbSpec LBSpec) ([]*armnetwork.FrontendIPConfiguration, []*armnetwork.SubResource) { frontendIPConfigurations := make([]*armnetwork.FrontendIPConfiguration, 0) frontendIDs := make([]*armnetwork.SubResource, 0) + + // Convert availability zones to []*string for Azure SDK + var zones []*string + if len(lbSpec.AvailabilityZones) > 0 { + zones = make([]*string, len(lbSpec.AvailabilityZones)) + for i, zone := range lbSpec.AvailabilityZones { + zones[i] = ptr.To(zone) + } + } + for _, ipConfig := range lbSpec.FrontendIPConfigs { var properties armnetwork.FrontendIPConfigurationPropertiesFormat if lbSpec.Type == infrav1.Internal { @@ -187,6 +198,7 @@ func getFrontendIPConfigs(lbSpec LBSpec) ([]*armnetwork.FrontendIPConfiguration, frontendIPConfigurations = append(frontendIPConfigurations, &armnetwork.FrontendIPConfiguration{ Properties: &properties, Name: ptr.To(ipConfig.Name), + Zones: zones, }) frontendIDs = append(frontendIDs, &armnetwork.SubResource{ ID: ptr.To(azure.FrontendIPConfigID(lbSpec.SubscriptionID, lbSpec.ResourceGroup, lbSpec.Name, ipConfig.Name)), From 82c79a292e8aee64fd0d2f36b5094e4cab951fa0 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 24 Oct 2025 12:52:50 -0400 Subject: [PATCH 3/9] webhook: Add validation for load balancer zone immutability Add webhook validation to enforce Azure's requirement that availability zones cannot be changed after a load balancer is created. The validation checks all three load balancer types: - APIServerLB - NodeOutboundLB - ControlPlaneOutboundLB Any attempt to modify zones on an existing load balancer will be rejected at admission time with a clear error message, preventing users from attempting operations that would fail at the Azure API level. --- api/v1beta1/azurecluster_webhook.go | 37 +++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/api/v1beta1/azurecluster_webhook.go b/api/v1beta1/azurecluster_webhook.go index def1503c053..816d1a0afdd 100644 --- a/api/v1beta1/azurecluster_webhook.go +++ b/api/v1beta1/azurecluster_webhook.go @@ -169,6 +169,43 @@ func (*AzureClusterWebhook) ValidateUpdate(_ context.Context, oldRaw, newObj run allErrs = append(allErrs, err) } + // Validate availability zones are immutable for load balancers + if c.Spec.NetworkSpec.APIServerLB != nil && old.Spec.NetworkSpec.APIServerLB != nil { + if !webhookutils.EnsureStringSlicesAreEquivalent( + c.Spec.NetworkSpec.APIServerLB.AvailabilityZones, + old.Spec.NetworkSpec.APIServerLB.AvailabilityZones) { + allErrs = append(allErrs, + field.Invalid( + field.NewPath("spec", "networkSpec", "apiServerLB", "availabilityZones"), + c.Spec.NetworkSpec.APIServerLB.AvailabilityZones, + "field is immutable")) + } + } + + if c.Spec.NetworkSpec.NodeOutboundLB != nil && old.Spec.NetworkSpec.NodeOutboundLB != nil { + if !webhookutils.EnsureStringSlicesAreEquivalent( + c.Spec.NetworkSpec.NodeOutboundLB.AvailabilityZones, + old.Spec.NetworkSpec.NodeOutboundLB.AvailabilityZones) { + allErrs = append(allErrs, + field.Invalid( + field.NewPath("spec", "networkSpec", "nodeOutboundLB", "availabilityZones"), + c.Spec.NetworkSpec.NodeOutboundLB.AvailabilityZones, + "field is immutable")) + } + } + + if c.Spec.NetworkSpec.ControlPlaneOutboundLB != nil && old.Spec.NetworkSpec.ControlPlaneOutboundLB != nil { + if !webhookutils.EnsureStringSlicesAreEquivalent( + c.Spec.NetworkSpec.ControlPlaneOutboundLB.AvailabilityZones, + old.Spec.NetworkSpec.ControlPlaneOutboundLB.AvailabilityZones) { + allErrs = append(allErrs, + field.Invalid( + field.NewPath("spec", "networkSpec", "controlPlaneOutboundLB", "availabilityZones"), + c.Spec.NetworkSpec.ControlPlaneOutboundLB.AvailabilityZones, + "field is immutable")) + } + } + allErrs = append(allErrs, c.validateSubnetUpdate(old)...) if len(allErrs) == 0 { From bd84259fa30f4d9f6c7bf4321a83182df70fa20a Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 24 Oct 2025 12:52:57 -0400 Subject: [PATCH 4/9] test: Add unit tests for zone-redundant load balancers Add comprehensive unit tests for zone-redundant load balancer functionality: - Add test fixture (fakeInternalAPILBSpecWithZones) with zone configuration - Add test case to verify zones are correctly set on frontend IP configs - Validate that zones array contains all expected zone values (1, 2, 3) - Ensure zones are properly converted to Azure SDK format The tests verify that the service layer correctly translates the API spec into Azure SDK structures with zones on frontend IP configurations. --- .../loadbalancers/loadbalancers_test.go | 24 +++++++++++++++++++ azure/services/loadbalancers/spec_test.go | 16 +++++++++++++ 2 files changed, 40 insertions(+) diff --git a/azure/services/loadbalancers/loadbalancers_test.go b/azure/services/loadbalancers/loadbalancers_test.go index 3a61789a507..f6233a6a120 100644 --- a/azure/services/loadbalancers/loadbalancers_test.go +++ b/azure/services/loadbalancers/loadbalancers_test.go @@ -111,6 +111,30 @@ var ( APIServerPort: 6443, } + fakeInternalAPILBSpecWithZones = LBSpec{ + Name: "my-private-lb", + ResourceGroup: "my-rg", + SubscriptionID: "123", + ClusterName: "my-cluster", + Location: "my-location", + Role: infrav1.APIServerRole, + Type: infrav1.Internal, + SKU: infrav1.SKUStandard, + SubnetName: "my-cp-subnet", + BackendPoolName: "my-private-lb-backendPool", + IdleTimeoutInMinutes: ptr.To[int32](4), + AvailabilityZones: []string{"1", "2", "3"}, + FrontendIPConfigs: []infrav1.FrontendIP{ + { + Name: "my-private-lb-frontEnd", + FrontendIPClass: infrav1.FrontendIPClass{ + PrivateIPAddress: "10.0.0.10", + }, + }, + }, + APIServerPort: 6443, + } + fakeNodeOutboundLBSpec = LBSpec{ Name: "my-cluster", ResourceGroup: "my-rg", diff --git a/azure/services/loadbalancers/spec_test.go b/azure/services/loadbalancers/spec_test.go index 9e75779a7c1..14f2f2a3082 100644 --- a/azure/services/loadbalancers/spec_test.go +++ b/azure/services/loadbalancers/spec_test.go @@ -178,6 +178,22 @@ func TestParameters(t *testing.T) { }, expectedError: "", }, + { + name: "internal load balancer with availability zones", + spec: &fakeInternalAPILBSpecWithZones, + existing: nil, + expect: func(g *WithT, result interface{}) { + g.Expect(result).To(BeAssignableToTypeOf(armnetwork.LoadBalancer{})) + lb := result.(armnetwork.LoadBalancer) + // Verify zones are set on frontend IP configuration + g.Expect(lb.Properties.FrontendIPConfigurations).To(HaveLen(1)) + g.Expect(lb.Properties.FrontendIPConfigurations[0].Zones).To(HaveLen(3)) + g.Expect(*lb.Properties.FrontendIPConfigurations[0].Zones[0]).To(Equal("1")) + g.Expect(*lb.Properties.FrontendIPConfigurations[0].Zones[1]).To(Equal("2")) + g.Expect(*lb.Properties.FrontendIPConfigurations[0].Zones[2]).To(Equal("3")) + }, + expectedError: "", + }, } for _, tc := range testcases { t.Run(tc.name, func(t *testing.T) { From da9aae2e08df0fcc463faa730450c589c966dfa1 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 24 Oct 2025 12:53:04 -0400 Subject: [PATCH 5/9] manifests: Update generated CRD for LoadBalancerSpec zones Update generated CRD manifests to include the availabilityZones field on LoadBalancerSpec with proper validation: - Type: array of strings - List type: set (prevents duplicates) - Max items: 3 (Azure supports up to 3 zones per region) This is the result of running 'make generate-manifests' after adding the AvailabilityZones field to the API types. --- ...ucture.cluster.x-k8s.io_azureclusters.yaml | 33 +++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml index 1edca6b7df9..29df5d71e00 100644 --- a/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml +++ b/config/crd/bases/infrastructure.cluster.x-k8s.io_azureclusters.yaml @@ -689,6 +689,17 @@ spec: description: APIServerLB is the configuration for the control-plane load balancer. properties: + availabilityZones: + description: |- + AvailabilityZones is a list of availability zones for the load balancer. + When specified for an internal load balancer, the frontend IP configuration + will be zone-redundant across the specified zones. + For public load balancers, this should be set on the associated public IP addresses instead. + items: + type: string + maxItems: 3 + type: array + x-kubernetes-list-type: set backendPool: description: BackendPool describes the backend pool of the load balancer. @@ -772,6 +783,17 @@ spec: ControlPlaneOutboundLB is the configuration for the control-plane outbound load balancer. This is different from APIServerLB, and is used only in private clusters (optionally) for enabling outbound traffic. properties: + availabilityZones: + description: |- + AvailabilityZones is a list of availability zones for the load balancer. + When specified for an internal load balancer, the frontend IP configuration + will be zone-redundant across the specified zones. + For public load balancers, this should be set on the associated public IP addresses instead. + items: + type: string + maxItems: 3 + type: array + x-kubernetes-list-type: set backendPool: description: BackendPool describes the backend pool of the load balancer. @@ -854,6 +876,17 @@ spec: description: NodeOutboundLB is the configuration for the node outbound load balancer. properties: + availabilityZones: + description: |- + AvailabilityZones is a list of availability zones for the load balancer. + When specified for an internal load balancer, the frontend IP configuration + will be zone-redundant across the specified zones. + For public load balancers, this should be set on the associated public IP addresses instead. + items: + type: string + maxItems: 3 + type: array + x-kubernetes-list-type: set backendPool: description: BackendPool describes the backend pool of the load balancer. From d114d7d5cc778615ebc59c9a1da150844829e797 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 24 Oct 2025 12:53:11 -0400 Subject: [PATCH 6/9] docs: Add load balancer zone redundancy documentation Add comprehensive documentation for zone-redundant load balancer feature: - Explain Azure zone redundancy concepts for load balancers - Provide configuration examples for all load balancer types: - Internal load balancers (API server) - Public load balancers - Node outbound load balancers - Control plane outbound load balancers - Include complete highly available cluster example - Document important considerations: - Immutability of zones after creation - Region support requirements - Standard SKU requirement - Backend pool placement best practices - Provide migration guidance for existing clusters - Add troubleshooting section - Document best practices --- docs/book/src/SUMMARY.md | 1 + .../load-balancer-zone-redundancy.md | 294 ++++++++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 docs/book/src/self-managed/load-balancer-zone-redundancy.md diff --git a/docs/book/src/SUMMARY.md b/docs/book/src/SUMMARY.md index b437fc7b3e3..a4f14b92d0e 100644 --- a/docs/book/src/SUMMARY.md +++ b/docs/book/src/SUMMARY.md @@ -35,6 +35,7 @@ - [Externally managed Azure infrastructure](./self-managed/externally-managed-azure-infrastructure.md) - [Failure Domains](./self-managed/failure-domains.md) - [Flatcar](./self-managed/flatcar.md) + - [Load Balancer Zone Redundancy](./self-managed/load-balancer-zone-redundancy.md) - [GPU-enabled Clusters](./self-managed/gpu.md) - [IPv6](./self-managed/ipv6.md) - [Machine Pools (VMSS)](./self-managed/machinepools.md) diff --git a/docs/book/src/self-managed/load-balancer-zone-redundancy.md b/docs/book/src/self-managed/load-balancer-zone-redundancy.md new file mode 100644 index 00000000000..8e4ea121fe9 --- /dev/null +++ b/docs/book/src/self-managed/load-balancer-zone-redundancy.md @@ -0,0 +1,294 @@ +# Load Balancer Zone Redundancy + +## Zone Redundancy for Load Balancers in Azure + +Azure Load Balancers can be configured as zone-redundant to ensure high availability across multiple availability zones within a region. A zone-redundant load balancer distributes traffic across all zones, providing resilience against zone failures. + +**Key concepts:** +- Zone redundancy for load balancers is configured through the **frontend IP configuration** +- For **internal load balancers**, zones are set directly on the frontend IP configuration +- For **public load balancers**, zones are inherited from the zone configuration of the public IP address +- **Zones are immutable** - once created, they cannot be changed, added, or removed + +Full details can be found in the [Azure Load Balancer reliability documentation](https://learn.microsoft.com/azure/reliability/reliability-load-balancer). + +## Configuring Zone-Redundant Load Balancers + +CAPZ exposes the `availabilityZones` field on load balancer specifications to enable zone redundancy. + +### Internal Load Balancers + +For internal load balancers (such as a private API server), you can configure availability zones directly on the load balancer spec: + +```yaml +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureCluster +metadata: + name: my-cluster + namespace: default +spec: + location: eastus + networkSpec: + apiServerLB: + type: Internal + availabilityZones: + - "1" + - "2" + - "3" +``` + +This configuration creates a zone-redundant internal load balancer with frontend IPs distributed across zones 1, 2, and 3. + +### Public Load Balancers + +For public load balancers, zone redundancy is primarily controlled by the public IP addresses. However, you can still set `availabilityZones` on the load balancer for consistency: + +```yaml +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureCluster +metadata: + name: my-cluster + namespace: default +spec: + location: eastus + networkSpec: + apiServerLB: + type: Public + availabilityZones: + - "1" + - "2" + - "3" +``` + +> **Note**: For public load balancers, ensure that the associated public IP addresses are also zone-redundant for complete zone redundancy. + +### Node Outbound Load Balancer + +You can also configure zone redundancy for node outbound load balancers: + +```yaml +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureCluster +metadata: + name: my-cluster + namespace: default +spec: + location: westus2 + networkSpec: + nodeOutboundLB: + type: Public + availabilityZones: + - "1" + - "2" + - "3" + frontendIPs: + - name: node-outbound-ip + publicIP: + name: node-outbound-publicip +``` + +### Control Plane Outbound Load Balancer + +For clusters with private API servers, you can configure the control plane outbound load balancer: + +```yaml +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureCluster +metadata: + name: my-cluster + namespace: default +spec: + location: eastus + networkSpec: + apiServerLB: + type: Internal + availabilityZones: + - "1" + - "2" + - "3" + controlPlaneOutboundLB: + availabilityZones: + - "1" + - "2" + - "3" + frontendIPs: + - name: controlplane-outbound-ip + publicIP: + name: controlplane-outbound-publicip +``` + +## Complete Example: Highly Available Cluster + +Here's a complete example of a highly available cluster with zone-redundant load balancers: + +```yaml +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureCluster +metadata: + name: ha-cluster + namespace: default +spec: + location: eastus + resourceGroup: ha-cluster-rg + networkSpec: + # Zone-redundant internal API server load balancer + apiServerLB: + type: Internal + name: ha-cluster-internal-lb + availabilityZones: + - "1" + - "2" + - "3" + frontendIPs: + - name: api-server-internal-ip + privateIPAddress: "10.0.0.100" + + # Zone-redundant control plane outbound load balancer + controlPlaneOutboundLB: + name: ha-cluster-cp-outbound-lb + availabilityZones: + - "1" + - "2" + - "3" + frontendIPs: + - name: cp-outbound-ip + publicIP: + name: cp-outbound-publicip + + # Zone-redundant node outbound load balancer + nodeOutboundLB: + name: ha-cluster-node-outbound-lb + availabilityZones: + - "1" + - "2" + - "3" + frontendIPs: + - name: node-outbound-ip + publicIP: + name: node-outbound-publicip + + # Custom VNet configuration + vnet: + name: ha-cluster-vnet + cidrBlocks: + - "10.0.0.0/16" + + subnets: + - name: control-plane-subnet + role: control-plane + cidrBlocks: + - "10.0.0.0/24" + - name: node-subnet + role: node + cidrBlocks: + - "10.0.1.0/24" +``` + +## Important Considerations + +### Immutability + +Once a load balancer is created with availability zones, the zone configuration **cannot be changed**. This is an Azure platform limitation. To change zones, you must: + +1. Delete the load balancer +2. Recreate it with the new zone configuration + +> **Warning**: Changing load balancer zones requires recreating the cluster's load balancers, which will cause service interruption. + +### Region Support + +Not all Azure regions support availability zones. Before configuring zone-redundant load balancers, verify that your target region supports zones: + +```bash +az vm list-skus -l --zone -o table +``` + +### Standard SKU Requirement + +Zone-redundant load balancers require the **Standard SKU**. CAPZ uses Standard SKU by default, so no additional configuration is needed. + +### Backend Pool Placement + +For optimal high availability: +- Spread your control plane nodes across all availability zones +- Spread your worker nodes across all availability zones +- Ensure backend pool members exist in the same zones as the load balancer + +See the [Failure Domains](failure-domains.md) documentation for details on distributing VMs across zones. + +## Migration from Non-Zone-Redundant Load Balancers + +If you have an existing cluster without zone-redundant load balancers, migration requires careful planning: + +### For New Clusters + +When creating a new cluster, simply include the `availabilityZones` field in your `AzureCluster` specification from the start. + +### For Existing Clusters + +**Migration is not straightforward** because: +1. Azure does not allow modifying zones on existing load balancers +2. CAPZ's webhook validation prevents zone changes to enforce this immutability +3. Load balancer recreation requires cluster downtime + +**Recommended approach for existing clusters:** +1. Create a new cluster with zone-redundant configuration +2. Migrate workloads to the new cluster +3. Decommission the old cluster + +**Alternative for development/test clusters:** +1. Delete the `AzureCluster` resource (this will delete the infrastructure) +2. Recreate the `AzureCluster` with `availabilityZones` configured +3. Reconcile the cluster + +> **Important**: The alternative approach causes significant downtime and should only be used in non-production environments. + +## Troubleshooting + +### Load Balancer Not Zone-Redundant + +If your load balancer is not zone-redundant despite configuration: + +1. **Verify the zones are set in spec:** + ```bash + kubectl get azurecluster -o jsonpath='{.spec.networkSpec.apiServerLB.availabilityZones}' + ``` + +2. **Check the Azure load balancer frontend configuration:** + ```bash + az network lb frontend-ip show \ + --lb-name \ + --name \ + --resource-group \ + --query zones + ``` + +3. **Verify the region supports zones:** + ```bash + az vm list-skus -l --zone -o table | grep -i standardsku + ``` + +### Validation Errors + +If you encounter validation errors when updating `availabilityZones`: + +``` +field is immutable +``` + +This is expected behavior. Zones cannot be modified after creation. You must recreate the load balancer with the desired configuration. + +## Best Practices + +1. **Enable zone redundancy from the start** when creating new clusters in zone-capable regions +2. **Use all available zones** in the region (typically 3 zones) for maximum resilience +3. **Spread backend pools** across all zones configured on the load balancer +4. **Monitor zone health** and be prepared to handle zone failures +5. **Test failover scenarios** to ensure your cluster can survive zone outages +6. **Document your zone configuration** for disaster recovery procedures + +## Related Documentation + +- [Failure Domains](failure-domains.md) - Configure VMs across availability zones +- [API Server Endpoint](api-server-endpoint.md) - API server load balancer configuration +- [Azure Load Balancer Reliability](https://learn.microsoft.com/azure/reliability/reliability-load-balancer) - Azure official documentation From edde0e06fd73ef5f876bc9437e8bcde18c958b81 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Fri, 24 Oct 2025 12:53:20 -0400 Subject: [PATCH 7/9] test: Add E2E test for zone-redundant load balancers Add dedicated end-to-end test to verify zone-redundant load balancer functionality in real Azure environments. The test: - Creates a cluster with zone-redundant load balancers configured - Uses the apiserver-ilb flavor with zones set to 1,2,3 - Verifies zones are correctly set in AzureCluster spec - Validates Azure resources have zones on frontend IP configurations - Tests all three load balancer types: - API Server Load Balancer (internal) - Node Outbound Load Balancer - Control Plane Outbound Load Balancer This is an optional test that validates the complete feature works end-to-end by creating actual Azure infrastructure and verifying the zone configuration. --- test/e2e/azure_test.go | 153 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/test/e2e/azure_test.go b/test/e2e/azure_test.go index 97eff3cfebf..e895ef2f20d 100644 --- a/test/e2e/azure_test.go +++ b/test/e2e/azure_test.go @@ -27,6 +27,8 @@ import ( "strings" "time" + "github.com/Azure/azure-sdk-for-go/sdk/azidentity" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4" "github.com/Azure/azure-service-operator/v2/pkg/common/config" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -37,6 +39,8 @@ import ( "sigs.k8s.io/cluster-api/test/framework/clusterctl" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/controller-runtime/pkg/client" + + infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1" ) var _ = Describe("Workload cluster creation", func() { @@ -1427,5 +1431,154 @@ var _ = Describe("Workload cluster creation", func() { }) }) + Context("Creating a cluster with zone-redundant load balancers [OPTIONAL]", func() { + It("with zone-redundant API server, node outbound, and control plane outbound load balancers", func() { + clusterName = getClusterName(clusterNamePrefix, "lb-zones") + + // Set up zone-redundant load balancer configuration + Expect(os.Setenv("EXP_APISERVER_ILB", "true")).To(Succeed()) + Expect(os.Setenv("AZURE_INTERNAL_LB_PRIVATE_IP", "40.0.0.100")).To(Succeed()) + Expect(os.Setenv("AZURE_VNET_CIDR", "40.0.0.0/8")).To(Succeed()) + Expect(os.Setenv("AZURE_CP_SUBNET_CIDR", "40.0.0.0/16")).To(Succeed()) + Expect(os.Setenv("AZURE_NODE_SUBNET_CIDR", "40.1.0.0/16")).To(Succeed()) + Expect(os.Setenv("AZURE_LB_ZONES", "1,2,3")).To(Succeed()) + + clusterctl.ApplyClusterTemplateAndWait(ctx, createApplyClusterTemplateInput( + specName, + withFlavor("apiserver-ilb"), + withNamespace(namespace.Name), + withClusterName(clusterName), + withControlPlaneMachineCount(3), + withWorkerMachineCount(2), + withControlPlaneInterval(specName, "wait-control-plane-ha"), + withControlPlaneWaiters(clusterctl.ControlPlaneWaiters{ + WaitForControlPlaneInitialized: EnsureControlPlaneInitialized, + }), + withPostMachinesProvisioned(func() { + EnsureDaemonsets(ctx, func() DaemonsetsSpecInput { + return DaemonsetsSpecInput{ + BootstrapClusterProxy: bootstrapClusterProxy, + Namespace: namespace, + ClusterName: clusterName, + } + }) + }), + ), result) + + By("Verifying load balancer zones are configured correctly in Azure", func() { + expectedZones := []string{"1", "2", "3"} + + subscriptionID := getSubscriptionID(Default) + cred, err := azidentity.NewDefaultAzureCredential(nil) + Expect(err).NotTo(HaveOccurred()) + + mgmtClient := bootstrapClusterProxy.GetClient() + Expect(mgmtClient).NotTo(BeNil()) + + azureCluster := &infrav1.AzureCluster{} + err = mgmtClient.Get(ctx, client.ObjectKey{ + Namespace: namespace.Name, + Name: clusterName, + }, azureCluster) + Expect(err).NotTo(HaveOccurred()) + + resourceGroupName := azureCluster.Spec.ResourceGroup + Expect(resourceGroupName).NotTo(BeEmpty()) + + lbClient, err := armnetwork.NewLoadBalancersClient(subscriptionID, cred, nil) + Expect(err).NotTo(HaveOccurred()) + + // Verify API Server Load Balancer zones + if azureCluster.Spec.NetworkSpec.APIServerLB != nil { + Expect(azureCluster.Spec.NetworkSpec.APIServerLB.AvailabilityZones).To(Equal(expectedZones), + "APIServerLB should have zones configured in AzureCluster spec") + + lbName := azureCluster.Spec.NetworkSpec.APIServerLB.Name + Eventually(func(g Gomega) { + lb, err := lbClient.Get(ctx, resourceGroupName, lbName, nil) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(lb.Properties).NotTo(BeNil()) + g.Expect(lb.Properties.FrontendIPConfigurations).NotTo(BeEmpty()) + + for _, frontendIP := range lb.Properties.FrontendIPConfigurations { + g.Expect(frontendIP.Zones).NotTo(BeNil(), "Frontend IP should have zones configured") + g.Expect(frontendIP.Zones).To(HaveLen(3), "Frontend IP should have 3 zones") + + zonesMap := make(map[string]bool) + for _, zone := range frontendIP.Zones { + if zone != nil { + zonesMap[*zone] = true + } + } + for _, expectedZone := range expectedZones { + g.Expect(zonesMap[expectedZone]).To(BeTrue(), "Zone %s should be configured", expectedZone) + } + } + }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) + } + + // Verify Node Outbound Load Balancer zones + if azureCluster.Spec.NetworkSpec.NodeOutboundLB != nil { + Expect(azureCluster.Spec.NetworkSpec.NodeOutboundLB.AvailabilityZones).To(Equal(expectedZones), + "NodeOutboundLB should have zones configured in AzureCluster spec") + + lbName := azureCluster.Spec.NetworkSpec.NodeOutboundLB.Name + Eventually(func(g Gomega) { + lb, err := lbClient.Get(ctx, resourceGroupName, lbName, nil) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(lb.Properties).NotTo(BeNil()) + g.Expect(lb.Properties.FrontendIPConfigurations).NotTo(BeEmpty()) + + for _, frontendIP := range lb.Properties.FrontendIPConfigurations { + g.Expect(frontendIP.Zones).NotTo(BeNil(), "Frontend IP should have zones configured") + g.Expect(frontendIP.Zones).To(HaveLen(3), "Frontend IP should have 3 zones") + + zonesMap := make(map[string]bool) + for _, zone := range frontendIP.Zones { + if zone != nil { + zonesMap[*zone] = true + } + } + for _, expectedZone := range expectedZones { + g.Expect(zonesMap[expectedZone]).To(BeTrue(), "Zone %s should be configured", expectedZone) + } + } + }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) + } + + // Verify Control Plane Outbound Load Balancer zones + if azureCluster.Spec.NetworkSpec.ControlPlaneOutboundLB != nil { + Expect(azureCluster.Spec.NetworkSpec.ControlPlaneOutboundLB.AvailabilityZones).To(Equal(expectedZones), + "ControlPlaneOutboundLB should have zones configured in AzureCluster spec") + + lbName := azureCluster.Spec.NetworkSpec.ControlPlaneOutboundLB.Name + Eventually(func(g Gomega) { + lb, err := lbClient.Get(ctx, resourceGroupName, lbName, nil) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(lb.Properties).NotTo(BeNil()) + g.Expect(lb.Properties.FrontendIPConfigurations).NotTo(BeEmpty()) + + for _, frontendIP := range lb.Properties.FrontendIPConfigurations { + g.Expect(frontendIP.Zones).NotTo(BeNil(), "Frontend IP should have zones configured") + g.Expect(frontendIP.Zones).To(HaveLen(3), "Frontend IP should have 3 zones") + + zonesMap := make(map[string]bool) + for _, zone := range frontendIP.Zones { + if zone != nil { + zonesMap[*zone] = true + } + } + for _, expectedZone := range expectedZones { + g.Expect(zonesMap[expectedZone]).To(BeTrue(), "Zone %s should be configured", expectedZone) + } + } + }, retryableOperationTimeout, retryableOperationSleepBetweenRetries).Should(Succeed()) + } + }) + + By("PASSED!") + }) + }) + // TODO: add a same test as above for a windows cluster }) From 9ca4c6c977eb840fd20b417bbc654e31efdad2a4 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Tue, 28 Oct 2025 15:09:49 -0400 Subject: [PATCH 8/9] Add zone redundancy to private cluster API server load balancer Updates the private cluster flavor to include availability zones (1, 2, 3) on the API server internal load balancer for improved high availability and resilience. The private cluster E2E test is marked [OPTIONAL] so it will be skipped in regions that don't support 3 availability zones. Updates #5709 --- templates/cluster-template-private.yaml | 4 ++++ templates/flavors/private/patches/private-lb.yaml | 4 ++++ templates/test/ci/cluster-template-prow-private.yaml | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/templates/cluster-template-private.yaml b/templates/cluster-template-private.yaml index 786b6d52fc2..759fe70d84f 100644 --- a/templates/cluster-template-private.yaml +++ b/templates/cluster-template-private.yaml @@ -32,6 +32,10 @@ spec: location: ${AZURE_LOCATION} networkSpec: apiServerLB: + availabilityZones: + - "1" + - "2" + - "3" name: ${CLUSTER_NAME}-internal-lb type: Internal controlPlaneOutboundLB: diff --git a/templates/flavors/private/patches/private-lb.yaml b/templates/flavors/private/patches/private-lb.yaml index 76e1539df2a..a2933e29963 100644 --- a/templates/flavors/private/patches/private-lb.yaml +++ b/templates/flavors/private/patches/private-lb.yaml @@ -7,6 +7,10 @@ spec: apiServerLB: name: ${CLUSTER_NAME}-internal-lb type: Internal + availabilityZones: + - "1" + - "2" + - "3" nodeOutboundLB: frontendIPsCount: 1 controlPlaneOutboundLB: diff --git a/templates/test/ci/cluster-template-prow-private.yaml b/templates/test/ci/cluster-template-prow-private.yaml index 26910fdcb43..47d6e99cc7a 100644 --- a/templates/test/ci/cluster-template-prow-private.yaml +++ b/templates/test/ci/cluster-template-prow-private.yaml @@ -49,6 +49,10 @@ spec: location: ${AZURE_LOCATION} networkSpec: apiServerLB: + availabilityZones: + - "1" + - "2" + - "3" frontendIPs: - name: ${CLUSTER_NAME}-internal-lb-frontend privateIP: ${AZURE_INTERNAL_LB_IP} From b5bbbad98baaf228e8f28ad061fa19fa4af854c3 Mon Sep 17 00:00:00 2001 From: Bryan Cox Date: Wed, 26 Nov 2025 14:09:09 -0500 Subject: [PATCH 9/9] test: Add apiserver-ilb-zones flavor for zone-redundant LB E2E test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a new cluster template flavor that extends apiserver-ilb and adds availabilityZones configuration to all load balancers (APIServerLB, NodeOutboundLB, ControlPlaneOutboundLB). This fixes the optional E2E test for zone-redundant load balancers by using a dedicated template with zones pre-configured instead of relying on an unused environment variable. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../cluster-template-apiserver-ilb-zones.yaml | 235 ++++++++++ .../apiserver-ilb-zones/kustomization.yaml | 11 + .../apiserver-ilb-zones/patches/lb-zones.yaml | 21 + ...ter-template-prow-apiserver-ilb-zones.yaml | 420 ++++++++++++++++++ .../kustomization.yaml | 11 + .../patches/lb-zones.yaml | 21 + test/e2e/azure_test.go | 4 +- test/e2e/config/azure-dev.yaml | 2 + 8 files changed, 722 insertions(+), 3 deletions(-) create mode 100644 templates/cluster-template-apiserver-ilb-zones.yaml create mode 100644 templates/flavors/apiserver-ilb-zones/kustomization.yaml create mode 100644 templates/flavors/apiserver-ilb-zones/patches/lb-zones.yaml create mode 100644 templates/test/ci/cluster-template-prow-apiserver-ilb-zones.yaml create mode 100644 templates/test/ci/prow-apiserver-ilb-zones/kustomization.yaml create mode 100644 templates/test/ci/prow-apiserver-ilb-zones/patches/lb-zones.yaml diff --git a/templates/cluster-template-apiserver-ilb-zones.yaml b/templates/cluster-template-apiserver-ilb-zones.yaml new file mode 100644 index 00000000000..fa487e06fe6 --- /dev/null +++ b/templates/cluster-template-apiserver-ilb-zones.yaml @@ -0,0 +1,235 @@ +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Cluster +metadata: + name: ${CLUSTER_NAME} + namespace: default +spec: + clusterNetwork: + pods: + cidrBlocks: + - 192.168.0.0/16 + controlPlaneRef: + apiVersion: controlplane.cluster.x-k8s.io/v1beta1 + kind: KubeadmControlPlane + name: ${CLUSTER_NAME}-control-plane + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureCluster + name: ${CLUSTER_NAME} +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureCluster +metadata: + name: ${CLUSTER_NAME} + namespace: default +spec: + identityRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureClusterIdentity + name: ${CLUSTER_IDENTITY_NAME} + location: ${AZURE_LOCATION} + networkSpec: + apiServerLB: + availabilityZones: + - "1" + - "2" + - "3" + frontendIPs: + - name: ${CLUSTER_NAME}-api-lb + publicIP: + dnsName: ${CLUSTER_NAME}-${APISERVER_LB_DNS_SUFFIX}.${AZURE_LOCATION}.cloudapp.azure.com + name: ${CLUSTER_NAME}-api-lb + - name: ${CLUSTER_NAME}-internal-lb-private-ip + privateIP: ${AZURE_INTERNAL_LB_PRIVATE_IP:-30.0.0.100} + controlPlaneOutboundLB: + availabilityZones: + - "1" + - "2" + - "3" + nodeOutboundLB: + availabilityZones: + - "1" + - "2" + - "3" + subnets: + - cidrBlocks: + - 30.0.0.0/16 + name: control-plane-subnet + role: control-plane + - cidrBlocks: + - 30.1.0.0/16 + name: node-subnet + role: node + vnet: + cidrBlocks: + - 30.0.0.0/8 + name: ${AZURE_VNET_NAME:=${CLUSTER_NAME}-vnet} + resourceGroup: ${AZURE_RESOURCE_GROUP:=${CLUSTER_NAME}} + subscriptionID: ${AZURE_SUBSCRIPTION_ID} +--- +apiVersion: controlplane.cluster.x-k8s.io/v1beta1 +kind: KubeadmControlPlane +metadata: + name: ${CLUSTER_NAME}-control-plane + namespace: default +spec: + kubeadmConfigSpec: + clusterConfiguration: + apiServer: + extraArgs: {} + timeoutForControlPlane: 20m + controllerManager: + extraArgs: + allocate-node-cidrs: "false" + cloud-provider: external + cluster-name: ${CLUSTER_NAME} + etcd: + local: + dataDir: /var/lib/etcddisk/etcd + extraArgs: + quota-backend-bytes: "8589934592" + diskSetup: + filesystems: + - device: /dev/disk/azure/scsi1/lun0 + extraOpts: + - -E + - lazy_itable_init=1,lazy_journal_init=1 + filesystem: ext4 + label: etcd_disk + - device: ephemeral0.1 + filesystem: ext4 + label: ephemeral0 + replaceFS: ntfs + partitions: + - device: /dev/disk/azure/scsi1/lun0 + layout: true + overwrite: false + tableType: gpt + files: + - contentFrom: + secret: + key: control-plane-azure.json + name: ${CLUSTER_NAME}-control-plane-azure-json + owner: root:root + path: /etc/kubernetes/azure.json + permissions: "0644" + initConfiguration: + nodeRegistration: + kubeletExtraArgs: + cloud-provider: external + name: '{{ ds.meta_data["local_hostname"] }}' + joinConfiguration: + nodeRegistration: + kubeletExtraArgs: + cloud-provider: external + name: '{{ ds.meta_data["local_hostname"] }}' + mounts: + - - LABEL=etcd_disk + - /var/lib/etcddisk + postKubeadmCommands: [] + preKubeadmCommands: [] + verbosity: 10 + machineTemplate: + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureMachineTemplate + name: ${CLUSTER_NAME}-control-plane + replicas: ${CONTROL_PLANE_MACHINE_COUNT:=1} + version: ${KUBERNETES_VERSION} +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureMachineTemplate +metadata: + name: ${CLUSTER_NAME}-control-plane + namespace: default +spec: + template: + spec: + dataDisks: + - diskSizeGB: 256 + lun: 0 + nameSuffix: etcddisk + identity: UserAssigned + osDisk: + diskSizeGB: 128 + osType: Linux + sshPublicKey: ${AZURE_SSH_PUBLIC_KEY_B64:=""} + userAssignedIdentities: + - providerID: azure:///subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/${CI_RG:=capz-ci}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/${USER_IDENTITY:=cloud-provider-user-identity} + vmSize: ${AZURE_CONTROL_PLANE_MACHINE_TYPE} +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachineDeployment +metadata: + name: ${CLUSTER_NAME}-md-0 + namespace: default +spec: + clusterName: ${CLUSTER_NAME} + replicas: ${WORKER_MACHINE_COUNT:=2} + selector: + matchLabels: null + template: + spec: + bootstrap: + configRef: + apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 + kind: KubeadmConfigTemplate + name: ${CLUSTER_NAME}-md-0 + clusterName: ${CLUSTER_NAME} + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureMachineTemplate + name: ${CLUSTER_NAME}-md-0 + version: ${KUBERNETES_VERSION} +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureMachineTemplate +metadata: + name: ${CLUSTER_NAME}-md-0 + namespace: default +spec: + template: + spec: + osDisk: + diskSizeGB: 128 + osType: Linux + sshPublicKey: ${AZURE_SSH_PUBLIC_KEY_B64:=""} + vmSize: ${AZURE_NODE_MACHINE_TYPE} +--- +apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 +kind: KubeadmConfigTemplate +metadata: + name: ${CLUSTER_NAME}-md-0 + namespace: default +spec: + template: + spec: + files: + - contentFrom: + secret: + key: worker-node-azure.json + name: ${CLUSTER_NAME}-md-0-azure-json + owner: root:root + path: /etc/kubernetes/azure.json + permissions: "0644" + joinConfiguration: + nodeRegistration: + kubeletExtraArgs: + cloud-provider: external + name: '{{ ds.meta_data["local_hostname"] }}' + preKubeadmCommands: + - echo '${AZURE_INTERNAL_LB_PRIVATE_IP:-30.0.0.100} ${CLUSTER_NAME}-${APISERVER_LB_DNS_SUFFIX}.${AZURE_LOCATION}.cloudapp.azure.com' + >> /etc/hosts +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureClusterIdentity +metadata: + labels: + clusterctl.cluster.x-k8s.io/move-hierarchy: "true" + name: ${CLUSTER_IDENTITY_NAME} + namespace: default +spec: + allowedNamespaces: {} + clientID: ${AZURE_CLIENT_ID_USER_ASSIGNED_IDENTITY} + tenantID: ${AZURE_TENANT_ID} + type: ${CLUSTER_IDENTITY_TYPE:=WorkloadIdentity} diff --git a/templates/flavors/apiserver-ilb-zones/kustomization.yaml b/templates/flavors/apiserver-ilb-zones/kustomization.yaml new file mode 100644 index 00000000000..f3a814b11ad --- /dev/null +++ b/templates/flavors/apiserver-ilb-zones/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: default +resources: +- ../apiserver-ilb + +patches: +- path: patches/lb-zones.yaml + +sortOptions: + order: fifo diff --git a/templates/flavors/apiserver-ilb-zones/patches/lb-zones.yaml b/templates/flavors/apiserver-ilb-zones/patches/lb-zones.yaml new file mode 100644 index 00000000000..bc02d3e3667 --- /dev/null +++ b/templates/flavors/apiserver-ilb-zones/patches/lb-zones.yaml @@ -0,0 +1,21 @@ +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureCluster +metadata: + name: ${CLUSTER_NAME} +spec: + networkSpec: + apiServerLB: + availabilityZones: + - "1" + - "2" + - "3" + nodeOutboundLB: + availabilityZones: + - "1" + - "2" + - "3" + controlPlaneOutboundLB: + availabilityZones: + - "1" + - "2" + - "3" diff --git a/templates/test/ci/cluster-template-prow-apiserver-ilb-zones.yaml b/templates/test/ci/cluster-template-prow-apiserver-ilb-zones.yaml new file mode 100644 index 00000000000..07b52eb72b9 --- /dev/null +++ b/templates/test/ci/cluster-template-prow-apiserver-ilb-zones.yaml @@ -0,0 +1,420 @@ +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Cluster +metadata: + labels: + cloud-provider: ${CLOUD_PROVIDER_AZURE_LABEL:=azure} + cni: calico + name: ${CLUSTER_NAME} + namespace: default +spec: + clusterNetwork: + pods: + cidrBlocks: + - 192.168.0.0/16 + controlPlaneRef: + apiVersion: controlplane.cluster.x-k8s.io/v1beta1 + kind: KubeadmControlPlane + name: ${CLUSTER_NAME}-control-plane + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureCluster + name: ${CLUSTER_NAME} +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureCluster +metadata: + name: ${CLUSTER_NAME} + namespace: default +spec: + additionalTags: + buildProvenance: ${BUILD_PROVENANCE} + creationTimestamp: ${TIMESTAMP} + jobName: ${JOB_NAME} + identityRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureClusterIdentity + name: ${CLUSTER_IDENTITY_NAME} + location: ${AZURE_LOCATION} + networkSpec: + apiServerLB: + availabilityZones: + - "1" + - "2" + - "3" + frontendIPs: + - name: ${CLUSTER_NAME}-api-lb + publicIP: + dnsName: ${CLUSTER_NAME}-${APISERVER_LB_DNS_SUFFIX}.${AZURE_LOCATION}.cloudapp.azure.com + name: ${CLUSTER_NAME}-api-lb + - name: ${CLUSTER_NAME}-internal-lb-private-ip + privateIP: ${AZURE_INTERNAL_LB_PRIVATE_IP} + controlPlaneOutboundLB: + availabilityZones: + - "1" + - "2" + - "3" + nodeOutboundLB: + availabilityZones: + - "1" + - "2" + - "3" + subnets: + - cidrBlocks: + - ${AZURE_CP_SUBNET_CIDR} + name: control-plane-subnet + role: control-plane + - cidrBlocks: + - ${AZURE_NODE_SUBNET_CIDR} + name: node-subnet + role: node + vnet: + cidrBlocks: + - ${AZURE_VNET_CIDR} + name: ${AZURE_VNET_NAME:=${CLUSTER_NAME}-vnet} + resourceGroup: ${AZURE_RESOURCE_GROUP:=${CLUSTER_NAME}} + subscriptionID: ${AZURE_SUBSCRIPTION_ID} +--- +apiVersion: controlplane.cluster.x-k8s.io/v1beta1 +kind: KubeadmControlPlane +metadata: + name: ${CLUSTER_NAME}-control-plane + namespace: default +spec: + kubeadmConfigSpec: + clusterConfiguration: + apiServer: + extraArgs: {} + timeoutForControlPlane: 20m + controllerManager: + extraArgs: + allocate-node-cidrs: "false" + cloud-provider: external + cluster-name: ${CLUSTER_NAME} + v: "4" + etcd: + local: + dataDir: /var/lib/etcddisk/etcd + extraArgs: + quota-backend-bytes: "8589934592" + diskSetup: + filesystems: + - device: /dev/disk/azure/scsi1/lun0 + extraOpts: + - -E + - lazy_itable_init=1,lazy_journal_init=1 + filesystem: ext4 + label: etcd_disk + - device: ephemeral0.1 + filesystem: ext4 + label: ephemeral0 + replaceFS: ntfs + partitions: + - device: /dev/disk/azure/scsi1/lun0 + layout: true + overwrite: false + tableType: gpt + files: + - contentFrom: + secret: + key: control-plane-azure.json + name: ${CLUSTER_NAME}-control-plane-azure-json + owner: root:root + path: /etc/kubernetes/azure.json + permissions: "0644" + initConfiguration: + nodeRegistration: + kubeletExtraArgs: + cloud-provider: external + name: '{{ ds.meta_data["local_hostname"] }}' + joinConfiguration: + nodeRegistration: + kubeletExtraArgs: + cloud-provider: external + name: '{{ ds.meta_data["local_hostname"] }}' + mounts: + - - LABEL=etcd_disk + - /var/lib/etcddisk + postKubeadmCommands: [] + preKubeadmCommands: [] + verbosity: 10 + machineTemplate: + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureMachineTemplate + name: ${CLUSTER_NAME}-control-plane + replicas: ${CONTROL_PLANE_MACHINE_COUNT:=1} + version: ${KUBERNETES_VERSION} +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureMachineTemplate +metadata: + name: ${CLUSTER_NAME}-control-plane + namespace: default +spec: + template: + spec: + dataDisks: + - diskSizeGB: 256 + lun: 0 + nameSuffix: etcddisk + identity: UserAssigned + osDisk: + diskSizeGB: 128 + osType: Linux + sshPublicKey: ${AZURE_SSH_PUBLIC_KEY_B64:=""} + userAssignedIdentities: + - providerID: azure:///subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/${CI_RG:=capz-ci}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/${USER_IDENTITY:=cloud-provider-user-identity} + vmSize: ${AZURE_CONTROL_PLANE_MACHINE_TYPE} +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachineDeployment +metadata: + name: ${CLUSTER_NAME}-md-0 + namespace: default +spec: + clusterName: ${CLUSTER_NAME} + replicas: ${WORKER_MACHINE_COUNT:=2} + selector: + matchLabels: null + template: + spec: + bootstrap: + configRef: + apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 + kind: KubeadmConfigTemplate + name: ${CLUSTER_NAME}-md-0 + clusterName: ${CLUSTER_NAME} + infrastructureRef: + apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 + kind: AzureMachineTemplate + name: ${CLUSTER_NAME}-md-0 + version: ${KUBERNETES_VERSION} +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureMachineTemplate +metadata: + name: ${CLUSTER_NAME}-md-0 + namespace: default +spec: + template: + spec: + identity: UserAssigned + osDisk: + diskSizeGB: 128 + osType: Linux + sshPublicKey: ${AZURE_SSH_PUBLIC_KEY_B64:=""} + userAssignedIdentities: + - providerID: azure:///subscriptions/${AZURE_SUBSCRIPTION_ID}/resourceGroups/${CI_RG:=capz-ci}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/${USER_IDENTITY:=cloud-provider-user-identity} + vmSize: ${AZURE_NODE_MACHINE_TYPE} +--- +apiVersion: bootstrap.cluster.x-k8s.io/v1beta1 +kind: KubeadmConfigTemplate +metadata: + name: ${CLUSTER_NAME}-md-0 + namespace: default +spec: + template: + spec: + files: + - contentFrom: + secret: + key: worker-node-azure.json + name: ${CLUSTER_NAME}-md-0-azure-json + owner: root:root + path: /etc/kubernetes/azure.json + permissions: "0644" + joinConfiguration: + nodeRegistration: + kubeletExtraArgs: + cloud-provider: external + name: '{{ ds.meta_data["local_hostname"] }}' + preKubeadmCommands: + - echo '${AZURE_INTERNAL_LB_PRIVATE_IP} ${CLUSTER_NAME}-${APISERVER_LB_DNS_SUFFIX}.${AZURE_LOCATION}.cloudapp.azure.com' + >> /etc/hosts +--- +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureClusterIdentity +metadata: + labels: + clusterctl.cluster.x-k8s.io/move-hierarchy: "true" + name: ${CLUSTER_IDENTITY_NAME} + namespace: default +spec: + allowedNamespaces: {} + clientID: ${AZURE_CLIENT_ID_USER_ASSIGNED_IDENTITY} + tenantID: ${AZURE_TENANT_ID} + type: ${CLUSTER_IDENTITY_TYPE:=WorkloadIdentity} +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: MachineHealthCheck +metadata: + name: ${CLUSTER_NAME}-control-plane + namespace: default +spec: + clusterName: ${CLUSTER_NAME} + maxUnhealthy: 100% + selector: + matchLabels: + cluster.x-k8s.io/control-plane: "" + unhealthyConditions: + - status: Unknown + timeout: 300s + type: Ready + - status: "False" + timeout: 300s + type: Ready +--- +apiVersion: addons.cluster.x-k8s.io/v1alpha1 +kind: HelmChartProxy +metadata: + name: calico + namespace: default +spec: + chartName: tigera-operator + clusterSelector: + matchLabels: + cni: calico + namespace: tigera-operator + releaseName: projectcalico + repoURL: https://docs.tigera.io/calico/charts + valuesTemplate: | + installation: + cni: + type: Calico + ipam: + type: Calico + calicoNetwork: + bgp: Disabled + windowsDataplane: HNS + mtu: 1350 + ipPools:{{range $i, $cidr := .Cluster.spec.clusterNetwork.pods.cidrBlocks }} + - cidr: {{ $cidr }} + encapsulation: VXLAN{{end}} + typhaDeployment: + spec: + template: + spec: + # By default, typha tolerates all NoSchedule taints. This breaks + # scale-ins when it continuously gets scheduled onto an + # out-of-date Node that is being deleted. Tolerate only the + # NoSchedule taints that are expected. + tolerations: + - effect: NoExecute + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Exists + - effect: NoSchedule + key: node.kubernetes.io/not-ready + operator: Exists + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 50 + preference: + matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + registry: capzcicommunity.azurecr.io + serviceCIDRs: + - 10.96.0.0/12 # must match cluster service CIDR (this is the default) + # Image and registry configuration for the tigera/operator pod + tigeraOperator: + image: tigera/operator + registry: capzcicommunity.azurecr.io + calicoctl: + image: capzcicommunity.azurecr.io/calico/ctl + # when kubernetesServiceEndpoint (required for windows) is added + # DNS configuration is needed to look up the api server name properly + # https://github.com/projectcalico/calico/issues/9536 + dnsConfig: + nameservers: + - 127.0.0.53 + options: + - name: edns0 + - name: trust-ad + kubernetesServiceEndpoint: + host: "{{ .Cluster.spec.controlPlaneEndpoint.host }}" + port: "{{ .Cluster.spec.controlPlaneEndpoint.port }}" + # By default, tigera tolerates all NoSchedule taints. This breaks upgrades + # when it continuously gets scheduled onto an out-of-date Node that is being + # deleted. Tolerate only the NoSchedule taints that are expected. + tolerations: + - effect: NoExecute + operator: Exists + - effect: NoSchedule + key: node-role.kubernetes.io/control-plane + operator: Exists + - effect: NoSchedule + key: node.kubernetes.io/not-ready + operator: Exists + version: ${CALICO_VERSION} +--- +apiVersion: addons.cluster.x-k8s.io/v1alpha1 +kind: HelmChartProxy +metadata: + name: azuredisk-csi-driver-chart + namespace: default +spec: + chartName: azuredisk-csi-driver + clusterSelector: + matchLabels: + azuredisk-csi: "true" + namespace: kube-system + releaseName: azuredisk-csi-driver-oot + repoURL: https://raw.githubusercontent.com/kubernetes-sigs/azuredisk-csi-driver/master/charts + valuesTemplate: |- + controller: + replicas: 1 + runOnControlPlane: true + windows: + useHostProcessContainers: {{ hasKey .Cluster.metadata.labels "cni-windows" }} +--- +apiVersion: addons.cluster.x-k8s.io/v1alpha1 +kind: HelmChartProxy +metadata: + name: cloud-provider-azure-chart + namespace: default +spec: + chartName: cloud-provider-azure + clusterSelector: + matchLabels: + cloud-provider: azure + releaseName: cloud-provider-azure-oot + repoURL: https://raw.githubusercontent.com/kubernetes-sigs/cloud-provider-azure/master/helm/repo + valuesTemplate: | + infra: + clusterName: {{ .Cluster.metadata.name }} + cloudControllerManager: + clusterCIDR: {{ .Cluster.spec.clusterNetwork.pods.cidrBlocks | join "," }} + logVerbosity: 4 +--- +apiVersion: addons.cluster.x-k8s.io/v1alpha1 +kind: HelmChartProxy +metadata: + name: cloud-provider-azure-chart-ci + namespace: default +spec: + chartName: cloud-provider-azure + clusterSelector: + matchLabels: + cloud-provider: azure-ci + releaseName: cloud-provider-azure-oot + repoURL: https://raw.githubusercontent.com/kubernetes-sigs/cloud-provider-azure/master/helm/repo + valuesTemplate: | + infra: + clusterName: {{ .Cluster.metadata.name }} + cloudControllerManager: + cloudConfig: ${CLOUD_CONFIG:-"/etc/kubernetes/azure.json"} + cloudConfigSecretName: ${CONFIG_SECRET_NAME:-""} + clusterCIDR: {{ .Cluster.spec.clusterNetwork.pods.cidrBlocks | join "," }} + imageName: "${CCM_IMAGE_NAME:-""}" + imageRepository: "${IMAGE_REGISTRY:-""}" + imageTag: "${IMAGE_TAG_CCM:-""}" + logVerbosity: ${CCM_LOG_VERBOSITY:-4} + replicas: ${CCM_COUNT:-1} + enableDynamicReloading: ${ENABLE_DYNAMIC_RELOADING:-false} + cloudNodeManager: + imageName: "${CNM_IMAGE_NAME:-""}" + imageRepository: "${IMAGE_REGISTRY:-""}" + imageTag: "${IMAGE_TAG_CNM:-""}" diff --git a/templates/test/ci/prow-apiserver-ilb-zones/kustomization.yaml b/templates/test/ci/prow-apiserver-ilb-zones/kustomization.yaml new file mode 100644 index 00000000000..5f9f50c11f0 --- /dev/null +++ b/templates/test/ci/prow-apiserver-ilb-zones/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: default +resources: + - ../prow-apiserver-ilb + +patches: + - path: patches/lb-zones.yaml + +sortOptions: + order: fifo diff --git a/templates/test/ci/prow-apiserver-ilb-zones/patches/lb-zones.yaml b/templates/test/ci/prow-apiserver-ilb-zones/patches/lb-zones.yaml new file mode 100644 index 00000000000..bc02d3e3667 --- /dev/null +++ b/templates/test/ci/prow-apiserver-ilb-zones/patches/lb-zones.yaml @@ -0,0 +1,21 @@ +apiVersion: infrastructure.cluster.x-k8s.io/v1beta1 +kind: AzureCluster +metadata: + name: ${CLUSTER_NAME} +spec: + networkSpec: + apiServerLB: + availabilityZones: + - "1" + - "2" + - "3" + nodeOutboundLB: + availabilityZones: + - "1" + - "2" + - "3" + controlPlaneOutboundLB: + availabilityZones: + - "1" + - "2" + - "3" diff --git a/test/e2e/azure_test.go b/test/e2e/azure_test.go index e895ef2f20d..b3268b090af 100644 --- a/test/e2e/azure_test.go +++ b/test/e2e/azure_test.go @@ -1441,11 +1441,9 @@ var _ = Describe("Workload cluster creation", func() { Expect(os.Setenv("AZURE_VNET_CIDR", "40.0.0.0/8")).To(Succeed()) Expect(os.Setenv("AZURE_CP_SUBNET_CIDR", "40.0.0.0/16")).To(Succeed()) Expect(os.Setenv("AZURE_NODE_SUBNET_CIDR", "40.1.0.0/16")).To(Succeed()) - Expect(os.Setenv("AZURE_LB_ZONES", "1,2,3")).To(Succeed()) - clusterctl.ApplyClusterTemplateAndWait(ctx, createApplyClusterTemplateInput( specName, - withFlavor("apiserver-ilb"), + withFlavor("apiserver-ilb-zones"), withNamespace(namespace.Name), withClusterName(clusterName), withControlPlaneMachineCount(3), diff --git a/test/e2e/config/azure-dev.yaml b/test/e2e/config/azure-dev.yaml index b89ed3fb755..0d39ecc0e44 100644 --- a/test/e2e/config/azure-dev.yaml +++ b/test/e2e/config/azure-dev.yaml @@ -188,6 +188,8 @@ providers: targetName: "cluster-template-apiserver-ilb.yaml" - sourcePath: "${PWD}/templates/test/ci/cluster-template-prow-apiserver-ilb-custom-images.yaml" targetName: "cluster-template-apiserver-ilb-custom-images.yaml" + - sourcePath: "${PWD}/templates/test/ci/cluster-template-prow-apiserver-ilb-zones.yaml" + targetName: "cluster-template-apiserver-ilb-zones.yaml" - sourcePath: "${PWD}/templates/test/ci/cluster-template-prow-dalec-custom-builds.yaml" targetName: "cluster-template-dalec-custom-builds.yaml" - sourcePath: "${PWD}/templates/test/ci/cluster-template-prow-azl3.yaml"