Skip to content

Commit f7f44c3

Browse files
authored
Update cluster-api provider to use machineTemplate.status.nodeInfo for architecture-aware autoscale from zero (#8345)
* Update cluster-api provider to use machineTemplate.status.nodeInfo for architecture-aware autoscale from zero kubernetes-sigs/cluster-api#11962 introduced the nodeInfo field for MachineTemplates. Providers can reconcile this field in the status subresource to inform the autoscaler about the architecture and operating system that the MachineTemplate's nodes will run. Previously, we have been implementing this behavior in the cluster autoscaler by leveraging the labels capacity annotation and, as a fallback, default values set in environment variables at cluster-autoscaler deployment time. With this commit, the cluster autoscaler computes the future architecture of a node with the following priority order: - Labels set in existing nodes for not-autoscale-from-zero cases - Labels set in the labels capacity annotation of machine template, machine set, and machine deployment. - Values in the status.nodeSystemInfo of MachineTemplates - Generic/default labels set in the environment of the cluster autoscaler # Conflicts: # cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go * [Tests] Update cluster-api provider to use machineTemplate.status.nodeInfo for architecture-aware autoscale from zero kubernetes-sigs/cluster-api#11962 introduced the nodeInfo field for MachineTemplates. Providers can reconcile this field in the status subresource to inform the autoscaler about the architecture and operating system that the MachineTemplate's nodes will run. Previously, we have been implementing this behavior in the cluster autoscaler by leveraging the labels capacity annotation and, as a fallback, default values set in environment variables at cluster-autoscaler deployment time. With this commit, the cluster autoscaler computes the future architecture of a node with the following priority order: - Labels set in existing nodes for not-autoscale-from-zero cases - Labels set in the labels capacity annotation of machine template, machine set, and machine deployment. - Values in the status.nodeSystemInfo of MachineTemplates - Generic/default labels set in the environment of the cluster autoscaler
1 parent a257623 commit f7f44c3

File tree

5 files changed

+217
-11
lines changed

5 files changed

+217
-11
lines changed

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_nodegroup.go

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -361,12 +361,17 @@ func (ng *nodegroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
361361
},
362362
}
363363

364+
nsi := ng.scalableResource.InstanceSystemInfo()
365+
if nsi != nil {
366+
node.Status.NodeInfo = *nsi
367+
}
368+
364369
node.Status.Capacity = capacity
365370
node.Status.Allocatable = capacity
366371
node.Status.Conditions = cloudprovider.BuildReadyConditions()
367372
node.Spec.Taints = ng.scalableResource.Taints()
368373

369-
node.Labels, err = ng.buildTemplateLabels(nodeName)
374+
node.Labels, err = ng.buildTemplateLabels(nodeName, nsi)
370375
if err != nil {
371376
return nil, err
372377
}
@@ -380,8 +385,19 @@ func (ng *nodegroup) TemplateNodeInfo() (*framework.NodeInfo, error) {
380385
return nodeInfo, nil
381386
}
382387

383-
func (ng *nodegroup) buildTemplateLabels(nodeName string) (map[string]string, error) {
384-
labels := cloudprovider.JoinStringMaps(buildGenericLabels(nodeName), ng.scalableResource.Labels())
388+
func (ng *nodegroup) buildTemplateLabels(nodeName string, nsi *corev1.NodeSystemInfo) (map[string]string, error) {
389+
nsiLabels := make(map[string]string)
390+
if nsi != nil {
391+
nsiLabels[corev1.LabelArchStable] = nsi.Architecture
392+
nsiLabels[corev1.LabelOSStable] = nsi.OperatingSystem
393+
}
394+
395+
// The order of priority is:
396+
// - Labels set in existing nodes for not-autoscale-from-zero cases
397+
// - Labels set in the labels capacity annotation of machine template, machine set, and machine deployment.
398+
// - Values in the status.nodeSystemInfo of MachineTemplates
399+
// - Generic/default labels set in the environment of the cluster autoscaler
400+
labels := cloudprovider.JoinStringMaps(buildGenericLabels(nodeName), nsiLabels, ng.scalableResource.Labels())
385401

386402
nodes, err := ng.Nodes()
387403
if err != nil {

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_test_framework.go

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ type testConfigBuilder struct {
6060
nodeCount int
6161
annotations map[string]string
6262
capacity map[string]string
63+
nodeInfo map[string]string
6364
}
6465

6566
// NewTestConfigBuilder returns a builder for dynamically constructing mock ClusterAPI resources for testing.
@@ -91,6 +92,7 @@ func (b *testConfigBuilder) Build() *TestConfig {
9192
isMachineDeployment,
9293
b.annotations,
9394
b.capacity,
95+
b.nodeInfo,
9496
)[0],
9597
)[0]
9698
}
@@ -111,6 +113,7 @@ func (b *testConfigBuilder) BuildMultiple(configCount int) []*TestConfig {
111113
isMachineDeployment,
112114
b.annotations,
113115
b.capacity,
116+
b.nodeInfo,
114117
)...,
115118
)
116119
}
@@ -171,6 +174,18 @@ func (b *testConfigBuilder) WithCapacity(c map[string]string) *testConfigBuilder
171174
return b
172175
}
173176

177+
func (b *testConfigBuilder) WithNodeInfo(n map[string]string) *testConfigBuilder {
178+
if n == nil {
179+
b.nodeInfo = nil
180+
} else {
181+
if b.nodeInfo == nil {
182+
b.nodeInfo = map[string]string{}
183+
}
184+
maps.Insert(b.nodeInfo, maps.All(n))
185+
}
186+
return b
187+
}
188+
174189
// TestConfig contains clusterspecific information about a single test configuration.
175190
type TestConfig struct {
176191
spec *TestSpec
@@ -290,8 +305,8 @@ func createTestConfigs(specs ...TestSpec) []*TestConfig {
290305
UID: config.machineSet.GetUID(),
291306
}
292307

293-
if spec.capacity != nil {
294-
klog.V(4).Infof("adding capacity to machine template")
308+
if spec.capacity != nil || spec.nodeInfo != nil {
309+
klog.V(4).Infof("creating machine template")
295310
config.machineTemplate = &unstructured.Unstructured{
296311
Object: map[string]interface{}{
297312
"apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1",
@@ -303,13 +318,25 @@ func createTestConfigs(specs ...TestSpec) []*TestConfig {
303318
},
304319
},
305320
}
321+
}
322+
if spec.capacity != nil {
323+
klog.V(4).Infof("adding capacity to machine template")
306324
if err := unstructured.SetNestedStringMap(config.machineTemplate.Object, spec.capacity, "status", "capacity"); err != nil {
307325
panic(err)
308326
}
309327
} else {
310328
klog.V(4).Infof("not adding capacity")
311329
}
312330

331+
if spec.nodeInfo != nil {
332+
klog.V(4).Infof("adding node info")
333+
if err := unstructured.SetNestedStringMap(config.machineTemplate.Object, spec.nodeInfo, "status", "nodeInfo"); err != nil {
334+
panic(err)
335+
}
336+
} else {
337+
klog.V(4).Infof("not adding node info")
338+
}
339+
313340
for j := 0; j < spec.nodeCount; j++ {
314341
config.nodes[j], config.machines[j] = makeLinkedNodeAndMachine(j, spec.namespace, spec.clusterName, machineOwner, machineSetLabels)
315342
}
@@ -324,6 +351,7 @@ func createTestConfigs(specs ...TestSpec) []*TestConfig {
324351
type TestSpec struct {
325352
annotations map[string]string
326353
capacity map[string]string
354+
nodeInfo map[string]string
327355
machineDeploymentName string
328356
machineSetName string
329357
machinePoolName string
@@ -333,17 +361,17 @@ type TestSpec struct {
333361
rootIsMachineDeployment bool
334362
}
335363

336-
func createTestSpecs(namespace, clusterName, namePrefix string, scalableResourceCount, nodeCount int, isMachineDeployment bool, annotations map[string]string, capacity map[string]string) []TestSpec {
364+
func createTestSpecs(namespace, clusterName, namePrefix string, scalableResourceCount, nodeCount int, isMachineDeployment bool, annotations map[string]string, capacity map[string]string, nodeInfo map[string]string) []TestSpec {
337365
var specs []TestSpec
338366

339367
for i := 0; i < scalableResourceCount; i++ {
340-
specs = append(specs, createTestSpec(namespace, clusterName, fmt.Sprintf("%s-%d", namePrefix, i), nodeCount, isMachineDeployment, annotations, capacity))
368+
specs = append(specs, createTestSpec(namespace, clusterName, fmt.Sprintf("%s-%d", namePrefix, i), nodeCount, isMachineDeployment, annotations, capacity, nodeInfo))
341369
}
342370

343371
return specs
344372
}
345373

346-
func createTestSpec(namespace, clusterName, name string, nodeCount int, isMachineDeployment bool, annotations map[string]string, capacity map[string]string) TestSpec {
374+
func createTestSpec(namespace, clusterName, name string, nodeCount int, isMachineDeployment bool, annotations map[string]string, capacity map[string]string, nodeInfo map[string]string) TestSpec {
347375
return TestSpec{
348376
annotations: annotations,
349377
capacity: capacity,
@@ -353,6 +381,7 @@ func createTestSpec(namespace, clusterName, name string, nodeCount int, isMachin
353381
namespace: namespace,
354382
nodeCount: nodeCount,
355383
rootIsMachineDeployment: isMachineDeployment,
384+
nodeInfo: nodeInfo,
356385
}
357386
}
358387

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"path"
2424
"strconv"
2525
"strings"
26+
"sync"
2627
"time"
2728

2829
"github.com/pkg/errors"
@@ -42,6 +43,8 @@ import (
4243
type unstructuredScalableResource struct {
4344
controller *machineController
4445
unstructured *unstructured.Unstructured
46+
infraObj *unstructured.Unstructured
47+
infraMutex sync.RWMutex
4548
maxSize int
4649
minSize int
4750
autoscalingOptions map[string]string
@@ -321,6 +324,17 @@ func (r unstructuredScalableResource) InstanceCapacity() (map[corev1.ResourceNam
321324
return capacity, nil
322325
}
323326

327+
// InstanceSystemInfo sets the nodeSystemInfo from the infrastructure reference resource.
328+
// If the infrastructure reference resource is not found, returns nil.
329+
func (r unstructuredScalableResource) InstanceSystemInfo() *apiv1.NodeSystemInfo {
330+
infraObj, err := r.readInfrastructureReferenceResource()
331+
if err != nil || infraObj == nil {
332+
return nil
333+
}
334+
nsiObj := systemInfoFromInfrastructureObject(infraObj)
335+
return &nsiObj
336+
}
337+
324338
func (r unstructuredScalableResource) InstanceResourceSlices(nodeName string) ([]*resourceapi.ResourceSlice, error) {
325339
var result []*resourceapi.ResourceSlice
326340
driver := r.InstanceDRADriver()
@@ -390,6 +404,17 @@ func (r unstructuredScalableResource) InstanceDRADriver() string {
390404
}
391405

392406
func (r unstructuredScalableResource) readInfrastructureReferenceResource() (*unstructured.Unstructured, error) {
407+
// Cache w/ lazy loading of the infrastructure reference resource.
408+
r.infraMutex.RLock()
409+
if r.infraObj != nil {
410+
defer r.infraMutex.RUnlock()
411+
return r.infraObj, nil
412+
}
413+
r.infraMutex.RUnlock()
414+
415+
r.infraMutex.Lock()
416+
defer r.infraMutex.Unlock()
417+
393418
obKind := r.unstructured.GetKind()
394419
obName := r.unstructured.GetName()
395420

@@ -440,6 +465,8 @@ func (r unstructuredScalableResource) readInfrastructureReferenceResource() (*un
440465
return nil, err
441466
}
442467

468+
r.infraObj = infra
469+
443470
return infra, nil
444471
}
445472

@@ -477,6 +504,25 @@ func resourceCapacityFromInfrastructureObject(infraobj *unstructured.Unstructure
477504
return capacity
478505
}
479506

507+
func systemInfoFromInfrastructureObject(infraobj *unstructured.Unstructured) apiv1.NodeSystemInfo {
508+
nsi := apiv1.NodeSystemInfo{}
509+
infransi, found, err := unstructured.NestedStringMap(infraobj.Object, "status", "nodeInfo")
510+
if !found || err != nil {
511+
return nsi
512+
}
513+
514+
for k, v := range infransi {
515+
switch k {
516+
case "architecture":
517+
nsi.Architecture = v
518+
case "operatingSystem":
519+
nsi.OperatingSystem = v
520+
}
521+
}
522+
523+
return nsi
524+
}
525+
480526
// adapted from https://github.com/kubernetes/kubernetes/blob/release-1.25/pkg/util/taints/taints.go#L39
481527
func parseTaint(st string) (apiv1.Taint, error) {
482528
var taint apiv1.Taint

cluster-autoscaler/cloudprovider/clusterapi/clusterapi_unstructured_test.go

Lines changed: 115 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,14 @@ import (
3333
)
3434

3535
const (
36-
cpuStatusKey = "cpu"
37-
memoryStatusKey = "memory"
38-
nvidiaGpuStatusKey = "nvidia.com/gpu"
36+
cpuStatusKey = "cpu"
37+
memoryStatusKey = "memory"
38+
nvidiaGpuStatusKey = "nvidia.com/gpu"
39+
architectureStatusKey = "architecture"
40+
operatingSystemStatusKey = "operatingSystem"
41+
42+
arm64 = "arm64"
43+
linux = "linux"
3944
)
4045

4146
func TestSetSize(t *testing.T) {
@@ -595,3 +600,110 @@ func TestCanScaleFromZero(t *testing.T) {
595600
})
596601
}
597602
}
603+
604+
func TestInstanceSystemInfo(t *testing.T) {
605+
// use a constant capacity as that's necessary for the business logic to consider the resource scalable
606+
capacity := map[string]string{
607+
cpuStatusKey: "1",
608+
memoryStatusKey: "4G",
609+
}
610+
testConfigs := []struct {
611+
name string
612+
nodeInfo map[string]string
613+
expectedArch string
614+
expectedOS string
615+
}{
616+
{
617+
"with no architecture or operating system in machine template's status' nodeInfo, the system info is empty",
618+
map[string]string{},
619+
"",
620+
"",
621+
},
622+
{
623+
"with architecture in machine template's status' nodeInfo, the system info is filled in the scalable resource",
624+
map[string]string{
625+
architectureStatusKey: arm64,
626+
},
627+
arm64,
628+
"",
629+
},
630+
{
631+
"with operating system in machine template's status' nodeInfo, the system info is filled in the scalable resource",
632+
map[string]string{
633+
operatingSystemStatusKey: linux,
634+
},
635+
"",
636+
linux,
637+
},
638+
{
639+
"with architecture and operating system in machine template's status' nodeInfo, the system info is filled in the scalable resource",
640+
map[string]string{
641+
architectureStatusKey: arm64,
642+
operatingSystemStatusKey: linux,
643+
},
644+
arm64,
645+
linux,
646+
},
647+
}
648+
649+
for _, tc := range testConfigs {
650+
testname := fmt.Sprintf("MachineSet %s", tc.name)
651+
t.Run(testname, func(t *testing.T) {
652+
mdTestConfig := NewTestConfigBuilder().
653+
ForMachineSet().
654+
WithNodeCount(1).
655+
WithCapacity(capacity).
656+
WithNodeInfo(tc.nodeInfo).
657+
Build()
658+
controller := NewTestMachineController(t)
659+
defer controller.Stop()
660+
controller.AddTestConfigs(mdTestConfig)
661+
662+
testResource := mdTestConfig.machineSet
663+
664+
sr, err := newUnstructuredScalableResource(controller.machineController, testResource)
665+
if err != nil {
666+
t.Fatal(err)
667+
}
668+
669+
sysInfo := sr.InstanceSystemInfo()
670+
if sysInfo.Architecture != tc.expectedArch {
671+
t.Errorf("expected architecture %s, got %s", tc.nodeInfo[architectureStatusKey], sysInfo.Architecture)
672+
}
673+
if sysInfo.OperatingSystem != tc.expectedOS {
674+
t.Errorf("expected operating system %s, got %s", tc.nodeInfo[operatingSystemStatusKey], sysInfo.OperatingSystem)
675+
}
676+
})
677+
}
678+
679+
for _, tc := range testConfigs {
680+
testname := fmt.Sprintf("MachineDeployment %s", tc.name)
681+
t.Run(testname, func(t *testing.T) {
682+
mdTestConfig := NewTestConfigBuilder().
683+
ForMachineDeployment().
684+
WithNodeCount(1).
685+
WithCapacity(capacity).
686+
WithNodeInfo(tc.nodeInfo).
687+
Build()
688+
controller := NewTestMachineController(t)
689+
defer controller.Stop()
690+
controller.AddTestConfigs(mdTestConfig)
691+
692+
testResource := mdTestConfig.machineDeployment
693+
694+
sr, err := newUnstructuredScalableResource(controller.machineController, testResource)
695+
if err != nil {
696+
t.Fatal(err)
697+
}
698+
699+
sysInfo := sr.InstanceSystemInfo()
700+
if sysInfo.Architecture != tc.expectedArch {
701+
t.Errorf("expected architecture %s, got %s", tc.nodeInfo[architectureStatusKey], sysInfo.Architecture)
702+
}
703+
704+
if sysInfo.OperatingSystem != tc.expectedOS {
705+
t.Errorf("expected operating system %s, got %s", tc.nodeInfo[operatingSystemStatusKey], sysInfo.OperatingSystem)
706+
}
707+
})
708+
}
709+
}

cluster-autoscaler/cloudprovider/util_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,12 @@ func TestBuildKubeProxy(t *testing.T) {
4444
}
4545

4646
func TestJoinStringMaps(t *testing.T) {
47+
emptyMapBeginning := make(map[string]string)
4748
map1 := map[string]string{"1": "a", "2": "b"}
4849
map2 := map[string]string{"3": "c", "2": "d"}
4950
map3 := map[string]string{"5": "e"}
5051
result := JoinStringMaps(map1, map2, map3)
52+
emptyMapEnd := make(map[string]string)
53+
result = JoinStringMaps(emptyMapBeginning, map1, map2, map3, emptyMapEnd)
5154
assert.Equal(t, map[string]string{"1": "a", "2": "d", "3": "c", "5": "e"}, result)
5255
}

0 commit comments

Comments
 (0)