Skip to content

Commit ecc3768

Browse files
[no-relnote] add helper internal pkg for tests/e2e
Signed-off-by: Carlos Eduardo Arango Gutierrez <eduardoa@nvidia.com>
1 parent 2876cd0 commit ecc3768

File tree

1 file changed

+288
-0
lines changed

1 file changed

+288
-0
lines changed

tests/e2e/internal/kube.go

Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*
5+
* Licensed under the Apache License, Version 2.0 (the "License");
6+
* you may not use this file except in compliance with the License.
7+
* You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package internal
19+
20+
import (
21+
"context"
22+
"fmt"
23+
"time"
24+
25+
. "github.com/onsi/gomega"
26+
appsv1 "k8s.io/api/apps/v1"
27+
corev1 "k8s.io/api/core/v1"
28+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29+
"k8s.io/apimachinery/pkg/labels"
30+
"k8s.io/client-go/kubernetes"
31+
)
32+
33+
const (
34+
// DefaultPollInterval for Eventually checks
35+
DefaultPollInterval = 2 * time.Second
36+
// DefaultTimeout for Eventually checks
37+
DefaultTimeout = 5 * time.Minute
38+
)
39+
40+
// WaitForDaemonSetRollout waits for a DaemonSet to complete its rollout
41+
func WaitForDaemonSetRollout(ctx context.Context, client kubernetes.Interface, namespace, name string) error {
42+
EventuallyWithOffset(1, func(g Gomega) error {
43+
ds, err := client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{})
44+
if err != nil {
45+
return err
46+
}
47+
48+
// Check if rollout is complete
49+
if ds.Status.DesiredNumberScheduled == 0 {
50+
return fmt.Errorf("daemonset %s/%s has 0 desired pods", namespace, name)
51+
}
52+
53+
if ds.Status.NumberReady != ds.Status.DesiredNumberScheduled {
54+
return fmt.Errorf("daemonset %s/%s rollout incomplete: %d/%d pods ready",
55+
namespace, name, ds.Status.NumberReady, ds.Status.DesiredNumberScheduled)
56+
}
57+
58+
if ds.Status.UpdatedNumberScheduled != ds.Status.DesiredNumberScheduled {
59+
return fmt.Errorf("daemonset %s/%s update incomplete: %d/%d pods updated",
60+
namespace, name, ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled)
61+
}
62+
63+
// Check generation to ensure we're looking at the latest spec
64+
if ds.Generation != ds.Status.ObservedGeneration {
65+
return fmt.Errorf("daemonset %s/%s generation mismatch: %d != %d",
66+
namespace, name, ds.Generation, ds.Status.ObservedGeneration)
67+
}
68+
69+
return nil
70+
}).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed())
71+
return nil
72+
}
73+
74+
// WaitForAllDaemonSetsReady waits for all DaemonSets in a namespace to be ready
75+
func WaitForAllDaemonSetsReady(ctx context.Context, client kubernetes.Interface, namespace string) error {
76+
return WaitForDaemonSetsReady(ctx, client, namespace, "")
77+
}
78+
79+
// WaitForDaemonSetsReady waits for DaemonSets in a namespace to be ready, optionally filtered by label selector
80+
func WaitForDaemonSetsReady(ctx context.Context, client kubernetes.Interface, namespace, labelSelector string) error {
81+
EventuallyWithOffset(1, func(g Gomega) error {
82+
dsList, err := client.AppsV1().DaemonSets(namespace).List(ctx, metav1.ListOptions{
83+
LabelSelector: labelSelector,
84+
})
85+
if err != nil {
86+
return err
87+
}
88+
89+
if len(dsList.Items) == 0 {
90+
return fmt.Errorf("no daemonsets found in namespace %s with selector '%s'", namespace, labelSelector)
91+
}
92+
93+
for _, ds := range dsList.Items {
94+
// Skip if no pods are desired
95+
if ds.Status.DesiredNumberScheduled == 0 {
96+
continue
97+
}
98+
99+
if ds.Status.NumberReady != ds.Status.DesiredNumberScheduled {
100+
return fmt.Errorf("daemonset %s/%s rollout incomplete: %d/%d pods ready",
101+
namespace, ds.Name, ds.Status.NumberReady, ds.Status.DesiredNumberScheduled)
102+
}
103+
104+
if ds.Status.UpdatedNumberScheduled != ds.Status.DesiredNumberScheduled {
105+
return fmt.Errorf("daemonset %s/%s update incomplete: %d/%d pods updated",
106+
namespace, ds.Name, ds.Status.UpdatedNumberScheduled, ds.Status.DesiredNumberScheduled)
107+
}
108+
109+
// Check generation to ensure we're looking at the latest spec
110+
if ds.Generation != ds.Status.ObservedGeneration {
111+
return fmt.Errorf("daemonset %s/%s generation mismatch: %d != %d",
112+
namespace, ds.Name, ds.Generation, ds.Status.ObservedGeneration)
113+
}
114+
}
115+
116+
return nil
117+
}).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed())
118+
return nil
119+
}
120+
121+
// WaitForDaemonSetPodsReady waits for all pods of a DaemonSet to be ready
122+
func WaitForDaemonSetPodsReady(ctx context.Context, client kubernetes.Interface, namespace, name string) error {
123+
EventuallyWithOffset(1, func(g Gomega) error {
124+
ds, err := client.AppsV1().DaemonSets(namespace).Get(ctx, name, metav1.GetOptions{})
125+
if err != nil {
126+
return err
127+
}
128+
129+
selector, err := metav1.LabelSelectorAsSelector(ds.Spec.Selector)
130+
if err != nil {
131+
return fmt.Errorf("invalid selector: %v", err)
132+
}
133+
134+
pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
135+
LabelSelector: selector.String(),
136+
})
137+
if err != nil {
138+
return err
139+
}
140+
141+
if len(pods.Items) == 0 {
142+
return fmt.Errorf("no pods found for daemonset %s/%s", namespace, name)
143+
}
144+
145+
for _, pod := range pods.Items {
146+
if !isPodReady(&pod) {
147+
return fmt.Errorf("pod %s/%s is not ready", pod.Namespace, pod.Name)
148+
}
149+
}
150+
151+
return nil
152+
}).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed())
153+
return nil
154+
}
155+
156+
// WaitForNodeLabels waits for specific labels to appear on nodes
157+
func WaitForNodeLabels(ctx context.Context, client kubernetes.Interface, labelSelector string, expectedLabels map[string]string) error {
158+
EventuallyWithOffset(1, func(g Gomega) error {
159+
nodes, err := client.CoreV1().Nodes().List(ctx, metav1.ListOptions{
160+
LabelSelector: labelSelector,
161+
})
162+
if err != nil {
163+
return err
164+
}
165+
166+
if len(nodes.Items) == 0 {
167+
return fmt.Errorf("no nodes found with selector: %s", labelSelector)
168+
}
169+
170+
// Check each node has the expected labels
171+
for _, node := range nodes.Items {
172+
for key, expectedValue := range expectedLabels {
173+
actualValue, exists := node.Labels[key]
174+
if !exists {
175+
return fmt.Errorf("node %s missing label: %s", node.Name, key)
176+
}
177+
if expectedValue != "" && actualValue != expectedValue {
178+
return fmt.Errorf("node %s label %s=%s, expected %s",
179+
node.Name, key, actualValue, expectedValue)
180+
}
181+
}
182+
}
183+
184+
return nil
185+
}).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed())
186+
return nil
187+
}
188+
189+
// WaitForGFDLabels waits for GPU Feature Discovery labels on nodes
190+
func WaitForGFDLabels(ctx context.Context, client kubernetes.Interface, nodeName string) error {
191+
gfdLabels := []string{
192+
"nvidia.com/gfd.timestamp",
193+
"nvidia.com/cuda.driver.major",
194+
"nvidia.com/cuda.driver.minor",
195+
"nvidia.com/gpu.family",
196+
"nvidia.com/gpu.machine",
197+
"nvidia.com/gpu.memory",
198+
"nvidia.com/gpu.product",
199+
}
200+
201+
EventuallyWithOffset(1, func(g Gomega) error {
202+
node, err := client.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
203+
if err != nil {
204+
return err
205+
}
206+
207+
for _, label := range gfdLabels {
208+
if _, exists := node.Labels[label]; !exists {
209+
return fmt.Errorf("node %s missing GFD label: %s", nodeName, label)
210+
}
211+
}
212+
213+
return nil
214+
}).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed())
215+
return nil
216+
}
217+
218+
// WaitForPodsRunning waits for pods matching a selector to be running
219+
func WaitForPodsRunning(ctx context.Context, client kubernetes.Interface, namespace string, selector labels.Selector) error {
220+
EventuallyWithOffset(1, func(g Gomega) error {
221+
pods, err := client.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{
222+
LabelSelector: selector.String(),
223+
})
224+
if err != nil {
225+
return err
226+
}
227+
228+
if len(pods.Items) == 0 {
229+
return fmt.Errorf("no pods found matching selector: %s", selector.String())
230+
}
231+
232+
for _, pod := range pods.Items {
233+
if pod.Status.Phase != corev1.PodRunning {
234+
return fmt.Errorf("pod %s/%s is %s, not Running", pod.Namespace, pod.Name, pod.Status.Phase)
235+
}
236+
}
237+
238+
return nil
239+
}).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed())
240+
return nil
241+
}
242+
243+
// WaitForDeploymentRollout waits for a deployment to complete its rollout
244+
func WaitForDeploymentRollout(ctx context.Context, client kubernetes.Interface, namespace, name string) error {
245+
EventuallyWithOffset(1, func(g Gomega) error {
246+
deployment, err := client.AppsV1().Deployments(namespace).Get(ctx, name, metav1.GetOptions{})
247+
if err != nil {
248+
return err
249+
}
250+
251+
// Check if the deployment is complete
252+
for _, condition := range deployment.Status.Conditions {
253+
if condition.Type == appsv1.DeploymentProgressing {
254+
if condition.Status != corev1.ConditionTrue {
255+
return fmt.Errorf("deployment %s/%s is not progressing: %s", namespace, name, condition.Message)
256+
}
257+
}
258+
if condition.Type == appsv1.DeploymentAvailable {
259+
if condition.Status != corev1.ConditionTrue {
260+
return fmt.Errorf("deployment %s/%s is not available: %s", namespace, name, condition.Message)
261+
}
262+
}
263+
}
264+
265+
if deployment.Status.UpdatedReplicas != *deployment.Spec.Replicas {
266+
return fmt.Errorf("deployment %s/%s update incomplete: %d/%d replicas updated",
267+
namespace, name, deployment.Status.UpdatedReplicas, *deployment.Spec.Replicas)
268+
}
269+
270+
if deployment.Status.ReadyReplicas != *deployment.Spec.Replicas {
271+
return fmt.Errorf("deployment %s/%s not ready: %d/%d replicas ready",
272+
namespace, name, deployment.Status.ReadyReplicas, *deployment.Spec.Replicas)
273+
}
274+
275+
return nil
276+
}).WithContext(ctx).WithPolling(DefaultPollInterval).WithTimeout(DefaultTimeout).Should(Succeed())
277+
return nil
278+
}
279+
280+
// isPodReady checks if a pod is ready
281+
func isPodReady(pod *corev1.Pod) bool {
282+
for _, condition := range pod.Status.Conditions {
283+
if condition.Type == corev1.PodReady {
284+
return condition.Status == corev1.ConditionTrue
285+
}
286+
}
287+
return false
288+
}

0 commit comments

Comments
 (0)