55 pull_request :
66 branches :
77 - main
8- - ' release-*'
8+ - " release-*"
99 - ray-jobs-feature
10+ - kueue-integration
1011 paths-ignore :
11- - ' docs/**'
12- - ' **.adoc'
13- - ' **.md'
14- - ' LICENSE'
12+ - " docs/**"
13+ - " **.adoc"
14+ - " **.md"
15+ - " LICENSE"
1516
1617concurrency :
1718 group : ${{ github.head_ref }}-${{ github.workflow }}
1819 cancel-in-progress : true
1920
2021env :
2122 CODEFLARE_OPERATOR_IMG : " quay.io/project-codeflare/codeflare-operator:dev"
23+ KUEUE_VERSION : " v0.13.4"
2224
2325jobs :
2426 kubernetes :
4345 repository : project-codeflare/codeflare-operator
4446 path : codeflare-operator
4547
46- - name : Set Go
47- uses : actions/setup-go@v5
48- with :
49- go-version-file : ' ./codeflare-operator/go.mod'
50- cache-dependency-path : " ./codeflare-operator/go.sum"
51-
5248 - name : Set up gotestfmt
5349 uses : gotesttools/gotestfmt-action@v2
5450 with :
5753 - name : Set up specific Python version
5854 uses : actions/setup-python@v5
5955 with :
60- python-version : ' 3.11 '
61- cache : ' pip' # caching pip dependencies
56+ python-version : " 3.12 "
57+ cache : " pip" # caching pip dependencies
6258
6359 - name : Setup NVidia GPU environment for KinD
6460 uses : ./common/github-actions/nvidia-gpu-setup
@@ -71,16 +67,38 @@ jobs:
7167 - name : Install NVidia GPU operator for KinD
7268 uses : ./common/github-actions/nvidia-gpu-operator
7369
74- - name : Deploy CodeFlare stack
75- id : deploy
70+ - name : Wait for nodes to be ready
7671 run : |
77- cd codeflare-operator
78- echo Setting up CodeFlare stack
79- make setup-e2e
80- echo Deploying CodeFlare operator
81- make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
82- kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
83- cd ..
72+ echo "Waiting for all nodes to be ready..."
73+ kubectl wait --for=condition=Ready nodes --all --timeout=300s
74+
75+ echo "Checking node status..."
76+ kubectl get nodes -o wide
77+
78+ echo "Checking for CNI readiness..."
79+ for i in {1..30}; do
80+ if kubectl get nodes -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -v "True"; then
81+ echo "Waiting for CNI to initialize (attempt $i/30)..."
82+ sleep 10
83+ else
84+ echo "All nodes are ready!"
85+ break
86+ fi
87+ done
88+
89+ # Final verification
90+ kubectl describe nodes | grep -E "Ready|NetworkReady|RuntimeReady|PodCIDR"
91+
92+ - name: Deploy CodeFlare stack
93+ id: deploy
94+ run: |
95+ cd codeflare-operator
96+ echo Setting up CodeFlare stack
97+ make setup-e2e
98+ echo Deploying CodeFlare operator
99+ make deploy -e IMG="${CODEFLARE_OPERATOR_IMG}" -e ENV="e2e"
100+ kubectl wait --timeout=120s --for=condition=Available=true deployment -n openshift-operators codeflare-operator-manager
101+ cd ..
84102
85103 - name : Add user to KinD
86104 uses : ./common/github-actions/kind-add-user
@@ -93,16 +111,18 @@ jobs:
93111 kubectl create clusterrolebinding sdk-user-list-ingresses --clusterrole=list-ingresses --user=sdk-user
94112 kubectl create clusterrole namespace-creator --verb=get,list,create,delete,patch --resource=namespaces
95113 kubectl create clusterrolebinding sdk-user-namespace-creator --clusterrole=namespace-creator --user=sdk-user
96- kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch --resource=rayclusters
114+ kubectl create clusterrole raycluster-creator --verb=get,list,create,delete,patch,watch --resource=rayclusters
97115 kubectl create clusterrolebinding sdk-user-raycluster-creator --clusterrole=raycluster-creator --user=sdk-user
98- kubectl create clusterrole appwrapper -creator --verb=get,list,create,delete,patch --resource=appwrappers
99- kubectl create clusterrolebinding sdk-user-appwrapper -creator --clusterrole=appwrapper -creator --user=sdk-user
116+ kubectl create clusterrole rayjob -creator --verb=get,list,create,delete,patch,watch,update --resource=rayjobs,rayjobs/status
117+ kubectl create clusterrolebinding sdk-user-rayjob -creator --clusterrole=rayjob -creator --user=sdk-user
100118 kubectl create clusterrole resourceflavor-creator --verb=get,list,create,delete --resource=resourceflavors
101119 kubectl create clusterrolebinding sdk-user-resourceflavor-creator --clusterrole=resourceflavor-creator --user=sdk-user
102120 kubectl create clusterrole clusterqueue-creator --verb=get,list,create,delete,patch --resource=clusterqueues
103121 kubectl create clusterrolebinding sdk-user-clusterqueue-creator --clusterrole=clusterqueue-creator --user=sdk-user
104122 kubectl create clusterrole localqueue-creator --verb=get,list,create,delete,patch --resource=localqueues
105123 kubectl create clusterrolebinding sdk-user-localqueue-creator --clusterrole=localqueue-creator --user=sdk-user
124+ kubectl create clusterrole workload-creator --verb=get,list,watch --resource=workloads
125+ kubectl create clusterrolebinding sdk-user-workload-creator --clusterrole=workload-creator --user=sdk-user
106126 kubectl create clusterrole list-secrets --verb=get,list --resource=secrets
107127 kubectl create clusterrolebinding sdk-user-list-secrets --clusterrole=list-secrets --user=sdk-user
108128 kubectl create clusterrole pod-creator --verb=get,list,watch --resource=pods
@@ -111,8 +131,31 @@ jobs:
111131 kubectl create clusterrolebinding sdk-user-service-reader --clusterrole=service-reader --user=sdk-user
112132 kubectl create clusterrole port-forward-pods --verb=create --resource=pods/portforward
113133 kubectl create clusterrolebinding sdk-user-port-forward-pods-binding --clusterrole=port-forward-pods --user=sdk-user
134+ kubectl create clusterrole node-reader --verb=get,list --resource=nodes
135+ kubectl create clusterrolebinding sdk-user-node-reader --clusterrole=node-reader --user=sdk-user
114136 kubectl config use-context sdk-user
115137
138+ - name : Verify cluster readiness before tests
139+ run : |
140+ echo "=== Pre-test cluster verification ==="
141+ echo "Current context:"
142+ kubectl config current-context
143+
144+ echo -e "\nNode status:"
145+ kubectl get nodes -o wide
146+
147+ echo -e "\nSystem pods status:"
148+ kubectl get pods -A | grep -E "(kube-system|ray-system|kueue-system|openshift-operators)" || true
149+
150+ echo -e "\nChecking for any pods in error state:"
151+ kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded | grep -v "NAMESPACE" || echo "No pods in error state"
152+
153+ echo -e "\nKueue resources:"
154+ kubectl get resourceflavors,clusterqueues,localqueues -A || true
155+
156+ echo -e "\nRay CRDs:"
157+ kubectl get crd | grep ray || true
158+
116159 - name : Run e2e tests
117160 run : |
118161 export CODEFLARE_TEST_OUTPUT_DIR=${{ env.TEMP_DIR }}
@@ -122,19 +165,37 @@ jobs:
122165 pip install poetry
123166 poetry install --with test,docs
124167 echo "Running e2e tests..."
125- poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log 2>&1
168+ poetry run pytest -v -s ./tests/e2e -m 'kind and nvidia_gpu' > ${CODEFLARE_TEST_OUTPUT_DIR}/e2e-pytest_output.log 2>&1
169+ env :
170+ GRPC_DNS_RESOLVER : " native"
171+
172+ - name : Run RayJob e2e tests
173+ run : |
174+ set -euo pipefail
175+ echo "Running RayJob e2e tests..."
176+ # Set environment variable to prevent default queue assignment for non-Kueue tests
177+ export DISABLE_DEFAULT_KUEUE_QUEUE=true
178+
179+ # Run only the tests that are designed for Kueue integration
180+ poetry run pytest -v -s ./tests/e2e/rayjob/ -x > ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log 2>&1
126181 env :
127182 GRPC_DNS_RESOLVER : " native"
128183
129184 - name : Switch to kind-cluster context to print logs
130185 if : always() && steps.deploy.outcome == 'success'
131186 run : kubectl config use-context kind-cluster
132187
133- - name : Print Pytest output log
188+ - name : Print RayJob E2E Pytest output log
189+ if : always() && steps.deploy.outcome == 'success'
190+ run : |
191+ echo "Printing Pytest output logs"
192+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/rayjob_pytest_output.log
193+
194+ - name : Print E2E Pytest output log
134195 if : always() && steps.deploy.outcome == 'success'
135196 run : |
136197 echo "Printing Pytest output logs"
137- cat ${CODEFLARE_TEST_OUTPUT_DIR}/pytest_output.log
198+ cat ${CODEFLARE_TEST_OUTPUT_DIR}/e2e- pytest_output.log
138199
139200 - name : Print CodeFlare operator logs
140201 if : always() && steps.deploy.outcome == 'success'
@@ -162,3 +223,4 @@ jobs:
162223 retention-days : 10
163224 path : |
164225 ${{ env.CODEFLARE_TEST_OUTPUT_DIR }}/**/*.log
226+ if-no-files-found : warn
0 commit comments