@@ -136,55 +136,31 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
136136
137137 if changed {
138138 log .V (int (zapcore .DebugLevel )).Info ("created/updated job, stopping request processing" )
139- return nil
139+ // Return a non error error here to prevent the controller from
140+ // clearing any previously set Status fields.
141+ return NotReadyErr {errors .New ("job changed" )}
140142 }
141143
142144 log .V (int (zapcore .DebugLevel )).Info ("version checker" , "job" , jobName )
143145 key := kubetypes.NamespacedName {
144146 Namespace : cluster .Namespace (),
145147 Name : jobName ,
146148 }
147- job := & kbatch.Job {}
148149
150+ job := & kbatch.Job {}
149151 if err := v .client .Get (ctx , key , job ); err != nil {
150- err := WaitUntilJobPodIsRunning (ctx , v .clientset , job , log )
151- if err != nil {
152- log .Error (err , "job pod is not running; deleting job" )
153- if dErr := deleteJob (ctx , cluster , v .clientset , job ); dErr != nil {
154- // Log the job deletion error, but return the underlying error that prompted deletion.
155- log .Error (dErr , "failed to delete the job" )
156- }
157- return err
158- }
159- }
160-
161- // We have hit an edge case were sometimes the job selector is nil, the following block is extra code
162- // that tries to list the jobs and then get the job again, which probably will reconcile
163- // the API. We also removed setting the job selector as well.
164- if job .Spec .Selector == nil {
165- log .V (int (zapcore .DebugLevel )).Info ("Job or Job Selector returned as nil, attempting to get it again." )
166-
167- // The job is nil or the selector is nil, we are doing a list, which
168- // should reconcile the API and we we do another get the job should have
169- // the selector.
170- jobs , err := v .clientset .BatchV1 ().Jobs (job .Namespace ).List (ctx , metav1.ListOptions {})
171- if err != nil {
172- log .Error (err , "unable to list jobs" )
173- return err
174- }
175-
176- if jobs == nil || len (jobs .Items ) == 0 {
177- err := errors .New ("unable to find any jobs" )
178- log .Error (err , err .Error ())
179- return err
180- }
181-
182- if err := v .client .Get (ctx , key , job ); err != nil {
183- log .Error (err , "unable to get job" )
184- return err
185- }
152+ log .Error (err , "failed getting Job '%s'" , jobName )
153+ return err
186154 }
187155
156+ // Left over insanity check just in case there's a missed edge case.
157+ // WaitUntilJobPodIsRunning will panic with a nil dereference if passed an
158+ // empty Job. There was previously an incorrect error check which would
159+ // always panic if the above .Get failed leading to some strange flakiness
160+ // in test. An extremely defensive block (See #607) was added as an attempt
161+ // to mitigate this panic (assumedly). It's been removed but this final
162+ // check is leftover just in case this after the fact correction was
163+ // misinformed.
188164 if job .Spec .Selector == nil {
189165 err := errors .New ("job selector is nil" )
190166 log .Error (err , err .Error ())
@@ -198,7 +174,7 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
198174 // We need to stop requeueing until further changes on the CR
199175 image := cluster .GetCockroachDBImageName ()
200176 if errBackoff := IsContainerStatusImagePullBackoff (ctx , v .clientset , job , log , image ); errBackoff != nil {
201- err := InvalidContainerVersionError {Err : errBackoff }
177+ err := PermanentErr {Err : errBackoff }
202178 return LogError ("job image incorrect" , err , log )
203179 } else if dErr := deleteJob (ctx , cluster , v .clientset , job ); dErr != nil {
204180 // Log the job deletion error, but return the underlying error that prompted deletion.
@@ -232,6 +208,7 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
232208 }
233209 }
234210 }
211+
235212 podName := tmpPod .Name
236213
237214 req := v .clientset .CoreV1 ().Pods (job .Namespace ).GetLogs (podName , & podLogOpts )
0 commit comments