actor: correctly detect failed version checker Pods (#913)

chrisseto · web-flow · commit dbefb9b08629 · 2022-06-13T09:44:48.000-04:00
Previously, the version checker action could silently fail by if
    given a non-cockroach container. It would incorrectly set the
    cluster version to the error output of the container resulting in
    error such as:
    &gt; failed to reconcile crdb: Service \"crdb\" is invalid:
    &gt; [metadata.labels: Invalid value: \"/bin/bash: line 1:
    &gt; /cockroach/cockroach.sh: No such file or directory\"

    This commit ensures that we the Pod that logs get pulled from has
    exited with a zero code before attempting to parse the version.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 ## Fixed
 
 * Delete the CancelLoop function, fixing a cluster status update bug
+* Correctly detect failed version checker Pods
 
 # [v2.7.0](https://github.com/cockroachdb/cockroach-operator/compare/v2.6.0...v2.7.0)
 
diff --git a/pkg/actor/actor.go b/pkg/actor/actor.go
@@ -52,15 +52,6 @@ func (e PermanentErr) Error() string {
 	return e.Err.Error()
 }
 
-//InvalidContainerVersionError error used to stop requeue the request on failure
-type InvalidContainerVersionError struct {
-	Err error
-}
-
-func (e InvalidContainerVersionError) Error() string {
-	return e.Err.Error()
-}
-
 //ValidationError error used to stop requeue the request on failure
 type ValidationError struct {
 	Err error
diff --git a/pkg/actor/validate_version.go b/pkg/actor/validate_version.go
@@ -136,55 +136,31 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
 
 	if changed {
 		log.V(int(zapcore.DebugLevel)).Info("created/updated job, stopping request processing")
-		return nil
+		// Return a non error error here to prevent the controller from
+		// clearing any previously set Status fields.
+		return NotReadyErr{errors.New("job changed")}
 	}
 
 	log.V(int(zapcore.DebugLevel)).Info("version checker", "job", jobName)
 	key := kubetypes.NamespacedName{
 		Namespace: cluster.Namespace(),
 		Name:      jobName,
 	}
-	job := &kbatch.Job{}
 
+	job := &kbatch.Job{}
 	if err := v.client.Get(ctx, key, job); err != nil {
-		err := WaitUntilJobPodIsRunning(ctx, v.clientset, job, log)
-		if err != nil {
-			log.Error(err, "job pod is not running; deleting job")
-			if dErr := deleteJob(ctx, cluster, v.clientset, job); dErr != nil {
-				// Log the job deletion error, but return the underlying error that prompted deletion.
-				log.Error(dErr, "failed to delete the job")
-			}
-			return err
-		}
-	}
-
-	// We have hit an edge case were sometimes the job selector is nil, the following block is extra code
-	// that tries to list the jobs and then get the job again, which probably will reconcile
-	// the API.  We also removed setting the job selector as well.
-	if job.Spec.Selector == nil {
-		log.V(int(zapcore.DebugLevel)).Info("Job or Job Selector returned as nil, attempting to get it again.")
-
-		// The job is nil or the selector is nil, we are doing a list, which
-		// should reconcile the API and we we do another get the job should have
-		// the selector.
-		jobs, err := v.clientset.BatchV1().Jobs(job.Namespace).List(ctx, metav1.ListOptions{})
-		if err != nil {
-			log.Error(err, "unable to list jobs")
-			return err
-		}
-
-		if jobs == nil || len(jobs.Items) == 0 {
-			err := errors.New("unable to find any jobs")
-			log.Error(err, err.Error())
-			return err
-		}
-
-		if err := v.client.Get(ctx, key, job); err != nil {
-			log.Error(err, "unable to get job")
-			return err
-		}
+		log.Error(err, "failed getting Job '%s'", jobName)
+		return err
 	}
 
+	// Left over insanity check just in case there's a missed edge case.
+	// WaitUntilJobPodIsRunning will panic with a nil dereference if passed an
+	// empty Job. There was previously an incorrect error check which would
+	// always panic if the above .Get failed leading to some strange flakiness
+	// in test. An extremely defensive block (See #607) was added as an attempt
+	// to mitigate this panic (assumedly). It's been removed but this final
+	// check is leftover just in case this after the fact correction was
+	// misinformed.
 	if job.Spec.Selector == nil {
 		err := errors.New("job selector is nil")
 		log.Error(err, err.Error())
@@ -198,7 +174,7 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
 			// We need to stop requeueing until further changes on the CR
 			image := cluster.GetCockroachDBImageName()
 			if errBackoff := IsContainerStatusImagePullBackoff(ctx, v.clientset, job, log, image); errBackoff != nil {
-				err := InvalidContainerVersionError{Err: errBackoff}
+				err := PermanentErr{Err: errBackoff}
 				return LogError("job image incorrect", err, log)
 			} else if dErr := deleteJob(ctx, cluster, v.clientset, job); dErr != nil {
 				// Log the job deletion error, but return the underlying error that prompted deletion.
@@ -232,6 +208,7 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
 				}
 			}
 		}
+
 		podName := tmpPod.Name
 
 		req := v.clientset.CoreV1().Pods(job.Namespace).GetLogs(podName, &podLogOpts)
diff --git a/pkg/controller/BUILD.bazel b/pkg/controller/BUILD.bazel
@@ -13,6 +13,7 @@ go_library(
         "//pkg/actor:go_default_library",
         "//pkg/resource:go_default_library",
         "//pkg/util:go_default_library",
+        "@com_github_cockroachdb_errors//:go_default_library",
         "@com_github_go_logr_logr//:go_default_library",
         "@com_github_lithammer_shortuuid_v3//:go_default_library",
         "@io_k8s_api//apps/v1:go_default_library",
@@ -39,6 +40,7 @@ go_test(
         "//pkg/actor:go_default_library",
         "//pkg/resource:go_default_library",
         "//pkg/testutil:go_default_library",
+        "@com_github_cockroachdb_errors//:go_default_library",
         "@com_github_go_logr_logr//:go_default_library",
         "@com_github_go_logr_zapr//:go_default_library",
         "@com_github_stretchr_testify//assert:go_default_library",
diff --git a/pkg/controller/cluster_controller.go b/pkg/controller/cluster_controller.go
@@ -25,6 +25,7 @@ import (
 	"github.com/cockroachdb/cockroach-operator/pkg/actor"
 	"github.com/cockroachdb/cockroach-operator/pkg/resource"
 	"github.com/cockroachdb/cockroach-operator/pkg/util"
+	"github.com/cockroachdb/errors"
 	"github.com/go-logr/logr"
 	"github.com/lithammer/shortuuid/v3"
 	"go.uber.org/zap/zapcore"
@@ -154,32 +155,38 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request
 		// Save the error on the Status for each action
 		log.Info("Error on action", "Action", actorToExecute.GetActionType(), "err", err.Error())
 		cluster.SetActionFailed(actorToExecute.GetActionType(), err.Error())
+
 		defer func(ctx context.Context, cluster *resource.Cluster) {
 			if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil {
 				log.Error(err, "failed to update cluster status")
 			}
 		}(ctx, &cluster)
+
 		// Short pause
-		if notReadyErr, ok := err.(actor.NotReadyErr); ok {
+		var notReadyErr actor.NotReadyErr
+		if errors.As(err, &notReadyErr) {
 			log.V(int(zapcore.DebugLevel)).Info("requeueing", "reason", notReadyErr.Error(), "Action", actorToExecute.GetActionType())
 			return requeueAfter(5*time.Second, nil)
 		}
 
 		// Long pause
-		if cantRecoverErr, ok := err.(actor.PermanentErr); ok {
+		var cantRecoverErr actor.PermanentErr
+		if errors.As(err, &cantRecoverErr) {
 			log.Error(cantRecoverErr, "can't proceed with reconcile", "Action", actorToExecute.GetActionType())
 			return noRequeue()
 		}
 
 		// No requeue until the user makes changes
-		if validationError, ok := err.(actor.ValidationError); ok {
-			log.Error(validationError, "can't proceed with reconcile")
+		var validationErr actor.ValidationError
+		if errors.As(err, &validationErr) {
+			log.Error(validationErr, "can't proceed with reconcile")
 			return noRequeue()
 		}
 
 		log.Error(err, "action failed")
 		return requeueIfError(err)
 	}
+
 	// reset errors on each run  if there was an error,
 	// this is to cover the not ready case
 	if cluster.Failed(actorToExecute.GetActionType()) {
diff --git a/pkg/controller/cluster_controller_test.go b/pkg/controller/cluster_controller_test.go
@@ -18,17 +18,16 @@ package controller_test
 
 import (
 	"context"
-	"errors"
 	"testing"
 	"time"
 
-	"github.com/go-logr/logr"
-
 	api "github.com/cockroachdb/cockroach-operator/apis/v1alpha1"
 	"github.com/cockroachdb/cockroach-operator/pkg/actor"
 	"github.com/cockroachdb/cockroach-operator/pkg/controller"
 	"github.com/cockroachdb/cockroach-operator/pkg/resource"
 	"github.com/cockroachdb/cockroach-operator/pkg/testutil"
+	"github.com/cockroachdb/errors"
+	"github.com/go-logr/logr"
 	"github.com/go-logr/zapr"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -90,26 +89,36 @@ func TestReconcile(t *testing.T) {
 		want    ctrl.Result
 		wantErr string
 	}{
-		// {
-		// 	name: "reconcile action fails",
-		// 	action: fakeActor{
-		// 		err: errors.New("failed to reconcile resource"),
-		// 	},
-		// 	want:    ctrl.Result{Requeue: false},
-		// 	wantErr: "failed to reconcile resource",
-		// },
-		// {
-		// 	name:    "reconcile action updates owned resource successfully",
-		// 	action:  fakeActor{},
-		// 	want:    ctrl.Result{Requeue: false},
-		// 	wantErr: "",
-		// },
 		{
 			name:    "on first reconcile we update and requeue",
 			action:  fakeActor{},
 			want:    ctrl.Result{Requeue: true},
 			wantErr: "",
 		},
+		{
+			name: "reconcile action fails genericly",
+			action: fakeActor{
+				err: errors.New("failed to reconcile resource"),
+			},
+			want:    ctrl.Result{},
+			wantErr: "failed to reconcile resource",
+		},
+		{
+			name: "reconcile action permanently fails",
+			action: fakeActor{
+				err: errors.Wrap(actor.PermanentErr{Err: errors.New("foo")}, "bar"),
+			},
+			want:    ctrl.Result{Requeue: false},
+			wantErr: "",
+		},
+		{
+			name: "reconcile action validation fails",
+			action: fakeActor{
+				err: actor.ValidationError{Err: errors.New("bar")},
+			},
+			want:    ctrl.Result{Requeue: false},
+			wantErr: "",
+		},
 		{
 			name: "reconcile action fails to probe expected condition",
 			action: fakeActor{
diff --git a/pkg/resource/BUILD.bazel b/pkg/resource/BUILD.bazel
@@ -57,8 +57,8 @@ go_library(
 go_test(
     name = "go_default_test",
     srcs = [
-        "cluster_test.go",
         "certificate_test.go",
+        "cluster_test.go",
         "discovery_service_test.go",
         "pod_distruption_budget_test.go",
         "public_service_test.go",
diff --git a/pkg/resource/job.go b/pkg/resource/job.go
@@ -63,11 +63,13 @@ func (b JobBuilder) Build(obj client.Object) error {
 
 	// we recreate spec from ground only if we do not find the container job
 	if dbContainer, err := kube.FindContainer(JobContainerName, &job.Spec.Template.Spec); err != nil {
+		backoffLimit := int32(2)
 		job.Spec = kbatch.JobSpec{
 			// This field is alpha-level and is only honored by servers that enable the TTLAfterFinished feature.
 			// see https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#job-v1-batch
 			TTLSecondsAfterFinished: ptr.Int32(300),
 			Template:                b.buildPodTemplate(),
+			BackoffLimit:            &backoffLimit,
 		}
 	} else {
 		//if job with the container already exists we update the image only
@@ -150,7 +152,7 @@ func (b JobBuilder) MakeContainers() []corev1.Container {
 				},
 			},
 			Command: []string{"/bin/bash"},
-			Args:    []string{"-c", fmt.Sprintf("%s; sleep 150", GetTagVersionCommand)},
+			Args:    []string{"-c", fmt.Sprintf("set -eo pipefail; %s; sleep 150", GetTagVersionCommand)},
 		},
 	}
 }
diff --git a/pkg/testutil/require.go b/pkg/testutil/require.go
@@ -621,7 +621,9 @@ func RequireClusterInImagePullBackoff(t *testing.T, sb testenv.DiffingSandbox, b
 		"app.kubernetes.io/instance": clusterName,
 	}
 
-	wErr := wait.Poll(10*time.Second, 500*time.Second, func() (bool, error) {
+	// Timeout must be greater than 2 minutes, the max backoff time for the
+	// version checker job.
+	wErr := wait.Poll(10*time.Second, 3 * time.Minute, func() (bool, error) {
 		if err := sb.List(jobList, jobLabel); err != nil {
 			return false, err
 		}
@@ -658,7 +660,7 @@ func RequireClusterInFailedState(t *testing.T, sb testenv.DiffingSandbox, b Clus
 		},
 	}
 
-	wErr := wait.Poll(10*time.Second, 500*time.Second, func() (bool, error) {
+	wErr := wait.Poll(10*time.Second, 2 * time.Minute, func() (bool, error) {
 		if err := sb.Get(&crdbCluster); err != nil {
 			return false, err
 		}

Original file line number	Diff line number	Diff line change
`@@ -621,7 +621,9 @@ func RequireClusterInImagePullBackoff(t *testing.T, sb testenv.DiffingSandbox, b`
`621`	`621`	`"app.kubernetes.io/instance": clusterName,`
`622`	`622`	`}`
`623`	`623`
`624`		`- wErr := wait.Poll(10time.Second, 500time.Second, func() (bool, error) {`
	`624`	`+ // Timeout must be greater than 2 minutes, the max backoff time for the`
	`625`	`+ // version checker job.`
	`626`	`+ wErr := wait.Poll(10time.Second, 3 time.Minute, func() (bool, error) {`
`625`	`627`	`if err := sb.List(jobList, jobLabel); err != nil {`
`626`	`628`	`return false, err`
`627`	`629`	`}`
`@@ -658,7 +660,7 @@ func RequireClusterInFailedState(t *testing.T, sb testenv.DiffingSandbox, b Clus`
`658`	`660`	`},`
`659`	`661`	`}`
`660`	`662`
`661`		`- wErr := wait.Poll(10time.Second, 500time.Second, func() (bool, error) {`
	`663`	`+ wErr := wait.Poll(10time.Second, 2 time.Minute, func() (bool, error) {`
`662`	`664`	`if err := sb.Get(&crdbCluster); err != nil {`
`663`	`665`	`return false, err`
`664`	`666`	`}`