Skip to content

Commit dbefb9b

Browse files
authored
actor: correctly detect failed version checker Pods (#913)
Previously, the version checker action could silently fail by if given a non-cockroach container. It would incorrectly set the cluster version to the error output of the container resulting in error such as: > failed to reconcile crdb: Service \"crdb\" is invalid: > [metadata.labels: Invalid value: \"/bin/bash: line 1: > /cockroach/cockroach.sh: No such file or directory\" This commit ensures that we the Pod that logs get pulled from has exited with a zero code before attempting to parse the version.
1 parent c6172d4 commit dbefb9b

File tree

9 files changed

+64
-73
lines changed

9 files changed

+64
-73
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
1010
## Fixed
1111

1212
* Delete the CancelLoop function, fixing a cluster status update bug
13+
* Correctly detect failed version checker Pods
1314

1415
# [v2.7.0](https://github.com/cockroachdb/cockroach-operator/compare/v2.6.0...v2.7.0)
1516

pkg/actor/actor.go

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,6 @@ func (e PermanentErr) Error() string {
5252
return e.Err.Error()
5353
}
5454

55-
//InvalidContainerVersionError error used to stop requeue the request on failure
56-
type InvalidContainerVersionError struct {
57-
Err error
58-
}
59-
60-
func (e InvalidContainerVersionError) Error() string {
61-
return e.Err.Error()
62-
}
63-
6455
//ValidationError error used to stop requeue the request on failure
6556
type ValidationError struct {
6657
Err error

pkg/actor/validate_version.go

Lines changed: 16 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -136,55 +136,31 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
136136

137137
if changed {
138138
log.V(int(zapcore.DebugLevel)).Info("created/updated job, stopping request processing")
139-
return nil
139+
// Return a non error error here to prevent the controller from
140+
// clearing any previously set Status fields.
141+
return NotReadyErr{errors.New("job changed")}
140142
}
141143

142144
log.V(int(zapcore.DebugLevel)).Info("version checker", "job", jobName)
143145
key := kubetypes.NamespacedName{
144146
Namespace: cluster.Namespace(),
145147
Name: jobName,
146148
}
147-
job := &kbatch.Job{}
148149

150+
job := &kbatch.Job{}
149151
if err := v.client.Get(ctx, key, job); err != nil {
150-
err := WaitUntilJobPodIsRunning(ctx, v.clientset, job, log)
151-
if err != nil {
152-
log.Error(err, "job pod is not running; deleting job")
153-
if dErr := deleteJob(ctx, cluster, v.clientset, job); dErr != nil {
154-
// Log the job deletion error, but return the underlying error that prompted deletion.
155-
log.Error(dErr, "failed to delete the job")
156-
}
157-
return err
158-
}
159-
}
160-
161-
// We have hit an edge case were sometimes the job selector is nil, the following block is extra code
162-
// that tries to list the jobs and then get the job again, which probably will reconcile
163-
// the API. We also removed setting the job selector as well.
164-
if job.Spec.Selector == nil {
165-
log.V(int(zapcore.DebugLevel)).Info("Job or Job Selector returned as nil, attempting to get it again.")
166-
167-
// The job is nil or the selector is nil, we are doing a list, which
168-
// should reconcile the API and we we do another get the job should have
169-
// the selector.
170-
jobs, err := v.clientset.BatchV1().Jobs(job.Namespace).List(ctx, metav1.ListOptions{})
171-
if err != nil {
172-
log.Error(err, "unable to list jobs")
173-
return err
174-
}
175-
176-
if jobs == nil || len(jobs.Items) == 0 {
177-
err := errors.New("unable to find any jobs")
178-
log.Error(err, err.Error())
179-
return err
180-
}
181-
182-
if err := v.client.Get(ctx, key, job); err != nil {
183-
log.Error(err, "unable to get job")
184-
return err
185-
}
152+
log.Error(err, "failed getting Job '%s'", jobName)
153+
return err
186154
}
187155

156+
// Left over insanity check just in case there's a missed edge case.
157+
// WaitUntilJobPodIsRunning will panic with a nil dereference if passed an
158+
// empty Job. There was previously an incorrect error check which would
159+
// always panic if the above .Get failed leading to some strange flakiness
160+
// in test. An extremely defensive block (See #607) was added as an attempt
161+
// to mitigate this panic (assumedly). It's been removed but this final
162+
// check is leftover just in case this after the fact correction was
163+
// misinformed.
188164
if job.Spec.Selector == nil {
189165
err := errors.New("job selector is nil")
190166
log.Error(err, err.Error())
@@ -198,7 +174,7 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
198174
// We need to stop requeueing until further changes on the CR
199175
image := cluster.GetCockroachDBImageName()
200176
if errBackoff := IsContainerStatusImagePullBackoff(ctx, v.clientset, job, log, image); errBackoff != nil {
201-
err := InvalidContainerVersionError{Err: errBackoff}
177+
err := PermanentErr{Err: errBackoff}
202178
return LogError("job image incorrect", err, log)
203179
} else if dErr := deleteJob(ctx, cluster, v.clientset, job); dErr != nil {
204180
// Log the job deletion error, but return the underlying error that prompted deletion.
@@ -232,6 +208,7 @@ func (v *versionChecker) Act(ctx context.Context, cluster *resource.Cluster, log
232208
}
233209
}
234210
}
211+
235212
podName := tmpPod.Name
236213

237214
req := v.clientset.CoreV1().Pods(job.Namespace).GetLogs(podName, &podLogOpts)

pkg/controller/BUILD.bazel

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ go_library(
1313
"//pkg/actor:go_default_library",
1414
"//pkg/resource:go_default_library",
1515
"//pkg/util:go_default_library",
16+
"@com_github_cockroachdb_errors//:go_default_library",
1617
"@com_github_go_logr_logr//:go_default_library",
1718
"@com_github_lithammer_shortuuid_v3//:go_default_library",
1819
"@io_k8s_api//apps/v1:go_default_library",
@@ -39,6 +40,7 @@ go_test(
3940
"//pkg/actor:go_default_library",
4041
"//pkg/resource:go_default_library",
4142
"//pkg/testutil:go_default_library",
43+
"@com_github_cockroachdb_errors//:go_default_library",
4244
"@com_github_go_logr_logr//:go_default_library",
4345
"@com_github_go_logr_zapr//:go_default_library",
4446
"@com_github_stretchr_testify//assert:go_default_library",

pkg/controller/cluster_controller.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"github.com/cockroachdb/cockroach-operator/pkg/actor"
2626
"github.com/cockroachdb/cockroach-operator/pkg/resource"
2727
"github.com/cockroachdb/cockroach-operator/pkg/util"
28+
"github.com/cockroachdb/errors"
2829
"github.com/go-logr/logr"
2930
"github.com/lithammer/shortuuid/v3"
3031
"go.uber.org/zap/zapcore"
@@ -154,32 +155,38 @@ func (r *ClusterReconciler) Reconcile(ctx context.Context, req reconcile.Request
154155
// Save the error on the Status for each action
155156
log.Info("Error on action", "Action", actorToExecute.GetActionType(), "err", err.Error())
156157
cluster.SetActionFailed(actorToExecute.GetActionType(), err.Error())
158+
157159
defer func(ctx context.Context, cluster *resource.Cluster) {
158160
if err := r.Client.Status().Update(ctx, cluster.Unwrap()); err != nil {
159161
log.Error(err, "failed to update cluster status")
160162
}
161163
}(ctx, &cluster)
164+
162165
// Short pause
163-
if notReadyErr, ok := err.(actor.NotReadyErr); ok {
166+
var notReadyErr actor.NotReadyErr
167+
if errors.As(err, &notReadyErr) {
164168
log.V(int(zapcore.DebugLevel)).Info("requeueing", "reason", notReadyErr.Error(), "Action", actorToExecute.GetActionType())
165169
return requeueAfter(5*time.Second, nil)
166170
}
167171

168172
// Long pause
169-
if cantRecoverErr, ok := err.(actor.PermanentErr); ok {
173+
var cantRecoverErr actor.PermanentErr
174+
if errors.As(err, &cantRecoverErr) {
170175
log.Error(cantRecoverErr, "can't proceed with reconcile", "Action", actorToExecute.GetActionType())
171176
return noRequeue()
172177
}
173178

174179
// No requeue until the user makes changes
175-
if validationError, ok := err.(actor.ValidationError); ok {
176-
log.Error(validationError, "can't proceed with reconcile")
180+
var validationErr actor.ValidationError
181+
if errors.As(err, &validationErr) {
182+
log.Error(validationErr, "can't proceed with reconcile")
177183
return noRequeue()
178184
}
179185

180186
log.Error(err, "action failed")
181187
return requeueIfError(err)
182188
}
189+
183190
// reset errors on each run if there was an error,
184191
// this is to cover the not ready case
185192
if cluster.Failed(actorToExecute.GetActionType()) {

pkg/controller/cluster_controller_test.go

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,16 @@ package controller_test
1818

1919
import (
2020
"context"
21-
"errors"
2221
"testing"
2322
"time"
2423

25-
"github.com/go-logr/logr"
26-
2724
api "github.com/cockroachdb/cockroach-operator/apis/v1alpha1"
2825
"github.com/cockroachdb/cockroach-operator/pkg/actor"
2926
"github.com/cockroachdb/cockroach-operator/pkg/controller"
3027
"github.com/cockroachdb/cockroach-operator/pkg/resource"
3128
"github.com/cockroachdb/cockroach-operator/pkg/testutil"
29+
"github.com/cockroachdb/errors"
30+
"github.com/go-logr/logr"
3231
"github.com/go-logr/zapr"
3332
"github.com/stretchr/testify/assert"
3433
"github.com/stretchr/testify/require"
@@ -90,26 +89,36 @@ func TestReconcile(t *testing.T) {
9089
want ctrl.Result
9190
wantErr string
9291
}{
93-
// {
94-
// name: "reconcile action fails",
95-
// action: fakeActor{
96-
// err: errors.New("failed to reconcile resource"),
97-
// },
98-
// want: ctrl.Result{Requeue: false},
99-
// wantErr: "failed to reconcile resource",
100-
// },
101-
// {
102-
// name: "reconcile action updates owned resource successfully",
103-
// action: fakeActor{},
104-
// want: ctrl.Result{Requeue: false},
105-
// wantErr: "",
106-
// },
10792
{
10893
name: "on first reconcile we update and requeue",
10994
action: fakeActor{},
11095
want: ctrl.Result{Requeue: true},
11196
wantErr: "",
11297
},
98+
{
99+
name: "reconcile action fails genericly",
100+
action: fakeActor{
101+
err: errors.New("failed to reconcile resource"),
102+
},
103+
want: ctrl.Result{},
104+
wantErr: "failed to reconcile resource",
105+
},
106+
{
107+
name: "reconcile action permanently fails",
108+
action: fakeActor{
109+
err: errors.Wrap(actor.PermanentErr{Err: errors.New("foo")}, "bar"),
110+
},
111+
want: ctrl.Result{Requeue: false},
112+
wantErr: "",
113+
},
114+
{
115+
name: "reconcile action validation fails",
116+
action: fakeActor{
117+
err: actor.ValidationError{Err: errors.New("bar")},
118+
},
119+
want: ctrl.Result{Requeue: false},
120+
wantErr: "",
121+
},
113122
{
114123
name: "reconcile action fails to probe expected condition",
115124
action: fakeActor{

pkg/resource/BUILD.bazel

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,8 @@ go_library(
5757
go_test(
5858
name = "go_default_test",
5959
srcs = [
60-
"cluster_test.go",
6160
"certificate_test.go",
61+
"cluster_test.go",
6262
"discovery_service_test.go",
6363
"pod_distruption_budget_test.go",
6464
"public_service_test.go",

pkg/resource/job.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,11 +63,13 @@ func (b JobBuilder) Build(obj client.Object) error {
6363

6464
// we recreate spec from ground only if we do not find the container job
6565
if dbContainer, err := kube.FindContainer(JobContainerName, &job.Spec.Template.Spec); err != nil {
66+
backoffLimit := int32(2)
6667
job.Spec = kbatch.JobSpec{
6768
// This field is alpha-level and is only honored by servers that enable the TTLAfterFinished feature.
6869
// see https://v1-18.docs.kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#job-v1-batch
6970
TTLSecondsAfterFinished: ptr.Int32(300),
7071
Template: b.buildPodTemplate(),
72+
BackoffLimit: &backoffLimit,
7173
}
7274
} else {
7375
//if job with the container already exists we update the image only
@@ -150,7 +152,7 @@ func (b JobBuilder) MakeContainers() []corev1.Container {
150152
},
151153
},
152154
Command: []string{"/bin/bash"},
153-
Args: []string{"-c", fmt.Sprintf("%s; sleep 150", GetTagVersionCommand)},
155+
Args: []string{"-c", fmt.Sprintf("set -eo pipefail; %s; sleep 150", GetTagVersionCommand)},
154156
},
155157
}
156158
}

pkg/testutil/require.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,9 @@ func RequireClusterInImagePullBackoff(t *testing.T, sb testenv.DiffingSandbox, b
621621
"app.kubernetes.io/instance": clusterName,
622622
}
623623

624-
wErr := wait.Poll(10*time.Second, 500*time.Second, func() (bool, error) {
624+
// Timeout must be greater than 2 minutes, the max backoff time for the
625+
// version checker job.
626+
wErr := wait.Poll(10*time.Second, 3 * time.Minute, func() (bool, error) {
625627
if err := sb.List(jobList, jobLabel); err != nil {
626628
return false, err
627629
}
@@ -658,7 +660,7 @@ func RequireClusterInFailedState(t *testing.T, sb testenv.DiffingSandbox, b Clus
658660
},
659661
}
660662

661-
wErr := wait.Poll(10*time.Second, 500*time.Second, func() (bool, error) {
663+
wErr := wait.Poll(10*time.Second, 2 * time.Minute, func() (bool, error) {
662664
if err := sb.Get(&crdbCluster); err != nil {
663665
return false, err
664666
}

0 commit comments

Comments
 (0)