From f0ece305269d9ec55be3826db84fae8a569b811f Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Tue, 22 Apr 2025 18:27:16 +0200 Subject: [PATCH 01/12] Feat reattach to manual destroy run --- .../workspace_controller_deletion_policy.go | 24 +++++ ...rkspace_controller_deletion_policy_test.go | 91 +++++++++++++++++++ .../workspace_controller_outputs_test.go | 43 +++++++++ 3 files changed, 158 insertions(+) diff --git a/internal/controller/workspace_controller_deletion_policy.go b/internal/controller/workspace_controller_deletion_policy.go index 47e3764a..badf222c 100644 --- a/internal/controller/workspace_controller_deletion_policy.go +++ b/internal/controller/workspace_controller_deletion_policy.go @@ -101,6 +101,30 @@ func (r *WorkspaceReconciler) deleteWorkspace(ctx context.Context, w *workspaceI if _, ok := runStatusUnsuccessful[run.Status]; ok { w.log.Info("Destroy Run", "msg", fmt.Sprintf("destroy run %s is unsuccessful: %s", run.ID, run.Status)) + + workspace, err := w.tfClient.Client.Workspaces.ReadByID(ctx, w.instance.Status.WorkspaceID) + if err != nil { + return r.handleWorkspaceErrorNotFound(ctx, w, err) + } + + w.log.Info("Destroy Run", "msg", fmt.Sprintf("CurrentRun: %s %s %v", workspace.CurrentRun.ID, workspace.CurrentRun.Status, workspace.CurrentRun.IsDestroy)) + + if workspace.CurrentRun != nil && workspace.CurrentRun.ID != w.instance.Status.DestroyRunID { + + run, err := w.tfClient.Client.Runs.Read(ctx, w.instance.Status.DestroyRunID) + if err != nil { + // ignore this run id, and let the next reconcile loop handle the error + return nil + } + if run.IsDestroy { + w.log.Info("Destroy Run", "msg", fmt.Sprintf("found more recent destroy run %s, updating DestroyRunID", workspace.CurrentRun.ID)) + + w.instance.Status.DestroyRunID = workspace.CurrentRun.ID + w.updateWorkspaceStatusRun(run) + return r.Status().Update(ctx, &w.instance) + } + } + return nil } w.log.Info("Destroy Run", "msg", fmt.Sprintf("destroy run %s is not finished", run.ID)) diff --git a/internal/controller/workspace_controller_deletion_policy_test.go b/internal/controller/workspace_controller_deletion_policy_test.go index 68e54dfe..a3e13e36 100644 --- a/internal/controller/workspace_controller_deletion_policy_test.go +++ b/internal/controller/workspace_controller_deletion_policy_test.go @@ -195,6 +195,97 @@ var _ = Describe("Workspace controller", Ordered, func() { return err == tfc.ErrResourceNotFound }).Should(BeTrue()) }) + It("can destroy delete a workspace when the destroy was retried manually after failing", func() { + if cloudEndpoint != tfcDefaultAddress { + Skip("Does not run against TFC, skip this test") + } + instance.Spec.AllowDestroyPlan = true + instance.Spec.DeletionPolicy = appv1alpha2.DeletionPolicyDestroy + createWorkspace(instance) + workspaceID := instance.Status.WorkspaceID + + cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi") + Eventually(func() bool { + listOpts := tfc.ListOptions{ + PageNumber: 1, + PageSize: maxPageSize, + } + for listOpts.PageNumber != 0 { + runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{ + ListOptions: listOpts, + }) + Expect(err).To(Succeed()) + for _, r := range runs.Items { + if r.ConfigurationVersion.ID == cv.ID { + return r.Status == tfc.RunApplied + } + } + listOpts.PageNumber = runs.NextPage + } + return false + }).Should(BeTrue()) + + // create an errored ConfigurationVersion for the delete to fail + cv = createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, false) + + Expect(k8sClient.Delete(ctx, instance)).To(Succeed()) + + var destroyRunID string + Eventually(func() bool { + ws, err := tfClient.Workspaces.ReadByID(ctx, workspaceID) + Expect(err).To(Succeed()) + Expect(ws).ToNot(BeNil()) + Expect(ws.CurrentRun).ToNot(BeNil()) + run, err := tfClient.Runs.Read(ctx, ws.CurrentRun.ID) + Expect(err).To(Succeed()) + Expect(run).ToNot(BeNil()) + destroyRunID = run.ID + + return run.IsDestroy + }).Should(BeTrue()) + + Eventually(func() bool { + run, _ := tfClient.Runs.Read(ctx, destroyRunID) + if run.Status == tfc.RunErrored { + return true + } + + return false + }).Should(BeTrue()) + + // put back a working configuration + cv = createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi") + + // start a new destroy run manually + run, err := tfClient.Runs.Create(ctx, tfc.RunCreateOptions{ + IsDestroy: tfc.Bool(true), + Message: tfc.String(runMessage), + Workspace: &tfc.Workspace{ + ID: workspaceID, + }, + }) + Expect(err).To(Succeed()) + Expect(run).ToNot(BeNil()) + + var newDestroyRunID string + Eventually(func() bool { + ws, err := tfClient.Workspaces.ReadByID(ctx, workspaceID) + Expect(err).To(Succeed()) + Expect(ws).ToNot(BeNil()) + Expect(ws.CurrentRun).ToNot(BeNil()) + run, err := tfClient.Runs.Read(ctx, ws.CurrentRun.ID) + Expect(err).To(Succeed()) + Expect(run).ToNot(BeNil()) + newDestroyRunID = run.ID + + return run.IsDestroy && newDestroyRunID != destroyRunID + }).Should(BeTrue()) + + Eventually(func() bool { + _, err := tfClient.Workspaces.ReadByID(ctx, workspaceID) + return err == tfc.ErrResourceNotFound + }).Should(BeTrue()) + }) It("can force delete a workspace", func() { instance.Spec.DeletionPolicy = appv1alpha2.DeletionPolicyForce createWorkspace(instance) diff --git a/internal/controller/workspace_controller_outputs_test.go b/internal/controller/workspace_controller_outputs_test.go index e9a3be34..f8a611b2 100644 --- a/internal/controller/workspace_controller_outputs_test.go +++ b/internal/controller/workspace_controller_outputs_test.go @@ -180,3 +180,46 @@ func createAndUploadConfigurationVersion(workspaceID string, outputValue string) return cv } + +func createAndUploadErroredConfigurationVersion(workspaceID string, autoQueueRuns bool) *tfc.ConfigurationVersion { + GinkgoHelper() + // Create a temporary dir in the current one + cd, err := os.Getwd() + Expect(err).Should(Succeed()) + td, err := os.MkdirTemp(cd, "tf-*") + Expect(err).Should(Succeed()) + defer os.RemoveAll(td) + // Create a temporary file in the temporary dir + f, err := os.CreateTemp(td, "*.tf") + Expect(err).Should(Succeed()) + defer os.Remove(f.Name()) + // Terraform code to upload + tf := fmt.Sprint(` + resource "test_non_existent_resource" "this" {} + `) + // Save the Terraform code to the temporary file + _, err = f.WriteString(tf) + Expect(err).Should(Succeed()) + + cv, err := tfClient.ConfigurationVersions.Create(ctx, workspaceID, tfc.ConfigurationVersionCreateOptions{ + AutoQueueRuns: tfc.Bool(autoQueueRuns), + Speculative: tfc.Bool(false), + }) + Expect(err).Should(Succeed()) + Expect(cv).ShouldNot(BeNil()) + + Expect(tfClient.ConfigurationVersions.Upload(ctx, cv.UploadURL, td)).Should(Succeed()) + + Eventually(func() bool { + c, err := tfClient.ConfigurationVersions.Read(ctx, cv.ID) + if err != nil { + return false + } + if c.Status == tfc.ConfigurationUploaded { + return true + } + return false + }).Should(BeTrue()) + + return cv +} From 956f0dd4c75e7a403e7c543d172272ae05f3c67b Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Thu, 24 Apr 2025 16:46:53 +0200 Subject: [PATCH 02/12] docs: add changelog --- .changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml diff --git a/.changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml b/.changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml new file mode 100644 index 00000000..d217e400 --- /dev/null +++ b/.changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml @@ -0,0 +1,5 @@ +kind: ENHANCEMENTS +body: Relink to more recent destroy runs when destroying a workspace +time: 2025-04-24T16:46:45.710081284+02:00 +custom: + PR: "594" From 6702906407ad6d041c915dda8305eb578f917136 Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Thu, 24 Apr 2025 16:00:34 +0200 Subject: [PATCH 03/12] test: add option to not trigger run when uploading a new config --- .../controller/agentpool_controller_autoscaling_test.go | 4 ++-- .../workspace_controller_deletion_policy_test.go | 8 ++++---- internal/controller/workspace_controller_outputs_test.go | 8 ++++---- internal/controller/workspace_controller_runs_test.go | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/internal/controller/agentpool_controller_autoscaling_test.go b/internal/controller/agentpool_controller_autoscaling_test.go index 87d8666d..314d8798 100644 --- a/internal/controller/agentpool_controller_autoscaling_test.go +++ b/internal/controller/agentpool_controller_autoscaling_test.go @@ -92,7 +92,7 @@ var _ = Describe("Agent Pool controller", Ordered, func() { Expect(err).Should(Succeed()) Expect(ws).ShouldNot(BeNil()) // Create a new Run and execute it - _ = createAndUploadConfigurationVersion(ws.ID, "hoi") + _ = createAndUploadConfigurationVersion(ws.ID, "hoi", true) Eventually(func() bool { ws, err = tfClient.Workspaces.ReadByID(ctx, ws.ID) Expect(err).Should(Succeed()) @@ -147,7 +147,7 @@ var _ = Describe("Agent Pool controller", Ordered, func() { Expect(err).Should(Succeed()) Expect(ws).ShouldNot(BeNil()) // New Run - _ = createAndUploadConfigurationVersion(ws.ID, "hoi") + _ = createAndUploadConfigurationVersion(ws.ID, "hoi", true) Eventually(func() bool { ws, err = tfClient.Workspaces.ReadByID(ctx, ws.ID) Expect(err).Should(Succeed()) diff --git a/internal/controller/workspace_controller_deletion_policy_test.go b/internal/controller/workspace_controller_deletion_policy_test.go index a3e13e36..c7d5c8b2 100644 --- a/internal/controller/workspace_controller_deletion_policy_test.go +++ b/internal/controller/workspace_controller_deletion_policy_test.go @@ -97,7 +97,7 @@ var _ = Describe("Workspace controller", Ordered, func() { createWorkspace(instance) workspaceID := instance.Status.WorkspaceID - cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi") + cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true) Eventually(func() bool { listOpts := tfc.ListOptions{ PageNumber: 1, @@ -144,7 +144,7 @@ var _ = Describe("Workspace controller", Ordered, func() { createWorkspace(instance) workspaceID := instance.Status.WorkspaceID - cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi") + cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true) Eventually(func() bool { listOpts := tfc.ListOptions{ PageNumber: 1, @@ -204,7 +204,7 @@ var _ = Describe("Workspace controller", Ordered, func() { createWorkspace(instance) workspaceID := instance.Status.WorkspaceID - cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi") + cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true) Eventually(func() bool { listOpts := tfc.ListOptions{ PageNumber: 1, @@ -254,7 +254,7 @@ var _ = Describe("Workspace controller", Ordered, func() { }).Should(BeTrue()) // put back a working configuration - cv = createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi") + cv = createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true) // start a new destroy run manually run, err := tfClient.Runs.Create(ctx, tfc.RunCreateOptions{ diff --git a/internal/controller/workspace_controller_outputs_test.go b/internal/controller/workspace_controller_outputs_test.go index f8a611b2..e2b1a897 100644 --- a/internal/controller/workspace_controller_outputs_test.go +++ b/internal/controller/workspace_controller_outputs_test.go @@ -76,7 +76,7 @@ var _ = Describe("Workspace controller", Ordered, func() { createWorkspace(instance) outputValue := "hoi" - cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, outputValue) + cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, outputValue, true) By("Validating configuration version and workspace run") Eventually(func() bool { @@ -132,7 +132,7 @@ var _ = Describe("Workspace controller", Ordered, func() { }) }) -func createAndUploadConfigurationVersion(workspaceID string, outputValue string) *tfc.ConfigurationVersion { +func createAndUploadConfigurationVersion(workspaceID string, outputValue string, autoQueueRuns bool) *tfc.ConfigurationVersion { GinkgoHelper() // Create a temporary dir in the current one cd, err := os.Getwd() @@ -140,7 +140,7 @@ func createAndUploadConfigurationVersion(workspaceID string, outputValue string) td, err := os.MkdirTemp(cd, "tf-*") Expect(err).Should(Succeed()) defer os.RemoveAll(td) - // Create a temporary file in the temporary dir + // Create a te AutoQueueRuns: tfc.Bool(autoQueueRuns), dir f, err := os.CreateTemp(td, "*.tf") Expect(err).Should(Succeed()) defer os.Remove(f.Name()) @@ -159,7 +159,7 @@ func createAndUploadConfigurationVersion(workspaceID string, outputValue string) Expect(err).Should(Succeed()) cv, err := tfClient.ConfigurationVersions.Create(ctx, workspaceID, tfc.ConfigurationVersionCreateOptions{ - AutoQueueRuns: tfc.Bool(true), + AutoQueueRuns: tfc.Bool(autoQueueRuns), Speculative: tfc.Bool(false), }) Expect(err).Should(Succeed()) diff --git a/internal/controller/workspace_controller_runs_test.go b/internal/controller/workspace_controller_runs_test.go index f2bb5421..6d9b0f41 100644 --- a/internal/controller/workspace_controller_runs_test.go +++ b/internal/controller/workspace_controller_runs_test.go @@ -73,7 +73,7 @@ var _ = Describe("Workspace controller", Ordered, func() { namespacedName := getNamespacedName(instance) // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation createWorkspace(instance) - createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi") + createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true) Eventually(func() bool { Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed()) if instance.Status.Run == nil { @@ -87,7 +87,7 @@ var _ = Describe("Workspace controller", Ordered, func() { // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation createWorkspace(instance) - createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi") + createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true) Eventually(func() bool { Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed()) if instance.Status.Run == nil { From f8928bb54464c658decd706ca5c44b52c628f17d Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Thu, 24 Apr 2025 16:30:50 +0200 Subject: [PATCH 04/12] test: add a test for the retry logic that fails --- .../workspace_controller_retry_test.go | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 internal/controller/workspace_controller_retry_test.go diff --git a/internal/controller/workspace_controller_retry_test.go b/internal/controller/workspace_controller_retry_test.go new file mode 100644 index 00000000..9d12e2bf --- /dev/null +++ b/internal/controller/workspace_controller_retry_test.go @@ -0,0 +1,122 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package controller + +import ( + "fmt" + "time" + + tfc "github.com/hashicorp/go-tfe" + appv1alpha2 "github.com/hashicorp/hcp-terraform-operator/api/v1alpha2" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +var _ = Describe("Workspace controller", Ordered, func() { + var ( + instance *appv1alpha2.Workspace + namespacedName types.NamespacedName + workspace string + ) + + BeforeAll(func() { + // Set default Eventually timers + SetDefaultEventuallyTimeout(syncPeriod * 4) + SetDefaultEventuallyPollingInterval(2 * time.Second) + }) + + BeforeEach(func() { + if cloudEndpoint != tfcDefaultAddress { + Skip("Does not run against TFC, skip this test") + } + namespacedName = newNamespacedName() + workspace = fmt.Sprintf("kubernetes-operator-%v", randomNumber()) + // Create a new workspace object for each test + instance = &appv1alpha2.Workspace{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "app.terraform.io/v1alpha2", + Kind: "Workspace", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: namespacedName.Name, + Namespace: namespacedName.Namespace, + DeletionTimestamp: nil, + Finalizers: []string{}, + }, + Spec: appv1alpha2.WorkspaceSpec{ + Organization: organization, + Token: appv1alpha2.Token{ + SecretKeyRef: &corev1.SecretKeySelector{ + LocalObjectReference: corev1.LocalObjectReference{ + Name: secretNamespacedName.Name, + }, + Key: secretKey, + }, + }, + Name: workspace, + ApplyMethod: "auto", + ApplyRunTrigger: "auto", + }, + Status: appv1alpha2.WorkspaceStatus{}, + } + }) + + AfterEach(func() { + deleteWorkspace(instance) + }) + + Context("Retry", func() { + It("can retry failed runs", func() { + namespacedName := getNamespacedName(instance) + // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation + createWorkspace(instance) + // TODO: add the retry config in the CRD to actually do a retry + + // start a run that will fail + createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true) + + var runID string + + Eventually(func() bool { + Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed()) + if instance.Status.Run == nil { + return false + } + + runID = instance.Status.Run.ID + + _, ok := runStatusUnsuccessful[tfc.RunStatus(instance.Status.Run.Status)] + + return ok + }).Should(BeTrue()) + + // Fix the code but no not start a run manually + createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", false) + + // a new run should be started automatically + Eventually(func() bool { + Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed()) + if instance.Status.Run == nil { + return false + } + // TODO: check that the retry count of the status decreased + + return runID != instance.Status.Run.ID + }).Should(BeTrue()) + + // Since the code is fixed at some point a run will succeed + Eventually(func() bool { + Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed()) + if instance.Status.Run == nil { + return false + } + + return runID != instance.Status.Run.ID && instance.Status.Run.RunCompleted() + }).Should(BeTrue()) + }) + }) +}) From e8c06dadd708269edd49342980e7d18ce33681a4 Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Thu, 24 Apr 2025 16:44:18 +0200 Subject: [PATCH 05/12] feat(api): add retry policy and status --- api/v1alpha2/workspace_types.go | 41 +++++++++++++++++++ api/v1alpha2/zz_generated.deepcopy.go | 41 +++++++++++++++++++ .../crds/app.terraform.io_workspaces.yaml | 35 ++++++++++++++++ .../bases/app.terraform.io_workspaces.yaml | 35 ++++++++++++++++ docs/api-reference.md | 33 +++++++++++++++ 5 files changed, 185 insertions(+) diff --git a/api/v1alpha2/workspace_types.go b/api/v1alpha2/workspace_types.go index d98d568c..26cc142a 100644 --- a/api/v1alpha2/workspace_types.go +++ b/api/v1alpha2/workspace_types.go @@ -65,6 +65,25 @@ type RemoteStateSharing struct { Workspaces []*ConsumerWorkspace `json:"workspaces,omitempty"` } +// RetryPolicy allows you to configure retry behavior for failed runs on the workspace. +// It will apply for the latest current run of the operator. +type RetryPolicy struct { + // Limit is the maximum number of retries for failed runs. + // Default: `0`. + // + // +kubebuilder:default:=0 + // +optional + Limit int64 `json:"limit,omitempty"` + // Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string. + // More information: + // - https://pkg.go.dev/time#ParseDuration + // Default: `""`. + // + // +kubebuilder:default:="" + // +optional + Backoff string `json:"backoff,omitempty"` +} + // Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle. // Only one of the fields `ID` or `Name` is allowed. // At least one of the fields `ID` or `Name` is mandatory. @@ -592,6 +611,10 @@ type WorkspaceSpec struct { // //+optional RemoteStateSharing *RemoteStateSharing `json:"remoteStateSharing,omitempty"` + // Retry Policy allows you to specify how the operator should retry failed runs automatically. + // + //+optional + RetryPolicy *RetryPolicy `json:"retryPolicy,omitempty"` // Run triggers allow you to connect this workspace to one or more source workspaces. // These connections allow runs to queue automatically in this workspace on successful apply of runs in any of the source workspaces. // More information: @@ -742,6 +765,11 @@ type WorkspaceStatus struct { // //+optional VariableSets []VariableSetStatus `json:"variableSet,omitempty"` + + // Retry status of the latest run on the workspace. + // + //+optional + Retry *RetryStatus `json:"retry,omitempty"` } type VariableSetStatus struct { @@ -749,6 +777,19 @@ type VariableSetStatus struct { Name string `json:"name,omitempty"` } +// RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and +// possibly a time to wait for the next attempt. +type RetryStatus struct { + // RetriesLeft is the number of retries left for the latest run on the workspace. + RetriesLeft int64 `json:"retriesLeft,omitempty"` + + // NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry + // if the Backoff option was added. It is represented in RFC3339 form and is in UTC. + // + // +optional + NextRetryTimestamp metav1.Time `json:"nextRetryTimestamp,omitempty"` +} + //+kubebuilder:object:root=true //+kubebuilder:subresource:status //+kubebuilder:printcolumn:name="Workspace ID",type=string,JSONPath=`.status.workspaceID` diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go index d8fccce7..f054c31a 100644 --- a/api/v1alpha2/zz_generated.deepcopy.go +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -749,6 +749,37 @@ func (in *RemoteStateSharing) DeepCopy() *RemoteStateSharing { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RetryPolicy) DeepCopyInto(out *RetryPolicy) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryPolicy. +func (in *RetryPolicy) DeepCopy() *RetryPolicy { + if in == nil { + return nil + } + out := new(RetryPolicy) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RetryStatus) DeepCopyInto(out *RetryStatus) { + *out = *in + in.NextRetryTimestamp.DeepCopyInto(&out.NextRetryTimestamp) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryStatus. +func (in *RetryStatus) DeepCopy() *RetryStatus { + if in == nil { + return nil + } + out := new(RetryStatus) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RunStatus) DeepCopyInto(out *RunStatus) { *out = *in @@ -1114,6 +1145,11 @@ func (in *WorkspaceSpec) DeepCopyInto(out *WorkspaceSpec) { *out = new(RemoteStateSharing) (*in).DeepCopyInto(*out) } + if in.RetryPolicy != nil { + in, out := &in.RetryPolicy, &out.RetryPolicy + *out = new(RetryPolicy) + **out = **in + } if in.RunTriggers != nil { in, out := &in.RunTriggers, &out.RunTriggers *out = make([]RunTrigger, len(*in)) @@ -1181,6 +1217,11 @@ func (in *WorkspaceStatus) DeepCopyInto(out *WorkspaceStatus) { *out = make([]VariableSetStatus, len(*in)) copy(*out, *in) } + if in.Retry != nil { + in, out := &in.Retry, &out.Retry + *out = new(RetryStatus) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceStatus. diff --git a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml index a553d237..8f1021b0 100644 --- a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml +++ b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml @@ -374,6 +374,26 @@ spec: minItems: 1 type: array type: object + retryPolicy: + description: Retry Policy allows you to specify how the operator should + retry failed runs automatically. + properties: + backoff: + default: "" + description: |- + Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string. + More information: + - https://pkg.go.dev/time#ParseDuration + Default: `""`. + type: string + limit: + default: 0 + description: |- + Limit is the maximum number of retries for failed runs. + Default: `0`. + format: int64 + type: integer + type: object runTasks: description: |- Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle. @@ -836,6 +856,21 @@ spec: pattern: ^\d{1}\.\d{1,2}\.\d{1,2}$ type: string type: object + retry: + description: Retry status of the latest run on the workspace. + properties: + nextRetryTimestamp: + description: |- + NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry + if the Backoff option was added. It is represented in RFC3339 form and is in UTC. + format: date-time + type: string + retriesLeft: + description: RetriesLeft is the number of retries left for the + latest run on the workspace. + format: int64 + type: integer + type: object runStatus: description: Workspace Runs status. properties: diff --git a/config/crd/bases/app.terraform.io_workspaces.yaml b/config/crd/bases/app.terraform.io_workspaces.yaml index 1a8a6774..55fc993c 100644 --- a/config/crd/bases/app.terraform.io_workspaces.yaml +++ b/config/crd/bases/app.terraform.io_workspaces.yaml @@ -371,6 +371,26 @@ spec: minItems: 1 type: array type: object + retryPolicy: + description: Retry Policy allows you to specify how the operator should + retry failed runs automatically. + properties: + backoff: + default: "" + description: |- + Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string. + More information: + - https://pkg.go.dev/time#ParseDuration + Default: `""`. + type: string + limit: + default: 0 + description: |- + Limit is the maximum number of retries for failed runs. + Default: `0`. + format: int64 + type: integer + type: object runTasks: description: |- Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle. @@ -833,6 +853,21 @@ spec: pattern: ^\d{1}\.\d{1,2}\.\d{1,2}$ type: string type: object + retry: + description: Retry status of the latest run on the workspace. + properties: + nextRetryTimestamp: + description: |- + NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry + if the Backoff option was added. It is represented in RFC3339 form and is in UTC. + format: date-time + type: string + retriesLeft: + description: RetriesLeft is the number of retries left for the + latest run on the workspace. + format: int64 + type: integer + type: object runStatus: description: Workspace Runs status. properties: diff --git a/docs/api-reference.md b/docs/api-reference.md index b9d6bec4..540371f0 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -549,6 +549,38 @@ _Appears in:_ | `workspaces` _[ConsumerWorkspace](#consumerworkspace) array_ | Allow access to the state for specific workspaces within the same organization. | +#### RetryPolicy + + + +RetryPolicy allows you to configure retry behavior for failed runs on the workspace. +It will apply for the latest current run of the operator. + +_Appears in:_ +- [WorkspaceSpec](#workspacespec) + +| Field | Description | +| --- | --- | +| `limit` _integer_ | Limit is the maximum number of retries for failed runs.
Default: `0`. | +| `backoff` _string_ | Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.
More information:
- https://pkg.go.dev/time#ParseDuration
Default: `""`. | + + +#### RetryStatus + + + +RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and +possibly a time to wait for the next attempt. + +_Appears in:_ +- [WorkspaceStatus](#workspacestatus) + +| Field | Description | +| --- | --- | +| `retriesLeft` _integer_ | RetriesLeft is the number of retries left for the latest run on the workspace. | +| `nextRetryTimestamp` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#time-v1-meta)_ | NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry
if the Backoff option was added. It is represented in RFC3339 form and is in UTC. | + + #### RunStatus @@ -894,6 +926,7 @@ _Appears in:_ | `environmentVariables` _[Variable](#variable) array_ | Terraform Environment variables for all plans and applies in this workspace.
Variables defined within a workspace always overwrite variables from variable sets that have the same type and the same key.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables#environment-variables | | `terraformVariables` _[Variable](#variable) array_ | Terraform variables for all plans and applies in this workspace.
Variables defined within a workspace always overwrite variables from variable sets that have the same type and the same key.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables#terraform-variables | | `remoteStateSharing` _[RemoteStateSharing](#remotestatesharing)_ | Remote state access between workspaces.
By default, new workspaces in HCP Terraform do not allow other workspaces to access their state.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/state#accessing-state-from-other-workspaces | +| `retryPolicy` _[RetryPolicy](#retrypolicy)_ | Retry Policy allows you to specify how the operator should retry failed runs automatically. | | `runTriggers` _[RunTrigger](#runtrigger) array_ | Run triggers allow you to connect this workspace to one or more source workspaces.
These connections allow runs to queue automatically in this workspace on successful apply of runs in any of the source workspaces.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/settings/run-triggers | | `versionControl` _[VersionControl](#versioncontrol)_ | Settings for the workspace's VCS repository, enabling the UI/VCS-driven run workflow.
Omit this argument to utilize the CLI-driven and API-driven workflows, where runs are not driven by webhooks on your VCS provider.
More information:
- https://www.terraform.io/cloud-docs/run/ui
- https://www.terraform.io/cloud-docs/vcs | | `sshKey` _[SSHKey](#sshkey)_ | SSH key used to clone Terraform modules.
More information:
- https://developer.hashicorp.com/terraform/cloud-docs/workspaces/settings/ssh-keys | From 9f03b5124989ef5f50f0e8ce59efeec1b1aaf251 Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Thu, 24 Apr 2025 18:16:11 +0200 Subject: [PATCH 06/12] feat(api): rename to strategy and modify tests --- api/v1alpha2/workspace_types.go | 28 +++++-------------- api/v1alpha2/zz_generated.deepcopy.go | 3 +- .../crds/app.terraform.io_workspaces.yaml | 25 ++++------------- .../bases/app.terraform.io_workspaces.yaml | 25 ++++------------- docs/api-reference.md | 6 ++-- .../workspace_controller_retry_test.go | 25 ++++++++++++++++- 6 files changed, 44 insertions(+), 68 deletions(-) diff --git a/api/v1alpha2/workspace_types.go b/api/v1alpha2/workspace_types.go index 26cc142a..80a572eb 100644 --- a/api/v1alpha2/workspace_types.go +++ b/api/v1alpha2/workspace_types.go @@ -68,20 +68,12 @@ type RemoteStateSharing struct { // RetryPolicy allows you to configure retry behavior for failed runs on the workspace. // It will apply for the latest current run of the operator. type RetryPolicy struct { - // Limit is the maximum number of retries for failed runs. + // Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied. // Default: `0`. // - // +kubebuilder:default:=0 - // +optional - Limit int64 `json:"limit,omitempty"` - // Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string. - // More information: - // - https://pkg.go.dev/time#ParseDuration - // Default: `""`. - // - // +kubebuilder:default:="" - // +optional - Backoff string `json:"backoff,omitempty"` + //+kubebuilder:default:=0 + //+optional + BackoffLimit int64 `json:"backoffLimit,omitempty"` } // Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle. @@ -620,7 +612,7 @@ type WorkspaceSpec struct { // More information: // - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/settings/run-triggers // - //+kubebuilder:validation:MinItems:=1 + //+kubebuilder:validation:MinItems:=2 //+optional RunTriggers []RunTrigger `json:"runTriggers,omitempty"` // Settings for the workspace's VCS repository, enabling the UI/VCS-driven run workflow. @@ -780,14 +772,8 @@ type VariableSetStatus struct { // RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and // possibly a time to wait for the next attempt. type RetryStatus struct { - // RetriesLeft is the number of retries left for the latest run on the workspace. - RetriesLeft int64 `json:"retriesLeft,omitempty"` - - // NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry - // if the Backoff option was added. It is represented in RFC3339 form and is in UTC. - // - // +optional - NextRetryTimestamp metav1.Time `json:"nextRetryTimestamp,omitempty"` + // Failed is the number of failed attempts. + Failed int64 `json:"failed,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go index f054c31a..706eec2f 100644 --- a/api/v1alpha2/zz_generated.deepcopy.go +++ b/api/v1alpha2/zz_generated.deepcopy.go @@ -767,7 +767,6 @@ func (in *RetryPolicy) DeepCopy() *RetryPolicy { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *RetryStatus) DeepCopyInto(out *RetryStatus) { *out = *in - in.NextRetryTimestamp.DeepCopyInto(&out.NextRetryTimestamp) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryStatus. @@ -1220,7 +1219,7 @@ func (in *WorkspaceStatus) DeepCopyInto(out *WorkspaceStatus) { if in.Retry != nil { in, out := &in.Retry, &out.Retry *out = new(RetryStatus) - (*in).DeepCopyInto(*out) + **out = **in } } diff --git a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml index 8f1021b0..5a2c6937 100644 --- a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml +++ b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml @@ -378,18 +378,10 @@ spec: description: Retry Policy allows you to specify how the operator should retry failed runs automatically. properties: - backoff: - default: "" - description: |- - Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string. - More information: - - https://pkg.go.dev/time#ParseDuration - Default: `""`. - type: string - limit: + backoffLimit: default: 0 description: |- - Limit is the maximum number of retries for failed runs. + Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied. Default: `0`. format: int64 type: integer @@ -462,7 +454,7 @@ spec: minLength: 1 type: string type: object - minItems: 1 + minItems: 2 type: array sshKey: description: |- @@ -859,15 +851,8 @@ spec: retry: description: Retry status of the latest run on the workspace. properties: - nextRetryTimestamp: - description: |- - NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry - if the Backoff option was added. It is represented in RFC3339 form and is in UTC. - format: date-time - type: string - retriesLeft: - description: RetriesLeft is the number of retries left for the - latest run on the workspace. + failed: + description: Failed is the number of failed attempts. format: int64 type: integer type: object diff --git a/config/crd/bases/app.terraform.io_workspaces.yaml b/config/crd/bases/app.terraform.io_workspaces.yaml index 55fc993c..20afc37d 100644 --- a/config/crd/bases/app.terraform.io_workspaces.yaml +++ b/config/crd/bases/app.terraform.io_workspaces.yaml @@ -375,18 +375,10 @@ spec: description: Retry Policy allows you to specify how the operator should retry failed runs automatically. properties: - backoff: - default: "" - description: |- - Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string. - More information: - - https://pkg.go.dev/time#ParseDuration - Default: `""`. - type: string - limit: + backoffLimit: default: 0 description: |- - Limit is the maximum number of retries for failed runs. + Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied. Default: `0`. format: int64 type: integer @@ -459,7 +451,7 @@ spec: minLength: 1 type: string type: object - minItems: 1 + minItems: 2 type: array sshKey: description: |- @@ -856,15 +848,8 @@ spec: retry: description: Retry status of the latest run on the workspace. properties: - nextRetryTimestamp: - description: |- - NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry - if the Backoff option was added. It is represented in RFC3339 form and is in UTC. - format: date-time - type: string - retriesLeft: - description: RetriesLeft is the number of retries left for the - latest run on the workspace. + failed: + description: Failed is the number of failed attempts. format: int64 type: integer type: object diff --git a/docs/api-reference.md b/docs/api-reference.md index 540371f0..e4628e30 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -561,8 +561,7 @@ _Appears in:_ | Field | Description | | --- | --- | -| `limit` _integer_ | Limit is the maximum number of retries for failed runs.
Default: `0`. | -| `backoff` _string_ | Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.
More information:
- https://pkg.go.dev/time#ParseDuration
Default: `""`. | +| `backoffLimit` _integer_ | Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied.
Default: `0`. | #### RetryStatus @@ -577,8 +576,7 @@ _Appears in:_ | Field | Description | | --- | --- | -| `retriesLeft` _integer_ | RetriesLeft is the number of retries left for the latest run on the workspace. | -| `nextRetryTimestamp` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#time-v1-meta)_ | NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry
if the Backoff option was added. It is represented in RFC3339 form and is in UTC. | +| `failed` _integer_ | Failed is the number of failed attempts. | #### RunStatus diff --git a/internal/controller/workspace_controller_retry_test.go b/internal/controller/workspace_controller_retry_test.go index 9d12e2bf..2403a351 100644 --- a/internal/controller/workspace_controller_retry_test.go +++ b/internal/controller/workspace_controller_retry_test.go @@ -73,8 +73,10 @@ var _ = Describe("Workspace controller", Ordered, func() { It("can retry failed runs", func() { namespacedName := getNamespacedName(instance) // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation + instance.Spec.RetryStrategy = &appv1alpha2.RetryPolicy{ + Enabled: true, + } createWorkspace(instance) - // TODO: add the retry config in the CRD to actually do a retry // start a run that will fail createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true) @@ -118,5 +120,26 @@ var _ = Describe("Workspace controller", Ordered, func() { return runID != instance.Status.Run.ID && instance.Status.Run.RunCompleted() }).Should(BeTrue()) }) + It("can retry until the limit of retries is reached", func() { + namespacedName := getNamespacedName(instance) + // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation + instance.Spec.RetryStrategy = &appv1alpha2.RetryPolicy{ + Enabled: true, + Limit: 1, + } + + createWorkspace(instance) + // start a run that will fail + createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true) + + Eventually(func() bool { + Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed()) + if instance.Status.Retry == nil { + return false + } + + return instance.Status.Retry.RetriesLeft == 0 + }).Should(BeTrue()) + }) }) }) From d88eded1d56e06fec30d7f5c10625a8750bff5a7 Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Mon, 28 Apr 2025 17:58:34 +0200 Subject: [PATCH 07/12] feat(api): working retry in controller --- api/v1alpha2/workspace_types.go | 2 +- .../crds/app.terraform.io_workspaces.yaml | 3 +- .../bases/app.terraform.io_workspaces.yaml | 3 +- docs/api-reference.md | 2 +- .../controller/workspace_controller_retry.go | 20 +++++ .../workspace_controller_retry_test.go | 75 ++++++++++++++----- .../controller/workspace_controller_runs.go | 47 ++++++++++++ 7 files changed, 128 insertions(+), 24 deletions(-) create mode 100644 internal/controller/workspace_controller_retry.go diff --git a/api/v1alpha2/workspace_types.go b/api/v1alpha2/workspace_types.go index 80a572eb..a8ab10d6 100644 --- a/api/v1alpha2/workspace_types.go +++ b/api/v1alpha2/workspace_types.go @@ -772,7 +772,7 @@ type VariableSetStatus struct { // RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and // possibly a time to wait for the next attempt. type RetryStatus struct { - // Failed is the number of failed attempts. + // Failed is the number of failed attempts, counting the initial one. Failed int64 `json:"failed,omitempty"` } diff --git a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml index 5a2c6937..8738aeac 100644 --- a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml +++ b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml @@ -852,7 +852,8 @@ spec: description: Retry status of the latest run on the workspace. properties: failed: - description: Failed is the number of failed attempts. + description: Failed is the number of failed attempts, counting + the initial one. format: int64 type: integer type: object diff --git a/config/crd/bases/app.terraform.io_workspaces.yaml b/config/crd/bases/app.terraform.io_workspaces.yaml index 20afc37d..c39d7cf5 100644 --- a/config/crd/bases/app.terraform.io_workspaces.yaml +++ b/config/crd/bases/app.terraform.io_workspaces.yaml @@ -849,7 +849,8 @@ spec: description: Retry status of the latest run on the workspace. properties: failed: - description: Failed is the number of failed attempts. + description: Failed is the number of failed attempts, counting + the initial one. format: int64 type: integer type: object diff --git a/docs/api-reference.md b/docs/api-reference.md index e4628e30..090a36e0 100644 --- a/docs/api-reference.md +++ b/docs/api-reference.md @@ -576,7 +576,7 @@ _Appears in:_ | Field | Description | | --- | --- | -| `failed` _integer_ | Failed is the number of failed attempts. | +| `failed` _integer_ | Failed is the number of failed attempts, counting the initial one. | #### RunStatus diff --git a/internal/controller/workspace_controller_retry.go b/internal/controller/workspace_controller_retry.go new file mode 100644 index 00000000..675d9d2b --- /dev/null +++ b/internal/controller/workspace_controller_retry.go @@ -0,0 +1,20 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package controller + +import ( + "context" + + appv1alpha2 "github.com/hashicorp/hcp-terraform-operator/api/v1alpha2" +) + +func (r *WorkspaceReconciler) resetRetryStatus(ctx context.Context, w *workspaceInstance) error { + if w.instance.Spec.RetryPolicy == nil || w.instance.Spec.RetryPolicy.BackoffLimit == 0 { + return nil + } + w.instance.Status.Retry = &appv1alpha2.RetryStatus{ + Failed: 0, + } + return nil +} diff --git a/internal/controller/workspace_controller_retry_test.go b/internal/controller/workspace_controller_retry_test.go index 2403a351..8e8c693e 100644 --- a/internal/controller/workspace_controller_retry_test.go +++ b/internal/controller/workspace_controller_retry_test.go @@ -72,28 +72,37 @@ var _ = Describe("Workspace controller", Ordered, func() { Context("Retry", func() { It("can retry failed runs", func() { namespacedName := getNamespacedName(instance) - // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation - instance.Spec.RetryStrategy = &appv1alpha2.RetryPolicy{ - Enabled: true, + instance.Spec.RetryPolicy = &appv1alpha2.RetryPolicy{ + BackoffLimit: -1, } + // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation createWorkspace(instance) + workspaceID := instance.Status.WorkspaceID // start a run that will fail - createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true) + cv := createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true) var runID string Eventually(func() bool { - Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed()) - if instance.Status.Run == nil { - return false + listOpts := tfc.ListOptions{ + PageNumber: 1, + PageSize: maxPageSize, } - - runID = instance.Status.Run.ID - - _, ok := runStatusUnsuccessful[tfc.RunStatus(instance.Status.Run.Status)] - - return ok + for listOpts.PageNumber != 0 { + runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{ + ListOptions: listOpts, + }) + Expect(err).To(Succeed()) + for _, r := range runs.Items { + if r.ConfigurationVersion.ID == cv.ID { + runID = r.ID + return r.Status == tfc.RunErrored + } + } + listOpts.PageNumber = runs.NextPage + } + return false }).Should(BeTrue()) // Fix the code but no not start a run manually @@ -105,11 +114,18 @@ var _ = Describe("Workspace controller", Ordered, func() { if instance.Status.Run == nil { return false } - // TODO: check that the retry count of the status decreased - return runID != instance.Status.Run.ID }).Should(BeTrue()) + // the number of failed attemps should be reset to 0 + Eventually(func() bool { + Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed()) + if instance.Status.Retry == nil { + return false + } + return instance.Status.Retry.Failed == 0 + }).Should(BeTrue()) + // Since the code is fixed at some point a run will succeed Eventually(func() bool { Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed()) @@ -122,13 +138,15 @@ var _ = Describe("Workspace controller", Ordered, func() { }) It("can retry until the limit of retries is reached", func() { namespacedName := getNamespacedName(instance) - // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation - instance.Spec.RetryStrategy = &appv1alpha2.RetryPolicy{ - Enabled: true, - Limit: 1, + + instance.Spec.RetryPolicy = &appv1alpha2.RetryPolicy{ + BackoffLimit: 2, } + // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation createWorkspace(instance) + workspaceID := instance.Status.WorkspaceID + // start a run that will fail createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true) @@ -138,7 +156,24 @@ var _ = Describe("Workspace controller", Ordered, func() { return false } - return instance.Status.Retry.RetriesLeft == 0 + return instance.Status.Retry.Failed == 3 + }).Should(BeTrue()) + + Eventually(func() bool { + listOpts := tfc.ListOptions{ + PageNumber: 1, + PageSize: maxPageSize, + } + runCount := 0 + for listOpts.PageNumber != 0 { + runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{ + ListOptions: listOpts, + }) + Expect(err).To(Succeed()) + runCount += len(runs.Items) + listOpts.PageNumber = runs.NextPage + } + return runCount == 3 }).Should(BeTrue()) }) }) diff --git a/internal/controller/workspace_controller_runs.go b/internal/controller/workspace_controller_runs.go index 30cdc728..4c8e587a 100644 --- a/internal/controller/workspace_controller_runs.go +++ b/internal/controller/workspace_controller_runs.go @@ -84,6 +84,53 @@ func (r *WorkspaceReconciler) reconcileCurrentRun(ctx context.Context, w *worksp w.instance.Status.Run.Status = string(run.Status) w.instance.Status.Run.ConfigurationVersion = run.ConfigurationVersion.ID + if w.instance.Spec.RetryPolicy != nil && w.instance.Spec.RetryPolicy.BackoffLimit != 0 { + if _, ok := runStatusUnsuccessful[run.Status]; ok { + w.log.Info("Reconcile Runs", "msg", "ongoing non-speculative run is unsuccessful, retrying it") + + if w.instance.Status.Retry == nil { + w.instance.Status.Retry = &appv1alpha2.RetryStatus{ + Failed: 0, + } + } + w.instance.Status.Retry.Failed++ + + if w.instance.Spec.RetryPolicy.BackoffLimit < 0 || w.instance.Status.Retry.Failed <= w.instance.Spec.RetryPolicy.BackoffLimit { + + options := tfc.RunCreateOptions{ + Message: tfc.String(runMessage), + Workspace: workspace, + IsDestroy: tfc.Bool(run.IsDestroy), + RefreshOnly: tfc.Bool(run.RefreshOnly), + } + retriedRun, err := w.tfClient.Client.Runs.Create(ctx, options) + if err != nil { + w.log.Error(err, "Reconcile Runs", "msg", "failed to create a new apply run for retry") + return err + } + w.log.Info("Reconcile Runs", "msg", fmt.Sprintf("successfully created a new apply run %s for retry", retriedRun.ID)) + + // Update status + if w.instance.Status.Run == nil { + w.instance.Status.Run = &appv1alpha2.RunStatus{} + } + w.instance.Status.Run.ID = retriedRun.ID + w.instance.Status.Run.Status = string(retriedRun.Status) + w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID + + // WARNING: there is a race limit here in case the run fails very fast and the initial status returned + // by the Runs.Create funtion is Errored. In this case the run is never retried. + } else { + w.log.Info("Reconcile Runs", "msg", "backoff limit was reached, skip retry") + } + } + } + + // when the current run succeeds, we reset the failed counter for a next retry + if _, ok := runStatusComplete[run.Status]; ok { + r.resetRetryStatus(ctx, w) + } + return nil } From 6d112985cfe962ca0c262d3280129f7c9ba08715 Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Mon, 28 Apr 2025 18:15:18 +0200 Subject: [PATCH 08/12] refacto: move retry logic to own function --- .../controller/workspace_controller_retry.go | 50 ++++++++++++++++++- .../controller/workspace_controller_runs.go | 43 +++------------- 2 files changed, 57 insertions(+), 36 deletions(-) diff --git a/internal/controller/workspace_controller_retry.go b/internal/controller/workspace_controller_retry.go index 675d9d2b..09fa4506 100644 --- a/internal/controller/workspace_controller_retry.go +++ b/internal/controller/workspace_controller_retry.go @@ -5,12 +5,14 @@ package controller import ( "context" + "fmt" + tfc "github.com/hashicorp/go-tfe" appv1alpha2 "github.com/hashicorp/hcp-terraform-operator/api/v1alpha2" ) func (r *WorkspaceReconciler) resetRetryStatus(ctx context.Context, w *workspaceInstance) error { - if w.instance.Spec.RetryPolicy == nil || w.instance.Spec.RetryPolicy.BackoffLimit == 0 { + if !isRetryEnabled(w) { return nil } w.instance.Status.Retry = &appv1alpha2.RetryStatus{ @@ -18,3 +20,49 @@ func (r *WorkspaceReconciler) resetRetryStatus(ctx context.Context, w *workspace } return nil } + +func isRetryEnabled(w *workspaceInstance) bool { + return w.instance.Spec.RetryPolicy != nil && w.instance.Spec.RetryPolicy.BackoffLimit != 0 +} + +func (r *WorkspaceReconciler) retryFailedRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error { + if w.instance.Status.Retry == nil { + w.instance.Status.Retry = &appv1alpha2.RetryStatus{ + Failed: 0, + } + } + w.instance.Status.Retry.Failed++ + + if w.instance.Spec.RetryPolicy.BackoffLimit < 0 || w.instance.Status.Retry.Failed <= w.instance.Spec.RetryPolicy.BackoffLimit { + + options := tfc.RunCreateOptions{ + Message: tfc.String(runMessage), + Workspace: workspace, + IsDestroy: tfc.Bool(failedRun.IsDestroy), + RefreshOnly: tfc.Bool(failedRun.RefreshOnly), + } + retriedRun, err := w.tfClient.Client.Runs.Create(ctx, options) + if err != nil { + w.log.Error(err, "Retry Runs", "msg", "failed to create a new apply run for retry") + return err + } + w.log.Info("Retry Runs", "msg", fmt.Sprintf("successfully created a new apply run %s for to retry failed %s", retriedRun.ID, failedRun.ID)) + + // Update status + if w.instance.Status.Run == nil { + w.instance.Status.Run = &appv1alpha2.RunStatus{} + } + w.instance.Status.Run.ID = retriedRun.ID + w.instance.Status.Run.Status = string(retriedRun.Status) + w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID + + // WARNING: there is a race limit here in case the run fails very fast and the initial status returned + // by the Runs.Create funtion is Errored. In this case the run is never retried. + // TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to + // make sure we didn't miss a retry + } else { + w.log.Info("Retry Runs", "msg", "backoff limit was reached, skip retry") + return nil + } + return nil +} diff --git a/internal/controller/workspace_controller_runs.go b/internal/controller/workspace_controller_runs.go index 4c8e587a..98e67178 100644 --- a/internal/controller/workspace_controller_runs.go +++ b/internal/controller/workspace_controller_runs.go @@ -84,44 +84,12 @@ func (r *WorkspaceReconciler) reconcileCurrentRun(ctx context.Context, w *worksp w.instance.Status.Run.Status = string(run.Status) w.instance.Status.Run.ConfigurationVersion = run.ConfigurationVersion.ID - if w.instance.Spec.RetryPolicy != nil && w.instance.Spec.RetryPolicy.BackoffLimit != 0 { + if isRetryEnabled(w) { if _, ok := runStatusUnsuccessful[run.Status]; ok { w.log.Info("Reconcile Runs", "msg", "ongoing non-speculative run is unsuccessful, retrying it") - if w.instance.Status.Retry == nil { - w.instance.Status.Retry = &appv1alpha2.RetryStatus{ - Failed: 0, - } - } - w.instance.Status.Retry.Failed++ - - if w.instance.Spec.RetryPolicy.BackoffLimit < 0 || w.instance.Status.Retry.Failed <= w.instance.Spec.RetryPolicy.BackoffLimit { - - options := tfc.RunCreateOptions{ - Message: tfc.String(runMessage), - Workspace: workspace, - IsDestroy: tfc.Bool(run.IsDestroy), - RefreshOnly: tfc.Bool(run.RefreshOnly), - } - retriedRun, err := w.tfClient.Client.Runs.Create(ctx, options) - if err != nil { - w.log.Error(err, "Reconcile Runs", "msg", "failed to create a new apply run for retry") - return err - } - w.log.Info("Reconcile Runs", "msg", fmt.Sprintf("successfully created a new apply run %s for retry", retriedRun.ID)) - - // Update status - if w.instance.Status.Run == nil { - w.instance.Status.Run = &appv1alpha2.RunStatus{} - } - w.instance.Status.Run.ID = retriedRun.ID - w.instance.Status.Run.Status = string(retriedRun.Status) - w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID - - // WARNING: there is a race limit here in case the run fails very fast and the initial status returned - // by the Runs.Create funtion is Errored. In this case the run is never retried. - } else { - w.log.Info("Reconcile Runs", "msg", "backoff limit was reached, skip retry") + if err = r.retryFailedRun(ctx, w, workspace, run); err != nil { + return err } } } @@ -181,6 +149,11 @@ func (r *WorkspaceReconciler) triggerApplyRun(ctx context.Context, w *workspaceI w.instance.Status.Run.Status = string(run.Status) w.instance.Status.Run.ConfigurationVersion = run.ConfigurationVersion.ID + // WARNING: there is a race limit here in case the run fails very fast and the initial status returned + // by the Runs.Create funtion is Errored. In this case the run is never retried. + // TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to + // make sure we didn't miss a retry + return nil } From ef09280443d7b428e9b6a0423a874675cf5ae540 Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Tue, 29 Apr 2025 10:59:35 +0200 Subject: [PATCH 09/12] refacto(retry): split logic to update the status --- .../controller/workspace_controller_retry.go | 63 ++++++++++++++----- .../controller/workspace_controller_runs.go | 2 +- 2 files changed, 48 insertions(+), 17 deletions(-) diff --git a/internal/controller/workspace_controller_retry.go b/internal/controller/workspace_controller_retry.go index 09fa4506..fd502767 100644 --- a/internal/controller/workspace_controller_retry.go +++ b/internal/controller/workspace_controller_retry.go @@ -25,7 +25,50 @@ func isRetryEnabled(w *workspaceInstance) bool { return w.instance.Spec.RetryPolicy != nil && w.instance.Spec.RetryPolicy.BackoffLimit != 0 } -func (r *WorkspaceReconciler) retryFailedRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error { +func (r *WorkspaceReconciler) retryFailedApplyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error { + retriedRun, err := r.retryFailedRun(ctx, w, workspace, failedRun) + if err != nil { + return err + } + + // when no run is returned, it means the backoff limit was reached + if retriedRun == nil { + return nil + } + + // Update status + if w.instance.Status.Run == nil { + w.instance.Status.Run = &appv1alpha2.RunStatus{} + } + w.instance.Status.Run.ID = retriedRun.ID + w.instance.Status.Run.Status = string(retriedRun.Status) + w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID + + // WARNING: there is a race limit here in case the run fails very fast and the initial status returned + // by the Runs.Create funtion is Errored. In this case the run is never retried. + // TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to + // make sure we didn't miss a retry + + return nil +} + +func (r *WorkspaceReconciler) retryFaileDestroyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error { + retriedRun, err := r.retryFailedRun(ctx, w, workspace, failedRun) + if err != nil { + return err + } + + // when no run is returned, it means the backoff limit was reached + if retriedRun == nil { + return nil + } + + w.instance.Status.DestroyRunID = retriedRun.ID + + return nil +} + +func (r *WorkspaceReconciler) retryFailedRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) (*tfc.Run, error) { if w.instance.Status.Retry == nil { w.instance.Status.Retry = &appv1alpha2.RetryStatus{ Failed: 0, @@ -44,25 +87,13 @@ func (r *WorkspaceReconciler) retryFailedRun(ctx context.Context, w *workspaceIn retriedRun, err := w.tfClient.Client.Runs.Create(ctx, options) if err != nil { w.log.Error(err, "Retry Runs", "msg", "failed to create a new apply run for retry") - return err + return nil, err } w.log.Info("Retry Runs", "msg", fmt.Sprintf("successfully created a new apply run %s for to retry failed %s", retriedRun.ID, failedRun.ID)) - // Update status - if w.instance.Status.Run == nil { - w.instance.Status.Run = &appv1alpha2.RunStatus{} - } - w.instance.Status.Run.ID = retriedRun.ID - w.instance.Status.Run.Status = string(retriedRun.Status) - w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID - - // WARNING: there is a race limit here in case the run fails very fast and the initial status returned - // by the Runs.Create funtion is Errored. In this case the run is never retried. - // TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to - // make sure we didn't miss a retry + return retriedRun, nil } else { w.log.Info("Retry Runs", "msg", "backoff limit was reached, skip retry") - return nil + return nil, nil } - return nil } diff --git a/internal/controller/workspace_controller_runs.go b/internal/controller/workspace_controller_runs.go index 98e67178..9c498241 100644 --- a/internal/controller/workspace_controller_runs.go +++ b/internal/controller/workspace_controller_runs.go @@ -88,7 +88,7 @@ func (r *WorkspaceReconciler) reconcileCurrentRun(ctx context.Context, w *worksp if _, ok := runStatusUnsuccessful[run.Status]; ok { w.log.Info("Reconcile Runs", "msg", "ongoing non-speculative run is unsuccessful, retrying it") - if err = r.retryFailedRun(ctx, w, workspace, run); err != nil { + if err = r.retryFailedApplyRun(ctx, w, workspace, run); err != nil { return err } } From 3e9bc7c841d17522dc56d3dad60adc7b6c315908 Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Tue, 29 Apr 2025 11:13:02 +0200 Subject: [PATCH 10/12] feat(retry): add retry for deleletion policy --- .../workspace_controller_deletion_policy.go | 9 +++++++++ internal/controller/workspace_controller_retry.go | 12 +++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/internal/controller/workspace_controller_deletion_policy.go b/internal/controller/workspace_controller_deletion_policy.go index badf222c..27ffc4ee 100644 --- a/internal/controller/workspace_controller_deletion_policy.go +++ b/internal/controller/workspace_controller_deletion_policy.go @@ -124,6 +124,15 @@ func (r *WorkspaceReconciler) deleteWorkspace(ctx context.Context, w *workspaceI return r.Status().Update(ctx, &w.instance) } } + if isRetryEnabled(w) { + w.log.Info("Destroy Run", "msg", fmt.Sprintf("ongoing destroy run %s is unsuccessful, retrying it", run.ID)) + err := r.retryFailedDestroyRun(ctx, w, workspace, run) + if err != nil { + w.log.Info("Destroy Run", "msg", fmt.Sprintf("ongoing destroy run %s is unsuccessful, retrying it", run.ID)) + return err + } + return r.Status().Update(ctx, &w.instance) + } return nil } diff --git a/internal/controller/workspace_controller_retry.go b/internal/controller/workspace_controller_retry.go index fd502767..cfc924f3 100644 --- a/internal/controller/workspace_controller_retry.go +++ b/internal/controller/workspace_controller_retry.go @@ -36,14 +36,7 @@ func (r *WorkspaceReconciler) retryFailedApplyRun(ctx context.Context, w *worksp return nil } - // Update status - if w.instance.Status.Run == nil { - w.instance.Status.Run = &appv1alpha2.RunStatus{} - } - w.instance.Status.Run.ID = retriedRun.ID - w.instance.Status.Run.Status = string(retriedRun.Status) - w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID - + w.updateWorkspaceStatusRun(retriedRun) // WARNING: there is a race limit here in case the run fails very fast and the initial status returned // by the Runs.Create funtion is Errored. In this case the run is never retried. // TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to @@ -52,7 +45,7 @@ func (r *WorkspaceReconciler) retryFailedApplyRun(ctx context.Context, w *worksp return nil } -func (r *WorkspaceReconciler) retryFaileDestroyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error { +func (r *WorkspaceReconciler) retryFailedDestroyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error { retriedRun, err := r.retryFailedRun(ctx, w, workspace, failedRun) if err != nil { return err @@ -64,6 +57,7 @@ func (r *WorkspaceReconciler) retryFaileDestroyRun(ctx context.Context, w *works } w.instance.Status.DestroyRunID = retriedRun.ID + w.updateWorkspaceStatusRun(retriedRun) return nil } From d77aa861eb3aa00850f0accb7b66ac8e4d3a6cfc Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Tue, 29 Apr 2025 11:14:11 +0200 Subject: [PATCH 11/12] test: add tests for the deletion --- .../workspace_controller_retry_test.go | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/internal/controller/workspace_controller_retry_test.go b/internal/controller/workspace_controller_retry_test.go index 8e8c693e..902ae6a1 100644 --- a/internal/controller/workspace_controller_retry_test.go +++ b/internal/controller/workspace_controller_retry_test.go @@ -176,5 +176,70 @@ var _ = Describe("Workspace controller", Ordered, func() { return runCount == 3 }).Should(BeTrue()) }) + It("can retry failed destroy runs when deleting the workspace", func() { + instance.Spec.RetryPolicy = &appv1alpha2.RetryPolicy{ + BackoffLimit: -1, + } + instance.Spec.AllowDestroyPlan = true + instance.Spec.DeletionPolicy = appv1alpha2.DeletionPolicyDestroy + // Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation + createWorkspace(instance) + workspaceID := instance.Status.WorkspaceID + + cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true) + Eventually(func() bool { + listOpts := tfc.ListOptions{ + PageNumber: 1, + PageSize: maxPageSize, + } + for listOpts.PageNumber != 0 { + runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{ + ListOptions: listOpts, + }) + Expect(err).To(Succeed()) + for _, r := range runs.Items { + if r.ConfigurationVersion.ID == cv.ID { + return r.Status == tfc.RunApplied + } + } + listOpts.PageNumber = runs.NextPage + } + return false + }).Should(BeTrue()) + + // create an errored ConfigurationVersion for the delete to fail + cv = createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, false) + + Expect(k8sClient.Delete(ctx, instance)).To(Succeed()) + + Eventually(func() bool { + listOpts := tfc.ListOptions{ + PageNumber: 1, + PageSize: maxPageSize, + } + for listOpts.PageNumber != 0 { + runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{ + ListOptions: listOpts, + }) + Expect(err).To(Succeed()) + for _, r := range runs.Items { + if r.ConfigurationVersion.ID == cv.ID { + return r.Status == tfc.RunErrored + } + } + listOpts.PageNumber = runs.NextPage + } + return false + }).Should(BeTrue()) + + // Fix the code but no not start a run manually + createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", false) + + // The retry should eventually delete the workspace + Eventually(func() bool { + _, err := tfClient.Workspaces.ReadByID(ctx, workspaceID) + return err == tfc.ErrResourceNotFound + }).Should(BeTrue()) + }) }) }) From d786c9b1b8b15da10266c92118f21dbf227f4a9a Mon Sep 17 00:00:00 2001 From: Baptiste Bourdet Date: Tue, 29 Apr 2025 15:45:08 +0200 Subject: [PATCH 12/12] docs: add changelog --- .changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml diff --git a/.changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml b/.changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml new file mode 100644 index 00000000..d5202d68 --- /dev/null +++ b/.changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml @@ -0,0 +1,5 @@ +kind: ENHANCEMENTS +body: Workspace CRD option for the operator to retry failed runs +time: 2025-04-29T15:45:03.031933484+02:00 +custom: + PR: "602"