From f0ece305269d9ec55be3826db84fae8a569b811f Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Tue, 22 Apr 2025 18:27:16 +0200
Subject: [PATCH 01/12] Feat reattach to manual destroy run

---
 .../workspace_controller_deletion_policy.go   | 24 +++++
 ...rkspace_controller_deletion_policy_test.go | 91 +++++++++++++++++++
 .../workspace_controller_outputs_test.go      | 43 +++++++++
 3 files changed, 158 insertions(+)

diff --git a/internal/controller/workspace_controller_deletion_policy.go b/internal/controller/workspace_controller_deletion_policy.go
index 47e3764a..badf222c 100644
--- a/internal/controller/workspace_controller_deletion_policy.go
+++ b/internal/controller/workspace_controller_deletion_policy.go
@@ -101,6 +101,30 @@ func (r *WorkspaceReconciler) deleteWorkspace(ctx context.Context, w *workspaceI
 
 		if _, ok := runStatusUnsuccessful[run.Status]; ok {
 			w.log.Info("Destroy Run", "msg", fmt.Sprintf("destroy run %s is unsuccessful: %s", run.ID, run.Status))
+
+			workspace, err := w.tfClient.Client.Workspaces.ReadByID(ctx, w.instance.Status.WorkspaceID)
+			if err != nil {
+				return r.handleWorkspaceErrorNotFound(ctx, w, err)
+			}
+
+			w.log.Info("Destroy Run", "msg", fmt.Sprintf("CurrentRun: %s %s %v", workspace.CurrentRun.ID, workspace.CurrentRun.Status, workspace.CurrentRun.IsDestroy))
+
+			if workspace.CurrentRun != nil && workspace.CurrentRun.ID != w.instance.Status.DestroyRunID {
+
+				run, err := w.tfClient.Client.Runs.Read(ctx, w.instance.Status.DestroyRunID)
+				if err != nil {
+					// ignore this run id, and let the next reconcile loop handle the error
+					return nil
+				}
+				if run.IsDestroy {
+					w.log.Info("Destroy Run", "msg", fmt.Sprintf("found more recent destroy run %s, updating DestroyRunID", workspace.CurrentRun.ID))
+
+					w.instance.Status.DestroyRunID = workspace.CurrentRun.ID
+					w.updateWorkspaceStatusRun(run)
+					return r.Status().Update(ctx, &w.instance)
+				}
+			}
+
 			return nil
 		}
 		w.log.Info("Destroy Run", "msg", fmt.Sprintf("destroy run %s is not finished", run.ID))
diff --git a/internal/controller/workspace_controller_deletion_policy_test.go b/internal/controller/workspace_controller_deletion_policy_test.go
index 68e54dfe..a3e13e36 100644
--- a/internal/controller/workspace_controller_deletion_policy_test.go
+++ b/internal/controller/workspace_controller_deletion_policy_test.go
@@ -195,6 +195,97 @@ var _ = Describe("Workspace controller", Ordered, func() {
 				return err == tfc.ErrResourceNotFound
 			}).Should(BeTrue())
 		})
+		It("can destroy delete a workspace when the destroy was retried manually after failing", func() {
+			if cloudEndpoint != tfcDefaultAddress {
+				Skip("Does not run against TFC, skip this test")
+			}
+			instance.Spec.AllowDestroyPlan = true
+			instance.Spec.DeletionPolicy = appv1alpha2.DeletionPolicyDestroy
+			createWorkspace(instance)
+			workspaceID := instance.Status.WorkspaceID
+
+			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+			Eventually(func() bool {
+				listOpts := tfc.ListOptions{
+					PageNumber: 1,
+					PageSize:   maxPageSize,
+				}
+				for listOpts.PageNumber != 0 {
+					runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+						ListOptions: listOpts,
+					})
+					Expect(err).To(Succeed())
+					for _, r := range runs.Items {
+						if r.ConfigurationVersion.ID == cv.ID {
+							return r.Status == tfc.RunApplied
+						}
+					}
+					listOpts.PageNumber = runs.NextPage
+				}
+				return false
+			}).Should(BeTrue())
+
+			// create an errored ConfigurationVersion for the delete to fail
+			cv = createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, false)
+
+			Expect(k8sClient.Delete(ctx, instance)).To(Succeed())
+
+			var destroyRunID string
+			Eventually(func() bool {
+				ws, err := tfClient.Workspaces.ReadByID(ctx, workspaceID)
+				Expect(err).To(Succeed())
+				Expect(ws).ToNot(BeNil())
+				Expect(ws.CurrentRun).ToNot(BeNil())
+				run, err := tfClient.Runs.Read(ctx, ws.CurrentRun.ID)
+				Expect(err).To(Succeed())
+				Expect(run).ToNot(BeNil())
+				destroyRunID = run.ID
+
+				return run.IsDestroy
+			}).Should(BeTrue())
+
+			Eventually(func() bool {
+				run, _ := tfClient.Runs.Read(ctx, destroyRunID)
+				if run.Status == tfc.RunErrored {
+					return true
+				}
+
+				return false
+			}).Should(BeTrue())
+
+			// put back a working configuration
+			cv = createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+
+			// start a new destroy run manually
+			run, err := tfClient.Runs.Create(ctx, tfc.RunCreateOptions{
+				IsDestroy: tfc.Bool(true),
+				Message:   tfc.String(runMessage),
+				Workspace: &tfc.Workspace{
+					ID: workspaceID,
+				},
+			})
+			Expect(err).To(Succeed())
+			Expect(run).ToNot(BeNil())
+
+			var newDestroyRunID string
+			Eventually(func() bool {
+				ws, err := tfClient.Workspaces.ReadByID(ctx, workspaceID)
+				Expect(err).To(Succeed())
+				Expect(ws).ToNot(BeNil())
+				Expect(ws.CurrentRun).ToNot(BeNil())
+				run, err := tfClient.Runs.Read(ctx, ws.CurrentRun.ID)
+				Expect(err).To(Succeed())
+				Expect(run).ToNot(BeNil())
+				newDestroyRunID = run.ID
+
+				return run.IsDestroy && newDestroyRunID != destroyRunID
+			}).Should(BeTrue())
+
+			Eventually(func() bool {
+				_, err := tfClient.Workspaces.ReadByID(ctx, workspaceID)
+				return err == tfc.ErrResourceNotFound
+			}).Should(BeTrue())
+		})
 		It("can force delete a workspace", func() {
 			instance.Spec.DeletionPolicy = appv1alpha2.DeletionPolicyForce
 			createWorkspace(instance)
diff --git a/internal/controller/workspace_controller_outputs_test.go b/internal/controller/workspace_controller_outputs_test.go
index e9a3be34..f8a611b2 100644
--- a/internal/controller/workspace_controller_outputs_test.go
+++ b/internal/controller/workspace_controller_outputs_test.go
@@ -180,3 +180,46 @@ func createAndUploadConfigurationVersion(workspaceID string, outputValue string)
 
 	return cv
 }
+
+func createAndUploadErroredConfigurationVersion(workspaceID string, autoQueueRuns bool) *tfc.ConfigurationVersion {
+	GinkgoHelper()
+	// Create a temporary dir in the current one
+	cd, err := os.Getwd()
+	Expect(err).Should(Succeed())
+	td, err := os.MkdirTemp(cd, "tf-*")
+	Expect(err).Should(Succeed())
+	defer os.RemoveAll(td)
+	// Create a temporary file in the temporary dir
+	f, err := os.CreateTemp(td, "*.tf")
+	Expect(err).Should(Succeed())
+	defer os.Remove(f.Name())
+	// Terraform code to upload
+	tf := fmt.Sprint(`
+				resource "test_non_existent_resource" "this" {}
+				`)
+	// Save the Terraform code to the temporary file
+	_, err = f.WriteString(tf)
+	Expect(err).Should(Succeed())
+
+	cv, err := tfClient.ConfigurationVersions.Create(ctx, workspaceID, tfc.ConfigurationVersionCreateOptions{
+		AutoQueueRuns: tfc.Bool(autoQueueRuns),
+		Speculative:   tfc.Bool(false),
+	})
+	Expect(err).Should(Succeed())
+	Expect(cv).ShouldNot(BeNil())
+
+	Expect(tfClient.ConfigurationVersions.Upload(ctx, cv.UploadURL, td)).Should(Succeed())
+
+	Eventually(func() bool {
+		c, err := tfClient.ConfigurationVersions.Read(ctx, cv.ID)
+		if err != nil {
+			return false
+		}
+		if c.Status == tfc.ConfigurationUploaded {
+			return true
+		}
+		return false
+	}).Should(BeTrue())
+
+	return cv
+}

From 956f0dd4c75e7a403e7c543d172272ae05f3c67b Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Thu, 24 Apr 2025 16:46:53 +0200
Subject: [PATCH 02/12] docs: add changelog

---
 .changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml

diff --git a/.changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml b/.changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml
new file mode 100644
index 00000000..d217e400
--- /dev/null
+++ b/.changes/unreleased/ENHANCEMENTS-594-20250424-164645.yaml
@@ -0,0 +1,5 @@
+kind: ENHANCEMENTS
+body: Relink to more recent destroy runs when destroying a workspace
+time: 2025-04-24T16:46:45.710081284+02:00
+custom:
+    PR: "594"

From 6702906407ad6d041c915dda8305eb578f917136 Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Thu, 24 Apr 2025 16:00:34 +0200
Subject: [PATCH 03/12] test: add option to not trigger run when uploading a
 new config

---
 .../controller/agentpool_controller_autoscaling_test.go   | 4 ++--
 .../workspace_controller_deletion_policy_test.go          | 8 ++++----
 internal/controller/workspace_controller_outputs_test.go  | 8 ++++----
 internal/controller/workspace_controller_runs_test.go     | 4 ++--
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/internal/controller/agentpool_controller_autoscaling_test.go b/internal/controller/agentpool_controller_autoscaling_test.go
index 87d8666d..314d8798 100644
--- a/internal/controller/agentpool_controller_autoscaling_test.go
+++ b/internal/controller/agentpool_controller_autoscaling_test.go
@@ -92,7 +92,7 @@ var _ = Describe("Agent Pool controller", Ordered, func() {
 				Expect(err).Should(Succeed())
 				Expect(ws).ShouldNot(BeNil())
 				// Create a new Run and execute it
-				_ = createAndUploadConfigurationVersion(ws.ID, "hoi")
+				_ = createAndUploadConfigurationVersion(ws.ID, "hoi", true)
 				Eventually(func() bool {
 					ws, err = tfClient.Workspaces.ReadByID(ctx, ws.ID)
 					Expect(err).Should(Succeed())
@@ -147,7 +147,7 @@ var _ = Describe("Agent Pool controller", Ordered, func() {
 			Expect(err).Should(Succeed())
 			Expect(ws).ShouldNot(BeNil())
 			// New Run
-			_ = createAndUploadConfigurationVersion(ws.ID, "hoi")
+			_ = createAndUploadConfigurationVersion(ws.ID, "hoi", true)
 			Eventually(func() bool {
 				ws, err = tfClient.Workspaces.ReadByID(ctx, ws.ID)
 				Expect(err).Should(Succeed())
diff --git a/internal/controller/workspace_controller_deletion_policy_test.go b/internal/controller/workspace_controller_deletion_policy_test.go
index a3e13e36..c7d5c8b2 100644
--- a/internal/controller/workspace_controller_deletion_policy_test.go
+++ b/internal/controller/workspace_controller_deletion_policy_test.go
@@ -97,7 +97,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
 			createWorkspace(instance)
 			workspaceID := instance.Status.WorkspaceID
 
-			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
 			Eventually(func() bool {
 				listOpts := tfc.ListOptions{
 					PageNumber: 1,
@@ -144,7 +144,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
 			createWorkspace(instance)
 			workspaceID := instance.Status.WorkspaceID
 
-			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
 			Eventually(func() bool {
 				listOpts := tfc.ListOptions{
 					PageNumber: 1,
@@ -204,7 +204,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
 			createWorkspace(instance)
 			workspaceID := instance.Status.WorkspaceID
 
-			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
 			Eventually(func() bool {
 				listOpts := tfc.ListOptions{
 					PageNumber: 1,
@@ -254,7 +254,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
 			}).Should(BeTrue())
 
 			// put back a working configuration
-			cv = createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+			cv = createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
 
 			// start a new destroy run manually
 			run, err := tfClient.Runs.Create(ctx, tfc.RunCreateOptions{
diff --git a/internal/controller/workspace_controller_outputs_test.go b/internal/controller/workspace_controller_outputs_test.go
index f8a611b2..e2b1a897 100644
--- a/internal/controller/workspace_controller_outputs_test.go
+++ b/internal/controller/workspace_controller_outputs_test.go
@@ -76,7 +76,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
 			createWorkspace(instance)
 
 			outputValue := "hoi"
-			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, outputValue)
+			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, outputValue, true)
 
 			By("Validating configuration version and workspace run")
 			Eventually(func() bool {
@@ -132,7 +132,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
 	})
 })
 
-func createAndUploadConfigurationVersion(workspaceID string, outputValue string) *tfc.ConfigurationVersion {
+func createAndUploadConfigurationVersion(workspaceID string, outputValue string, autoQueueRuns bool) *tfc.ConfigurationVersion {
 	GinkgoHelper()
 	// Create a temporary dir in the current one
 	cd, err := os.Getwd()
@@ -140,7 +140,7 @@ func createAndUploadConfigurationVersion(workspaceID string, outputValue string)
 	td, err := os.MkdirTemp(cd, "tf-*")
 	Expect(err).Should(Succeed())
 	defer os.RemoveAll(td)
-	// Create a temporary file in the temporary dir
+	// Create a te		AutoQueueRuns: tfc.Bool(autoQueueRuns), dir
 	f, err := os.CreateTemp(td, "*.tf")
 	Expect(err).Should(Succeed())
 	defer os.Remove(f.Name())
@@ -159,7 +159,7 @@ func createAndUploadConfigurationVersion(workspaceID string, outputValue string)
 	Expect(err).Should(Succeed())
 
 	cv, err := tfClient.ConfigurationVersions.Create(ctx, workspaceID, tfc.ConfigurationVersionCreateOptions{
-		AutoQueueRuns: tfc.Bool(true),
+		AutoQueueRuns: tfc.Bool(autoQueueRuns),
 		Speculative:   tfc.Bool(false),
 	})
 	Expect(err).Should(Succeed())
diff --git a/internal/controller/workspace_controller_runs_test.go b/internal/controller/workspace_controller_runs_test.go
index f2bb5421..6d9b0f41 100644
--- a/internal/controller/workspace_controller_runs_test.go
+++ b/internal/controller/workspace_controller_runs_test.go
@@ -73,7 +73,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
 			namespacedName := getNamespacedName(instance)
 			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
 			createWorkspace(instance)
-			createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+			createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
 			Eventually(func() bool {
 				Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
 				if instance.Status.Run == nil {
@@ -87,7 +87,7 @@ var _ = Describe("Workspace controller", Ordered, func() {
 			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
 			createWorkspace(instance)
 
-			createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi")
+			createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
 			Eventually(func() bool {
 				Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
 				if instance.Status.Run == nil {

From f8928bb54464c658decd706ca5c44b52c628f17d Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Thu, 24 Apr 2025 16:30:50 +0200
Subject: [PATCH 04/12] test: add a test for the retry logic that fails

---
 .../workspace_controller_retry_test.go        | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 internal/controller/workspace_controller_retry_test.go

diff --git a/internal/controller/workspace_controller_retry_test.go b/internal/controller/workspace_controller_retry_test.go
new file mode 100644
index 00000000..9d12e2bf
--- /dev/null
+++ b/internal/controller/workspace_controller_retry_test.go
@@ -0,0 +1,122 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: MPL-2.0
+
+package controller
+
+import (
+	"fmt"
+	"time"
+
+	tfc "github.com/hashicorp/go-tfe"
+	appv1alpha2 "github.com/hashicorp/hcp-terraform-operator/api/v1alpha2"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+)
+
+var _ = Describe("Workspace controller", Ordered, func() {
+	var (
+		instance       *appv1alpha2.Workspace
+		namespacedName types.NamespacedName
+		workspace      string
+	)
+
+	BeforeAll(func() {
+		// Set default Eventually timers
+		SetDefaultEventuallyTimeout(syncPeriod * 4)
+		SetDefaultEventuallyPollingInterval(2 * time.Second)
+	})
+
+	BeforeEach(func() {
+		if cloudEndpoint != tfcDefaultAddress {
+			Skip("Does not run against TFC, skip this test")
+		}
+		namespacedName = newNamespacedName()
+		workspace = fmt.Sprintf("kubernetes-operator-%v", randomNumber())
+		// Create a new workspace object for each test
+		instance = &appv1alpha2.Workspace{
+			TypeMeta: metav1.TypeMeta{
+				APIVersion: "app.terraform.io/v1alpha2",
+				Kind:       "Workspace",
+			},
+			ObjectMeta: metav1.ObjectMeta{
+				Name:              namespacedName.Name,
+				Namespace:         namespacedName.Namespace,
+				DeletionTimestamp: nil,
+				Finalizers:        []string{},
+			},
+			Spec: appv1alpha2.WorkspaceSpec{
+				Organization: organization,
+				Token: appv1alpha2.Token{
+					SecretKeyRef: &corev1.SecretKeySelector{
+						LocalObjectReference: corev1.LocalObjectReference{
+							Name: secretNamespacedName.Name,
+						},
+						Key: secretKey,
+					},
+				},
+				Name:            workspace,
+				ApplyMethod:     "auto",
+				ApplyRunTrigger: "auto",
+			},
+			Status: appv1alpha2.WorkspaceStatus{},
+		}
+	})
+
+	AfterEach(func() {
+		deleteWorkspace(instance)
+	})
+
+	Context("Retry", func() {
+		It("can retry failed runs", func() {
+			namespacedName := getNamespacedName(instance)
+			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
+			createWorkspace(instance)
+			// TODO: add the retry config in the CRD to actually do a retry
+
+			// start a run that will fail
+			createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true)
+
+			var runID string
+
+			Eventually(func() bool {
+				Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
+				if instance.Status.Run == nil {
+					return false
+				}
+
+				runID = instance.Status.Run.ID
+
+				_, ok := runStatusUnsuccessful[tfc.RunStatus(instance.Status.Run.Status)]
+
+				return ok
+			}).Should(BeTrue())
+
+			// Fix the code but no not start a run manually
+			createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", false)
+
+			// a new run should be started automatically
+			Eventually(func() bool {
+				Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
+				if instance.Status.Run == nil {
+					return false
+				}
+				// TODO: check that the retry count of the status decreased
+
+				return runID != instance.Status.Run.ID
+			}).Should(BeTrue())
+
+			// Since the code is fixed at some point a run will succeed
+			Eventually(func() bool {
+				Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
+				if instance.Status.Run == nil {
+					return false
+				}
+
+				return runID != instance.Status.Run.ID && instance.Status.Run.RunCompleted()
+			}).Should(BeTrue())
+		})
+	})
+})

From e8c06dadd708269edd49342980e7d18ce33681a4 Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Thu, 24 Apr 2025 16:44:18 +0200
Subject: [PATCH 05/12] feat(api): add retry policy and status

---
 api/v1alpha2/workspace_types.go               | 41 +++++++++++++++++++
 api/v1alpha2/zz_generated.deepcopy.go         | 41 +++++++++++++++++++
 .../crds/app.terraform.io_workspaces.yaml     | 35 ++++++++++++++++
 .../bases/app.terraform.io_workspaces.yaml    | 35 ++++++++++++++++
 docs/api-reference.md                         | 33 +++++++++++++++
 5 files changed, 185 insertions(+)

diff --git a/api/v1alpha2/workspace_types.go b/api/v1alpha2/workspace_types.go
index d98d568c..26cc142a 100644
--- a/api/v1alpha2/workspace_types.go
+++ b/api/v1alpha2/workspace_types.go
@@ -65,6 +65,25 @@ type RemoteStateSharing struct {
 	Workspaces []*ConsumerWorkspace `json:"workspaces,omitempty"`
 }
 
+// RetryPolicy allows you to configure retry behavior for failed runs on the workspace.
+// It will apply for the latest current run of the operator.
+type RetryPolicy struct {
+	// Limit is the maximum number of retries for failed runs.
+	// Default: `0`.
+	//
+	// +kubebuilder:default:=0
+	// +optional
+	Limit int64 `json:"limit,omitempty"`
+	// Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.
+	// More information:
+	//   - https://pkg.go.dev/time#ParseDuration
+	// Default: `""`.
+	//
+	// +kubebuilder:default:=""
+	// +optional
+	Backoff string `json:"backoff,omitempty"`
+}
+
 // Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle.
 // Only one of the fields `ID` or `Name` is allowed.
 // At least one of the fields `ID` or `Name` is mandatory.
@@ -592,6 +611,10 @@ type WorkspaceSpec struct {
 	//
 	//+optional
 	RemoteStateSharing *RemoteStateSharing `json:"remoteStateSharing,omitempty"`
+	// Retry Policy allows you to specify how the operator should retry failed runs automatically.
+	//
+	//+optional
+	RetryPolicy *RetryPolicy `json:"retryPolicy,omitempty"`
 	// Run triggers allow you to connect this workspace to one or more source workspaces.
 	// These connections allow runs to queue automatically in this workspace on successful apply of runs in any of the source workspaces.
 	// More information:
@@ -742,6 +765,11 @@ type WorkspaceStatus struct {
 	//
 	//+optional
 	VariableSets []VariableSetStatus `json:"variableSet,omitempty"`
+
+	// Retry status of the latest run on the workspace.
+	//
+	//+optional
+	Retry *RetryStatus `json:"retry,omitempty"`
 }
 
 type VariableSetStatus struct {
@@ -749,6 +777,19 @@ type VariableSetStatus struct {
 	Name string `json:"name,omitempty"`
 }
 
+// RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and
+// possibly a time to wait for the next attempt.
+type RetryStatus struct {
+	// RetriesLeft is the number of retries left for the latest run on the workspace.
+	RetriesLeft int64 `json:"retriesLeft,omitempty"`
+
+	// NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry
+	// if the Backoff option was added. It is represented in RFC3339 form and is in UTC.
+	//
+	// +optional
+	NextRetryTimestamp metav1.Time `json:"nextRetryTimestamp,omitempty"`
+}
+
 //+kubebuilder:object:root=true
 //+kubebuilder:subresource:status
 //+kubebuilder:printcolumn:name="Workspace ID",type=string,JSONPath=`.status.workspaceID`
diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go
index d8fccce7..f054c31a 100644
--- a/api/v1alpha2/zz_generated.deepcopy.go
+++ b/api/v1alpha2/zz_generated.deepcopy.go
@@ -749,6 +749,37 @@ func (in *RemoteStateSharing) DeepCopy() *RemoteStateSharing {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *RetryPolicy) DeepCopyInto(out *RetryPolicy) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryPolicy.
+func (in *RetryPolicy) DeepCopy() *RetryPolicy {
+	if in == nil {
+		return nil
+	}
+	out := new(RetryPolicy)
+	in.DeepCopyInto(out)
+	return out
+}
+
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *RetryStatus) DeepCopyInto(out *RetryStatus) {
+	*out = *in
+	in.NextRetryTimestamp.DeepCopyInto(&out.NextRetryTimestamp)
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryStatus.
+func (in *RetryStatus) DeepCopy() *RetryStatus {
+	if in == nil {
+		return nil
+	}
+	out := new(RetryStatus)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *RunStatus) DeepCopyInto(out *RunStatus) {
 	*out = *in
@@ -1114,6 +1145,11 @@ func (in *WorkspaceSpec) DeepCopyInto(out *WorkspaceSpec) {
 		*out = new(RemoteStateSharing)
 		(*in).DeepCopyInto(*out)
 	}
+	if in.RetryPolicy != nil {
+		in, out := &in.RetryPolicy, &out.RetryPolicy
+		*out = new(RetryPolicy)
+		**out = **in
+	}
 	if in.RunTriggers != nil {
 		in, out := &in.RunTriggers, &out.RunTriggers
 		*out = make([]RunTrigger, len(*in))
@@ -1181,6 +1217,11 @@ func (in *WorkspaceStatus) DeepCopyInto(out *WorkspaceStatus) {
 		*out = make([]VariableSetStatus, len(*in))
 		copy(*out, *in)
 	}
+	if in.Retry != nil {
+		in, out := &in.Retry, &out.Retry
+		*out = new(RetryStatus)
+		(*in).DeepCopyInto(*out)
+	}
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkspaceStatus.
diff --git a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
index a553d237..8f1021b0 100644
--- a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
+++ b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
@@ -374,6 +374,26 @@ spec:
                     minItems: 1
                     type: array
                 type: object
+              retryPolicy:
+                description: Retry Policy allows you to specify how the operator should
+                  retry failed runs automatically.
+                properties:
+                  backoff:
+                    default: ""
+                    description: |-
+                      Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.
+                      More information:
+                        - https://pkg.go.dev/time#ParseDuration
+                      Default: `""`.
+                    type: string
+                  limit:
+                    default: 0
+                    description: |-
+                      Limit is the maximum number of retries for failed runs.
+                      Default: `0`.
+                    format: int64
+                    type: integer
+                type: object
               runTasks:
                 description: |-
                   Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle.
@@ -836,6 +856,21 @@ spec:
                     pattern: ^\d{1}\.\d{1,2}\.\d{1,2}$
                     type: string
                 type: object
+              retry:
+                description: Retry status of the latest run on the workspace.
+                properties:
+                  nextRetryTimestamp:
+                    description: |-
+                      NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry
+                      if the Backoff option was added. It is represented in RFC3339 form and is in UTC.
+                    format: date-time
+                    type: string
+                  retriesLeft:
+                    description: RetriesLeft is the number of retries left for the
+                      latest run on the workspace.
+                    format: int64
+                    type: integer
+                type: object
               runStatus:
                 description: Workspace Runs status.
                 properties:
diff --git a/config/crd/bases/app.terraform.io_workspaces.yaml b/config/crd/bases/app.terraform.io_workspaces.yaml
index 1a8a6774..55fc993c 100644
--- a/config/crd/bases/app.terraform.io_workspaces.yaml
+++ b/config/crd/bases/app.terraform.io_workspaces.yaml
@@ -371,6 +371,26 @@ spec:
                     minItems: 1
                     type: array
                 type: object
+              retryPolicy:
+                description: Retry Policy allows you to specify how the operator should
+                  retry failed runs automatically.
+                properties:
+                  backoff:
+                    default: ""
+                    description: |-
+                      Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.
+                      More information:
+                        - https://pkg.go.dev/time#ParseDuration
+                      Default: `""`.
+                    type: string
+                  limit:
+                    default: 0
+                    description: |-
+                      Limit is the maximum number of retries for failed runs.
+                      Default: `0`.
+                    format: int64
+                    type: integer
+                type: object
               runTasks:
                 description: |-
                   Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle.
@@ -833,6 +853,21 @@ spec:
                     pattern: ^\d{1}\.\d{1,2}\.\d{1,2}$
                     type: string
                 type: object
+              retry:
+                description: Retry status of the latest run on the workspace.
+                properties:
+                  nextRetryTimestamp:
+                    description: |-
+                      NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry
+                      if the Backoff option was added. It is represented in RFC3339 form and is in UTC.
+                    format: date-time
+                    type: string
+                  retriesLeft:
+                    description: RetriesLeft is the number of retries left for the
+                      latest run on the workspace.
+                    format: int64
+                    type: integer
+                type: object
               runStatus:
                 description: Workspace Runs status.
                 properties:
diff --git a/docs/api-reference.md b/docs/api-reference.md
index b9d6bec4..540371f0 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -549,6 +549,38 @@ _Appears in:_
 | `workspaces` _[ConsumerWorkspace](#consumerworkspace) array_ | Allow access to the state for specific workspaces within the same organization. |
 
 
+#### RetryPolicy
+
+
+
+RetryPolicy allows you to configure retry behavior for failed runs on the workspace.
+It will apply for the latest current run of the operator.
+
+_Appears in:_
+- [WorkspaceSpec](#workspacespec)
+
+| Field | Description |
+| --- | --- |
+| `limit` _integer_ | Limit is the maximum number of retries for failed runs.<br />Default: `0`. |
+| `backoff` _string_ | Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.<br />More information:<br />  - https://pkg.go.dev/time#ParseDuration<br />Default: `""`. |
+
+
+#### RetryStatus
+
+
+
+RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and
+possibly a time to wait for the next attempt.
+
+_Appears in:_
+- [WorkspaceStatus](#workspacestatus)
+
+| Field | Description |
+| --- | --- |
+| `retriesLeft` _integer_ | RetriesLeft is the number of retries left for the latest run on the workspace. |
+| `nextRetryTimestamp` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#time-v1-meta)_ | NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry<br />if the Backoff option was added. It is represented in RFC3339 form and is in UTC. |
+
+
 #### RunStatus
 
 
@@ -894,6 +926,7 @@ _Appears in:_
 | `environmentVariables` _[Variable](#variable) array_ | Terraform Environment variables for all plans and applies in this workspace.<br />Variables defined within a workspace always overwrite variables from variable sets that have the same type and the same key.<br />More information:<br />  - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables<br />  - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables#environment-variables |
 | `terraformVariables` _[Variable](#variable) array_ | Terraform variables for all plans and applies in this workspace.<br />Variables defined within a workspace always overwrite variables from variable sets that have the same type and the same key.<br />More information:<br />  - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables<br />  - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/variables#terraform-variables |
 | `remoteStateSharing` _[RemoteStateSharing](#remotestatesharing)_ | Remote state access between workspaces.<br />By default, new workspaces in HCP Terraform do not allow other workspaces to access their state.<br />More information:<br />  - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/state#accessing-state-from-other-workspaces |
+| `retryPolicy` _[RetryPolicy](#retrypolicy)_ | Retry Policy allows you to specify how the operator should retry failed runs automatically. |
 | `runTriggers` _[RunTrigger](#runtrigger) array_ | Run triggers allow you to connect this workspace to one or more source workspaces.<br />These connections allow runs to queue automatically in this workspace on successful apply of runs in any of the source workspaces.<br />More information:<br />  - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/settings/run-triggers |
 | `versionControl` _[VersionControl](#versioncontrol)_ | Settings for the workspace's VCS repository, enabling the UI/VCS-driven run workflow.<br />Omit this argument to utilize the CLI-driven and API-driven workflows, where runs are not driven by webhooks on your VCS provider.<br />More information:<br />  - https://www.terraform.io/cloud-docs/run/ui<br />  - https://www.terraform.io/cloud-docs/vcs |
 | `sshKey` _[SSHKey](#sshkey)_ | SSH key used to clone Terraform modules.<br />More information:<br />  - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/settings/ssh-keys |

From 9f03b5124989ef5f50f0e8ce59efeec1b1aaf251 Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Thu, 24 Apr 2025 18:16:11 +0200
Subject: [PATCH 06/12] feat(api): rename to strategy and modify tests

---
 api/v1alpha2/workspace_types.go               | 28 +++++--------------
 api/v1alpha2/zz_generated.deepcopy.go         |  3 +-
 .../crds/app.terraform.io_workspaces.yaml     | 25 ++++-------------
 .../bases/app.terraform.io_workspaces.yaml    | 25 ++++-------------
 docs/api-reference.md                         |  6 ++--
 .../workspace_controller_retry_test.go        | 25 ++++++++++++++++-
 6 files changed, 44 insertions(+), 68 deletions(-)

diff --git a/api/v1alpha2/workspace_types.go b/api/v1alpha2/workspace_types.go
index 26cc142a..80a572eb 100644
--- a/api/v1alpha2/workspace_types.go
+++ b/api/v1alpha2/workspace_types.go
@@ -68,20 +68,12 @@ type RemoteStateSharing struct {
 // RetryPolicy allows you to configure retry behavior for failed runs on the workspace.
 // It will apply for the latest current run of the operator.
 type RetryPolicy struct {
-	// Limit is the maximum number of retries for failed runs.
+	// Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied.
 	// Default: `0`.
 	//
-	// +kubebuilder:default:=0
-	// +optional
-	Limit int64 `json:"limit,omitempty"`
-	// Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.
-	// More information:
-	//   - https://pkg.go.dev/time#ParseDuration
-	// Default: `""`.
-	//
-	// +kubebuilder:default:=""
-	// +optional
-	Backoff string `json:"backoff,omitempty"`
+	//+kubebuilder:default:=0
+	//+optional
+	BackoffLimit int64 `json:"backoffLimit,omitempty"`
 }
 
 // Run tasks allow HCP Terraform to interact with external systems at specific points in the HCP Terraform run lifecycle.
@@ -620,7 +612,7 @@ type WorkspaceSpec struct {
 	// More information:
 	//   - https://developer.hashicorp.com/terraform/cloud-docs/workspaces/settings/run-triggers
 	//
-	//+kubebuilder:validation:MinItems:=1
+	//+kubebuilder:validation:MinItems:=2
 	//+optional
 	RunTriggers []RunTrigger `json:"runTriggers,omitempty"`
 	// Settings for the workspace's VCS repository, enabling the UI/VCS-driven run workflow.
@@ -780,14 +772,8 @@ type VariableSetStatus struct {
 // RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and
 // possibly a time to wait for the next attempt.
 type RetryStatus struct {
-	// RetriesLeft is the number of retries left for the latest run on the workspace.
-	RetriesLeft int64 `json:"retriesLeft,omitempty"`
-
-	// NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry
-	// if the Backoff option was added. It is represented in RFC3339 form and is in UTC.
-	//
-	// +optional
-	NextRetryTimestamp metav1.Time `json:"nextRetryTimestamp,omitempty"`
+	// Failed is the number of failed attempts.
+	Failed int64 `json:"failed,omitempty"`
 }
 
 //+kubebuilder:object:root=true
diff --git a/api/v1alpha2/zz_generated.deepcopy.go b/api/v1alpha2/zz_generated.deepcopy.go
index f054c31a..706eec2f 100644
--- a/api/v1alpha2/zz_generated.deepcopy.go
+++ b/api/v1alpha2/zz_generated.deepcopy.go
@@ -767,7 +767,6 @@ func (in *RetryPolicy) DeepCopy() *RetryPolicy {
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *RetryStatus) DeepCopyInto(out *RetryStatus) {
 	*out = *in
-	in.NextRetryTimestamp.DeepCopyInto(&out.NextRetryTimestamp)
 }
 
 // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RetryStatus.
@@ -1220,7 +1219,7 @@ func (in *WorkspaceStatus) DeepCopyInto(out *WorkspaceStatus) {
 	if in.Retry != nil {
 		in, out := &in.Retry, &out.Retry
 		*out = new(RetryStatus)
-		(*in).DeepCopyInto(*out)
+		**out = **in
 	}
 }
 
diff --git a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
index 8f1021b0..5a2c6937 100644
--- a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
+++ b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
@@ -378,18 +378,10 @@ spec:
                 description: Retry Policy allows you to specify how the operator should
                   retry failed runs automatically.
                 properties:
-                  backoff:
-                    default: ""
-                    description: |-
-                      Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.
-                      More information:
-                        - https://pkg.go.dev/time#ParseDuration
-                      Default: `""`.
-                    type: string
-                  limit:
+                  backoffLimit:
                     default: 0
                     description: |-
-                      Limit is the maximum number of retries for failed runs.
+                      Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied.
                       Default: `0`.
                     format: int64
                     type: integer
@@ -462,7 +454,7 @@ spec:
                       minLength: 1
                       type: string
                   type: object
-                minItems: 1
+                minItems: 2
                 type: array
               sshKey:
                 description: |-
@@ -859,15 +851,8 @@ spec:
               retry:
                 description: Retry status of the latest run on the workspace.
                 properties:
-                  nextRetryTimestamp:
-                    description: |-
-                      NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry
-                      if the Backoff option was added. It is represented in RFC3339 form and is in UTC.
-                    format: date-time
-                    type: string
-                  retriesLeft:
-                    description: RetriesLeft is the number of retries left for the
-                      latest run on the workspace.
+                  failed:
+                    description: Failed is the number of failed attempts.
                     format: int64
                     type: integer
                 type: object
diff --git a/config/crd/bases/app.terraform.io_workspaces.yaml b/config/crd/bases/app.terraform.io_workspaces.yaml
index 55fc993c..20afc37d 100644
--- a/config/crd/bases/app.terraform.io_workspaces.yaml
+++ b/config/crd/bases/app.terraform.io_workspaces.yaml
@@ -375,18 +375,10 @@ spec:
                 description: Retry Policy allows you to specify how the operator should
                   retry failed runs automatically.
                 properties:
-                  backoff:
-                    default: ""
-                    description: |-
-                      Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.
-                      More information:
-                        - https://pkg.go.dev/time#ParseDuration
-                      Default: `""`.
-                    type: string
-                  limit:
+                  backoffLimit:
                     default: 0
                     description: |-
-                      Limit is the maximum number of retries for failed runs.
+                      Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied.
                       Default: `0`.
                     format: int64
                     type: integer
@@ -459,7 +451,7 @@ spec:
                       minLength: 1
                       type: string
                   type: object
-                minItems: 1
+                minItems: 2
                 type: array
               sshKey:
                 description: |-
@@ -856,15 +848,8 @@ spec:
               retry:
                 description: Retry status of the latest run on the workspace.
                 properties:
-                  nextRetryTimestamp:
-                    description: |-
-                      NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry
-                      if the Backoff option was added. It is represented in RFC3339 form and is in UTC.
-                    format: date-time
-                    type: string
-                  retriesLeft:
-                    description: RetriesLeft is the number of retries left for the
-                      latest run on the workspace.
+                  failed:
+                    description: Failed is the number of failed attempts.
                     format: int64
                     type: integer
                 type: object
diff --git a/docs/api-reference.md b/docs/api-reference.md
index 540371f0..e4628e30 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -561,8 +561,7 @@ _Appears in:_
 
 | Field | Description |
 | --- | --- |
-| `limit` _integer_ | Limit is the maximum number of retries for failed runs.<br />Default: `0`. |
-| `backoff` _string_ | Backoff is the time to wait before retrying a failed run. It is specified as a golang duration string.<br />More information:<br />  - https://pkg.go.dev/time#ParseDuration<br />Default: `""`. |
+| `backoffLimit` _integer_ | Limit is the maximum number of retries for failed runs. If set to a negative number, no limit will be applied.<br />Default: `0`. |
 
 
 #### RetryStatus
@@ -577,8 +576,7 @@ _Appears in:_
 
 | Field | Description |
 | --- | --- |
-| `retriesLeft` _integer_ | RetriesLeft is the number of retries left for the latest run on the workspace. |
-| `nextRetryTimestamp` _[Time](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.27/#time-v1-meta)_ | NextRetryTimestamp is a timestamp representing the server time after which the operator whould start a new retry<br />if the Backoff option was added. It is represented in RFC3339 form and is in UTC. |
+| `failed` _integer_ | Failed is the number of failed attempts. |
 
 
 #### RunStatus
diff --git a/internal/controller/workspace_controller_retry_test.go b/internal/controller/workspace_controller_retry_test.go
index 9d12e2bf..2403a351 100644
--- a/internal/controller/workspace_controller_retry_test.go
+++ b/internal/controller/workspace_controller_retry_test.go
@@ -73,8 +73,10 @@ var _ = Describe("Workspace controller", Ordered, func() {
 		It("can retry failed runs", func() {
 			namespacedName := getNamespacedName(instance)
 			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
+			instance.Spec.RetryStrategy = &appv1alpha2.RetryPolicy{
+				Enabled: true,
+			}
 			createWorkspace(instance)
-			// TODO: add the retry config in the CRD to actually do a retry
 
 			// start a run that will fail
 			createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true)
@@ -118,5 +120,26 @@ var _ = Describe("Workspace controller", Ordered, func() {
 				return runID != instance.Status.Run.ID && instance.Status.Run.RunCompleted()
 			}).Should(BeTrue())
 		})
+		It("can retry until the limit of retries is reached", func() {
+			namespacedName := getNamespacedName(instance)
+			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
+			instance.Spec.RetryStrategy = &appv1alpha2.RetryPolicy{
+				Enabled: true,
+				Limit:   1,
+			}
+
+			createWorkspace(instance)
+			// start a run that will fail
+			createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true)
+
+			Eventually(func() bool {
+				Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
+				if instance.Status.Retry == nil {
+					return false
+				}
+
+				return instance.Status.Retry.RetriesLeft == 0
+			}).Should(BeTrue())
+		})
 	})
 })

From d88eded1d56e06fec30d7f5c10625a8750bff5a7 Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Mon, 28 Apr 2025 17:58:34 +0200
Subject: [PATCH 07/12] feat(api): working retry in controller

---
 api/v1alpha2/workspace_types.go               |  2 +-
 .../crds/app.terraform.io_workspaces.yaml     |  3 +-
 .../bases/app.terraform.io_workspaces.yaml    |  3 +-
 docs/api-reference.md                         |  2 +-
 .../controller/workspace_controller_retry.go  | 20 +++++
 .../workspace_controller_retry_test.go        | 75 ++++++++++++++-----
 .../controller/workspace_controller_runs.go   | 47 ++++++++++++
 7 files changed, 128 insertions(+), 24 deletions(-)
 create mode 100644 internal/controller/workspace_controller_retry.go

diff --git a/api/v1alpha2/workspace_types.go b/api/v1alpha2/workspace_types.go
index 80a572eb..a8ab10d6 100644
--- a/api/v1alpha2/workspace_types.go
+++ b/api/v1alpha2/workspace_types.go
@@ -772,7 +772,7 @@ type VariableSetStatus struct {
 // RetryStatus contains the status of the retry of the latest run on the workspace. How many attempts are left and
 // possibly a time to wait for the next attempt.
 type RetryStatus struct {
-	// Failed is the number of failed attempts.
+	// Failed is the number of failed attempts, counting the initial one.
 	Failed int64 `json:"failed,omitempty"`
 }
 
diff --git a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
index 5a2c6937..8738aeac 100644
--- a/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
+++ b/charts/hcp-terraform-operator/crds/app.terraform.io_workspaces.yaml
@@ -852,7 +852,8 @@ spec:
                 description: Retry status of the latest run on the workspace.
                 properties:
                   failed:
-                    description: Failed is the number of failed attempts.
+                    description: Failed is the number of failed attempts, counting
+                      the initial one.
                     format: int64
                     type: integer
                 type: object
diff --git a/config/crd/bases/app.terraform.io_workspaces.yaml b/config/crd/bases/app.terraform.io_workspaces.yaml
index 20afc37d..c39d7cf5 100644
--- a/config/crd/bases/app.terraform.io_workspaces.yaml
+++ b/config/crd/bases/app.terraform.io_workspaces.yaml
@@ -849,7 +849,8 @@ spec:
                 description: Retry status of the latest run on the workspace.
                 properties:
                   failed:
-                    description: Failed is the number of failed attempts.
+                    description: Failed is the number of failed attempts, counting
+                      the initial one.
                     format: int64
                     type: integer
                 type: object
diff --git a/docs/api-reference.md b/docs/api-reference.md
index e4628e30..090a36e0 100644
--- a/docs/api-reference.md
+++ b/docs/api-reference.md
@@ -576,7 +576,7 @@ _Appears in:_
 
 | Field | Description |
 | --- | --- |
-| `failed` _integer_ | Failed is the number of failed attempts. |
+| `failed` _integer_ | Failed is the number of failed attempts, counting the initial one. |
 
 
 #### RunStatus
diff --git a/internal/controller/workspace_controller_retry.go b/internal/controller/workspace_controller_retry.go
new file mode 100644
index 00000000..675d9d2b
--- /dev/null
+++ b/internal/controller/workspace_controller_retry.go
@@ -0,0 +1,20 @@
+// Copyright (c) HashiCorp, Inc.
+// SPDX-License-Identifier: MPL-2.0
+
+package controller
+
+import (
+	"context"
+
+	appv1alpha2 "github.com/hashicorp/hcp-terraform-operator/api/v1alpha2"
+)
+
+func (r *WorkspaceReconciler) resetRetryStatus(ctx context.Context, w *workspaceInstance) error {
+	if w.instance.Spec.RetryPolicy == nil || w.instance.Spec.RetryPolicy.BackoffLimit == 0 {
+		return nil
+	}
+	w.instance.Status.Retry = &appv1alpha2.RetryStatus{
+		Failed: 0,
+	}
+	return nil
+}
diff --git a/internal/controller/workspace_controller_retry_test.go b/internal/controller/workspace_controller_retry_test.go
index 2403a351..8e8c693e 100644
--- a/internal/controller/workspace_controller_retry_test.go
+++ b/internal/controller/workspace_controller_retry_test.go
@@ -72,28 +72,37 @@ var _ = Describe("Workspace controller", Ordered, func() {
 	Context("Retry", func() {
 		It("can retry failed runs", func() {
 			namespacedName := getNamespacedName(instance)
-			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
-			instance.Spec.RetryStrategy = &appv1alpha2.RetryPolicy{
-				Enabled: true,
+			instance.Spec.RetryPolicy = &appv1alpha2.RetryPolicy{
+				BackoffLimit: -1,
 			}
+			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
 			createWorkspace(instance)
+			workspaceID := instance.Status.WorkspaceID
 
 			// start a run that will fail
-			createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true)
+			cv := createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true)
 
 			var runID string
 
 			Eventually(func() bool {
-				Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
-				if instance.Status.Run == nil {
-					return false
+				listOpts := tfc.ListOptions{
+					PageNumber: 1,
+					PageSize:   maxPageSize,
 				}
-
-				runID = instance.Status.Run.ID
-
-				_, ok := runStatusUnsuccessful[tfc.RunStatus(instance.Status.Run.Status)]
-
-				return ok
+				for listOpts.PageNumber != 0 {
+					runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+						ListOptions: listOpts,
+					})
+					Expect(err).To(Succeed())
+					for _, r := range runs.Items {
+						if r.ConfigurationVersion.ID == cv.ID {
+							runID = r.ID
+							return r.Status == tfc.RunErrored
+						}
+					}
+					listOpts.PageNumber = runs.NextPage
+				}
+				return false
 			}).Should(BeTrue())
 
 			// Fix the code but no not start a run manually
@@ -105,11 +114,18 @@ var _ = Describe("Workspace controller", Ordered, func() {
 				if instance.Status.Run == nil {
 					return false
 				}
-				// TODO: check that the retry count of the status decreased
-
 				return runID != instance.Status.Run.ID
 			}).Should(BeTrue())
 
+			// the number of failed attemps should be reset to 0
+			Eventually(func() bool {
+				Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
+				if instance.Status.Retry == nil {
+					return false
+				}
+				return instance.Status.Retry.Failed == 0
+			}).Should(BeTrue())
+
 			// Since the code is fixed at some point a run will succeed
 			Eventually(func() bool {
 				Expect(k8sClient.Get(ctx, namespacedName, instance)).Should(Succeed())
@@ -122,13 +138,15 @@ var _ = Describe("Workspace controller", Ordered, func() {
 		})
 		It("can retry until the limit of retries is reached", func() {
 			namespacedName := getNamespacedName(instance)
-			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
-			instance.Spec.RetryStrategy = &appv1alpha2.RetryPolicy{
-				Enabled: true,
-				Limit:   1,
+
+			instance.Spec.RetryPolicy = &appv1alpha2.RetryPolicy{
+				BackoffLimit: 2,
 			}
+			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
 
 			createWorkspace(instance)
+			workspaceID := instance.Status.WorkspaceID
+
 			// start a run that will fail
 			createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, true)
 
@@ -138,7 +156,24 @@ var _ = Describe("Workspace controller", Ordered, func() {
 					return false
 				}
 
-				return instance.Status.Retry.RetriesLeft == 0
+				return instance.Status.Retry.Failed == 3
+			}).Should(BeTrue())
+
+			Eventually(func() bool {
+				listOpts := tfc.ListOptions{
+					PageNumber: 1,
+					PageSize:   maxPageSize,
+				}
+				runCount := 0
+				for listOpts.PageNumber != 0 {
+					runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+						ListOptions: listOpts,
+					})
+					Expect(err).To(Succeed())
+					runCount += len(runs.Items)
+					listOpts.PageNumber = runs.NextPage
+				}
+				return runCount == 3
 			}).Should(BeTrue())
 		})
 	})
diff --git a/internal/controller/workspace_controller_runs.go b/internal/controller/workspace_controller_runs.go
index 30cdc728..4c8e587a 100644
--- a/internal/controller/workspace_controller_runs.go
+++ b/internal/controller/workspace_controller_runs.go
@@ -84,6 +84,53 @@ func (r *WorkspaceReconciler) reconcileCurrentRun(ctx context.Context, w *worksp
 	w.instance.Status.Run.Status = string(run.Status)
 	w.instance.Status.Run.ConfigurationVersion = run.ConfigurationVersion.ID
 
+	if w.instance.Spec.RetryPolicy != nil && w.instance.Spec.RetryPolicy.BackoffLimit != 0 {
+		if _, ok := runStatusUnsuccessful[run.Status]; ok {
+			w.log.Info("Reconcile Runs", "msg", "ongoing non-speculative run is unsuccessful, retrying it")
+
+			if w.instance.Status.Retry == nil {
+				w.instance.Status.Retry = &appv1alpha2.RetryStatus{
+					Failed: 0,
+				}
+			}
+			w.instance.Status.Retry.Failed++
+
+			if w.instance.Spec.RetryPolicy.BackoffLimit < 0 || w.instance.Status.Retry.Failed <= w.instance.Spec.RetryPolicy.BackoffLimit {
+
+				options := tfc.RunCreateOptions{
+					Message:     tfc.String(runMessage),
+					Workspace:   workspace,
+					IsDestroy:   tfc.Bool(run.IsDestroy),
+					RefreshOnly: tfc.Bool(run.RefreshOnly),
+				}
+				retriedRun, err := w.tfClient.Client.Runs.Create(ctx, options)
+				if err != nil {
+					w.log.Error(err, "Reconcile Runs", "msg", "failed to create a new apply run for retry")
+					return err
+				}
+				w.log.Info("Reconcile Runs", "msg", fmt.Sprintf("successfully created a new apply run %s for retry", retriedRun.ID))
+
+				// Update status
+				if w.instance.Status.Run == nil {
+					w.instance.Status.Run = &appv1alpha2.RunStatus{}
+				}
+				w.instance.Status.Run.ID = retriedRun.ID
+				w.instance.Status.Run.Status = string(retriedRun.Status)
+				w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID
+
+				// WARNING: there is a race limit here in case the run fails very fast and the initial status returned
+				// by the Runs.Create funtion is Errored. In this case the run is never retried.
+			} else {
+				w.log.Info("Reconcile Runs", "msg", "backoff limit was reached, skip retry")
+			}
+		}
+	}
+
+	// when the current run succeeds, we reset the failed counter for a next retry
+	if _, ok := runStatusComplete[run.Status]; ok {
+		r.resetRetryStatus(ctx, w)
+	}
+
 	return nil
 }
 

From 6d112985cfe962ca0c262d3280129f7c9ba08715 Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Mon, 28 Apr 2025 18:15:18 +0200
Subject: [PATCH 08/12] refacto: move retry logic to own function

---
 .../controller/workspace_controller_retry.go  | 50 ++++++++++++++++++-
 .../controller/workspace_controller_runs.go   | 43 +++-------------
 2 files changed, 57 insertions(+), 36 deletions(-)

diff --git a/internal/controller/workspace_controller_retry.go b/internal/controller/workspace_controller_retry.go
index 675d9d2b..09fa4506 100644
--- a/internal/controller/workspace_controller_retry.go
+++ b/internal/controller/workspace_controller_retry.go
@@ -5,12 +5,14 @@ package controller
 
 import (
 	"context"
+	"fmt"
 
+	tfc "github.com/hashicorp/go-tfe"
 	appv1alpha2 "github.com/hashicorp/hcp-terraform-operator/api/v1alpha2"
 )
 
 func (r *WorkspaceReconciler) resetRetryStatus(ctx context.Context, w *workspaceInstance) error {
-	if w.instance.Spec.RetryPolicy == nil || w.instance.Spec.RetryPolicy.BackoffLimit == 0 {
+	if !isRetryEnabled(w) {
 		return nil
 	}
 	w.instance.Status.Retry = &appv1alpha2.RetryStatus{
@@ -18,3 +20,49 @@ func (r *WorkspaceReconciler) resetRetryStatus(ctx context.Context, w *workspace
 	}
 	return nil
 }
+
+func isRetryEnabled(w *workspaceInstance) bool {
+	return w.instance.Spec.RetryPolicy != nil && w.instance.Spec.RetryPolicy.BackoffLimit != 0
+}
+
+func (r *WorkspaceReconciler) retryFailedRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error {
+	if w.instance.Status.Retry == nil {
+		w.instance.Status.Retry = &appv1alpha2.RetryStatus{
+			Failed: 0,
+		}
+	}
+	w.instance.Status.Retry.Failed++
+
+	if w.instance.Spec.RetryPolicy.BackoffLimit < 0 || w.instance.Status.Retry.Failed <= w.instance.Spec.RetryPolicy.BackoffLimit {
+
+		options := tfc.RunCreateOptions{
+			Message:     tfc.String(runMessage),
+			Workspace:   workspace,
+			IsDestroy:   tfc.Bool(failedRun.IsDestroy),
+			RefreshOnly: tfc.Bool(failedRun.RefreshOnly),
+		}
+		retriedRun, err := w.tfClient.Client.Runs.Create(ctx, options)
+		if err != nil {
+			w.log.Error(err, "Retry Runs", "msg", "failed to create a new apply run for retry")
+			return err
+		}
+		w.log.Info("Retry Runs", "msg", fmt.Sprintf("successfully created a new apply run %s for to retry failed %s", retriedRun.ID, failedRun.ID))
+
+		// Update status
+		if w.instance.Status.Run == nil {
+			w.instance.Status.Run = &appv1alpha2.RunStatus{}
+		}
+		w.instance.Status.Run.ID = retriedRun.ID
+		w.instance.Status.Run.Status = string(retriedRun.Status)
+		w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID
+
+		// WARNING: there is a race limit here in case the run fails very fast and the initial status returned
+		// by the Runs.Create funtion is Errored. In this case the run is never retried.
+		// TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to
+		// make sure we didn't miss a retry
+	} else {
+		w.log.Info("Retry Runs", "msg", "backoff limit was reached, skip retry")
+		return nil
+	}
+	return nil
+}
diff --git a/internal/controller/workspace_controller_runs.go b/internal/controller/workspace_controller_runs.go
index 4c8e587a..98e67178 100644
--- a/internal/controller/workspace_controller_runs.go
+++ b/internal/controller/workspace_controller_runs.go
@@ -84,44 +84,12 @@ func (r *WorkspaceReconciler) reconcileCurrentRun(ctx context.Context, w *worksp
 	w.instance.Status.Run.Status = string(run.Status)
 	w.instance.Status.Run.ConfigurationVersion = run.ConfigurationVersion.ID
 
-	if w.instance.Spec.RetryPolicy != nil && w.instance.Spec.RetryPolicy.BackoffLimit != 0 {
+	if isRetryEnabled(w) {
 		if _, ok := runStatusUnsuccessful[run.Status]; ok {
 			w.log.Info("Reconcile Runs", "msg", "ongoing non-speculative run is unsuccessful, retrying it")
 
-			if w.instance.Status.Retry == nil {
-				w.instance.Status.Retry = &appv1alpha2.RetryStatus{
-					Failed: 0,
-				}
-			}
-			w.instance.Status.Retry.Failed++
-
-			if w.instance.Spec.RetryPolicy.BackoffLimit < 0 || w.instance.Status.Retry.Failed <= w.instance.Spec.RetryPolicy.BackoffLimit {
-
-				options := tfc.RunCreateOptions{
-					Message:     tfc.String(runMessage),
-					Workspace:   workspace,
-					IsDestroy:   tfc.Bool(run.IsDestroy),
-					RefreshOnly: tfc.Bool(run.RefreshOnly),
-				}
-				retriedRun, err := w.tfClient.Client.Runs.Create(ctx, options)
-				if err != nil {
-					w.log.Error(err, "Reconcile Runs", "msg", "failed to create a new apply run for retry")
-					return err
-				}
-				w.log.Info("Reconcile Runs", "msg", fmt.Sprintf("successfully created a new apply run %s for retry", retriedRun.ID))
-
-				// Update status
-				if w.instance.Status.Run == nil {
-					w.instance.Status.Run = &appv1alpha2.RunStatus{}
-				}
-				w.instance.Status.Run.ID = retriedRun.ID
-				w.instance.Status.Run.Status = string(retriedRun.Status)
-				w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID
-
-				// WARNING: there is a race limit here in case the run fails very fast and the initial status returned
-				// by the Runs.Create funtion is Errored. In this case the run is never retried.
-			} else {
-				w.log.Info("Reconcile Runs", "msg", "backoff limit was reached, skip retry")
+			if err = r.retryFailedRun(ctx, w, workspace, run); err != nil {
+				return err
 			}
 		}
 	}
@@ -181,6 +149,11 @@ func (r *WorkspaceReconciler) triggerApplyRun(ctx context.Context, w *workspaceI
 	w.instance.Status.Run.Status = string(run.Status)
 	w.instance.Status.Run.ConfigurationVersion = run.ConfigurationVersion.ID
 
+	// WARNING: there is a race limit here in case the run fails very fast and the initial status returned
+	// by the Runs.Create funtion is Errored. In this case the run is never retried.
+	// TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to
+	// make sure we didn't miss a retry
+
 	return nil
 }
 

From ef09280443d7b428e9b6a0423a874675cf5ae540 Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Tue, 29 Apr 2025 10:59:35 +0200
Subject: [PATCH 09/12] refacto(retry): split logic to update the status

---
 .../controller/workspace_controller_retry.go  | 63 ++++++++++++++-----
 .../controller/workspace_controller_runs.go   |  2 +-
 2 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/internal/controller/workspace_controller_retry.go b/internal/controller/workspace_controller_retry.go
index 09fa4506..fd502767 100644
--- a/internal/controller/workspace_controller_retry.go
+++ b/internal/controller/workspace_controller_retry.go
@@ -25,7 +25,50 @@ func isRetryEnabled(w *workspaceInstance) bool {
 	return w.instance.Spec.RetryPolicy != nil && w.instance.Spec.RetryPolicy.BackoffLimit != 0
 }
 
-func (r *WorkspaceReconciler) retryFailedRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error {
+func (r *WorkspaceReconciler) retryFailedApplyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error {
+	retriedRun, err := r.retryFailedRun(ctx, w, workspace, failedRun)
+	if err != nil {
+		return err
+	}
+
+	// when no run is returned, it means the backoff limit was reached
+	if retriedRun == nil {
+		return nil
+	}
+
+	// Update status
+	if w.instance.Status.Run == nil {
+		w.instance.Status.Run = &appv1alpha2.RunStatus{}
+	}
+	w.instance.Status.Run.ID = retriedRun.ID
+	w.instance.Status.Run.Status = string(retriedRun.Status)
+	w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID
+
+	// WARNING: there is a race limit here in case the run fails very fast and the initial status returned
+	// by the Runs.Create funtion is Errored. In this case the run is never retried.
+	// TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to
+	// make sure we didn't miss a retry
+
+	return nil
+}
+
+func (r *WorkspaceReconciler) retryFaileDestroyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error {
+	retriedRun, err := r.retryFailedRun(ctx, w, workspace, failedRun)
+	if err != nil {
+		return err
+	}
+
+	// when no run is returned, it means the backoff limit was reached
+	if retriedRun == nil {
+		return nil
+	}
+
+	w.instance.Status.DestroyRunID = retriedRun.ID
+
+	return nil
+}
+
+func (r *WorkspaceReconciler) retryFailedRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) (*tfc.Run, error) {
 	if w.instance.Status.Retry == nil {
 		w.instance.Status.Retry = &appv1alpha2.RetryStatus{
 			Failed: 0,
@@ -44,25 +87,13 @@ func (r *WorkspaceReconciler) retryFailedRun(ctx context.Context, w *workspaceIn
 		retriedRun, err := w.tfClient.Client.Runs.Create(ctx, options)
 		if err != nil {
 			w.log.Error(err, "Retry Runs", "msg", "failed to create a new apply run for retry")
-			return err
+			return nil, err
 		}
 		w.log.Info("Retry Runs", "msg", fmt.Sprintf("successfully created a new apply run %s for to retry failed %s", retriedRun.ID, failedRun.ID))
 
-		// Update status
-		if w.instance.Status.Run == nil {
-			w.instance.Status.Run = &appv1alpha2.RunStatus{}
-		}
-		w.instance.Status.Run.ID = retriedRun.ID
-		w.instance.Status.Run.Status = string(retriedRun.Status)
-		w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID
-
-		// WARNING: there is a race limit here in case the run fails very fast and the initial status returned
-		// by the Runs.Create funtion is Errored. In this case the run is never retried.
-		// TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to
-		// make sure we didn't miss a retry
+		return retriedRun, nil
 	} else {
 		w.log.Info("Retry Runs", "msg", "backoff limit was reached, skip retry")
-		return nil
+		return nil, nil
 	}
-	return nil
 }
diff --git a/internal/controller/workspace_controller_runs.go b/internal/controller/workspace_controller_runs.go
index 98e67178..9c498241 100644
--- a/internal/controller/workspace_controller_runs.go
+++ b/internal/controller/workspace_controller_runs.go
@@ -88,7 +88,7 @@ func (r *WorkspaceReconciler) reconcileCurrentRun(ctx context.Context, w *worksp
 		if _, ok := runStatusUnsuccessful[run.Status]; ok {
 			w.log.Info("Reconcile Runs", "msg", "ongoing non-speculative run is unsuccessful, retrying it")
 
-			if err = r.retryFailedRun(ctx, w, workspace, run); err != nil {
+			if err = r.retryFailedApplyRun(ctx, w, workspace, run); err != nil {
 				return err
 			}
 		}

From 3e9bc7c841d17522dc56d3dad60adc7b6c315908 Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Tue, 29 Apr 2025 11:13:02 +0200
Subject: [PATCH 10/12] feat(retry): add retry for deleletion policy

---
 .../workspace_controller_deletion_policy.go          |  9 +++++++++
 internal/controller/workspace_controller_retry.go    | 12 +++---------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/internal/controller/workspace_controller_deletion_policy.go b/internal/controller/workspace_controller_deletion_policy.go
index badf222c..27ffc4ee 100644
--- a/internal/controller/workspace_controller_deletion_policy.go
+++ b/internal/controller/workspace_controller_deletion_policy.go
@@ -124,6 +124,15 @@ func (r *WorkspaceReconciler) deleteWorkspace(ctx context.Context, w *workspaceI
 					return r.Status().Update(ctx, &w.instance)
 				}
 			}
+			if isRetryEnabled(w) {
+				w.log.Info("Destroy Run", "msg", fmt.Sprintf("ongoing destroy run %s is unsuccessful, retrying it", run.ID))
+				err := r.retryFailedDestroyRun(ctx, w, workspace, run)
+				if err != nil {
+					w.log.Info("Destroy Run", "msg", fmt.Sprintf("ongoing destroy run %s is unsuccessful, retrying it", run.ID))
+					return err
+				}
+				return r.Status().Update(ctx, &w.instance)
+			}
 
 			return nil
 		}
diff --git a/internal/controller/workspace_controller_retry.go b/internal/controller/workspace_controller_retry.go
index fd502767..cfc924f3 100644
--- a/internal/controller/workspace_controller_retry.go
+++ b/internal/controller/workspace_controller_retry.go
@@ -36,14 +36,7 @@ func (r *WorkspaceReconciler) retryFailedApplyRun(ctx context.Context, w *worksp
 		return nil
 	}
 
-	// Update status
-	if w.instance.Status.Run == nil {
-		w.instance.Status.Run = &appv1alpha2.RunStatus{}
-	}
-	w.instance.Status.Run.ID = retriedRun.ID
-	w.instance.Status.Run.Status = string(retriedRun.Status)
-	w.instance.Status.Run.ConfigurationVersion = retriedRun.ConfigurationVersion.ID
-
+	w.updateWorkspaceStatusRun(retriedRun)
 	// WARNING: there is a race limit here in case the run fails very fast and the initial status returned
 	// by the Runs.Create funtion is Errored. In this case the run is never retried.
 	// TODO: loop back ? I don't like loops so maybe the best would be to change the reconcile runs function to
@@ -52,7 +45,7 @@ func (r *WorkspaceReconciler) retryFailedApplyRun(ctx context.Context, w *worksp
 	return nil
 }
 
-func (r *WorkspaceReconciler) retryFaileDestroyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error {
+func (r *WorkspaceReconciler) retryFailedDestroyRun(ctx context.Context, w *workspaceInstance, workspace *tfc.Workspace, failedRun *tfc.Run) error {
 	retriedRun, err := r.retryFailedRun(ctx, w, workspace, failedRun)
 	if err != nil {
 		return err
@@ -64,6 +57,7 @@ func (r *WorkspaceReconciler) retryFaileDestroyRun(ctx context.Context, w *works
 	}
 
 	w.instance.Status.DestroyRunID = retriedRun.ID
+	w.updateWorkspaceStatusRun(retriedRun)
 
 	return nil
 }

From d77aa861eb3aa00850f0accb7b66ac8e4d3a6cfc Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Tue, 29 Apr 2025 11:14:11 +0200
Subject: [PATCH 11/12] test: add tests for the deletion

---
 .../workspace_controller_retry_test.go        | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/internal/controller/workspace_controller_retry_test.go b/internal/controller/workspace_controller_retry_test.go
index 8e8c693e..902ae6a1 100644
--- a/internal/controller/workspace_controller_retry_test.go
+++ b/internal/controller/workspace_controller_retry_test.go
@@ -176,5 +176,70 @@ var _ = Describe("Workspace controller", Ordered, func() {
 				return runCount == 3
 			}).Should(BeTrue())
 		})
+		It("can retry failed destroy runs when deleting the workspace", func() {
+			instance.Spec.RetryPolicy = &appv1alpha2.RetryPolicy{
+				BackoffLimit: -1,
+			}
+			instance.Spec.AllowDestroyPlan = true
+			instance.Spec.DeletionPolicy = appv1alpha2.DeletionPolicyDestroy
+			// Create a new Kubernetes workspace object and wait until the controller finishes the reconciliation
+			createWorkspace(instance)
+			workspaceID := instance.Status.WorkspaceID
+
+			cv := createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", true)
+			Eventually(func() bool {
+				listOpts := tfc.ListOptions{
+					PageNumber: 1,
+					PageSize:   maxPageSize,
+				}
+				for listOpts.PageNumber != 0 {
+					runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+						ListOptions: listOpts,
+					})
+					Expect(err).To(Succeed())
+					for _, r := range runs.Items {
+						if r.ConfigurationVersion.ID == cv.ID {
+							return r.Status == tfc.RunApplied
+						}
+					}
+					listOpts.PageNumber = runs.NextPage
+				}
+				return false
+			}).Should(BeTrue())
+
+			// create an errored ConfigurationVersion for the delete to fail
+			cv = createAndUploadErroredConfigurationVersion(instance.Status.WorkspaceID, false)
+
+			Expect(k8sClient.Delete(ctx, instance)).To(Succeed())
+
+			Eventually(func() bool {
+				listOpts := tfc.ListOptions{
+					PageNumber: 1,
+					PageSize:   maxPageSize,
+				}
+				for listOpts.PageNumber != 0 {
+					runs, err := tfClient.Runs.List(ctx, workspaceID, &tfc.RunListOptions{
+						ListOptions: listOpts,
+					})
+					Expect(err).To(Succeed())
+					for _, r := range runs.Items {
+						if r.ConfigurationVersion.ID == cv.ID {
+							return r.Status == tfc.RunErrored
+						}
+					}
+					listOpts.PageNumber = runs.NextPage
+				}
+				return false
+			}).Should(BeTrue())
+
+			// Fix the code but no not start a run manually
+			createAndUploadConfigurationVersion(instance.Status.WorkspaceID, "hoi", false)
+
+			// The retry should eventually delete the workspace
+			Eventually(func() bool {
+				_, err := tfClient.Workspaces.ReadByID(ctx, workspaceID)
+				return err == tfc.ErrResourceNotFound
+			}).Should(BeTrue())
+		})
 	})
 })

From d786c9b1b8b15da10266c92118f21dbf227f4a9a Mon Sep 17 00:00:00 2001
From: Baptiste Bourdet <bat.bourdet@gmail.com>
Date: Tue, 29 Apr 2025 15:45:08 +0200
Subject: [PATCH 12/12] docs: add changelog

---
 .changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 .changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml

diff --git a/.changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml b/.changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml
new file mode 100644
index 00000000..d5202d68
--- /dev/null
+++ b/.changes/unreleased/ENHANCEMENTS-602-20250429-154503.yaml
@@ -0,0 +1,5 @@
+kind: ENHANCEMENTS
+body: Workspace CRD option for the operator to retry failed runs
+time: 2025-04-29T15:45:03.031933484+02:00
+custom:
+    PR: "602"