Persist impacted projects (#2418)

ZIJ · web-flow · commit 2eb4264a87be · 2025-11-07T21:35:11.000Z
* Agent task for persisting drift runs

* Refactor into helper

* Fix table name
diff --git a/agent-tasks/persist-detection-runs.md b/agent-tasks/persist-detection-runs.md
@@ -0,0 +1,216 @@
+# Persist Detection Runs (Append‑Only)
+
+This document outlines a minimal, append‑only design to persist every time we compute “impacted projects” for a PR. The goal is auditability and simple, reliable retrieval of the latest detection run, without extra counters or complex coordination.
+
+## Summary
+- Add a single append‑only table `digger_detection_runs`.
+- Insert one row per detection run (PR and Issue Comment flows) with denormalized JSON payloads.
+- Use timestamps (`created_at`) to identify the latest run for a given PR.
+- No updates, no deletes.
+
+## Scope
+- Persist detection runs for:
+  - GitHub Pull Request events (`handlePullRequestEvent`).
+  - GitHub Issue Comment events (`handleIssueCommentEvent`).
+- Denormalized JSON for impacted projects and source mappings.
+- Minimal model + writer method; errors are logged but do not break main flow.
+
+Out of scope:
+- EE and OpenTaco.
+- Additional VCS (GitLab/Bitbucket) wiring (can be added later similarly).
+- Lock/PR inconsistency detection (future step once data is persisted).
+
+## Schema (Postgres)
+Create a single table for append‑only detection runs.
+
+```sql
+-- backend/migrations/20251107000100.sql
+CREATE TABLE "public"."digger_detection_runs" (
+  "id"               bigserial PRIMARY KEY,
+  "created_at"       timestamptz NOT NULL DEFAULT now(),
+  "updated_at"       timestamptz,
+  "deleted_at"       timestamptz,
+
+  "organisation_id"  bigint       NOT NULL,
+  "repo_full_name"   text         NOT NULL,
+  "pr_number"        integer      NOT NULL,
+
+  -- What triggered this detection
+  "trigger_type"     text         NOT NULL, -- 'pull_request' | 'issue_comment'
+  "trigger_action"   text         NOT NULL, -- e.g. opened | synchronize | reopened | comment | closed | converted_to_draft
+
+  -- Context
+  "commit_sha"       text,
+  "default_branch"   text,
+  "target_branch"    text,
+
+  -- Denormalized JSON payloads
+  "labels_json"              jsonb,
+  "changed_files_json"       jsonb,
+  "impacted_projects_json"   jsonb NOT NULL, -- array of projects
+  "source_mapping_json"      jsonb           -- project -> impacting_locations[]
+);
+
+-- Helpful indexes for lookups and listing latest runs per PR
+CREATE INDEX IF NOT EXISTS idx_ddr_org_repo_pr_created_at
+  ON "public"."digger_detection_runs" ("organisation_id", "repo_full_name", "pr_number", "created_at" DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ddr_repo_pr
+  ON "public"."digger_detection_runs" ("repo_full_name", "pr_number");
+
+CREATE INDEX IF NOT EXISTS idx_ddr_deleted_at
+  ON "public"."digger_detection_runs" ("deleted_at");
+```
+
+Notes:
+- We reuse GORM’s soft‑delete columns via `gorm.Model` pattern (created_at/updated_at/deleted_at). We will not update or delete rows in code.
+- `impacted_projects_json` is required; empty array when zero impacted projects.
+
+## JSON Shapes
+- impacted_projects_json (array of objects) — subset of project fields we already have in memory:
+```json
+[
+  {
+    "name": "app-us-east-1",
+    "dir": "infra/app",
+    "workspace": "default",
+    "layer": 1,
+    "workflow": "default",
+    "terragrunt": false,
+    "opentofu": false,
+    "pulumi": false
+  }
+]
+```
+
+- source_mapping_json (object of arrays):
+```json
+{
+  "app-us-east-1": { "impacting_locations": ["infra/app/modules/sg", "infra/app/main.tf"] }
+}
+```
+
+- labels_json / changed_files_json: arrays of strings. When unavailable (e.g., labels in comment flows), pass null or empty array.
+
+## Model (backend/models)
+Add a new model and writer. Keep it simple and append‑only.
+
+```go
+// backend/models/detection_runs.go
+package models
+
+import (
+  "encoding/json"
+  "gorm.io/datatypes"
+  "gorm.io/gorm"
+)
+
+type DetectionRun struct {
+  gorm.Model
+  OrganisationID     uint
+  RepoFullName       string
+  PrNumber           int
+  TriggerType        string
+  TriggerAction      string
+  CommitSHA          string
+  DefaultBranch      string
+  TargetBranch       string
+  LabelsJSON         datatypes.JSON
+  ChangedFilesJSON   datatypes.JSON
+  ImpactedProjectsJSON datatypes.JSON // required
+  SourceMappingJSON  datatypes.JSON
+}
+
+// CreateDetectionRun inserts an append‑only detection run row.
+func (db *Database) CreateDetectionRun(run *DetectionRun) error {
+  return db.GormDB.Create(run).Error
+}
+```
+
+Helper mappers (in the same file) to convert from:
+- `[]digger_config.Project` → lightweight `[]struct{...}` → `json.Marshal`.
+- `map[string]digger_config.ProjectToSourceMapping` → `map[string]struct{ ImpactingLocations []string }` → `json.Marshal`.
+
+## Controller Wiring
+We add writes at the moment we compute impacted projects successfully — before any early returns — so runs are recorded even if later steps decide to skip work (e.g., draft PRs).
+
+1) Pull Request events
+- File: `backend/controllers/github_pull_request.go`
+- After:
+  - `impactedProjects, impactedProjectsSourceMapping, _, err := github2.ProcessGitHubPullRequestEvent(...)`
+  - And after fetching `changedFiles` (already available)
+- Insert:
+  - Build the `DetectionRun` struct:
+    - orgId, repoFullName, prNumber
+    - trigger_type="pull_request", trigger_action=`*payload.Action`
+    - commit_sha=payload.PullRequest.Head.GetSHA()
+    - default_branch=`*payload.Repo.DefaultBranch`
+    - target_branch=payload.PullRequest.Base.GetRef()
+    - labels_json: PR label names (we already collect `labels` → `prLabelsStr`)
+    - changed_files_json: from `changedFiles`
+    - impacted_projects_json: from `impactedProjects`
+    - source_mapping_json: from `impactedProjectsSourceMapping`
+  - Call `models.DB.CreateDetectionRun(&run)`
+  - On error: `slog.Error` and continue (do not fail the PR handler).
+
+2) Issue Comment events
+- File: `backend/controllers/github_comment.go`
+- After:
+  - `processEventResult, err := generic.ProcessIssueCommentEvent(...)`
+  - Use `processEventResult.AllImpactedProjects` and `.ImpactedProjectsSourceMapping` (not the filtered subset)
+  - We have `changedFiles` captured earlier in the handler
+  - `prBranchName, _, targetBranch, _, err := ghService.GetBranchName(issueNumber)` → defaultBranch is `*payload.Repo.DefaultBranch`
+  - `commitSha` available from earlier when loading config
+- Insert `CreateDetectionRun(...)` with:
+  - trigger_type="issue_comment", trigger_action="comment"
+  - Same fields as PR event with the appropriate sources.
+
+## Error Handling
+- Persistence must be best‑effort: log and continue on errors to avoid impacting main workflows.
+- Use concise log fields: orgId, repoFullName, prNumber, counts of impacted projects and changed files.
+
+## Queries (examples)
+- Latest detection run for a PR:
+```sql
+SELECT *
+FROM public.digger_detection_runs
+WHERE organisation_id = $1 AND repo_full_name = $2 AND pr_number = $3
+ORDER BY created_at DESC
+LIMIT 1;
+```
+
+- All runs for a PR:
+```sql
+SELECT *
+FROM public.digger_detection_runs
+WHERE organisation_id = $1 AND repo_full_name = $2 AND pr_number = $3
+ORDER BY created_at DESC;
+```
+
+## Testing
+- Unit tests:
+  - Model round‑trip: marshal minimal and full payloads (empty impacted projects; multiple projects; multiple source locations) and `CreateDetectionRun` succeeds.
+- Controller integration tests (lightweight):
+  - Simulate a PR event with no impacted projects → one row with empty `impacted_projects_json`.
+  - Simulate a PR event with 2 impacted projects → row with expected JSON arrays.
+  - Simulate an issue comment event → row with trigger_type="issue_comment".
+
+## Rollout
+- Add migration.
+- Add model + writer method.
+- Wire controllers (PR and Issue Comment) to create detection runs.
+- Deploy; no backfill required. Data accrues on subsequent events.
+
+## Risks / Considerations
+- Size of JSON fields: on very large PRs, `changed_files_json` can be big; acceptable for audit purposes, can be truncated later if needed.
+- Ordering by timestamp: adequate for our needs; if we ever need strict monotonic ordering under rare clock drifts, we could fall back to ID ordering as a tie‑breaker (`ORDER BY created_at DESC, id DESC`).
+- Privacy: Paths and labels are internal to the repo; acceptable within backend storage context.
+
+## Work Items
+1) Create migration file `backend/migrations/20251107000100.sql` with schema above.
+2) Add `backend/models/detection_runs.go` with `DetectionRun` and `CreateDetectionRun`.
+3) Add light mappers for JSON serialization of projects and source mapping.
+4) PR controller: write detection run after computing impacts.
+5) Comment controller: write detection run after computing impacts.
+6) Add basic unit tests for model creation; optional controller tests.
+
diff --git a/backend/controllers/github_comment.go b/backend/controllers/github_comment.go
@@ -266,6 +266,26 @@ func handleIssueCommentEvent(gh utils.GithubClientProvider, payload *github.Issu
 	impactedProjectsSourceMapping := processEventResult.ImpactedProjectsSourceMapping
 	allImpactedProjects := processEventResult.AllImpactedProjects
 
+	// Persist detection run (append-only) for issue comment events using full impacted set
+	var csha string
+	if commitSha != nil {
+		csha = *commitSha
+	}
+	recordDetectionRun(
+		orgId,
+		repoFullName,
+		issueNumber,
+		"issue_comment",
+		"comment",
+		csha,
+		defaultBranch,
+		targetBranch,
+		prLabelsStr,
+		changedFiles,
+		allImpactedProjects,
+		impactedProjectsSourceMapping,
+	)
+
 	impactedProjectsForComment, err := generic.FilterOutProjectsFromComment(allImpactedProjects, commentBody)
 	if err != nil {
 		slog.Error("Error filtering out projects from comment",
diff --git a/backend/controllers/github_helpers.go b/backend/controllers/github_helpers.go
@@ -244,6 +244,46 @@ func TriggerDiggerJobs(ciBackend ci_backends.CiBackend, repoFullName string, rep
 	return nil
 }
 
+// recordDetectionRun persists a detection run for any trigger (PR or issue comment).
+func recordDetectionRun(
+    organisationId uint,
+    repoFullName string,
+    number int,
+    triggerType string,   // e.g. "pull_request" | "issue_comment"
+    triggerAction string, // e.g. PR action or "comment"
+    commitSha string,
+    defaultBranch string,
+    targetBranch string,
+    labels []string,
+    changedFiles []string,
+    impactedProjects []digger_config.Project,
+    impactedProjectsSourceMapping map[string]digger_config.ProjectToSourceMapping,
+) {
+    dr, derr := models.NewDetectionRun(
+        organisationId,
+        repoFullName,
+        number,
+        triggerType,
+        triggerAction,
+        commitSha,
+        defaultBranch,
+        targetBranch,
+        labels,
+        changedFiles,
+        impactedProjects,
+        impactedProjectsSourceMapping,
+    )
+    if derr != nil {
+        slog.Error("Failed to build detection run payload", "number", number, "trigger", triggerType, "error", derr)
+        return
+    }
+    if err := models.DB.CreateDetectionRun(dr); err != nil {
+        slog.Error("Failed to persist detection run", "number", number, "trigger", triggerType, "error", err)
+        return
+    }
+    slog.Debug("Persisted detection run", "number", number, "trigger", triggerType, "projects", len(impactedProjects))
+}
+
 func GenerateTerraformFromCode(payload *github.IssueCommentEvent, commentReporterManager utils.CommentReporterManager, config *digger_config.DiggerConfig, defaultBranch string, ghService *github2.GithubService, repoOwner string, repoName string, commitSha *string, issueNumber int, branch *string) error {
 	if !strings.HasPrefix(*payload.Comment.Body, "digger generate") {
 		return nil
@@ -934,4 +974,3 @@ generate_projects:
 	slog.Info("Created Digger repo", "repoId", repo.ID, "diggerRepoName", diggerRepoName)
 	return repo, org, nil
 }
-
diff --git a/backend/controllers/github_pull_request.go b/backend/controllers/github_pull_request.go
@@ -162,6 +162,22 @@ func handlePullRequestEvent(gh utils.GithubClientProvider, payload *github.PullR
 		return fmt.Errorf("error processing event")
 	}
 
+	// Persist detection run (append-only) right after impact calculation
+	recordDetectionRun(
+		organisationId,
+		repoFullName,
+		prNumber,
+		"pull_request",
+		action,
+		commitSha,
+		*payload.Repo.DefaultBranch,
+		payload.PullRequest.Base.GetRef(),
+		prLabelsStr,
+		changedFiles,
+		impactedProjects,
+		impactedProjectsSourceMapping,
+	)
+
 	jobsForImpactedProjects, coverAllImpactedProjects, err := github2.ConvertGithubPullRequestEventToJobs(payload, impactedProjects, nil, *config, false)
 	if err != nil {
 		slog.Error("Error converting event to jobs",
diff --git a/backend/go.mod b/backend/go.mod
@@ -329,6 +329,7 @@ require (
 	gopkg.in/urfave/cli.v1 v1.20.0 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
 	gopkg.in/yaml.v3 v3.0.1 // indirect
+	gorm.io/datatypes v1.2.7 // indirect
 	gorm.io/driver/mysql v1.6.0 // indirect
 	gorm.io/driver/sqlserver v1.6.1 // indirect
 	k8s.io/klog v1.0.0 // indirect
diff --git a/backend/go.sum b/backend/go.sum
@@ -2824,6 +2824,8 @@ gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gorm.io/datatypes v1.2.7 h1:ww9GAhF1aGXZY3EB3cJPJ7//JiuQo7DlQA7NNlVaTdk=
+gorm.io/datatypes v1.2.7/go.mod h1:M2iO+6S3hhi4nAyYe444Pcb0dcIiOMJ7QHaUXxyiNZY=
 gorm.io/driver/mysql v1.4.0/go.mod h1:sSIebwZAVPiT+27jK9HIwvsqOGKx3YMPmrA3mBJR10c=
 gorm.io/driver/mysql v1.6.0 h1:eNbLmNTpPpTOVZi8MMxCi2aaIm0ZpInbORNXDwyLGvg=
 gorm.io/driver/mysql v1.6.0/go.mod h1:D/oCC2GWK3M/dqoLxnOlaNKmXz8WNTfcS9y5ovaSqKo=
diff --git a/backend/migrations/20251107000100.sql b/backend/migrations/20251107000100.sql
@@ -0,0 +1,39 @@
+-- Create "digger_detection_runs" table (append-only)
+CREATE TABLE "public"."digger_detection_runs" (
+  "id" bigserial NOT NULL,
+  "created_at" timestamptz NOT NULL DEFAULT now(),
+  "updated_at" timestamptz NULL,
+  "deleted_at" timestamptz NULL,
+
+  "organisation_id"  bigint       NOT NULL,
+  "repo_full_name"   text         NOT NULL,
+  "pr_number"        integer      NOT NULL,
+
+  -- What triggered this detection
+  "trigger_type"     text         NOT NULL, -- 'pull_request' | 'issue_comment'
+  "trigger_action"   text         NOT NULL, -- e.g. opened | synchronize | reopened | comment | closed | converted_to_draft
+
+  -- Context
+  "commit_sha"       text,
+  "default_branch"   text,
+  "target_branch"    text,
+
+  -- Denormalized JSON payloads
+  "labels_json"              jsonb,
+  "changed_files_json"       jsonb,
+  "impacted_projects_json"   jsonb NOT NULL,
+  "source_mapping_json"      jsonb,
+
+  PRIMARY KEY ("id")
+);
+
+-- Helpful indexes for lookups and listing latest runs per PR
+CREATE INDEX IF NOT EXISTS idx_ddr_org_repo_pr_created_at
+  ON "public"."digger_detection_runs" ("organisation_id", "repo_full_name", "pr_number", "created_at" DESC);
+
+CREATE INDEX IF NOT EXISTS idx_ddr_repo_pr
+  ON "public"."digger_detection_runs" ("repo_full_name", "pr_number");
+
+CREATE INDEX IF NOT EXISTS idx_ddr_deleted_at
+  ON "public"."digger_detection_runs" ("deleted_at");
+
diff --git a/backend/migrations/atlas.sum b/backend/migrations/atlas.sum
@@ -1,4 +1,4 @@
-h1:jWoBs487iG65lDQFsl/k45k6w5yExxjs+1elkqh5xoI=
+h1:Bodw0wkkaLj7PUoyJBn2J/C1a0k3CEZzYZVP7A4930g=
 20231227132525.sql h1:43xn7XC0GoJsCnXIMczGXWis9d504FAWi4F1gViTIcw=
 20240115170600.sql h1:IW8fF/8vc40+eWqP/xDK+R4K9jHJ9QBSGO6rN9LtfSA=
 20240116123649.sql h1:R1JlUIgxxF6Cyob9HdtMqiKmx/BfnsctTl5rvOqssQw=
@@ -66,3 +66,4 @@ h1:jWoBs487iG65lDQFsl/k45k6w5yExxjs+1elkqh5xoI=
 20250907140955.sql h1:LHINhHgrPwM/Sy1UeIS4Z3iUVp6kv3/UtiGZZ5/SE8k=
 20250910102133.sql h1:jBW3PuoCWZPJA8ZaXDAyRuA9LnGDQGxvL+HtjCn33DI=
 20251006225238.sql h1:L581xAn5IsYt9Srf1RnJLleLIQVlgLzp7FaAChAlCJw=
+20251107000100.sql h1:b3USfhlLulZ+6iL9a66Ddpy6uDcYmmyDGZLYzbEjuRA=
diff --git a/backend/models/detection_runs.go b/backend/models/detection_runs.go