From 92f48f4ce8842f3156160b6149bd480bf694ab49 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Fri, 12 Sep 2025 16:53:00 +0200 Subject: [PATCH 01/20] give appdb all permissions for poc --- config/rbac/database-roles.yaml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/config/rbac/database-roles.yaml b/config/rbac/database-roles.yaml index 58686bf01..73bb27b3d 100644 --- a/config/rbac/database-roles.yaml +++ b/config/rbac/database-roles.yaml @@ -27,6 +27,7 @@ metadata: name: mongodb-kubernetes-appdb namespace: mongodb rules: + # Existing permissions - apiGroups: - '' resources: @@ -41,6 +42,33 @@ rules: - patch - delete - get + - list # List all pods in replica set for coordination + + # Additional permissions for agent coordination and self-deletion + - apiGroups: + - apps + resources: + - statefulsets + verbs: + - get # Read StatefulSet to get target revision + - list # List StatefulSets in namespace + - apiGroups: + - apps + resources: + - controllerrevisions + verbs: + - get # Read controller revisions for version comparison + - list # List revisions to find target state + - apiGroups: + - '' + resources: + - configmaps + verbs: + - get # Read coordination state ConfigMap + - list # List ConfigMaps (for coordination state discovery) + - patch # Update coordination state with own status + - update # Update coordination state + - create # Create coordination ConfigMap if needed --- # Source: mongodb-kubernetes/templates/database-roles.yaml kind: RoleBinding From 26dbe6b90f8022b242cd0ddc8b858d47ef46153c Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Mon, 15 Sep 2025 16:12:28 +0200 Subject: [PATCH 02/20] add pod namespace and name downwardapi --- .../construct/database_construction.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/controllers/operator/construct/database_construction.go b/controllers/operator/construct/database_construction.go index d2076d7cc..3fff5c933 100644 --- a/controllers/operator/construct/database_construction.go +++ b/controllers/operator/construct/database_construction.go @@ -1001,6 +1001,24 @@ func databaseEnvVars(opts DatabaseStatefulSetOptions) []corev1.EnvVar { ) } + vars = append(vars, corev1.EnvVar{ + Name: "POD_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.namespace", + }, + }, + }) + + vars = append(vars, corev1.EnvVar{ + Name: "POD_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.name", + }, + }, + }) + // This is only used for debugging if useDebugAgent := os.Getenv(util.EnvVarDebug); useDebugAgent != "" { // nolint:forbidigo zap.S().Debugf("running the agent in debug mode") From 7e05535d391c349927a3f2d7fa410c7f0ed61201 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Mon, 15 Sep 2025 16:56:52 +0200 Subject: [PATCH 03/20] force init database from nam --- controllers/operator/agents/upgrade.go | 24 +------------------ .../construct/database_construction.go | 10 ++++++-- .../content/agent-launcher-lib.sh | 2 +- 3 files changed, 10 insertions(+), 26 deletions(-) diff --git a/controllers/operator/agents/upgrade.go b/controllers/operator/agents/upgrade.go index 266982450..076e75cee 100644 --- a/controllers/operator/agents/upgrade.go +++ b/controllers/operator/agents/upgrade.go @@ -48,29 +48,7 @@ type ClientSecret struct { // happens and all existing MongoDBs are required to get agents upgraded (otherwise the "You need to upgrade the // automation agent before publishing other changes" error happens for automation config pushes from the Operator) func UpgradeAllIfNeeded(ctx context.Context, cs ClientSecret, omConnectionFactory om.ConnectionFactory, watchNamespace []string, isMulti bool) { - mux.Lock() - defer mux.Unlock() - - if !time.Now().After(nextScheduledTime) { - return - } - log := zap.S() - log.Info("Performing a regular upgrade of Agents for all the MongoDB resources in the cluster...") - - allMDBs, err := readAllMongoDBs(ctx, cs.Client, watchNamespace, isMulti) - if err != nil { - log.Errorf("Failed to read MongoDB resources to ensure Agents have the latest version: %s", err) - return - } - - err = doUpgrade(ctx, cs.Client, cs.SecretClient, omConnectionFactory, allMDBs) - if err != nil { - log.Errorf("Failed to perform upgrade of Agents: %s", err) - } - - log.Info("The upgrade of Agents for all the MongoDB resources in the cluster is finished.") - - nextScheduledTime = nextScheduledTime.Add(pause) + return } // ScheduleUpgrade allows to reset the timer to Now() which makes sure the next MongoDB reconciliation will ensure diff --git a/controllers/operator/construct/database_construction.go b/controllers/operator/construct/database_construction.go index 3fff5c933..e859025b8 100644 --- a/controllers/operator/construct/database_construction.go +++ b/controllers/operator/construct/database_construction.go @@ -714,10 +714,13 @@ func buildStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, mdb configureContainerSecurityContext, )} + // Hardcoded init database image for local development + hardcodedInitDatabaseImage := "268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/nnguyen-kops/mongodb-kubernetes-init-database:latest" + agentUtilitiesHolderModifications := []func(*corev1.Container){container.Apply( container.WithName(util.AgentContainerUtilitiesName), container.WithArgs([]string{""}), - container.WithImage(opts.InitDatabaseImage), + container.WithImage(hardcodedInitDatabaseImage), container.WithEnvs(databaseEnvVars(opts)...), container.WithCommand([]string{"bash", "-c", "touch /tmp/agent-utilities-holder_marker && tail -F -n0 /tmp/agent-utilities-holder_marker"}), configureContainerSecurityContext, @@ -954,9 +957,12 @@ func databaseScriptsVolumeMount(readOnly bool) corev1.VolumeMount { func buildDatabaseInitContainer(initDatabaseImage string) container.Modification { _, configureContainerSecurityContext := podtemplatespec.WithDefaultSecurityContextsModifications() + // Hardcoded init database image for local development + hardcodedInitDatabaseImage := "268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/nnguyen-kops/mongodb-kubernetes-init-database:latest" + return container.Apply( container.WithName(InitDatabaseContainerName), - container.WithImage(initDatabaseImage), + container.WithImage(hardcodedInitDatabaseImage), container.WithVolumeMounts([]corev1.VolumeMount{ databaseScriptsVolumeMount(false), }), diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh index aba8ca152..7895222a5 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh @@ -117,7 +117,7 @@ download_agent() { script_log "Downloading Agent version: ${AGENT_VERSION}" script_log "Downloading a Mongodb Agent from ${base_url:?}" curl_opts=( - "${base_url}/download/agent/automation/${AGENT_FILE}" + "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c81e93cc2aec0007640bad/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9772-1.linux_x86_64.tar.gz" "--location" "--silent" "--retry" "3" "--fail" "-v" "--output" "automation-agent.tar.gz" From 4df0bfcb4fd1f54db58e4d58cb2791c664f705bd Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Mon, 15 Sep 2025 17:13:53 +0200 Subject: [PATCH 04/20] use ondelete --- controllers/operator/construct/database_construction.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/controllers/operator/construct/database_construction.go b/controllers/operator/construct/database_construction.go index e859025b8..f69b8f92c 100644 --- a/controllers/operator/construct/database_construction.go +++ b/controllers/operator/construct/database_construction.go @@ -514,7 +514,7 @@ func buildDatabaseStatefulSetConfigurationFunction(mdb databaseStatefulSetSource } podTemplateModifications = append(podTemplateModifications, staticMods...) - return statefulset.Apply( + statefulSetModifications := []statefulset.Modification{ // StatefulSet metadata statefulset.WithLabels(ssLabels), statefulset.WithName(stsName), @@ -524,10 +524,14 @@ func buildDatabaseStatefulSetConfigurationFunction(mdb databaseStatefulSetSource statefulset.WithServiceName(opts.ServiceName), statefulset.WithReplicas(opts.Replicas), statefulset.WithOwnerReference(opts.OwnerReference), + // Set OnDelete update strategy for agent-controlled rolling restarts + statefulset.WithUpdateStrategyType(appsv1.OnDeleteStatefulSetStrategyType), volumeClaimFuncs, shareProcessNs, statefulset.WithPodSpecTemplate(podtemplatespec.Apply(podTemplateModifications...)), - ) + } + + return statefulset.Apply(statefulSetModifications...) } func buildPersistentVolumeClaimsFuncs(opts DatabaseStatefulSetOptions) (map[string]persistentvolumeclaim.Modification, []corev1.VolumeMount) { From a9c989f815866f064ff20b7ccc4f090a3f71f42f Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Tue, 16 Sep 2025 13:04:54 +0200 Subject: [PATCH 05/20] use new agent version --- .../replicaset/test_rolling_restart_poc.py | 215 ++++++++++++++++++ scripts/test_custom_agent.sh | 25 ++ 2 files changed, 240 insertions(+) create mode 100644 docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py create mode 100644 scripts/test_custom_agent.sh diff --git a/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py new file mode 100644 index 000000000..aaf0b20fe --- /dev/null +++ b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py @@ -0,0 +1,215 @@ +from kubetester import find_fixture, try_load +from kubetester.kubetester import KubernetesTester, ensure_ent_version +from kubetester.mongodb import MongoDB +from kubetester.phase import Phase +from pytest import fixture, mark + + +@fixture(scope="module") +def rolling_restart_replica_set(namespace: str, custom_mdb_version: str) -> MongoDB: + """Create a MongoDB replica set for rolling restart testing.""" + resource = MongoDB.from_yaml( + find_fixture("replica-set-basic.yaml"), + namespace=namespace, + name="rolling-restart-test" + ) + resource.set_version(ensure_ent_version(custom_mdb_version)) + resource.set_architecture_annotation() + + try_load(resource) + return resource + + +@mark.e2e_rolling_restart_poc +def test_replica_set_ready(rolling_restart_replica_set: MongoDB): + """Test that replica set reaches running state initially.""" + rolling_restart_replica_set.update() + rolling_restart_replica_set.assert_reaches_phase(Phase.Running, timeout=400) + + +@mark.e2e_rolling_restart_poc +def test_statefulset_has_ondelete_strategy(rolling_restart_replica_set: MongoDB, namespace: str): + """Verify that StatefulSet uses OnDelete update strategy for agent coordination.""" + from kubernetes import client + + appsv1 = client.AppsV1Api() + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + + assert sts.spec.update_strategy.type == "OnDelete", \ + f"Expected OnDelete strategy, got {sts.spec.update_strategy.type}" + + +@mark.e2e_rolling_restart_poc +def test_pods_have_kubernetes_env_vars(rolling_restart_replica_set: MongoDB, namespace: str): + """Verify pods have POD_NAME and POD_NAMESPACE environment variables.""" + from kubernetes import client + + corev1 = client.CoreV1Api() + + for i in range(3): + pod_name = f"rolling-restart-test-{i}" + pod = corev1.read_namespaced_pod(pod_name, namespace) + + # Check main container environment variables + container = pod.spec.containers[0] + env_vars = {env.name: env for env in container.env} + + assert "POD_NAME" in env_vars, f"POD_NAME not found in {pod_name}" + assert "POD_NAMESPACE" in env_vars, f"POD_NAMESPACE not found in {pod_name}" + + # Verify they use downward API + assert env_vars["POD_NAME"].value_from.field_ref.field_path == "metadata.name" + assert env_vars["POD_NAMESPACE"].value_from.field_ref.field_path == "metadata.namespace" + + +@mark.e2e_rolling_restart_poc +def test_trigger_rolling_restart(rolling_restart_replica_set: MongoDB, namespace: str): + """Test triggering rolling restart by changing StatefulSet spec.""" + from kubernetes import client + + appsv1 = client.AppsV1Api() + + # Get initial StatefulSet revision + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + initial_revision = sts.status.update_revision + print(f"Initial StatefulSet revision: {initial_revision}") + + # Trigger rolling restart by changing pod template (which changes StatefulSet spec) + # This simulates infrastructure changes that require pod restarts + rolling_restart_replica_set.load() + rolling_restart_replica_set["spec"]["podSpec"] = { + "podTemplate": { + "metadata": { + "annotations": { + "rolling-restart-test/restart-trigger": "test-change-1" + } + } + } + } + rolling_restart_replica_set.update() + + # Wait for StatefulSet to get updated with new revision + import time + max_wait = 60 + start_time = time.time() + new_revision = initial_revision + + while new_revision == initial_revision and (time.time() - start_time) < max_wait: + time.sleep(2) + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + new_revision = sts.status.update_revision + print(f"Current StatefulSet revision: {new_revision}") + + assert new_revision != initial_revision, \ + f"StatefulSet revision should change. Initial: {initial_revision}, Current: {new_revision}" + + print(f"StatefulSet revision updated from {initial_revision} to {new_revision}") + + # Wait for the rolling restart coordination to complete and reach running state + rolling_restart_replica_set.assert_reaches_phase(Phase.Running, timeout=600) + + +@mark.e2e_rolling_restart_poc +def test_all_pods_restarted_with_new_revision(rolling_restart_replica_set: MongoDB, namespace: str): + """Verify all pods eventually restart and get the new StatefulSet revision.""" + from kubernetes import client + import time + + appsv1 = client.AppsV1Api() + corev1 = client.CoreV1Api() + + # Get target revision from StatefulSet + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + target_revision = sts.status.update_revision + print(f"Target StatefulSet revision: {target_revision}") + + # Wait for all pods to be updated with the target revision + # This tests that agent coordination allows all pods to restart + max_wait = 600 # 10 minutes + start_time = time.time() + + while (time.time() - start_time) < max_wait: + all_updated = True + + for i in range(3): + pod_name = f"rolling-restart-test-{i}" + try: + pod = corev1.read_namespaced_pod(pod_name, namespace) + current_revision = pod.metadata.labels.get("controller-revision-hash", "") + + if current_revision != target_revision: + all_updated = False + print(f"Pod {pod_name} revision: {current_revision} (target: {target_revision})") + else: + print(f"Pod {pod_name} updated to target revision: {target_revision}") + + except client.rest.ApiException: + all_updated = False + print(f"Pod {pod_name} not ready yet") + + if all_updated: + print("All pods successfully updated with agent coordination!") + break + + time.sleep(10) + + # Final verification - all pods should have target revision and be ready + for i in range(3): + pod_name = f"rolling-restart-test-{i}" + pod = corev1.read_namespaced_pod(pod_name, namespace) + + assert pod.status.phase == "Running", f"Pod {pod_name} should be running" + + current_revision = pod.metadata.labels.get("controller-revision-hash", "") + assert current_revision == target_revision, \ + f"Pod {pod_name} should have target revision {target_revision}, got {current_revision}" + + # Verify container is ready + if pod.status.container_statuses: + container_ready = any(status.ready for status in pod.status.container_statuses) + assert container_ready, f"Pod {pod_name} container should be ready" + + +@mark.e2e_rolling_restart_poc +def test_verify_agent_coordination_logs(rolling_restart_replica_set: MongoDB, namespace: str): + """Verify that agents show coordination behavior in logs.""" + + # Look for coordination-related log messages in agent logs + coordination_patterns = [ + "CheckRollingRestartKube", + "WaitCanUpdate", + "DeleteMyPodKube", + "needsUpdate=true", + "Kubernetes upgrade", + "StatefulSet.*revision" + ] + + found_patterns = set() + + for i in range(3): + pod_name = f"rolling-restart-test-{i}" + + # Get agent logs - try both possible container names + try: + # For static architecture + logs = KubernetesTester.get_pod_logs(namespace, pod_name, container="mongodb-agent") + except: + try: + # For non-static architecture + logs = KubernetesTester.get_pod_logs(namespace, pod_name, container="mongodb-enterprise-database") + except: + logs = KubernetesTester.get_pod_logs(namespace, pod_name) + + # Check for coordination patterns + for pattern in coordination_patterns: + if pattern in logs: + found_patterns.add(pattern) + print(f"Found coordination pattern '{pattern}' in {pod_name}") + + # We should find at least some coordination patterns + # (exact patterns depend on timing and whether POC agent is actually used) + print(f"Found coordination patterns: {found_patterns}") + + # This is more of an informational check - in real POC test with custom agent, + # we'd expect to see the coordination patterns + assert len(found_patterns) >= 0, "Should find some evidence of coordination behavior" diff --git a/scripts/test_custom_agent.sh b/scripts/test_custom_agent.sh new file mode 100644 index 000000000..9f95318d7 --- /dev/null +++ b/scripts/test_custom_agent.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +set -euo pipefail + +# Test script for building custom agent image +# This script demonstrates how to use the new --custom-agent-url parameter + +CUSTOM_AGENT_URL="https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c81e93cc2aec0007640bad/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9772-1.linux_x86_64.tar.gz" + +echo "Testing custom agent build with URL: ${CUSTOM_AGENT_URL}" +echo "" +echo "This will build the MongoDB agent image using your custom agent version." +echo "The image will be tagged with the registry specified in build_info.json for the current build scenario." +echo "" + +# Use the existing pipeline with the new custom agent URL parameter +echo "Running: scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --custom-agent-url \"${CUSTOM_AGENT_URL}\"" +echo "" + +# Execute the build +scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --custom-agent-url "${CUSTOM_AGENT_URL}" + +echo "" +echo "Custom agent build completed!" +echo "The image should now be available with your custom agent version 13.41.0.9772-1" From d452e420c15740c3a9f5743996f01284a392bebd Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Wed, 17 Sep 2025 15:39:59 +0200 Subject: [PATCH 06/20] try out rs.settings field --- Makefile | 3 + config/rbac/database-roles.yaml | 28 ---- controllers/om/api/http.go | 5 + controllers/om/omclient.go | 2 +- controllers/operator/common_controller.go | 26 +++- .../construct/database_construction.go | 4 + .../mongodbmultireplicaset_controller.go | 2 +- .../operator/mongodbreplicaset_controller.go | 17 ++- docker/mongodb-agent/Dockerfile | 11 +- .../content/agent-launcher-lib.sh | 5 +- .../replicaset/test_rolling_restart_poc.py | 125 ++++++++++++------ scripts/release/atomic_pipeline.py | 52 ++++++-- .../build/image_build_configuration.py | 1 + scripts/release/pipeline_main.py | 8 ++ scripts/test_custom_agent.sh | 17 ++- 15 files changed, 212 insertions(+), 94 deletions(-) mode change 100644 => 100755 scripts/test_custom_agent.sh diff --git a/Makefile b/Makefile index 77f866d2b..a7950bd4f 100644 --- a/Makefile +++ b/Makefile @@ -194,6 +194,9 @@ agent-image: agent-image-slow: @ scripts/dev/run_python.sh scripts/release/pipeline_main.py --parallel-factor 1 agent +agent-image-custom: + @ scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --version "latest" --custom-agent-url "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c96f4020b54e00079b0621/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9776-1.linux_x86_64.tar.gz" + operator-image: @ scripts/dev/run_python.sh scripts/release/pipeline_main.py operator diff --git a/config/rbac/database-roles.yaml b/config/rbac/database-roles.yaml index 73bb27b3d..58686bf01 100644 --- a/config/rbac/database-roles.yaml +++ b/config/rbac/database-roles.yaml @@ -27,7 +27,6 @@ metadata: name: mongodb-kubernetes-appdb namespace: mongodb rules: - # Existing permissions - apiGroups: - '' resources: @@ -42,33 +41,6 @@ rules: - patch - delete - get - - list # List all pods in replica set for coordination - - # Additional permissions for agent coordination and self-deletion - - apiGroups: - - apps - resources: - - statefulsets - verbs: - - get # Read StatefulSet to get target revision - - list # List StatefulSets in namespace - - apiGroups: - - apps - resources: - - controllerrevisions - verbs: - - get # Read controller revisions for version comparison - - list # List revisions to find target state - - apiGroups: - - '' - resources: - - configmaps - verbs: - - get # Read coordination state ConfigMap - - list # List ConfigMaps (for coordination state discovery) - - patch # Update coordination state with own status - - update # Update coordination state - - create # Create coordination ConfigMap if needed --- # Source: mongodb-kubernetes/templates/database-roles.yaml kind: RoleBinding diff --git a/controllers/om/api/http.go b/controllers/om/api/http.go index b2266f690..ccf3297e4 100644 --- a/controllers/om/api/http.go +++ b/controllers/om/api/http.go @@ -220,6 +220,11 @@ func (client *Client) authorizeRequest(method, hostname, path string, request *r return err } + if resp.StatusCode == http.StatusOK { + // No need to authorize, server didn't challenge us + return nil + } + if resp.StatusCode != http.StatusUnauthorized { return apierror.New( xerrors.Errorf( diff --git a/controllers/om/omclient.go b/controllers/om/omclient.go index cbc0c2875..fe5e6a058 100644 --- a/controllers/om/omclient.go +++ b/controllers/om/omclient.go @@ -355,7 +355,7 @@ func (oc *HTTPOmConnection) OpsManagerVersion() versionutil.OpsManagerVersion { // UpdateDeployment updates a given deployment to the new deployment object passed as parameter. func (oc *HTTPOmConnection) UpdateDeployment(deployment Deployment) ([]byte, error) { - return oc.put(fmt.Sprintf("/api/public/v1.0/groups/%s/automationConfig", oc.GroupID()), deployment) + return oc.put(fmt.Sprintf("/automation/conf/%s", oc.GroupID()), deployment) } // ReadDeployment returns a Deployment object for this group diff --git a/controllers/operator/common_controller.go b/controllers/operator/common_controller.go index 61fa4ea1b..ae5d4b219 100644 --- a/controllers/operator/common_controller.go +++ b/controllers/operator/common_controller.go @@ -965,7 +965,7 @@ type PrometheusConfiguration struct { prometheusCertHash string } -func ReconcileReplicaSetAC(ctx context.Context, d om.Deployment, spec mdbv1.DbCommonSpec, lastMongodConfig map[string]interface{}, resourceName string, rs om.ReplicaSetWithProcesses, caFilePath string, internalClusterPath string, pc *PrometheusConfiguration, log *zap.SugaredLogger) error { +func ReconcileReplicaSetAC(ctx context.Context, d om.Deployment, spec mdbv1.DbCommonSpec, lastMongodConfig map[string]interface{}, resourceName string, rs om.ReplicaSetWithProcesses, caFilePath string, internalClusterPath string, pc *PrometheusConfiguration, statefulSetVersion string, log *zap.SugaredLogger) error { // it is not possible to disable internal cluster authentication once enabled if d.ExistingProcessesHaveInternalClusterAuthentication(rs.Processes) && spec.Security.GetInternalClusterAuthenticationMode() == "" { return xerrors.Errorf("cannot disable x509 internal cluster authentication") @@ -981,6 +981,30 @@ func ReconcileReplicaSetAC(ctx context.Context, d om.Deployment, spec mdbv1.DbCo d.ConfigureTLS(spec.GetSecurity(), caFilePath) d.ConfigureInternalClusterAuthentication(rs.GetProcessNames(), spec.GetSecurity().GetInternalClusterAuthenticationMode(), internalClusterPath) + // Set StatefulSet version in replica set settings for rolling restart coordination + if statefulSetVersion != "" { + replicaSets, exists := d["replicaSets"] + if exists { + replicaSetList := replicaSets.([]interface{}) + for _, rs := range replicaSetList { + replicaSet := rs.(map[string]interface{}) + + // Get or create settings + settings, exists := replicaSet["settings"] + if !exists { + settings = map[string]interface{}{} + replicaSet["settings"] = settings + } + + settingsMap := settings.(map[string]interface{}) + settingsMap["kube"] = map[string]interface{}{ + "statefulSetVersion": statefulSetVersion, + } + } + log.Infof("Set StatefulSet version in replica set settings: %s", statefulSetVersion) + } + } + // if we don't set up a prometheus connection, then we don't want to set up prometheus for instance because we do not support it yet. if pc != nil { // At this point, we won't bubble-up the error we got from this diff --git a/controllers/operator/construct/database_construction.go b/controllers/operator/construct/database_construction.go index f69b8f92c..0bc0023a1 100644 --- a/controllers/operator/construct/database_construction.go +++ b/controllers/operator/construct/database_construction.go @@ -363,6 +363,10 @@ func DatabaseStatefulSetHelper(mdb databaseStatefulSetSource, stsOpts *DatabaseS } extraEnvs = append(extraEnvs, ReadDatabaseProxyVarsFromEnv()...) + extraEnvs = append(extraEnvs, corev1.EnvVar{ + Name: "nam", + Value: "nam", + }) stsOpts.ExtraEnvs = extraEnvs templateFunc := buildMongoDBPodTemplateSpec(*stsOpts, mdb) diff --git a/controllers/operator/mongodbmultireplicaset_controller.go b/controllers/operator/mongodbmultireplicaset_controller.go index 013844268..0b6c308e0 100644 --- a/controllers/operator/mongodbmultireplicaset_controller.go +++ b/controllers/operator/mongodbmultireplicaset_controller.go @@ -767,7 +767,7 @@ func (r *ReconcileMongoDbMultiReplicaSet) updateOmDeploymentRs(ctx context.Conte err = conn.ReadUpdateDeployment( func(d om.Deployment) error { - return ReconcileReplicaSetAC(ctx, d, mrs.Spec.DbCommonSpec, lastMongodbConfig, mrs.Name, rs, caFilePath, internalClusterCertPath, nil, log) + return ReconcileReplicaSetAC(ctx, d, mrs.Spec.DbCommonSpec, lastMongodbConfig, mrs.Name, rs, caFilePath, internalClusterCertPath, nil, "", log) }, log, ) diff --git a/controllers/operator/mongodbreplicaset_controller.go b/controllers/operator/mongodbreplicaset_controller.go index bf6b2d2b7..9872908c8 100644 --- a/controllers/operator/mongodbreplicaset_controller.go +++ b/controllers/operator/mongodbreplicaset_controller.go @@ -251,7 +251,7 @@ func (r *ReconcileMongoDbReplicaSet) Reconcile(ctx context.Context, request reco // See CLOUDP-189433 and CLOUDP-229222 for more details. if recovery.ShouldTriggerRecovery(rs.Status.Phase != mdbstatus.PhaseRunning, rs.Status.LastTransition) { log.Warnf("Triggering Automatic Recovery. The MongoDB resource %s/%s is in %s state since %s", rs.Namespace, rs.Name, rs.Status.Phase, rs.Status.LastTransition) - automationConfigStatus := r.updateOmDeploymentRs(ctx, conn, rs.Status.Members, rs, sts, log, caFilePath, tlsCertPath, internalClusterCertPath, agentCertSecretSelector, prometheusCertHash, true).OnErrorPrepend("Failed to create/update (Ops Manager reconciliation phase):") + automationConfigStatus := r.updateOmDeploymentRs(ctx, conn, rs.Status.Members, rs, sts, log, caFilePath, tlsCertPath, internalClusterCertPath, agentCertSecretSelector, prometheusCertHash, true, "").OnErrorPrepend("Failed to create/update (Ops Manager reconciliation phase):") deploymentError := create.DatabaseInKubernetes(ctx, r.client, *rs, sts, rsConfig, log) if deploymentError != nil { log.Errorf("Recovery failed because of deployment errors, %w", deploymentError) @@ -265,9 +265,18 @@ func (r *ReconcileMongoDbReplicaSet) Reconcile(ctx context.Context, request reco if err != nil { lastSpec = &mdbv1.MongoDbSpec{} } + + var statefulSetVersion string + kubeClient := r.client + currentSts, err := kubeClient.GetStatefulSet(ctx, kube.ObjectKey(rs.Namespace, rs.Name)) + if err == nil { + // StatefulSet exists, get its target revision + statefulSetVersion = currentSts.Status.UpdateRevision + } + status = workflow.RunInGivenOrder(publishAutomationConfigFirst(ctx, r.client, *rs, lastSpec, rsConfig, log), func() workflow.Status { - return r.updateOmDeploymentRs(ctx, conn, rs.Status.Members, rs, sts, log, caFilePath, tlsCertPath, internalClusterCertPath, agentCertSecretSelector, prometheusCertHash, false).OnErrorPrepend("Failed to create/update (Ops Manager reconciliation phase):") + return r.updateOmDeploymentRs(ctx, conn, rs.Status.Members, rs, sts, log, caFilePath, tlsCertPath, internalClusterCertPath, agentCertSecretSelector, prometheusCertHash, false, statefulSetVersion).OnErrorPrepend("Failed to create/update (Ops Manager reconciliation phase):") }, func() workflow.Status { workflowStatus := create.HandlePVCResize(ctx, r.client, &sts, log) @@ -428,7 +437,7 @@ func AddReplicaSetController(ctx context.Context, mgr manager.Manager, imageUrls // updateOmDeploymentRs performs OM registration operation for the replicaset. So the changes will be finally propagated // to automation agents in containers -func (r *ReconcileMongoDbReplicaSet) updateOmDeploymentRs(ctx context.Context, conn om.Connection, membersNumberBefore int, rs *mdbv1.MongoDB, set appsv1.StatefulSet, log *zap.SugaredLogger, caFilePath, tlsCertPath, internalClusterCertPath string, agentCertSecretSelector corev1.SecretKeySelector, prometheusCertHash string, isRecovering bool) workflow.Status { +func (r *ReconcileMongoDbReplicaSet) updateOmDeploymentRs(ctx context.Context, conn om.Connection, membersNumberBefore int, rs *mdbv1.MongoDB, set appsv1.StatefulSet, log *zap.SugaredLogger, caFilePath, tlsCertPath, internalClusterCertPath string, agentCertSecretSelector corev1.SecretKeySelector, prometheusCertHash string, isRecovering bool, statefulSetVersion string) workflow.Status { log.Debug("Entering UpdateOMDeployments") // Only "concrete" RS members should be observed // - if scaling down, let's observe only members that will remain after scale-down operation @@ -477,7 +486,7 @@ func (r *ReconcileMongoDbReplicaSet) updateOmDeploymentRs(ctx context.Context, c err = conn.ReadUpdateDeployment( func(d om.Deployment) error { - return ReconcileReplicaSetAC(ctx, d, rs.Spec.DbCommonSpec, lastRsConfig.ToMap(), rs.Name, replicaSet, caFilePath, internalClusterCertPath, &p, log) + return ReconcileReplicaSetAC(ctx, d, rs.Spec.DbCommonSpec, lastRsConfig.ToMap(), rs.Name, replicaSet, caFilePath, internalClusterCertPath, &p, statefulSetVersion, log) }, log, ) diff --git a/docker/mongodb-agent/Dockerfile b/docker/mongodb-agent/Dockerfile index 15458c4e4..6c62661e6 100644 --- a/docker/mongodb-agent/Dockerfile +++ b/docker/mongodb-agent/Dockerfile @@ -65,12 +65,15 @@ RUN microdnf install -y libssh libpsl libbrotli \ RUN microdnf install -y --disableplugin=subscription-manager --setopt=install_weak_deps=0 nss_wrapper # Copy-pasted from https://www.mongodb.com/docs/manual/tutorial/install-mongodb-enterprise-on-red-hat-tarball/ +# Temporarily skip cyrus-sasl due to systemd dependency issues in UBI9 RUN microdnf install -y --disableplugin=subscription-manager \ + krb5-libs openldap openssl xz-libs || \ + microdnf install -y --disableplugin=subscription-manager --nobest \ cyrus-sasl cyrus-sasl-gssapi cyrus-sasl-plain krb5-libs openldap openssl xz-libs -# Dependencies for the Agent -RUN microdnf install -y --disableplugin=subscription-manager --setopt=install_weak_deps=0 \ - net-snmp \ - net-snmp-agent-libs +# Dependencies for the Agent - skip net-snmp due to systemd dependency issues +# RUN microdnf install -y --disableplugin=subscription-manager --setopt=install_weak_deps=0 \ +# net-snmp \ +# net-snmp-agent-libs RUN microdnf install -y --disableplugin=subscription-manager \ hostname tar gzip procps jq \ && microdnf upgrade -y \ diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh index 7895222a5..9f2d60b58 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh @@ -115,13 +115,14 @@ download_agent() { esac script_log "Downloading Agent version: ${AGENT_VERSION}" - script_log "Downloading a Mongodb Agent from ${base_url:?}" curl_opts=( - "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c81e93cc2aec0007640bad/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9772-1.linux_x86_64.tar.gz" + "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c96f4020b54e00079b0621/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9776-1.linux_x86_64.tar.gz" "--location" "--silent" "--retry" "3" "--fail" "-v" "--output" "automation-agent.tar.gz" ); + script_log "Downloading a Mongodb Agent via ${curl_opts[0]:?}" + if [ "${SSL_REQUIRE_VALID_MMS_CERTIFICATES-}" = "false" ]; then # If we are not expecting valid certs, `curl` should be run with `--insecure` option. diff --git a/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py index aaf0b20fe..d44173647 100644 --- a/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py +++ b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py @@ -9,9 +9,7 @@ def rolling_restart_replica_set(namespace: str, custom_mdb_version: str) -> MongoDB: """Create a MongoDB replica set for rolling restart testing.""" resource = MongoDB.from_yaml( - find_fixture("replica-set-basic.yaml"), - namespace=namespace, - name="rolling-restart-test" + find_fixture("replica-set-basic.yaml"), namespace=namespace, name="rolling-restart-test" ) resource.set_version(ensure_ent_version(custom_mdb_version)) resource.set_architecture_annotation() @@ -27,16 +25,17 @@ def test_replica_set_ready(rolling_restart_replica_set: MongoDB): rolling_restart_replica_set.assert_reaches_phase(Phase.Running, timeout=400) -@mark.e2e_rolling_restart_poc +@mark.e2e_rolling_restart_poc def test_statefulset_has_ondelete_strategy(rolling_restart_replica_set: MongoDB, namespace: str): """Verify that StatefulSet uses OnDelete update strategy for agent coordination.""" from kubernetes import client - + appsv1 = client.AppsV1Api() sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) - - assert sts.spec.update_strategy.type == "OnDelete", \ - f"Expected OnDelete strategy, got {sts.spec.update_strategy.type}" + + assert ( + sts.spec.update_strategy.type == "OnDelete" + ), f"Expected OnDelete strategy, got {sts.spec.update_strategy.type}" @mark.e2e_rolling_restart_poc @@ -62,47 +61,97 @@ def test_pods_have_kubernetes_env_vars(rolling_restart_replica_set: MongoDB, nam assert env_vars["POD_NAMESPACE"].value_from.field_ref.field_path == "metadata.namespace" +@mark.e2e_rolling_restart_poc +def test_check_agent_detection(rolling_restart_replica_set: MongoDB, namespace: str): + """Check if agents detect StatefulSet changes and log them.""" + import time + + from kubernetes import client + + print(f"Checking agent detection in namespace: {namespace}") + + # Wait a bit for the deployment to be fully ready + time.sleep(30) + + # Get current pods and their logs + v1 = client.CoreV1Api() + pods = v1.list_namespaced_pod(namespace) + + print(f"Found {len(pods.items)} pods") + for pod in pods.items: + print(f"Pod: {pod.metadata.name}, Status: {pod.status.phase}") + if "rolling-restart-test" in pod.metadata.name: + print(f"Getting logs for agent pod: {pod.metadata.name}") + try: + logs = v1.read_namespaced_pod_log( + name=pod.metadata.name, namespace=namespace, container="mongodb-enterprise-database", tail_lines=100 + ) + print(f"Recent logs from {pod.metadata.name}:") + print(logs[-2000:]) # Last 2000 chars + except Exception as e: + print(f"Could not get logs for {pod.metadata.name}: {e}") + + # Now trigger a StatefulSet change and watch for agent response + appsv1 = client.AppsV1Api() + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + initial_revision = sts.status.update_revision + print(f"Initial StatefulSet revision: {initial_revision}") + + # Don't cleanup automatically - let's examine manually + assert True + + @mark.e2e_rolling_restart_poc def test_trigger_rolling_restart(rolling_restart_replica_set: MongoDB, namespace: str): """Test triggering rolling restart by changing StatefulSet spec.""" from kubernetes import client - + appsv1 = client.AppsV1Api() - + # Get initial StatefulSet revision sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) initial_revision = sts.status.update_revision print(f"Initial StatefulSet revision: {initial_revision}") - # Trigger rolling restart by changing pod template (which changes StatefulSet spec) - # This simulates infrastructure changes that require pod restarts - rolling_restart_replica_set.load() - rolling_restart_replica_set["spec"]["podSpec"] = { - "podTemplate": { - "metadata": { - "annotations": { - "rolling-restart-test/restart-trigger": "test-change-1" + # Trigger rolling restart by directly patching the StatefulSet with an environment variable + # This simulates infrastructure changes like image updates or security context changes + import time + + patch_body = { + "spec": { + "template": { + "spec": { + "containers": [ + { + "name": "mongodb-enterprise-database", + "env": [{"name": "ROLLING_RESTART_TRIGGER", "value": str(int(time.time()))}], + } + ] } } } } - rolling_restart_replica_set.update() + + print("Triggering rolling restart by patching StatefulSet...") + appsv1.patch_namespaced_stateful_set(name="rolling-restart-test", namespace=namespace, body=patch_body) # Wait for StatefulSet to get updated with new revision import time + max_wait = 60 start_time = time.time() new_revision = initial_revision - + while new_revision == initial_revision and (time.time() - start_time) < max_wait: time.sleep(2) sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) new_revision = sts.status.update_revision print(f"Current StatefulSet revision: {new_revision}") - assert new_revision != initial_revision, \ - f"StatefulSet revision should change. Initial: {initial_revision}, Current: {new_revision}" - + assert ( + new_revision != initial_revision + ), f"StatefulSet revision should change. Initial: {initial_revision}, Current: {new_revision}" + print(f"StatefulSet revision updated from {initial_revision} to {new_revision}") # Wait for the rolling restart coordination to complete and reach running state @@ -112,9 +161,10 @@ def test_trigger_rolling_restart(rolling_restart_replica_set: MongoDB, namespace @mark.e2e_rolling_restart_poc def test_all_pods_restarted_with_new_revision(rolling_restart_replica_set: MongoDB, namespace: str): """Verify all pods eventually restart and get the new StatefulSet revision.""" - from kubernetes import client import time - + + from kubernetes import client + appsv1 = client.AppsV1Api() corev1 = client.CoreV1Api() @@ -127,42 +177,43 @@ def test_all_pods_restarted_with_new_revision(rolling_restart_replica_set: Mongo # This tests that agent coordination allows all pods to restart max_wait = 600 # 10 minutes start_time = time.time() - + while (time.time() - start_time) < max_wait: all_updated = True - + for i in range(3): pod_name = f"rolling-restart-test-{i}" try: pod = corev1.read_namespaced_pod(pod_name, namespace) current_revision = pod.metadata.labels.get("controller-revision-hash", "") - + if current_revision != target_revision: all_updated = False print(f"Pod {pod_name} revision: {current_revision} (target: {target_revision})") else: print(f"Pod {pod_name} updated to target revision: {target_revision}") - + except client.rest.ApiException: all_updated = False print(f"Pod {pod_name} not ready yet") - + if all_updated: print("All pods successfully updated with agent coordination!") break - + time.sleep(10) - + # Final verification - all pods should have target revision and be ready for i in range(3): pod_name = f"rolling-restart-test-{i}" pod = corev1.read_namespaced_pod(pod_name, namespace) - + assert pod.status.phase == "Running", f"Pod {pod_name} should be running" - + current_revision = pod.metadata.labels.get("controller-revision-hash", "") - assert current_revision == target_revision, \ - f"Pod {pod_name} should have target revision {target_revision}, got {current_revision}" + assert ( + current_revision == target_revision + ), f"Pod {pod_name} should have target revision {target_revision}, got {current_revision}" # Verify container is ready if pod.status.container_statuses: @@ -181,7 +232,7 @@ def test_verify_agent_coordination_logs(rolling_restart_replica_set: MongoDB, na "DeleteMyPodKube", "needsUpdate=true", "Kubernetes upgrade", - "StatefulSet.*revision" + "StatefulSet.*revision", ] found_patterns = set() diff --git a/scripts/release/atomic_pipeline.py b/scripts/release/atomic_pipeline.py index b31486058..a9f0e5bb8 100755 --- a/scripts/release/atomic_pipeline.py +++ b/scripts/release/atomic_pipeline.py @@ -334,7 +334,14 @@ def build_agent(build_configuration: ImageBuildConfiguration): Build the agent only for the latest operator for patches and operator releases. """ - if build_configuration.all_agents: + if build_configuration.custom_agent_url: + if not build_configuration.version: + raise ValueError("When using custom_agent_url, version must be set (e.g., 'latest')") + agent_versions_to_build = [(build_configuration.version, "100.9.5")] + logger.info( + f"building custom agent from URL: {build_configuration.custom_agent_url} with version: {build_configuration.version}" + ) + elif build_configuration.all_agents: agent_versions_to_build = get_all_agents_for_rebuild() logger.info("building all agents") elif build_configuration.currently_used_agents: @@ -400,18 +407,45 @@ def build_agent_pipeline( f"======== Building agent pipeline for version {agent_version}, build configuration version: {build_configuration.version}" ) - platform_build_args = generate_agent_build_args( - platforms=available_platforms, agent_version=agent_version, tools_version=tools_version - ) + # Handle custom agent URL for testing + if build_configuration.custom_agent_url: + print(f"======== Using custom agent URL: {build_configuration.custom_agent_url}") + print(f"======== Using version from pipeline: {build_configuration_copy.version}") + + # Extract the base URL and filename from the custom URL + custom_agent_base_url = build_configuration.custom_agent_url.rsplit("/", 1)[0] + custom_agent_filename = build_configuration.custom_agent_url.rsplit("/", 1)[1] + + # Use the version provided by the pipeline (e.g., "latest") instead of extracting from URL + # This allows the pipeline_main.py to control the version tag used for the image + + # Generate tools build args first (we still need MongoDB tools) + tools_build_args = generate_tools_build_args(available_platforms, tools_version) + + # Use custom URL for agent, but keep standard tools + agent_build_args = { + "mongodb_agent_version_amd64": custom_agent_filename, + "mongodb_agent_version_arm64": custom_agent_filename, # Fallback to same file + "mongodb_agent_version_s390x": custom_agent_filename, # Fallback to same file + "mongodb_agent_version_ppc64le": custom_agent_filename, # Fallback to same file + } + + # Combine tools and agent build args + platform_build_args = {**tools_build_args, **agent_build_args} + agent_base_url = custom_agent_base_url + else: + platform_build_args = generate_agent_build_args( + platforms=available_platforms, agent_version=agent_version, tools_version=tools_version + ) + agent_base_url = ( + "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/automation-agent/prod" + ) - agent_base_url = ( - "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/automation-agent/prod" - ) tools_base_url = "https://fastdl.mongodb.org/tools/db" args = { - "version": agent_version, - "agent_version": agent_version, + "version": build_configuration_copy.version, + "agent_version": build_configuration_copy.version, "mongodb_agent_url": agent_base_url, "mongodb_tools_url": tools_base_url, **platform_build_args, # Add the platform-specific build args diff --git a/scripts/release/build/image_build_configuration.py b/scripts/release/build/image_build_configuration.py index b62718276..e547437f9 100644 --- a/scripts/release/build/image_build_configuration.py +++ b/scripts/release/build/image_build_configuration.py @@ -21,6 +21,7 @@ class ImageBuildConfiguration: sign: bool = False all_agents: bool = False currently_used_agents: bool = False + custom_agent_url: Optional[str] = None def is_release_scenario(self) -> bool: return self.scenario == BuildScenario.RELEASE diff --git a/scripts/release/pipeline_main.py b/scripts/release/pipeline_main.py index 010620284..eaee48064 100644 --- a/scripts/release/pipeline_main.py +++ b/scripts/release/pipeline_main.py @@ -137,6 +137,7 @@ def image_build_config_from_args(args) -> ImageBuildConfiguration: parallel_factor=args.parallel_factor, all_agents=args.all_agents, currently_used_agents=args.current_agents, + custom_agent_url=args.custom_agent_url, ) @@ -275,6 +276,13 @@ def main(): action="store_true", help="Build all currently used agent images.", ) + parser.add_argument( + "--custom-agent-url", + metavar="", + action="store", + type=str, + help="Custom agent URL for testing (e.g., https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c96f4020b54e00079b0621/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9776-1.linux_x86_64.tar.gz)", + ) args = parser.parse_args() diff --git a/scripts/test_custom_agent.sh b/scripts/test_custom_agent.sh old mode 100644 new mode 100755 index 9f95318d7..44f446645 --- a/scripts/test_custom_agent.sh +++ b/scripts/test_custom_agent.sh @@ -4,22 +4,25 @@ set -euo pipefail # Test script for building custom agent image # This script demonstrates how to use the new --custom-agent-url parameter +# It leverages the pipeline_main.py by supplying a version (latest) which is used by atomic_pipeline.py -CUSTOM_AGENT_URL="https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c81e93cc2aec0007640bad/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9772-1.linux_x86_64.tar.gz" +CUSTOM_AGENT_URL="https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c96f4020b54e00079b0621/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9776-1.linux_x86_64.tar.gz" +VERSION="latest" echo "Testing custom agent build with URL: ${CUSTOM_AGENT_URL}" +echo "Using version: ${VERSION}" echo "" echo "This will build the MongoDB agent image using your custom agent version." -echo "The image will be tagged with the registry specified in build_info.json for the current build scenario." +echo "The image will be tagged with version '${VERSION}' and pushed to the registry specified in build_info.json." echo "" -# Use the existing pipeline with the new custom agent URL parameter -echo "Running: scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --custom-agent-url \"${CUSTOM_AGENT_URL}\"" +# Use the existing pipeline with version and custom agent URL parameters +echo "Running: scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --version \"${VERSION}\" --custom-agent-url \"${CUSTOM_AGENT_URL}\"" echo "" -# Execute the build -scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --custom-agent-url "${CUSTOM_AGENT_URL}" +# Execute the build (add --load to build locally without pushing) +scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --version "${VERSION}" --custom-agent-url "${CUSTOM_AGENT_URL}" echo "" echo "Custom agent build completed!" -echo "The image should now be available with your custom agent version 13.41.0.9772-1" +echo "The image should now be available with tag '${VERSION}' containing your custom agent version 13.41.0.9772-1" From a2ac6d8251b0753e810c03ccb945b34ac80629cf Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Wed, 17 Sep 2025 19:41:02 +0200 Subject: [PATCH 07/20] fix logging --- Makefile | 2 +- controllers/om/automation_config_test.go | 6 ++-- controllers/om/deployment.go | 8 ++--- controllers/om/omclient.go | 2 +- controllers/operator/common_controller.go | 30 +++++++------------ .../content/agent-launcher-lib.sh | 2 +- scripts/release/pipeline_main.py | 2 +- scripts/test_custom_agent.sh | 4 +-- 8 files changed, 23 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index a7950bd4f..c9db0dfe2 100644 --- a/Makefile +++ b/Makefile @@ -195,7 +195,7 @@ agent-image-slow: @ scripts/dev/run_python.sh scripts/release/pipeline_main.py --parallel-factor 1 agent agent-image-custom: - @ scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --version "latest" --custom-agent-url "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c96f4020b54e00079b0621/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9776-1.linux_x86_64.tar.gz" + @ scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --version "latest" --custom-agent-url "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68caf1b06da1570007e898b4/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9783-1.linux_x86_64.tar.gz" operator-image: @ scripts/dev/run_python.sh scripts/release/pipeline_main.py operator diff --git a/controllers/om/automation_config_test.go b/controllers/om/automation_config_test.go index fdc0d8e18..4c3ceeb6d 100644 --- a/controllers/om/automation_config_test.go +++ b/controllers/om/automation_config_test.go @@ -599,10 +599,10 @@ func TestAssigningListsReassignsInDeployment(t *testing.T) { func TestAutomationConfigEquality(t *testing.T) { deployment1 := NewDeployment() - deployment1.setReplicaSets([]ReplicaSet{NewReplicaSet("1", "5.0.0")}) + deployment1.SetReplicaSets([]ReplicaSet{NewReplicaSet("1", "5.0.0")}) deployment2 := NewDeployment() - deployment2.setReplicaSets([]ReplicaSet{NewReplicaSet("2", "5.0.0")}) + deployment2.SetReplicaSets([]ReplicaSet{NewReplicaSet("2", "5.0.0")}) authConfig := Auth{ Users: []*MongoDBUser{ @@ -1060,7 +1060,7 @@ func TestApplyInto(t *testing.T) { func changeTypes(deployment Deployment) error { rs := deployment.GetReplicaSets() - deployment.setReplicaSets(rs) + deployment.SetReplicaSets(rs) return nil } diff --git a/controllers/om/deployment.go b/controllers/om/deployment.go index fd7f7f776..c1b568258 100644 --- a/controllers/om/deployment.go +++ b/controllers/om/deployment.go @@ -85,7 +85,7 @@ func BuildDeploymentFromBytes(jsonBytes []byte) (Deployment, error) { func NewDeployment() Deployment { ans := Deployment{} ans.setProcesses(make([]Process, 0)) - ans.setReplicaSets(make([]ReplicaSet, 0)) + ans.SetReplicaSets(make([]ReplicaSet, 0)) ans.setShardedClusters(make([]ShardedCluster, 0)) ans.setMonitoringVersions(make([]interface{}, 0)) ans.setBackupVersions(make([]interface{}, 0)) @@ -405,7 +405,7 @@ func (d Deployment) RemoveReplicaSetByName(name string, log *zap.SugaredLogger) } } - d.setReplicaSets(toKeep) + d.SetReplicaSets(toKeep) members := rs.Members() processNames := make([]string, len(members)) @@ -992,12 +992,12 @@ func (d Deployment) GetReplicaSets() []ReplicaSet { } } -func (d Deployment) setReplicaSets(replicaSets []ReplicaSet) { +func (d Deployment) SetReplicaSets(replicaSets []ReplicaSet) { d["replicaSets"] = replicaSets } func (d Deployment) addReplicaSet(rs ReplicaSet) { - d.setReplicaSets(append(d.GetReplicaSets(), rs)) + d.SetReplicaSets(append(d.GetReplicaSets(), rs)) } func (d Deployment) getShardedClusters() []ShardedCluster { diff --git a/controllers/om/omclient.go b/controllers/om/omclient.go index fe5e6a058..cbc0c2875 100644 --- a/controllers/om/omclient.go +++ b/controllers/om/omclient.go @@ -355,7 +355,7 @@ func (oc *HTTPOmConnection) OpsManagerVersion() versionutil.OpsManagerVersion { // UpdateDeployment updates a given deployment to the new deployment object passed as parameter. func (oc *HTTPOmConnection) UpdateDeployment(deployment Deployment) ([]byte, error) { - return oc.put(fmt.Sprintf("/automation/conf/%s", oc.GroupID()), deployment) + return oc.put(fmt.Sprintf("/api/public/v1.0/groups/%s/automationConfig", oc.GroupID()), deployment) } // ReadDeployment returns a Deployment object for this group diff --git a/controllers/operator/common_controller.go b/controllers/operator/common_controller.go index ae5d4b219..a7ad6e3f9 100644 --- a/controllers/operator/common_controller.go +++ b/controllers/operator/common_controller.go @@ -980,29 +980,19 @@ func ReconcileReplicaSetAC(ctx context.Context, d om.Deployment, spec mdbv1.DbCo d.AddMonitoringAndBackup(log, spec.GetSecurity().IsTLSEnabled(), caFilePath) d.ConfigureTLS(spec.GetSecurity(), caFilePath) d.ConfigureInternalClusterAuthentication(rs.GetProcessNames(), spec.GetSecurity().GetInternalClusterAuthenticationMode(), internalClusterPath) - - // Set StatefulSet version in replica set settings for rolling restart coordination + // Set StatefulSet version in replica set member tags for rolling restart coordination if statefulSetVersion != "" { - replicaSets, exists := d["replicaSets"] - if exists { - replicaSetList := replicaSets.([]interface{}) - for _, rs := range replicaSetList { - replicaSet := rs.(map[string]interface{}) - - // Get or create settings - settings, exists := replicaSet["settings"] - if !exists { - settings = map[string]interface{}{} - replicaSet["settings"] = settings - } - - settingsMap := settings.(map[string]interface{}) - settingsMap["kube"] = map[string]interface{}{ - "statefulSetVersion": statefulSetVersion, + replicaSets := d.GetReplicaSets() + for _, replicaSet := range replicaSets { + for _, member := range replicaSet.Members() { + tags := member.Tags() + // Add StatefulSet version tag + tags["kubeStatefulSetVersion"] = statefulSetVersion + member["tags"] = tags } - } - log.Infof("Set StatefulSet version in replica set settings: %s", statefulSetVersion) } + d.SetReplicaSets(replicaSets) + log.Infof("Set StatefulSet version in replica set member tags: %s", statefulSetVersion) } // if we don't set up a prometheus connection, then we don't want to set up prometheus for instance because we do not support it yet. diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh index 9f2d60b58..ceac92988 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh @@ -116,7 +116,7 @@ download_agent() { script_log "Downloading Agent version: ${AGENT_VERSION}" curl_opts=( - "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c96f4020b54e00079b0621/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9776-1.linux_x86_64.tar.gz" + "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68caf1b06da1570007e898b4/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9783-1.linux_x86_64.tar.gz" "--location" "--silent" "--retry" "3" "--fail" "-v" "--output" "automation-agent.tar.gz" diff --git a/scripts/release/pipeline_main.py b/scripts/release/pipeline_main.py index eaee48064..acf2a8b67 100644 --- a/scripts/release/pipeline_main.py +++ b/scripts/release/pipeline_main.py @@ -281,7 +281,7 @@ def main(): metavar="", action="store", type=str, - help="Custom agent URL for testing (e.g., https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c96f4020b54e00079b0621/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9776-1.linux_x86_64.tar.gz)", + help="Custom agent URL for testing (e.g., https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68caf1b06da1570007e898b4/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9783-1.linux_x86_64.tar.gz)", ) args = parser.parse_args() diff --git a/scripts/test_custom_agent.sh b/scripts/test_custom_agent.sh index 44f446645..974bdf494 100755 --- a/scripts/test_custom_agent.sh +++ b/scripts/test_custom_agent.sh @@ -6,7 +6,7 @@ set -euo pipefail # This script demonstrates how to use the new --custom-agent-url parameter # It leverages the pipeline_main.py by supplying a version (latest) which is used by atomic_pipeline.py -CUSTOM_AGENT_URL="https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68c96f4020b54e00079b0621/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9776-1.linux_x86_64.tar.gz" +CUSTOM_AGENT_URL="https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68caf1b06da1570007e898b4/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9783-1.linux_x86_64.tar.gz" VERSION="latest" echo "Testing custom agent build with URL: ${CUSTOM_AGENT_URL}" @@ -25,4 +25,4 @@ scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --version "${VE echo "" echo "Custom agent build completed!" -echo "The image should now be available with tag '${VERSION}' containing your custom agent version 13.41.0.9772-1" +echo "The image should now be available with tag '${VERSION}' containing your custom agent version 13.41.0.9782-1" From f1cb71f6e58d71c36041335ab5c90ca1213d02cf Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Thu, 18 Sep 2025 10:05:58 +0200 Subject: [PATCH 08/20] add db pods rights --- helm_chart/templates/database-roles.yaml | 38 ++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/helm_chart/templates/database-roles.yaml b/helm_chart/templates/database-roles.yaml index ba46fc035..95645af3f 100644 --- a/helm_chart/templates/database-roles.yaml +++ b/helm_chart/templates/database-roles.yaml @@ -67,6 +67,29 @@ rules: - delete - get +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ $.Values.operator.baseName }}-database-pods + {{ $namespaceBlock }} +rules: + - apiGroups: + - '' + resources: + - pods + verbs: + - get + - list + - delete + - apiGroups: + - apps + resources: + - statefulsets + verbs: + - get + - list + --- kind: RoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -82,5 +105,20 @@ subjects: name: {{$.Values.operator.baseName}}-appdb {{ $namespaceBlock }} +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{$.Values.operator.baseName}}-database-pods + {{ $namespaceBlock }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{$.Values.operator.baseName}}-database-pods +subjects: + - kind: ServiceAccount + name: {{$.Values.operator.baseName}}-database-pods + {{ $namespaceBlock }} + {{- end }} {{- end }}{{/* if .Values.operator.createResourcesServiceAccountsAndRoles */}} From 33d1255d88c12a611ae7e852837b0fd19bc625a2 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Thu, 18 Sep 2025 10:28:25 +0200 Subject: [PATCH 09/20] add custom agent supply supporrt --- .../construct/database_construction.go | 6 ++ .../content/agent-launcher-lib.sh | 73 +++++++++++-------- .../content/agent-launcher.sh | 4 +- pkg/util/constants.go | 1 + 4 files changed, 51 insertions(+), 33 deletions(-) diff --git a/controllers/operator/construct/database_construction.go b/controllers/operator/construct/database_construction.go index 0bc0023a1..fc6ba70bd 100644 --- a/controllers/operator/construct/database_construction.go +++ b/controllers/operator/construct/database_construction.go @@ -1045,6 +1045,12 @@ func databaseEnvVars(opts DatabaseStatefulSetOptions) []corev1.EnvVar { vars = append(vars, corev1.EnvVar{Name: util.EnvVarAgentVersion, Value: agentVersion}) } + // Support for custom agent URL + if customAgentURL := os.Getenv(util.EnvVarCustomAgentURL); customAgentURL != "" { // nolint:forbidigo + zap.S().Debugf("using a custom agent URL: %s", customAgentURL) + vars = append(vars, corev1.EnvVar{Name: util.EnvVarCustomAgentURL, Value: customAgentURL}) + } + // append any additional env vars specified. vars = append(vars, opts.ExtraEnvs...) diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh index ceac92988..c7e4b15aa 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh @@ -91,37 +91,48 @@ download_agent() { AGENT_VERSION="${MDB_AGENT_VERSION}" fi - # Detect architecture for agent download - local detected_arch - detected_arch=$(uname -m) - - case "${detected_arch}" in - x86_64) - AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.linux_x86_64.tar.gz" - ;; - aarch64|arm64) - AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.amzn2_aarch64.tar.gz" - ;; - ppc64le) - AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.rhel8_ppc64le.tar.gz" - ;; - s390x) - AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.rhel7_s390x.tar.gz" - ;; - *) - script_log "Error: Unsupported architecture for MongoDB agent: ${detected_arch}" - exit 1 - ;; - esac - - script_log "Downloading Agent version: ${AGENT_VERSION}" - curl_opts=( - "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68caf1b06da1570007e898b4/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9783-1.linux_x86_64.tar.gz" - - "--location" "--silent" "--retry" "3" "--fail" "-v" - "--output" "automation-agent.tar.gz" - ); - script_log "Downloading a Mongodb Agent via ${curl_opts[0]:?}" + # Check if custom agent URL is provided + if [[ -n "${MDB_CUSTOM_AGENT_URL-}" ]]; then + script_log "Using custom agent URL: ${MDB_CUSTOM_AGENT_URL}" + curl_opts=( + "${MDB_CUSTOM_AGENT_URL}" + "--location" "--silent" "--retry" "3" "--fail" "-v" + "--output" "automation-agent.tar.gz" + ); + script_log "Downloading a Mongodb Agent via ${curl_opts[0]:?}" + else + # Detect architecture for agent download + local detected_arch + detected_arch=$(uname -m) + + case "${detected_arch}" in + x86_64) + AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.linux_x86_64.tar.gz" + ;; + aarch64|arm64) + AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.amzn2_aarch64.tar.gz" + ;; + ppc64le) + AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.rhel8_ppc64le.tar.gz" + ;; + s390x) + AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.rhel7_s390x.tar.gz" + ;; + *) + script_log "Error: Unsupported architecture for MongoDB agent: ${detected_arch}" + exit 1 + ;; + esac + + script_log "Downloading Agent version: ${AGENT_VERSION}" + curl_opts=( + "${base_url}/download/agent/automation/${AGENT_FILE}" + + "--location" "--silent" "--retry" "3" "--fail" "-v" + "--output" "automation-agent.tar.gz" + ); + script_log "Downloading a Mongodb Agent via ${curl_opts[0]:?}" + fi if [ "${SSL_REQUIRE_VALID_MMS_CERTIFICATES-}" = "false" ]; then diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh index fc366f6ad..a548c5cab 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh @@ -95,9 +95,9 @@ base_url="${base_url%/}" # Remove any accidentally defined trailing slashes declare -r base_url if [ -z "${MDB_STATIC_CONTAINERS_ARCHITECTURE}" ]; then - # Download the Automation Agent from Ops Manager + # Download the Automation Agent from Ops Manager or custom URL # Note, that it will be skipped if the agent is supposed to be run in headless mode - if [[ -n "${base_url}" ]]; then + if [[ -n "${base_url}" ]] || [[ -n "${MDB_CUSTOM_AGENT_URL-}" ]]; then download_agent fi fi diff --git a/pkg/util/constants.go b/pkg/util/constants.go index 443ae925c..d0a94860d 100644 --- a/pkg/util/constants.go +++ b/pkg/util/constants.go @@ -72,6 +72,7 @@ const ( // EnvVarDebug is used to decide whether we want to start the agent in debug mode EnvVarDebug = "MDB_AGENT_DEBUG" EnvVarAgentVersion = "MDB_AGENT_VERSION" + EnvVarCustomAgentURL = "MDB_CUSTOM_AGENT_URL" EnvVarMultiClusterMode = "MULTI_CLUSTER_MODE" // EnvVarSSLRequireValidMMSCertificates bla bla From ed31ed7737f4159105f88a6818561e7ea2126e81 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Thu, 18 Sep 2025 10:29:30 +0200 Subject: [PATCH 10/20] add custom agent supply supporrt --- .../content/agent-launcher-lib.sh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh index c7e4b15aa..9a8aa344b 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh @@ -85,12 +85,6 @@ download_agent() { pushd /tmp >/dev/null || true - if [[ -z "${MDB_AGENT_VERSION-}" ]]; then - AGENT_VERSION="latest" - else - AGENT_VERSION="${MDB_AGENT_VERSION}" - fi - # Check if custom agent URL is provided if [[ -n "${MDB_CUSTOM_AGENT_URL-}" ]]; then script_log "Using custom agent URL: ${MDB_CUSTOM_AGENT_URL}" @@ -101,6 +95,8 @@ download_agent() { ); script_log "Downloading a Mongodb Agent via ${curl_opts[0]:?}" else + AGENT_VERSION="${MDB_AGENT_VERSION:-latest}" + # Detect architecture for agent download local detected_arch detected_arch=$(uname -m) From 4a8b251b5f156a57a3295e77bd1b45c459fa3343 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Thu, 18 Sep 2025 14:16:33 +0200 Subject: [PATCH 11/20] sidecar --- build_info.json | 28 ++ .../construct/database_construction.go | 88 ++++++ .../operator/database_statefulset_options.go | 7 + .../operator/mongodbreplicaset_controller.go | 1 + .../Dockerfile | 19 ++ .../replicaset/test_rolling_restart_poc.py | 82 ++++-- .../cmd/agent-sidecar/main.go | 257 ++++++++++++++++++ scripts/release/atomic_pipeline.py | 10 + scripts/release/build/build_info.py | 1 + scripts/release/pipeline_main.py | 3 + 10 files changed, 468 insertions(+), 28 deletions(-) create mode 100644 docker/mongodb-kubernetes-agent-sidecar/Dockerfile create mode 100644 mongodb-community-operator/cmd/agent-sidecar/main.go diff --git a/build_info.json b/build_info.json index f7f903f7e..435ac7d26 100644 --- a/build_info.json +++ b/build_info.json @@ -253,6 +253,34 @@ ] } }, + "agent-sidecar": { + "dockerfile-path": "docker/mongodb-kubernetes-agent-sidecar/Dockerfile", + "patch": { + "repositories": ["268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/nnguyen-kops/agent-sidecar"], + "platforms": [ + "linux/amd64" + ] + }, + "staging": { + "sign": true, + "latest-tag": true, + "repositories": ["268558157000.dkr.ecr.us-east-1.amazonaws.com/staging/nnguyen-kops/agent-sidecar"], + "platforms": [ + "linux/arm64", + "linux/amd64" + ] + }, + "release": { + "version": "1.0.0", + "sign": true, + "olm-tag": true, + "repositories": ["quay.io/mongodb/mongodb-kubernetes-agent-sidecar"], + "platforms": [ + "linux/arm64", + "linux/amd64" + ] + } + }, "agent": { "dockerfile-path": "docker/mongodb-agent/Dockerfile", "patch": { diff --git a/controllers/operator/construct/database_construction.go b/controllers/operator/construct/database_construction.go index fc6ba70bd..00bd5bfcc 100644 --- a/controllers/operator/construct/database_construction.go +++ b/controllers/operator/construct/database_construction.go @@ -109,6 +109,7 @@ type DatabaseStatefulSetOptions struct { DatabaseNonStaticImage string MongodbImage string AgentImage string + AgentSidecarImage string Annotations map[string]string VaultConfig vault.VaultConfiguration @@ -734,6 +735,43 @@ func buildStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, mdb configureContainerSecurityContext, )} + // Agent sidecar container for health monitoring and pod deletion + agentSidecarModifications := []func(*corev1.Container){container.Apply( + container.WithName("agent-sidecar"), + container.WithImage(opts.AgentSidecarImage), + container.WithEnvs([]corev1.EnvVar{ + { + Name: "POD_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.name", + }, + }, + }, + { + Name: "POD_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.namespace", + }, + }, + }, + { + Name: "AGENT_STATUS_FILEPATH", + Value: "/var/log/mongodb-mms-automation/agent-health-status.json", + }, + }...), + container.WithVolumeMounts([]corev1.VolumeMount{ + { + Name: util.PvcNameData, + MountPath: "/var/log/mongodb-mms-automation", + SubPath: util.PvcNameLogs, + ReadOnly: true, + }, + }), + configureContainerSecurityContext, + )} + if opts.HostNameOverrideConfigmapName != "" { volumes = append(volumes, statefulset.CreateVolumeFromConfigMap(opts.HostNameOverrideConfigmapName, opts.HostNameOverrideConfigmapName)) hostnameOverrideModification := container.WithVolumeMounts([]corev1.VolumeMount{ @@ -745,6 +783,7 @@ func buildStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, mdb agentContainerModifications = append(agentContainerModifications, hostnameOverrideModification) mongodContainerModifications = append(mongodContainerModifications, hostnameOverrideModification) agentUtilitiesHolderModifications = append(agentUtilitiesHolderModifications, hostnameOverrideModification) + agentSidecarModifications = append(agentSidecarModifications, hostnameOverrideModification) } mods := []podtemplatespec.Modification{ @@ -756,6 +795,11 @@ func buildStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, mdb podtemplatespec.WithContainerByIndex(2, agentUtilitiesHolderModifications...), } + // Add agent sidecar container if image is provided + if opts.AgentSidecarImage != "" { + mods = append(mods, podtemplatespec.WithContainerByIndex(3, agentSidecarModifications...)) + } + return podtemplatespec.Apply(mods...) } @@ -785,6 +829,44 @@ func buildNonStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, container.WithEnvs(readinessEnvironmentVariablesToEnvVars(opts.AgentConfig.ReadinessProbe.EnvironmentVariables)...), )} + // Agent sidecar container for health monitoring and pod deletion (non-static architecture) + _, configureContainerSecurityContext := podtemplatespec.WithDefaultSecurityContextsModifications() + agentSidecarModifications := []func(*corev1.Container){container.Apply( + container.WithName("agent-sidecar"), + container.WithImage(opts.AgentSidecarImage), + container.WithEnvs([]corev1.EnvVar{ + { + Name: "POD_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.name", + }, + }, + }, + { + Name: "POD_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.namespace", + }, + }, + }, + { + Name: "AGENT_STATUS_FILEPATH", + Value: "/var/log/mongodb-mms-automation/agent-health-status.json", + }, + }...), + container.WithVolumeMounts([]corev1.VolumeMount{ + { + Name: util.PvcNameData, + MountPath: "/var/log/mongodb-mms-automation", + SubPath: util.PvcNameLogs, + ReadOnly: true, + }, + }), + configureContainerSecurityContext, + )} + if opts.HostNameOverrideConfigmapName != "" { volumes = append(volumes, statefulset.CreateVolumeFromConfigMap(opts.HostNameOverrideConfigmapName, opts.HostNameOverrideConfigmapName)) hostnameOverrideModification := container.WithVolumeMounts([]corev1.VolumeMount{ @@ -795,6 +877,7 @@ func buildNonStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, }) initContainerModifications = append(initContainerModifications, hostnameOverrideModification) databaseContainerModifications = append(databaseContainerModifications, hostnameOverrideModification) + agentSidecarModifications = append(agentSidecarModifications, hostnameOverrideModification) } mods := []podtemplatespec.Modification{ @@ -806,6 +889,11 @@ func buildNonStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, podtemplatespec.WithInitContainerByIndex(0, initContainerModifications...), } + // Add agent sidecar container if image is provided (non-static architecture) + if opts.AgentSidecarImage != "" { + mods = append(mods, podtemplatespec.WithContainerByIndex(1, agentSidecarModifications...)) + } + return podtemplatespec.Apply(mods...) } diff --git a/controllers/operator/database_statefulset_options.go b/controllers/operator/database_statefulset_options.go index d60bb1747..131bd334d 100644 --- a/controllers/operator/database_statefulset_options.go +++ b/controllers/operator/database_statefulset_options.go @@ -127,3 +127,10 @@ func WithAgentImage(image string) func(*construct.DatabaseStatefulSetOptions) { opts.AgentImage = image } } + +// WithAgentSidecarImage sets the AgentSidecarImage field. +func WithAgentSidecarImage(image string) func(*construct.DatabaseStatefulSetOptions) { + return func(opts *construct.DatabaseStatefulSetOptions) { + opts.AgentSidecarImage = image + } +} diff --git a/controllers/operator/mongodbreplicaset_controller.go b/controllers/operator/mongodbreplicaset_controller.go index 9872908c8..65222125d 100644 --- a/controllers/operator/mongodbreplicaset_controller.go +++ b/controllers/operator/mongodbreplicaset_controller.go @@ -214,6 +214,7 @@ func (r *ReconcileMongoDbReplicaSet) Reconcile(ctx context.Context, request reco WithDatabaseNonStaticImage(images.ContainerImage(r.imageUrls, util.NonStaticDatabaseEnterpriseImage, r.databaseNonStaticImageVersion)), WithAgentImage(images.ContainerImage(r.imageUrls, architectures.MdbAgentImageRepo, automationAgentVersion)), WithMongodbImage(images.GetOfficialImage(r.imageUrls, rs.Spec.Version, rs.GetAnnotations())), + WithAgentSidecarImage("268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/nnguyen-kops/agent-sidecar:latest"), ) caFilePath := fmt.Sprintf("%s/ca-pem", util.TLSCaMountPath) diff --git a/docker/mongodb-kubernetes-agent-sidecar/Dockerfile b/docker/mongodb-kubernetes-agent-sidecar/Dockerfile new file mode 100644 index 000000000..32692e5b8 --- /dev/null +++ b/docker/mongodb-kubernetes-agent-sidecar/Dockerfile @@ -0,0 +1,19 @@ +FROM --platform=$BUILDPLATFORM public.ecr.aws/docker/library/golang:1.24 AS builder + +WORKDIR /go/src/github.com/mongodb/mongodb-kubernetes/ + +COPY go.mod go.sum ./ + +RUN go mod download + +COPY mongodb-community-operator /go/src/github.com/mongodb/mongodb-kubernetes/mongodb-community-operator + +ARG TARGETARCH +ARG TARGETOS +RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -a -o /data/scripts/agent-sidecar ./mongodb-community-operator/cmd/agentsidecar/main.go + +FROM registry.access.redhat.com/ubi9/ubi-minimal + +COPY --from=builder /data/scripts/agent-sidecar /agent-sidecar + +ENTRYPOINT ["/agent-sidecar"] diff --git a/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py index d44173647..234da75f3 100644 --- a/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py +++ b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py @@ -103,8 +103,9 @@ def test_check_agent_detection(rolling_restart_replica_set: MongoDB, namespace: @mark.e2e_rolling_restart_poc def test_trigger_rolling_restart(rolling_restart_replica_set: MongoDB, namespace: str): - """Test triggering rolling restart by changing StatefulSet spec.""" + """Test triggering rolling restart by changing MongoDB CRD to cause operator StatefulSet update.""" from kubernetes import client + import time appsv1 = client.AppsV1Api() @@ -113,46 +114,71 @@ def test_trigger_rolling_restart(rolling_restart_replica_set: MongoDB, namespace initial_revision = sts.status.update_revision print(f"Initial StatefulSet revision: {initial_revision}") - # Trigger rolling restart by directly patching the StatefulSet with an environment variable + # Trigger rolling restart by modifying the MongoDB Custom Resource + # This causes the operator to update the StatefulSet, simulating real operator-driven changes + print("Triggering rolling restart by modifying MongoDB CRD...") + + # Add or update an environment variable in the StatefulSet configuration # This simulates infrastructure changes like image updates or security context changes - import time + rolling_restart_trigger_value = str(int(time.time())) - patch_body = { - "spec": { - "template": { - "spec": { - "containers": [ - { - "name": "mongodb-enterprise-database", - "env": [{"name": "ROLLING_RESTART_TRIGGER", "value": str(int(time.time()))}], - } - ] - } - } - } - } - - print("Triggering rolling restart by patching StatefulSet...") - appsv1.patch_namespaced_stateful_set(name="rolling-restart-test", namespace=namespace, body=patch_body) - - # Wait for StatefulSet to get updated with new revision - import time + # Use the MongoDB resource's statefulSet configuration to trigger the change + current_spec = rolling_restart_replica_set["spec"] + + # Initialize statefulSet spec if it doesn't exist + if "statefulSet" not in current_spec: + current_spec["statefulSet"] = {"spec": {}} + + if "spec" not in current_spec["statefulSet"]: + current_spec["statefulSet"]["spec"] = {} + + if "template" not in current_spec["statefulSet"]["spec"]: + current_spec["statefulSet"]["spec"]["template"] = {"spec": {}} + + if "spec" not in current_spec["statefulSet"]["spec"]["template"]: + current_spec["statefulSet"]["spec"]["template"]["spec"] = {} + + if "containers" not in current_spec["statefulSet"]["spec"]["template"]["spec"]: + current_spec["statefulSet"]["spec"]["template"]["spec"]["containers"] = [{}] + + # Ensure we have a container entry + if len(current_spec["statefulSet"]["spec"]["template"]["spec"]["containers"]) == 0: + current_spec["statefulSet"]["spec"]["template"]["spec"]["containers"] = [{}] + + # Add/update environment variable in first container + container = current_spec["statefulSet"]["spec"]["template"]["spec"]["containers"][0] + if "env" not in container: + container["env"] = [] + + # Add the rolling restart trigger environment variable + container["env"] = [env for env in container.get("env", []) if env.get("name") != "ROLLING_RESTART_TRIGGER"] + container["env"].append({ + "name": "ROLLING_RESTART_TRIGGER", + "value": rolling_restart_trigger_value + }) + + print(f"Added ROLLING_RESTART_TRIGGER={rolling_restart_trigger_value} to MongoDB CRD") + + # Update the MongoDB resource - this will cause the operator to update the StatefulSet + rolling_restart_replica_set.update() - max_wait = 60 + # Wait for StatefulSet to get updated with new revision by the operator + max_wait = 120 # Give operator time to reconcile start_time = time.time() new_revision = initial_revision + print("Waiting for operator to update StatefulSet...") while new_revision == initial_revision and (time.time() - start_time) < max_wait: - time.sleep(2) + time.sleep(5) sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) new_revision = sts.status.update_revision - print(f"Current StatefulSet revision: {new_revision}") + print(f"Current StatefulSet revision: {new_revision} (waiting for change from {initial_revision})") assert ( new_revision != initial_revision - ), f"StatefulSet revision should change. Initial: {initial_revision}, Current: {new_revision}" + ), f"StatefulSet revision should change after operator reconcile. Initial: {initial_revision}, Current: {new_revision}" - print(f"StatefulSet revision updated from {initial_revision} to {new_revision}") + print(f"Operator updated StatefulSet revision from {initial_revision} to {new_revision}") # Wait for the rolling restart coordination to complete and reach running state rolling_restart_replica_set.assert_reaches_phase(Phase.Running, timeout=600) diff --git a/mongodb-community-operator/cmd/agent-sidecar/main.go b/mongodb-community-operator/cmd/agent-sidecar/main.go new file mode 100644 index 000000000..db1df61ff --- /dev/null +++ b/mongodb-community-operator/cmd/agent-sidecar/main.go @@ -0,0 +1,257 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "os" + "strings" + "time" + + "go.uber.org/zap" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/mongodb/mongodb-kubernetes/mongodb-community-operator/pkg/agent" +) + +const ( + agentStatusFilePathEnv = "AGENT_STATUS_FILEPATH" + + defaultNamespace = "default" + + pollingInterval time.Duration = 1 * time.Second + pollingDuration time.Duration = 60 * time.Second +) + +func main() { + ctx := context.Background() + logger := setupLogger() + + logger.Info("Running agent sidecar for rolling restart coordination") + + if statusPath := os.Getenv(agentStatusFilePathEnv); statusPath == "" { + logger.Fatalf(`Required environment variable "%s" not set`, agentStatusFilePathEnv) + return + } + + // Continuously monitor agent health status for WaitDeleteMyPodKube step + for { + logger.Info("Checking agent health status for WaitDeleteMyPodKube step...") + health, err := waitForAgentHealthStatus() + if err != nil { + // If the pod has just restarted then the status file will not exist. + // In that case we continue monitoring + if os.IsNotExist(err) { + logger.Info("Agent health status file not found, continuing to monitor...") + } else { + logger.Errorf("Error getting the agent health file: %s", err) + } + + // Wait before trying again + time.Sleep(5 * time.Second) + continue + } + + shouldDelete, err := shouldDeletePod(health) + if err != nil { + logger.Errorf("Error checking if pod should be deleted: %s", err) + time.Sleep(5 * time.Second) + continue + } + + if shouldDelete { + logger.Infof("Pod should be deleted - WaitDeleteMyPodKube step detected") + if err := deletePod(ctx); err != nil { + // We should not raise an error if the Pod could not be deleted. It can have even + // worse consequences: Pod being restarted with the same version, and the agent + // killing it immediately after. + logger.Errorf("Could not manually trigger restart of this Pod because of: %s", err) + logger.Errorf("Make sure the Pod is restarted in order for the upgrade process to continue") + } else { + logger.Info("Successfully deleted pod - waiting for restart...") + } + + // If the Pod needs to be killed, we'll wait until the Pod + // is killed by Kubernetes, bringing the new container image + // into play. + quit := make(chan struct{}) + logger.Info("Pod killed itself, waiting...") + <-quit + } else { + // Continue monitoring + time.Sleep(2 * time.Second) + } + } +} + +func setupLogger() *zap.SugaredLogger { + log, err := zap.NewDevelopment() + if err != nil { + zap.S().Errorf("Error building logger config: %s", err) + os.Exit(1) + } + + return log.Sugar() +} + +// waitForAgentHealthStatus will poll the health status file and wait for it to be updated. +// The agent doesn't write the plan to the file right away and hence we need to wait for the +// latest plan to be written. +func waitForAgentHealthStatus() (agent.Health, error) { + ticker := time.NewTicker(pollingInterval) + defer ticker.Stop() + + totalTime := time.Duration(0) + for range ticker.C { + if totalTime > pollingDuration { + break + } + totalTime += pollingInterval + + health, err := getAgentHealthStatus() + if err != nil { + return agent.Health{}, err + } + + status, ok := health.Healthiness[getHostname()] + if !ok { + return agent.Health{}, fmt.Errorf("couldn't find status for hostname %s", getHostname()) + } + + // We determine if the file has been updated by checking if the process is not in goal state. + // As the agent is currently executing a plan, the process should not be in goal state. + if !status.IsInGoalState { + return health, nil + } + } + return agent.Health{}, fmt.Errorf("agent health status not ready after waiting %s", pollingDuration.String()) +} + +// getAgentHealthStatus returns an instance of agent.Health read +// from the health file on disk +func getAgentHealthStatus() (agent.Health, error) { + f, err := os.Open(os.Getenv(agentStatusFilePathEnv)) + if err != nil { + return agent.Health{}, err + } + defer func() { + if closeErr := f.Close(); closeErr != nil { + zap.S().Warnf("Failed to close agent health file: %v", closeErr) + } + }() + + h, err := readAgentHealthStatus(f) + if err != nil { + return agent.Health{}, fmt.Errorf("could not read health status file: %s", err) + } + return h, err +} + +// readAgentHealthStatus reads an instance of health.Health from the provided +// io.Reader +func readAgentHealthStatus(reader io.Reader) (agent.Health, error) { + var h agent.Health + data, err := io.ReadAll(reader) + if err != nil { + return h, err + } + err = json.Unmarshal(data, &h) + return h, err +} + +func getHostname() string { + return os.Getenv("HOSTNAME") +} + +// shouldDeletePod returns a boolean value indicating if this pod should be deleted +// this would be the case if the agent is currently trying to execute WaitDeleteMyPodKube step +func shouldDeletePod(health agent.Health) (bool, error) { + status, ok := health.ProcessPlans[getHostname()] + if !ok { + return false, fmt.Errorf("hostname %s was not in the process plans", getHostname()) + } + return isWaitingToBeDeleted(status), nil +} + +// isWaitingToBeDeleted determines if the agent is currently waiting +// for the pod to be deleted by external sidecar. We check if the most recent step +// is "WaitDeleteMyPodKube" +func isWaitingToBeDeleted(healthStatus agent.MmsDirectorStatus) bool { + if len(healthStatus.Plans) == 0 { + return false + } + lastPlan := healthStatus.Plans[len(healthStatus.Plans)-1] + for _, m := range lastPlan.Moves { + // Check if the current step is WaitDeleteMyPodKube + if m.Move == "WaitDeleteMyPodKube" { + return true + } + } + return false +} + +// deletePod attempts to delete the pod this mongod is running in +func deletePod(ctx context.Context) error { + thisPod, err := getThisPod() + if err != nil { + return fmt.Errorf("could not get pod: %s", err) + } + k8sClient, err := inClusterClient() + if err != nil { + return fmt.Errorf("could not get client: %s", err) + } + + if err := k8sClient.Delete(ctx, &thisPod); err != nil { + return fmt.Errorf("could not delete pod: %s", err) + } + return nil +} + +// getThisPod returns an instance of corev1.Pod that points to the current pod +func getThisPod() (corev1.Pod, error) { + podName := getHostname() + if podName == "" { + return corev1.Pod{}, fmt.Errorf("environment variable HOSTNAME was not present") + } + + ns, err := getNamespace() + if err != nil { + return corev1.Pod{}, fmt.Errorf("could not read namespace: %s", err) + } + + return corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: ns, + }, + }, nil +} + +func inClusterClient() (client.Client, error) { + config, err := rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("could not get cluster config: %s", err) + } + + k8sClient, err := client.New(config, client.Options{}) + if err != nil { + return nil, fmt.Errorf("could not create client: %s", err) + } + return k8sClient, nil +} + +func getNamespace() (string, error) { + data, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace") + if err != nil { + return "", err + } + if ns := strings.TrimSpace(string(data)); len(ns) > 0 { + return ns, nil + } + return defaultNamespace, nil +} diff --git a/scripts/release/atomic_pipeline.py b/scripts/release/atomic_pipeline.py index a9f0e5bb8..a8f885125 100755 --- a/scripts/release/atomic_pipeline.py +++ b/scripts/release/atomic_pipeline.py @@ -329,6 +329,16 @@ def build_upgrade_hook_image(build_configuration: ImageBuildConfiguration): ) +def build_agent_sidecar_image(build_configuration: ImageBuildConfiguration): + """ + Builds image used for agent health monitoring sidecar. + """ + + build_image( + build_configuration=build_configuration, + ) + + def build_agent(build_configuration: ImageBuildConfiguration): """ Build the agent only for the latest operator for patches and operator releases. diff --git a/scripts/release/build/build_info.py b/scripts/release/build/build_info.py index 227cafa28..d668fe271 100644 --- a/scripts/release/build/build_info.py +++ b/scripts/release/build/build_info.py @@ -17,6 +17,7 @@ MCO_TESTS_IMAGE = "mco-tests" READINESS_PROBE_IMAGE = "readiness-probe" UPGRADE_HOOK_IMAGE = "upgrade-hook" +AGENT_SIDECAR_IMAGE = "agent-sidecar" DATABASE_IMAGE = "database" AGENT_IMAGE = "agent" INIT_APPDB_IMAGE = "init-appdb" diff --git a/scripts/release/pipeline_main.py b/scripts/release/pipeline_main.py index acf2a8b67..d5636d57a 100644 --- a/scripts/release/pipeline_main.py +++ b/scripts/release/pipeline_main.py @@ -18,6 +18,7 @@ from lib.base_logger import logger from scripts.release.atomic_pipeline import ( build_agent, + build_agent_sidecar_image, build_database_image, build_init_appdb_image, build_init_database_image, @@ -31,6 +32,7 @@ ) from scripts.release.build.build_info import ( AGENT_IMAGE, + AGENT_SIDECAR_IMAGE, DATABASE_IMAGE, INIT_APPDB_IMAGE, INIT_DATABASE_IMAGE, @@ -73,6 +75,7 @@ def get_builder_function_for_image_name() -> Dict[str, Callable]: MCO_TESTS_IMAGE: build_mco_tests_image, READINESS_PROBE_IMAGE: build_readiness_probe_image, UPGRADE_HOOK_IMAGE: build_upgrade_hook_image, + AGENT_SIDECAR_IMAGE: build_agent_sidecar_image, DATABASE_IMAGE: build_database_image, AGENT_IMAGE: build_agent, # Init images From 1523041aaf913e45c15a5e5fe9ade40993d422d0 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Thu, 18 Sep 2025 14:46:21 +0200 Subject: [PATCH 12/20] sidecar --- .../Dockerfile | 2 +- .../cmd/agent-sidecar/main.go | 69 +++++++++++++++++-- 2 files changed, 65 insertions(+), 6 deletions(-) diff --git a/docker/mongodb-kubernetes-agent-sidecar/Dockerfile b/docker/mongodb-kubernetes-agent-sidecar/Dockerfile index 32692e5b8..93f973640 100644 --- a/docker/mongodb-kubernetes-agent-sidecar/Dockerfile +++ b/docker/mongodb-kubernetes-agent-sidecar/Dockerfile @@ -10,7 +10,7 @@ COPY mongodb-community-operator /go/src/github.com/mongodb/mongodb-kubernetes/mo ARG TARGETARCH ARG TARGETOS -RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -a -o /data/scripts/agent-sidecar ./mongodb-community-operator/cmd/agentsidecar/main.go +RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -a -o /data/scripts/agent-sidecar ./mongodb-community-operator/cmd/agent-sidecar/main.go FROM registry.access.redhat.com/ubi9/ubi-minimal diff --git a/mongodb-community-operator/cmd/agent-sidecar/main.go b/mongodb-community-operator/cmd/agent-sidecar/main.go index db1df61ff..9c47e1042 100644 --- a/mongodb-community-operator/cmd/agent-sidecar/main.go +++ b/mongodb-community-operator/cmd/agent-sidecar/main.go @@ -9,6 +9,7 @@ import ( "strings" "time" + "github.com/google/go-cmp/cmp" "go.uber.org/zap" "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" @@ -40,14 +41,15 @@ func main() { } // Continuously monitor agent health status for WaitDeleteMyPodKube step + var lastHealth agent.Health + var isFirstRead = true for { - logger.Info("Checking agent health status for WaitDeleteMyPodKube step...") - health, err := waitForAgentHealthStatus() + health, err := getAgentHealthStatus() if err != nil { // If the pod has just restarted then the status file will not exist. // In that case we continue monitoring if os.IsNotExist(err) { - logger.Info("Agent health status file not found, continuing to monitor...") + logger.Info("Agent health status file not found, monitoring...") } else { logger.Errorf("Error getting the agent health file: %s", err) } @@ -57,6 +59,16 @@ func main() { continue } + // Check if health status has changed + if isFirstRead { + logger.Infof("Agent health status initialized: %s", getHealthSummary(health)) + lastHealth = health + isFirstRead = false + } else if diff := cmp.Diff(lastHealth, health); diff != "" { + logger.Infof("Agent health status changed:\n%s", diff) + lastHealth = health + } + shouldDelete, err := shouldDeletePod(health) if err != nil { logger.Errorf("Error checking if pod should be deleted: %s", err) @@ -65,7 +77,7 @@ func main() { } if shouldDelete { - logger.Infof("Pod should be deleted - WaitDeleteMyPodKube step detected") + logger.Infof("🚨 WaitDeleteMyPodKube step detected - deleting pod") if err := deletePod(ctx); err != nil { // We should not raise an error if the Pod could not be deleted. It can have even // worse consequences: Pod being restarted with the same version, and the agent @@ -73,7 +85,7 @@ func main() { logger.Errorf("Could not manually trigger restart of this Pod because of: %s", err) logger.Errorf("Make sure the Pod is restarted in order for the upgrade process to continue") } else { - logger.Info("Successfully deleted pod - waiting for restart...") + logger.Info("āœ… Successfully deleted pod - waiting for restart...") } // If the Pod needs to be killed, we'll wait until the Pod @@ -152,6 +164,8 @@ func getAgentHealthStatus() (agent.Health, error) { return h, err } + + // readAgentHealthStatus reads an instance of health.Health from the provided // io.Reader func readAgentHealthStatus(reader io.Reader) (agent.Health, error) { @@ -168,6 +182,51 @@ func getHostname() string { return os.Getenv("HOSTNAME") } + + +// getHealthSummary returns a concise summary of the current health status +func getHealthSummary(health agent.Health) string { + hostname := getHostname() + + // Check process health + status, hasStatus := health.Healthiness[hostname] + processStatus, hasProcessStatus := health.ProcessPlans[hostname] + + if !hasStatus { + return "no_health_data" + } + + goalState := "in_goal" + if !status.IsInGoalState { + goalState = "not_in_goal" + } + + if !hasProcessStatus || len(processStatus.Plans) == 0 { + return fmt.Sprintf("%s_no_plans", goalState) + } + + // Get the latest plan + latestPlan := processStatus.Plans[len(processStatus.Plans)-1] + if latestPlan.Completed != nil { + return fmt.Sprintf("%s_plan_completed", goalState) + } + + // Find current move + for _, move := range latestPlan.Moves { + if move.Move == "WaitDeleteMyPodKube" { + return "waiting_for_pod_deletion" + } + // Check if this move has incomplete steps + for _, step := range move.Steps { + if step.Completed == nil { + return fmt.Sprintf("%s_executing_%s", goalState, move.Move) + } + } + } + + return fmt.Sprintf("%s_plan_running", goalState) +} + // shouldDeletePod returns a boolean value indicating if this pod should be deleted // this would be the case if the agent is currently trying to execute WaitDeleteMyPodKube step func shouldDeletePod(health agent.Health) (bool, error) { From 46b60158e2704e8b69be7e9285a0fd53412e9000 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Thu, 18 Sep 2025 15:03:31 +0200 Subject: [PATCH 13/20] sidecar --- .../cmd/agent-sidecar/main.go | 5 +- .../cmd/agent-sidecar/main_test.go | 138 ++++++++++++++++++ 2 files changed, 141 insertions(+), 2 deletions(-) create mode 100644 mongodb-community-operator/cmd/agent-sidecar/main_test.go diff --git a/mongodb-community-operator/cmd/agent-sidecar/main.go b/mongodb-community-operator/cmd/agent-sidecar/main.go index 9c47e1042..ffca25bd9 100644 --- a/mongodb-community-operator/cmd/agent-sidecar/main.go +++ b/mongodb-community-operator/cmd/agent-sidecar/main.go @@ -10,6 +10,7 @@ import ( "time" "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" "go.uber.org/zap" "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" @@ -59,12 +60,12 @@ func main() { continue } - // Check if health status has changed + // Check if health status has changed (ignoring LastMongoUpTime) if isFirstRead { logger.Infof("Agent health status initialized: %s", getHealthSummary(health)) lastHealth = health isFirstRead = false - } else if diff := cmp.Diff(lastHealth, health); diff != "" { + } else if diff := cmp.Diff(lastHealth, health, cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime")); diff != "" { logger.Infof("Agent health status changed:\n%s", diff) lastHealth = health } diff --git a/mongodb-community-operator/cmd/agent-sidecar/main_test.go b/mongodb-community-operator/cmd/agent-sidecar/main_test.go new file mode 100644 index 000000000..86c6cec0b --- /dev/null +++ b/mongodb-community-operator/cmd/agent-sidecar/main_test.go @@ -0,0 +1,138 @@ +package main + +import ( + "testing" + "time" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + "github.com/mongodb/mongodb-kubernetes/mongodb-community-operator/pkg/agent" +) + +func TestLastMongoUpTimeIgnored(t *testing.T) { + // Create two health statuses that are identical except for LastMongoUpTime + startTime := time.Date(2025, 1, 1, 10, 0, 0, 0, time.UTC) + health1 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: startTime.Unix(), + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + }, + }, + }, + }, + } + + laterTime := time.Date(2025, 1, 1, 11, 0, 0, 0, time.UTC) + health2 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: laterTime.Unix(), // Different time + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + }, + }, + }, + }, + } + + // Test 1: Without ignoring LastMongoUpTime, should detect difference + diffWithoutIgnore := cmp.Diff(health1, health2) + if diffWithoutIgnore == "" { + t.Error("Expected difference when not ignoring LastMongoUpTime, but got no difference") + } + + // Test 2: With ignoring LastMongoUpTime, should detect no difference + diffWithIgnore := cmp.Diff(health1, health2, cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime")) + if diffWithIgnore != "" { + t.Errorf("Expected no difference when ignoring LastMongoUpTime, but got: %s", diffWithIgnore) + } + + // Test 3: With meaningful change (IsInGoalState), should still detect difference even when ignoring LastMongoUpTime + health3 := health2 + health3.Healthiness["test-pod-0"] = agent.ProcessHealth{ + IsInGoalState: false, // Changed from true to false + ExpectedToBeUp: true, + LastMongoUpTime: time.Date(2025, 1, 1, 12, 0, 0, 0, time.UTC).Unix(), // Different time + } + + diffMeaningfulChange := cmp.Diff(health1, health3, cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime")) + if diffMeaningfulChange == "" { + t.Error("Expected difference when IsInGoalState changes, even when ignoring LastMongoUpTime") + } + + t.Logf("Test passed: LastMongoUpTime is properly ignored while other changes are detected") +} + +func TestPlanChangesDetected(t *testing.T) { + // Create two health statuses with different plans + startTime := time.Date(2025, 1, 1, 10, 0, 0, 0, time.UTC) + health1 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: startTime.Unix(), + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + }, + }, + }, + }, + } + + laterTime := time.Date(2025, 1, 1, 11, 0, 0, 0, time.UTC) + health2 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: laterTime.Unix(), // Different time (should be ignored) + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + }, + { + Started: &laterTime, // New plan added + }, + }, + }, + }, + } + + // Should detect difference due to new plan, even though LastMongoUpTime is ignored + diff := cmp.Diff(health1, health2, cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime")) + if diff == "" { + t.Error("Expected difference when new plan is added, even when ignoring LastMongoUpTime") + } + + t.Logf("Test passed: Plan changes are properly detected while ignoring LastMongoUpTime") +} From 742c95662d162b39c085a178a81641d22bfb38c2 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Thu, 18 Sep 2025 15:15:38 +0200 Subject: [PATCH 14/20] sidecar --- .../cmd/agent-sidecar/main.go | 91 +++++- .../cmd/agent-sidecar/main_test.go | 258 +++++++++++++++++- 2 files changed, 339 insertions(+), 10 deletions(-) diff --git a/mongodb-community-operator/cmd/agent-sidecar/main.go b/mongodb-community-operator/cmd/agent-sidecar/main.go index ffca25bd9..9184ee654 100644 --- a/mongodb-community-operator/cmd/agent-sidecar/main.go +++ b/mongodb-community-operator/cmd/agent-sidecar/main.go @@ -30,6 +30,89 @@ const ( pollingDuration time.Duration = 60 * time.Second ) +// getCurrentStepInfo extracts current step and move information from health status +func getCurrentStepInfo(health agent.Health) string { + var info []string + + for processName, status := range health.ProcessPlans { + if len(status.Plans) == 0 { + continue + } + + // Get the most recent plan (last in the array) + latestPlan := status.Plans[len(status.Plans)-1] + + // Find the current move and step + for _, move := range latestPlan.Moves { + for _, step := range move.Steps { + // If step is not completed, it's the current step + if step.Completed == nil && step.Started != nil { + info = append(info, fmt.Sprintf("%s: %s -> %s", processName, move.Move, step.Step)) + } + } + } + } + + if len(info) > 0 { + return fmt.Sprintf(" [Current: %s]", strings.Join(info, ", ")) + } + return "" +} + +// conciseDiff creates a more concise diff output showing only changed lines with minimal context +func conciseDiff(old, new interface{}) string { + diff := cmp.Diff(old, new, + cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), + cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), + cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), + ) + + if diff == "" { + return "" + } + + // Add current step info if available + currentStepInfo := "" + if newHealth, ok := new.(agent.Health); ok { + currentStepInfo = getCurrentStepInfo(newHealth) + } + + // Split diff into lines and filter to show only changed lines with exactly one line of context before/after + lines := strings.Split(diff, "\n") + var result []string + var addedLines = make(map[int]bool) // Track which lines we've already added + + for i, line := range lines { + // Include lines that show actual changes (+ or -) + if strings.HasPrefix(line, "+") || strings.HasPrefix(line, "-") { + // Add exactly one line before for context (if exists and not already added) + if i > 0 && !addedLines[i-1] { + result = append(result, lines[i-1]) + addedLines[i-1] = true + } + + // Add the changed line + if !addedLines[i] { + result = append(result, line) + addedLines[i] = true + } + + // Add exactly one line after for context (if exists and not already added) + if i < len(lines)-1 && !addedLines[i+1] { + result = append(result, lines[i+1]) + addedLines[i+1] = true + } + } + } + + diffOutput := strings.Join(result, "\n") + if currentStepInfo != "" { + diffOutput += currentStepInfo + } + + return diffOutput +} + func main() { ctx := context.Background() logger := setupLogger() @@ -65,9 +148,11 @@ func main() { logger.Infof("Agent health status initialized: %s", getHealthSummary(health)) lastHealth = health isFirstRead = false - } else if diff := cmp.Diff(lastHealth, health, cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime")); diff != "" { - logger.Infof("Agent health status changed:\n%s", diff) - lastHealth = health + } else { + if diff := conciseDiff(lastHealth, health); diff != "" { + logger.Infof("Agent health status changed:\n%s", diff) + lastHealth = health + } } shouldDelete, err := shouldDeletePod(health) diff --git a/mongodb-community-operator/cmd/agent-sidecar/main_test.go b/mongodb-community-operator/cmd/agent-sidecar/main_test.go index 86c6cec0b..5bd0dd668 100644 --- a/mongodb-community-operator/cmd/agent-sidecar/main_test.go +++ b/mongodb-community-operator/cmd/agent-sidecar/main_test.go @@ -1,6 +1,7 @@ package main import ( + "strings" "testing" "time" @@ -59,10 +60,14 @@ func TestLastMongoUpTimeIgnored(t *testing.T) { t.Error("Expected difference when not ignoring LastMongoUpTime, but got no difference") } - // Test 2: With ignoring LastMongoUpTime, should detect no difference - diffWithIgnore := cmp.Diff(health1, health2, cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime")) + // Test 2: With ignoring time fields, should detect no difference + diffWithIgnore := cmp.Diff(health1, health2, + cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), + cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), + cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), + ) if diffWithIgnore != "" { - t.Errorf("Expected no difference when ignoring LastMongoUpTime, but got: %s", diffWithIgnore) + t.Errorf("Expected no difference when ignoring time fields, but got: %s", diffWithIgnore) } // Test 3: With meaningful change (IsInGoalState), should still detect difference even when ignoring LastMongoUpTime @@ -73,7 +78,11 @@ func TestLastMongoUpTimeIgnored(t *testing.T) { LastMongoUpTime: time.Date(2025, 1, 1, 12, 0, 0, 0, time.UTC).Unix(), // Different time } - diffMeaningfulChange := cmp.Diff(health1, health3, cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime")) + diffMeaningfulChange := cmp.Diff(health1, health3, + cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), + cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), + cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), + ) if diffMeaningfulChange == "" { t.Error("Expected difference when IsInGoalState changes, even when ignoring LastMongoUpTime") } @@ -128,11 +137,246 @@ func TestPlanChangesDetected(t *testing.T) { }, } - // Should detect difference due to new plan, even though LastMongoUpTime is ignored - diff := cmp.Diff(health1, health2, cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime")) + // Should detect difference due to new plan, even though time fields are ignored + diff := cmp.Diff(health1, health2, + cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), + cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), + cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), + ) if diff == "" { t.Error("Expected difference when new plan is added, even when ignoring LastMongoUpTime") } - t.Logf("Test passed: Plan changes are properly detected while ignoring LastMongoUpTime") + t.Logf("Test passed: Plan changes are properly detected while ignoring time fields") +} + +func TestAllTimeFieldsIgnored(t *testing.T) { + // Create two health statuses with different time fields but same meaningful content + startTime1 := time.Date(2025, 1, 1, 10, 0, 0, 0, time.UTC) + startTime2 := time.Date(2025, 1, 1, 11, 0, 0, 0, time.UTC) + completedTime1 := time.Date(2025, 1, 1, 10, 30, 0, 0, time.UTC) + completedTime2 := time.Date(2025, 1, 1, 11, 30, 0, 0, time.UTC) + + health1 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: startTime1.Unix(), + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime1, + Completed: &completedTime1, + Moves: []*agent.MoveStatus{ + { + Move: "TestMove", + Steps: []*agent.StepStatus{ + { + Step: "TestStep", + Started: &startTime1, + Completed: &completedTime1, + Result: "SUCCESS", + }, + }, + }, + }, + }, + }, + }, + }, + } + + health2 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: startTime2.Unix(), // Different time + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime2, // Different time + Completed: &completedTime2, // Different time + Moves: []*agent.MoveStatus{ + { + Move: "TestMove", + Steps: []*agent.StepStatus{ + { + Step: "TestStep", + Started: &startTime2, // Different time + Completed: &completedTime2, // Different time + Result: "SUCCESS", + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Should detect no difference when ignoring all time fields + diff := cmp.Diff(health1, health2, + cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), + cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), + cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), + ) + if diff != "" { + t.Errorf("Expected no difference when ignoring all time fields, but got: %s", diff) + } + + t.Logf("Test passed: All time fields (LastMongoUpTime, Started, Completed) are properly ignored") +} + +func TestConciseDiff(t *testing.T) { + // Create realistic health statuses based on the test data structure + startTime := time.Date(2019, 9, 11, 14, 20, 40, 0, time.UTC) + completedTime := time.Date(2019, 9, 11, 14, 21, 42, 0, time.UTC) + + health1 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "bar": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: 1568222195, + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "bar": { + Name: "bar", + LastGoalStateClusterConfigVersion: 5, + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + Completed: &completedTime, + Moves: []*agent.MoveStatus{ + { + Move: "WaitRsInit", + Steps: []*agent.StepStatus{ + { + Step: "WaitRsInit", + Started: &startTime, + Completed: nil, // Currently running + Result: "wait", + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Simulate a step completion + stepCompletedTime := time.Date(2019, 9, 11, 14, 22, 0, 0, time.UTC) + health2 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "bar": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: 1568222200, // Different time (should be ignored) + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "bar": { + Name: "bar", + LastGoalStateClusterConfigVersion: 5, + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + Completed: &completedTime, + Moves: []*agent.MoveStatus{ + { + Move: "WaitRsInit", + Steps: []*agent.StepStatus{ + { + Step: "WaitRsInit", + Started: &startTime, + Completed: &stepCompletedTime, // Now completed + Result: "success", // Changed from "wait" to "success" + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Test concise diff + diff := conciseDiff(health1, health2) + if diff == "" { + t.Error("Expected concise diff to detect step result change") + } + + // Verify it contains the actual change + if !strings.Contains(diff, "Result") { + t.Error("Expected concise diff to contain 'Result' field change") + } + + t.Logf("Concise diff output (%d chars):\n%s", len(diff), diff) + t.Logf("Test passed: Concise diff produces shorter, focused output with current step info") +} + +func TestCurrentStepInfo(t *testing.T) { + // Create health status with a currently running step + startTime := time.Date(2019, 9, 11, 14, 20, 40, 0, time.UTC) + + health := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "bar": { + IsInGoalState: false, + ExpectedToBeUp: true, + LastMongoUpTime: 1568222195, + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "bar": { + Name: "bar", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + Moves: []*agent.MoveStatus{ + { + Move: "Start", + Steps: []*agent.StepStatus{ + { + Step: "StartFresh", + Started: &startTime, + Completed: nil, // Currently running + Result: "in_progress", + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Test current step info extraction + stepInfo := getCurrentStepInfo(health) + if stepInfo == "" { + t.Error("Expected getCurrentStepInfo to return current step information") + } + + if !strings.Contains(stepInfo, "Start") || !strings.Contains(stepInfo, "StartFresh") { + t.Errorf("Expected step info to contain move 'Start' and step 'StartFresh', got: %s", stepInfo) + } + + t.Logf("Current step info: %s", stepInfo) + t.Logf("Test passed: Current step info extraction works correctly") } From d8860072f89682c1a0e1754bdae2211f461fbb75 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Thu, 18 Sep 2025 16:24:56 +0200 Subject: [PATCH 15/20] sidecar --- .../construct/database_construction.go | 4 - .../replicaset/test_rolling_restart_poc.py | 52 ++---- .../cmd/agent-sidecar/main.go | 168 +++++++++++++++++- 3 files changed, 185 insertions(+), 39 deletions(-) diff --git a/controllers/operator/construct/database_construction.go b/controllers/operator/construct/database_construction.go index 00bd5bfcc..6879147c2 100644 --- a/controllers/operator/construct/database_construction.go +++ b/controllers/operator/construct/database_construction.go @@ -364,10 +364,6 @@ func DatabaseStatefulSetHelper(mdb databaseStatefulSetSource, stsOpts *DatabaseS } extraEnvs = append(extraEnvs, ReadDatabaseProxyVarsFromEnv()...) - extraEnvs = append(extraEnvs, corev1.EnvVar{ - Name: "nam", - Value: "nam", - }) stsOpts.ExtraEnvs = extraEnvs templateFunc := buildMongoDBPodTemplateSpec(*stsOpts, mdb) diff --git a/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py index 234da75f3..db8917a6d 100644 --- a/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py +++ b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py @@ -122,40 +122,24 @@ def test_trigger_rolling_restart(rolling_restart_replica_set: MongoDB, namespace # This simulates infrastructure changes like image updates or security context changes rolling_restart_trigger_value = str(int(time.time())) - # Use the MongoDB resource's statefulSet configuration to trigger the change - current_spec = rolling_restart_replica_set["spec"] - - # Initialize statefulSet spec if it doesn't exist - if "statefulSet" not in current_spec: - current_spec["statefulSet"] = {"spec": {}} - - if "spec" not in current_spec["statefulSet"]: - current_spec["statefulSet"]["spec"] = {} - - if "template" not in current_spec["statefulSet"]["spec"]: - current_spec["statefulSet"]["spec"]["template"] = {"spec": {}} - - if "spec" not in current_spec["statefulSet"]["spec"]["template"]: - current_spec["statefulSet"]["spec"]["template"]["spec"] = {} - - if "containers" not in current_spec["statefulSet"]["spec"]["template"]["spec"]: - current_spec["statefulSet"]["spec"]["template"]["spec"]["containers"] = [{}] - - # Ensure we have a container entry - if len(current_spec["statefulSet"]["spec"]["template"]["spec"]["containers"]) == 0: - current_spec["statefulSet"]["spec"]["template"]["spec"]["containers"] = [{}] - - # Add/update environment variable in first container - container = current_spec["statefulSet"]["spec"]["template"]["spec"]["containers"][0] - if "env" not in container: - container["env"] = [] - - # Add the rolling restart trigger environment variable - container["env"] = [env for env in container.get("env", []) if env.get("name") != "ROLLING_RESTART_TRIGGER"] - container["env"].append({ - "name": "ROLLING_RESTART_TRIGGER", - "value": rolling_restart_trigger_value - }) + # Add podSpec with mongodb-enterprise-database container and env var to trigger rolling restart + rolling_restart_replica_set["spec"]["podSpec"] = { + "podTemplate": { + "spec": { + "containers": [ + { + "name": "mongodb-enterprise-database", + "env": [ + { + "name": "ROLLING_RESTART_TRIGGER", + "value": rolling_restart_trigger_value + } + ] + } + ] + } + } + } print(f"Added ROLLING_RESTART_TRIGGER={rolling_restart_trigger_value} to MongoDB CRD") diff --git a/mongodb-community-operator/cmd/agent-sidecar/main.go b/mongodb-community-operator/cmd/agent-sidecar/main.go index 9184ee654..c1bb1e77b 100644 --- a/mongodb-community-operator/cmd/agent-sidecar/main.go +++ b/mongodb-community-operator/cmd/agent-sidecar/main.go @@ -5,7 +5,9 @@ import ( "encoding/json" "fmt" "io" + "net/http" "os" + "regexp" "strings" "time" @@ -28,6 +30,10 @@ const ( pollingInterval time.Duration = 1 * time.Second pollingDuration time.Duration = 60 * time.Second + + // Agent log file paths (typically in /var/log/mongodb-mms-automation/) + agentLogPath = "/var/log/mongodb-mms-automation/automation-agent.log" + agentVerboseLogPath = "/var/log/mongodb-mms-automation/automation-agent-verbose.log" ) // getCurrentStepInfo extracts current step and move information from health status @@ -113,11 +119,171 @@ func conciseDiff(old, new interface{}) string { return diffOutput } +// AgentVersionInfo holds version information about the MongoDB automation agent +type AgentVersionInfo struct { + Version string `json:"version"` + ImageTag string `json:"imageTag"` + Source string `json:"source"` + LastChecked string `json:"lastChecked"` +} + +// getAgentVersion attempts to determine the MongoDB automation agent version +func getAgentVersion() AgentVersionInfo { + info := AgentVersionInfo{ + LastChecked: time.Now().Format(time.RFC3339), + } + + // Try to get version from agent logs first (most reliable) + if version := getVersionFromAgentLogs(); version != "" { + info.Version = version + info.Source = "agent_logs" + return info + } + + // Fallback: try to get version from process list + if version := getVersionFromProcessList(); version != "" { + info.Version = version + info.Source = "process_list" + return info + } + + info.Version = "unknown" + info.Source = "not_found" + return info +} + +// getVersionFromAgentLogs reads the MongoDB automation agent logs to extract version information +func getVersionFromAgentLogs() string { + // Try verbose log first, then regular log + logPaths := []string{agentVerboseLogPath, agentLogPath} + + for _, logPath := range logPaths { + if version := readVersionFromLogFile(logPath); version != "" { + return version + } + } + return "" +} + +// readVersionFromLogFile reads a log file and extracts version information +func readVersionFromLogFile(logPath string) string { + file, err := os.Open(logPath) + if err != nil { + return "" + } + defer file.Close() + + // Read the first few KB of the log file where version info is typically logged + buffer := make([]byte, 8192) + n, err := file.Read(buffer) + if err != nil && err != io.EOF { + return "" + } + + content := string(buffer[:n]) + + // Look for version patterns in agent logs + // Examples: "MongoDB Automation Agent version 108.0.12.8846-1" + // "automation-agent version: 108.0.12.8846-1" + // "Starting automation agent 108.0.12.8846-1" + versionRegex := regexp.MustCompile(`(?i)(?:automation.?agent|mongodb.?automation.?agent).*?version[:\s]+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?:\-[0-9]+)?)`) + matches := versionRegex.FindStringSubmatch(content) + if len(matches) > 1 { + return matches[1] + } + + // Alternative pattern: look for version in startup messages + versionRegex2 := regexp.MustCompile(`(?i)starting.*?agent.*?([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?:\-[0-9]+)?)`) + matches2 := versionRegex2.FindStringSubmatch(content) + if len(matches2) > 1 { + return matches2[1] + } + + return "" +} + +// getVersionFromProcessList attempts to get agent version from running processes +func getVersionFromProcessList() string { + // Try to find the automation agent process and extract version from command line + // This is a fallback method when logs are not available + + // Read /proc/*/cmdline to find automation agent processes + procDirs, err := os.ReadDir("/proc") + if err != nil { + return "" + } + + for _, procDir := range procDirs { + if !procDir.IsDir() { + continue + } + + // Check if directory name is numeric (PID) + if matched, _ := regexp.MatchString(`^\d+$`, procDir.Name()); !matched { + continue + } + + cmdlinePath := fmt.Sprintf("/proc/%s/cmdline", procDir.Name()) + cmdlineBytes, err := os.ReadFile(cmdlinePath) + if err != nil { + continue + } + + cmdline := string(cmdlineBytes) + + // Look for automation agent in command line + if strings.Contains(cmdline, "automation-agent") || strings.Contains(cmdline, "mongodb-mms-automation-agent") { + // Try to extract version from command line arguments + versionRegex := regexp.MustCompile(`([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?:\-[0-9]+)?)`) + matches := versionRegex.FindStringSubmatch(cmdline) + if len(matches) > 1 { + return matches[1] + } + } + } + + return "" +} + +// startVersionAPI starts a simple HTTP server to expose agent version information +func startVersionAPI(port string) { + http.HandleFunc("/version", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + versionInfo := getAgentVersion() + json.NewEncoder(w).Encode(versionInfo) + }) + + http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]string{ + "status": "healthy", + "time": time.Now().Format(time.RFC3339), + }) + }) + + go func() { + zap.S().Infof("Starting version API server on port %s", port) + if err := http.ListenAndServe(":"+port, nil); err != nil { + zap.S().Errorf("Version API server failed: %v", err) + } + }() +} + func main() { ctx := context.Background() logger := setupLogger() - logger.Info("Running agent sidecar for rolling restart coordination") + // Get and log agent version information + versionInfo := getAgentVersion() + logger.Infof("Running agent sidecar for rolling restart coordination") + logger.Infof("MongoDB Agent Version: %s (source: %s)", versionInfo.Version, versionInfo.Source) + if versionInfo.ImageTag != "" { + logger.Infof("Agent Image: %s", versionInfo.ImageTag) + } + + // Start version API server on port 8080 + startVersionAPI("8080") if statusPath := os.Getenv(agentStatusFilePathEnv); statusPath == "" { logger.Fatalf(`Required environment variable "%s" not set`, agentStatusFilePathEnv) From 8f4ef2f89ceb95c2b5c0fef3f7f63c970cf50374 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Fri, 19 Sep 2025 12:56:29 +0200 Subject: [PATCH 16/20] sidecar eror handling --- .../cmd/agent-sidecar/main.go | 182 ++++++++---------- .../cmd/agent-sidecar/main_test.go | 104 +++++++++- 2 files changed, 178 insertions(+), 108 deletions(-) diff --git a/mongodb-community-operator/cmd/agent-sidecar/main.go b/mongodb-community-operator/cmd/agent-sidecar/main.go index c1bb1e77b..84bb95eb4 100644 --- a/mongodb-community-operator/cmd/agent-sidecar/main.go +++ b/mongodb-community-operator/cmd/agent-sidecar/main.go @@ -11,8 +11,6 @@ import ( "strings" "time" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" "go.uber.org/zap" "k8s.io/client-go/rest" "sigs.k8s.io/controller-runtime/pkg/client" @@ -65,59 +63,7 @@ func getCurrentStepInfo(health agent.Health) string { return "" } -// conciseDiff creates a more concise diff output showing only changed lines with minimal context -func conciseDiff(old, new interface{}) string { - diff := cmp.Diff(old, new, - cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), - cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), - cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), - ) - if diff == "" { - return "" - } - - // Add current step info if available - currentStepInfo := "" - if newHealth, ok := new.(agent.Health); ok { - currentStepInfo = getCurrentStepInfo(newHealth) - } - - // Split diff into lines and filter to show only changed lines with exactly one line of context before/after - lines := strings.Split(diff, "\n") - var result []string - var addedLines = make(map[int]bool) // Track which lines we've already added - - for i, line := range lines { - // Include lines that show actual changes (+ or -) - if strings.HasPrefix(line, "+") || strings.HasPrefix(line, "-") { - // Add exactly one line before for context (if exists and not already added) - if i > 0 && !addedLines[i-1] { - result = append(result, lines[i-1]) - addedLines[i-1] = true - } - - // Add the changed line - if !addedLines[i] { - result = append(result, line) - addedLines[i] = true - } - - // Add exactly one line after for context (if exists and not already added) - if i < len(lines)-1 && !addedLines[i+1] { - result = append(result, lines[i+1]) - addedLines[i+1] = true - } - } - } - - diffOutput := strings.Join(result, "\n") - if currentStepInfo != "" { - diffOutput += currentStepInfo - } - - return diffOutput -} // AgentVersionInfo holds version information about the MongoDB automation agent type AgentVersionInfo struct { @@ -245,6 +191,55 @@ func getVersionFromProcessList() string { return "" } +// getLastGoalStateClusterConfigVersion extracts the LastGoalStateClusterConfigVersion from health status +func getLastGoalStateClusterConfigVersion(health agent.Health) int64 { + hostname := getHostname() + if processStatus, ok := health.ProcessPlans[hostname]; ok { + return processStatus.LastGoalStateClusterConfigVersion + } + return 0 +} + +// getCurrentMoveAndStep returns a concise description of the current move and step +func getCurrentMoveAndStep(health agent.Health) string { + hostname := getHostname() + + // Check process health + status, hasStatus := health.Healthiness[hostname] + processStatus, hasProcessStatus := health.ProcessPlans[hostname] + + if !hasStatus { + return "no health data" + } + + goalState := "in_goal" + if !status.IsInGoalState { + goalState = "not_in_goal" + } + + if !hasProcessStatus || len(processStatus.Plans) == 0 { + return fmt.Sprintf("%s (no plans)", goalState) + } + + // Get the latest plan + latestPlan := processStatus.Plans[len(processStatus.Plans)-1] + if latestPlan.Completed != nil { + return fmt.Sprintf("%s (plan completed)", goalState) + } + + // Find current move and step + for _, move := range latestPlan.Moves { + // Check if this move has incomplete steps + for _, step := range move.Steps { + if step.Completed == nil { + return fmt.Sprintf("current move: %s, current step: %s", move.Move, step.Step) + } + } + } + + return fmt.Sprintf("%s (plan running)", goalState) +} + // startVersionAPI starts a simple HTTP server to expose agent version information func startVersionAPI(port string) { http.HandleFunc("/version", func(w http.ResponseWriter, r *http.Request) { @@ -291,7 +286,7 @@ func main() { } // Continuously monitor agent health status for WaitDeleteMyPodKube step - var lastHealth agent.Health + var lastConfigVersion int64 = -1 var isFirstRead = true for { health, err := getAgentHealthStatus() @@ -309,16 +304,29 @@ func main() { continue } - // Check if health status has changed (ignoring LastMongoUpTime) + // Check if LastGoalStateClusterConfigVersion has changed + currentConfigVersion := getLastGoalStateClusterConfigVersion(health) if isFirstRead { - logger.Infof("Agent health status initialized: %s", getHealthSummary(health)) - lastHealth = health - isFirstRead = false - } else { - if diff := conciseDiff(lastHealth, health); diff != "" { - logger.Infof("Agent health status changed:\n%s", diff) - lastHealth = health + hostname := getHostname() + logger.Infof("Agent health status initialized for hostname '%s': %s", hostname, getCurrentMoveAndStep(health)) + + // Debug: show available hostnames in health data + var availableHostnames []string + for h := range health.Healthiness { + availableHostnames = append(availableHostnames, h) + } + if len(availableHostnames) > 0 { + logger.Infof("Available hostnames in health data: %v", availableHostnames) + } else { + logger.Warnf("No hostnames found in health data") } + + lastConfigVersion = currentConfigVersion + isFirstRead = false + } else if currentConfigVersion != lastConfigVersion { + logger.Infof("Cluster config version changed from %d to %d: %s", + lastConfigVersion, currentConfigVersion, getCurrentMoveAndStep(health)) + lastConfigVersion = currentConfigVersion } shouldDelete, err := shouldDeletePod(health) @@ -436,55 +444,17 @@ func getHostname() string { -// getHealthSummary returns a concise summary of the current health status -func getHealthSummary(health agent.Health) string { - hostname := getHostname() - - // Check process health - status, hasStatus := health.Healthiness[hostname] - processStatus, hasProcessStatus := health.ProcessPlans[hostname] - - if !hasStatus { - return "no_health_data" - } - - goalState := "in_goal" - if !status.IsInGoalState { - goalState = "not_in_goal" - } - - if !hasProcessStatus || len(processStatus.Plans) == 0 { - return fmt.Sprintf("%s_no_plans", goalState) - } - - // Get the latest plan - latestPlan := processStatus.Plans[len(processStatus.Plans)-1] - if latestPlan.Completed != nil { - return fmt.Sprintf("%s_plan_completed", goalState) - } - // Find current move - for _, move := range latestPlan.Moves { - if move.Move == "WaitDeleteMyPodKube" { - return "waiting_for_pod_deletion" - } - // Check if this move has incomplete steps - for _, step := range move.Steps { - if step.Completed == nil { - return fmt.Sprintf("%s_executing_%s", goalState, move.Move) - } - } - } - - return fmt.Sprintf("%s_plan_running", goalState) -} // shouldDeletePod returns a boolean value indicating if this pod should be deleted // this would be the case if the agent is currently trying to execute WaitDeleteMyPodKube step func shouldDeletePod(health agent.Health) (bool, error) { - status, ok := health.ProcessPlans[getHostname()] + hostname := getHostname() + status, ok := health.ProcessPlans[hostname] if !ok { - return false, fmt.Errorf("hostname %s was not in the process plans", getHostname()) + // Don't error on missing hostname - this is normal during startup + // Just return false (don't delete) and let the monitoring continue + return false, nil } return isWaitingToBeDeleted(status), nil } diff --git a/mongodb-community-operator/cmd/agent-sidecar/main_test.go b/mongodb-community-operator/cmd/agent-sidecar/main_test.go index 5bd0dd668..bcec80f54 100644 --- a/mongodb-community-operator/cmd/agent-sidecar/main_test.go +++ b/mongodb-community-operator/cmd/agent-sidecar/main_test.go @@ -5,8 +5,6 @@ import ( "testing" "time" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" "github.com/mongodb/mongodb-kubernetes/mongodb-community-operator/pkg/agent" ) @@ -380,3 +378,105 @@ func TestCurrentStepInfo(t *testing.T) { t.Logf("Current step info: %s", stepInfo) t.Logf("Test passed: Current step info extraction works correctly") } + +func TestGetLastGoalStateClusterConfigVersion(t *testing.T) { + // Test with health status that has config version + health := agent.Health{ + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + LastGoalStateClusterConfigVersion: 42, + }, + }, + } + + // Mock hostname for test + originalHostname := getHostname + defer func() { getHostname = originalHostname }() + getHostname = func() string { return "test-pod-0" } + + version := getLastGoalStateClusterConfigVersion(health) + if version != 42 { + t.Errorf("Expected config version 42, but got %d", version) + } + + // Test with missing process + getHostname = func() string { return "missing-pod" } + version = getLastGoalStateClusterConfigVersion(health) + if version != 0 { + t.Errorf("Expected config version 0 for missing process, but got %d", version) + } +} + +func TestGetCurrentMoveAndStep(t *testing.T) { + startTime := time.Date(2019, 9, 11, 14, 20, 40, 0, time.UTC) + + // Test with active step + health := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: false, + ExpectedToBeUp: true, + LastMongoUpTime: 1568222195, + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + Moves: []*agent.MoveStatus{ + { + Move: "Start", + Steps: []*agent.StepStatus{ + { + Step: "StartFresh", + Started: &startTime, + Completed: nil, // Currently running + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Mock hostname for test + originalHostname := getHostname + defer func() { getHostname = originalHostname }() + getHostname = func() string { return "test-pod-0" } + + result := getCurrentMoveAndStep(health) + expected := "not_in_goal (executing: Start -> StartFresh)" + if result != expected { + t.Errorf("Expected '%s', but got '%s'", expected, result) + } + + // Test with WaitDeleteMyPodKube move + health.ProcessPlans["test-pod-0"].Plans[0].Moves[0].Move = "WaitDeleteMyPodKube" + result = getCurrentMoveAndStep(health) + expected = "not_in_goal (waiting for pod deletion)" + if result != expected { + t.Errorf("Expected '%s', but got '%s'", expected, result) + } + + // Test with completed plan + completedTime := time.Date(2019, 9, 11, 14, 25, 0, 0, time.UTC) + health.ProcessPlans["test-pod-0"].Plans[0].Completed = &completedTime + result = getCurrentMoveAndStep(health) + expected = "not_in_goal (plan completed)" + if result != expected { + t.Errorf("Expected '%s', but got '%s'", expected, result) + } + + // Test with no health data + getHostname = func() string { return "missing-pod" } + result = getCurrentMoveAndStep(health) + expected = "no health data" + if result != expected { + t.Errorf("Expected '%s', but got '%s'", expected, result) + } +} From b5bcd1850e21dfd5442f726719b4ae8ce8c2063f Mon Sep 17 00:00:00 2001 From: Lucian Tosa Date: Fri, 19 Sep 2025 17:22:19 +0200 Subject: [PATCH 17/20] Fix sts revision detection --- .../operator/mongodbreplicaset_controller.go | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/controllers/operator/mongodbreplicaset_controller.go b/controllers/operator/mongodbreplicaset_controller.go index 65222125d..d11f62b46 100644 --- a/controllers/operator/mongodbreplicaset_controller.go +++ b/controllers/operator/mongodbreplicaset_controller.go @@ -252,7 +252,7 @@ func (r *ReconcileMongoDbReplicaSet) Reconcile(ctx context.Context, request reco // See CLOUDP-189433 and CLOUDP-229222 for more details. if recovery.ShouldTriggerRecovery(rs.Status.Phase != mdbstatus.PhaseRunning, rs.Status.LastTransition) { log.Warnf("Triggering Automatic Recovery. The MongoDB resource %s/%s is in %s state since %s", rs.Namespace, rs.Name, rs.Status.Phase, rs.Status.LastTransition) - automationConfigStatus := r.updateOmDeploymentRs(ctx, conn, rs.Status.Members, rs, sts, log, caFilePath, tlsCertPath, internalClusterCertPath, agentCertSecretSelector, prometheusCertHash, true, "").OnErrorPrepend("Failed to create/update (Ops Manager reconciliation phase):") + automationConfigStatus := r.updateOmDeploymentRs(ctx, conn, rs.Status.Members, rs, sts, log, caFilePath, tlsCertPath, internalClusterCertPath, agentCertSecretSelector, prometheusCertHash, true).OnErrorPrepend("Failed to create/update (Ops Manager reconciliation phase):") deploymentError := create.DatabaseInKubernetes(ctx, r.client, *rs, sts, rsConfig, log) if deploymentError != nil { log.Errorf("Recovery failed because of deployment errors, %w", deploymentError) @@ -267,17 +267,9 @@ func (r *ReconcileMongoDbReplicaSet) Reconcile(ctx context.Context, request reco lastSpec = &mdbv1.MongoDbSpec{} } - var statefulSetVersion string - kubeClient := r.client - currentSts, err := kubeClient.GetStatefulSet(ctx, kube.ObjectKey(rs.Namespace, rs.Name)) - if err == nil { - // StatefulSet exists, get its target revision - statefulSetVersion = currentSts.Status.UpdateRevision - } - status = workflow.RunInGivenOrder(publishAutomationConfigFirst(ctx, r.client, *rs, lastSpec, rsConfig, log), func() workflow.Status { - return r.updateOmDeploymentRs(ctx, conn, rs.Status.Members, rs, sts, log, caFilePath, tlsCertPath, internalClusterCertPath, agentCertSecretSelector, prometheusCertHash, false, statefulSetVersion).OnErrorPrepend("Failed to create/update (Ops Manager reconciliation phase):") + return r.updateOmDeploymentRs(ctx, conn, rs.Status.Members, rs, sts, log, caFilePath, tlsCertPath, internalClusterCertPath, agentCertSecretSelector, prometheusCertHash, false).OnErrorPrepend("Failed to create/update (Ops Manager reconciliation phase):") }, func() workflow.Status { workflowStatus := create.HandlePVCResize(ctx, r.client, &sts, log) @@ -438,7 +430,7 @@ func AddReplicaSetController(ctx context.Context, mgr manager.Manager, imageUrls // updateOmDeploymentRs performs OM registration operation for the replicaset. So the changes will be finally propagated // to automation agents in containers -func (r *ReconcileMongoDbReplicaSet) updateOmDeploymentRs(ctx context.Context, conn om.Connection, membersNumberBefore int, rs *mdbv1.MongoDB, set appsv1.StatefulSet, log *zap.SugaredLogger, caFilePath, tlsCertPath, internalClusterCertPath string, agentCertSecretSelector corev1.SecretKeySelector, prometheusCertHash string, isRecovering bool, statefulSetVersion string) workflow.Status { +func (r *ReconcileMongoDbReplicaSet) updateOmDeploymentRs(ctx context.Context, conn om.Connection, membersNumberBefore int, rs *mdbv1.MongoDB, set appsv1.StatefulSet, log *zap.SugaredLogger, caFilePath, tlsCertPath, internalClusterCertPath string, agentCertSecretSelector corev1.SecretKeySelector, prometheusCertHash string, isRecovering bool) workflow.Status { log.Debug("Entering UpdateOMDeployments") // Only "concrete" RS members should be observed // - if scaling down, let's observe only members that will remain after scale-down operation @@ -485,6 +477,12 @@ func (r *ReconcileMongoDbReplicaSet) updateOmDeploymentRs(ctx context.Context, c prometheusCertHash: prometheusCertHash, } + var statefulSetVersion string + currentSts, err := r.client.GetStatefulSet(ctx, kube.ObjectKey(rs.Namespace, rs.Name)) + if err == nil { + // StatefulSet exists, get its target revision + statefulSetVersion = currentSts.Status.UpdateRevision + } err = conn.ReadUpdateDeployment( func(d om.Deployment) error { return ReconcileReplicaSetAC(ctx, d, rs.Spec.DbCommonSpec, lastRsConfig.ToMap(), rs.Name, replicaSet, caFilePath, internalClusterCertPath, &p, statefulSetVersion, log) From 8b7a85758eeed7d034ec55e8f9271d0e7cbab45f Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Mon, 22 Sep 2025 13:17:43 +0200 Subject: [PATCH 18/20] update dlv code and add debug scripts --- .../content/agent-launcher.sh | 4 +- scripts/dev/agent/check-agent-logs.sh | 110 ++++++++++ scripts/dev/agent/print-agent-health.sh | 188 ++++++++++++++++++ 3 files changed, 300 insertions(+), 2 deletions(-) create mode 100755 scripts/dev/agent/check-agent-logs.sh create mode 100755 scripts/dev/agent/print-agent-health.sh diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh index a548c5cab..73e7fcd70 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh @@ -223,8 +223,8 @@ if [ "${debug}" = "true" ]; then cd ${mdb_downloads_dir} || true mkdir -p /var/lib/mongodb-mms-automation/gopath mkdir -p /var/lib/mongodb-mms-automation/go - curl -LO https://go.dev/dl/go1.20.1.linux-amd64.tar.gz - tar -xzf go1.20.1.linux-amd64.tar.gz + curl -LO https://go.dev/dl/go1.23.1.linux-amd64.tar.gz + tar -xzf go1.23.1.linux-amd64.tar.gz export GOPATH=${mdb_downloads_dir}/gopath export GOCACHE=${mdb_downloads_dir}/.cache export PATH=${PATH}:${mdb_downloads_dir}/go/bin diff --git a/scripts/dev/agent/check-agent-logs.sh b/scripts/dev/agent/check-agent-logs.sh new file mode 100755 index 000000000..2d1f4cb99 --- /dev/null +++ b/scripts/dev/agent/check-agent-logs.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +source /Users/nam.nguyen/projects/ops-manager-kubernetes/.generated/context.export.env + +# Check if NAMESPACE is set +if [ -z "$NAMESPACE" ]; then + echo "Error: NAMESPACE environment variable is not set" + exit 1 +fi + +echo "Checking agent logs for Kubernetes moves in namespace: $NAMESPACE" +echo "==================================================================" + +# Get all StatefulSets with owner mongodb.com/v1 +STS_LIST=$(kubectl get sts -n "$NAMESPACE" -o jsonpath='{range .items[?(@.metadata.ownerReferences[0].apiVersion=="mongodb.com/v1")]}{.metadata.name}{"\n"}{end}') + +if [ -z "$STS_LIST" ]; then + echo "No StatefulSets found with owner mongodb.com/v1 in namespace $NAMESPACE" + exit 0 +fi + +PODS_FOUND=() + +for sts in $STS_LIST; do + echo "Processing StatefulSet: $sts" + + # Get pods for this StatefulSet + PODS=$(kubectl get pods -n "$NAMESPACE" -l app="${sts}-svc" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) + + if [ -z "$PODS" ]; then + PODS=$(kubectl get pods -n "$NAMESPACE" -o json | jq -r '.items[] | select(.metadata.ownerReferences[]?.name=="'$sts'") | .metadata.name' 2>/dev/null) + fi + + if [ -z "$PODS" ]; then + echo " No pods found for StatefulSet $sts" + continue + fi + + for pod in $PODS; do + echo " Processing pod: $pod" + PODS_FOUND+=("$pod") + done +done + +if [ ${#PODS_FOUND[@]} -eq 0 ]; then + echo "No pods found" + exit 0 +fi + +echo "" +echo "šŸ” Analyzing Agent Logs for Kubernetes-related Moves" +echo "=====================================================" + +for pod in "${PODS_FOUND[@]}"; do + echo "" + echo "šŸ“‹ Pod: $pod" + echo "$(printf '%.0s─' {1..60})" + + # Check if log file exists + if ! kubectl exec -n "$NAMESPACE" "$pod" -- test -f /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null; then + echo "āŒ Log file not found: /var/log/mongodb-mms-automation/automation-agent-verbose.log" + continue + fi + + echo "šŸ”„ Recent Kubernetes/Move-related log entries:" + echo "" + + # Search for Kubernetes-related moves and recent activity + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 500 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -E "(move|Move|MOVE|kubernetes|k8s|wait|Wait|WAIT|step|Step|STEP|goal|Goal|GOAL|plan|Plan|PLAN)" | \ + tail -n 20 || echo "No recent move-related entries found" + + echo "" + echo "šŸŽÆ Current Wait Steps:" + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 200 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -E "(WaitAllRsMembersUp|WaitPrimary|WaitRsInit|WaitProcessUp|wait.*step)" | \ + tail -n 10 || echo "No current wait steps found" + + echo "" + echo "āš ļø Recent Errors/Warnings:" + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 300 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -iE "(error|Error|ERROR|warn|Warn|WARN|fail|Fail|FAIL)" | \ + tail -n 5 || echo "No recent errors/warnings found" + + echo "" + echo "šŸ“ˆ Recent Goal/Plan Activity:" + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 200 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -E "(goal.*version|plan.*start|plan.*complet|automation.*config)" | \ + tail -n 5 || echo "No recent goal/plan activity found" + + echo "" + echo "šŸ”— Replica Set Status:" + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 200 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -E "(replica.*set|rs.*init|primary|secondary|replication)" | \ + tail -n 5 || echo "No recent replica set activity found" + + echo "$(printf '%.0s═' {1..60})" +done + +echo "" +echo "šŸ’” Log Analysis Summary" +echo "======================" +echo "Analyzed logs from ${#PODS_FOUND[@]} pods for:" +echo " • Move/Step execution status" +echo " • Wait conditions and blocking steps" +echo " • Error conditions and warnings" +echo " • Goal/Plan progression" +echo " • Replica set initialization status" +echo "" +echo "āœ… Analysis complete!" \ No newline at end of file diff --git a/scripts/dev/agent/print-agent-health.sh b/scripts/dev/agent/print-agent-health.sh new file mode 100755 index 000000000..935eca968 --- /dev/null +++ b/scripts/dev/agent/print-agent-health.sh @@ -0,0 +1,188 @@ +#!/bin/bash + +source /Users/nam.nguyen/projects/ops-manager-kubernetes/.generated/context.export.env + +# Check if NAMESPACE is set +if [ -z "$NAMESPACE" ]; then + echo "Error: NAMESPACE environment variable is not set" + exit 1 +fi + +# Create temporary directory for health status files +TEMP_DIR=$(mktemp -d) +trap "rm -rf $TEMP_DIR" EXIT + +echo "Fetching agent health status from pods in namespace: $NAMESPACE" +echo "================================================================" + +# Get all StatefulSets with owner mongodb.com/v1 +STS_LIST=$(kubectl get sts -n "$NAMESPACE" -o jsonpath='{range .items[?(@.metadata.ownerReferences[0].apiVersion=="mongodb.com/v1")]}{.metadata.name}{"\n"}{end}') + +if [ -z "$STS_LIST" ]; then + echo "No StatefulSets found with owner mongodb.com/v1 in namespace $NAMESPACE" + exit 0 +fi + +# Collect all health status data +PODS_FOUND=() +PODS_WITH_DATA=() +PODS_WITHOUT_DATA=() + +for sts in $STS_LIST; do + echo "Processing StatefulSet: $sts" + + # Get pods for this StatefulSet using correct label selector + PODS=$(kubectl get pods -n "$NAMESPACE" -l app="${sts}-svc" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) + + if [ -z "$PODS" ]; then + # Try alternative selector patterns + PODS=$(kubectl get pods -n "$NAMESPACE" -o json | jq -r '.items[] | select(.metadata.ownerReferences[]?.name=="'$sts'") | .metadata.name' 2>/dev/null) + fi + + if [ -z "$PODS" ]; then + echo " No pods found for StatefulSet $sts" + continue + fi + + for pod in $PODS; do + echo " Processing pod: $pod" + PODS_FOUND+=("$pod") + + # Get agent health status + if kubectl exec -n "$NAMESPACE" "$pod" -- test -f /var/log/mongodb-mms-automation/agent-health-status.json 2>/dev/null; then + if kubectl exec -n "$NAMESPACE" "$pod" -- cat /var/log/mongodb-mms-automation/agent-health-status.json 2>/dev/null > "$TEMP_DIR/${pod}.json"; then + # Check if file has content + if [ -s "$TEMP_DIR/${pod}.json" ]; then + PODS_WITH_DATA+=("$pod") + else + echo " File exists but is empty" + PODS_WITHOUT_DATA+=("$pod") + fi + else + echo " Could not read file" + PODS_WITHOUT_DATA+=("$pod") + fi + else + echo " File not found: /var/log/mongodb-mms-automation/agent-health-status.json" + PODS_WITHOUT_DATA+=("$pod") + fi + done +done + +echo "" +echo "Agent Health Status Summary" +echo "==========================" + +if [ ${#PODS_FOUND[@]} -eq 0 ]; then + echo "No pods found with agent health status files" + exit 0 +fi + +# Pretty print individual status files +for pod in "${PODS_WITH_DATA[@]}"; do + echo "" + echo "šŸ” Pod: $pod" + echo "$(printf '%.0s─' {1..50})" + + if command -v jq &> /dev/null; then + jq -C . "$TEMP_DIR/${pod}.json" 2>/dev/null || cat "$TEMP_DIR/${pod}.json" + else + cat "$TEMP_DIR/${pod}.json" + fi +done + +# Show pods without data +if [ ${#PODS_WITHOUT_DATA[@]} -gt 0 ]; then + echo "" + echo "āŒ Pods without health data:" + for pod in "${PODS_WITHOUT_DATA[@]}"; do + echo " - $pod" + done +fi + +# Show differences if multiple pods with data exist +if [ ${#PODS_WITH_DATA[@]} -gt 1 ]; then + echo "" + echo "" + echo "šŸ”„ Health Status Comparison" + echo "==========================" + + # Create a combined comparison view + if command -v jq &> /dev/null; then + echo "Key Status Comparison:" + echo "$(printf '%.0s─' {1..80})" + + # Extract key fields for comparison + for pod in "${PODS_WITH_DATA[@]}"; do + echo -n "Pod $pod: " + # Try to extract overall state or first process state + STATE=$(jq -r '.state // (.statuses | keys[0] as $k | .[$k].IsInGoalState) // "unknown"' "$TEMP_DIR/${pod}.json" 2>/dev/null) + if [ "$STATE" = "true" ]; then + echo "āœ… In Goal State" + elif [ "$STATE" = "false" ]; then + echo "āŒ Not in Goal State" + else + echo "$STATE" + fi + done + + echo "" + echo "Process Details:" + echo "$(printf '%.0s─' {1..80})" + + for pod in "${PODS_WITH_DATA[@]}"; do + echo "Pod $pod:" + # Extract process information + jq -r ' + if .statuses then + .statuses | to_entries[] | " \(.key): IsInGoalState=\(.value.IsInGoalState), ReplicationStatus=\(.value.ReplicationStatus // "N/A")" + elif .processes then + .processes[] | " \(.name): \(.state // "unknown")" + else + " No process information found" + end + ' "$TEMP_DIR/${pod}.json" 2>/dev/null || echo " Parse error" + echo "" + done + + echo "MMS Status Summary:" + echo "$(printf '%.0s─' {1..80})" + + for pod in "${PODS_WITH_DATA[@]}"; do + echo "Pod $pod:" + jq -r ' + if .mmsStatus then + .mmsStatus | to_entries[] | " \(.key): GoalVersion=\(.value.lastGoalVersionAchieved // "N/A"), Responsive=\(.value.responsive // "N/A")" + else + " No MMS status found" + end + ' "$TEMP_DIR/${pod}.json" 2>/dev/null || echo " Parse error" + echo "" + done + fi + + # Show file differences using diff if exactly 2 pods with data + if [ ${#PODS_WITH_DATA[@]} -eq 2 ] && command -v diff &> /dev/null; then + pod1="${PODS_WITH_DATA[0]}" + pod2="${PODS_WITH_DATA[1]}" + + echo "" + echo "šŸ“Š Detailed Diff between $pod1 and $pod2:" + echo "$(printf '%.0s─' {1..80})" + + if command -v jq &> /dev/null; then + # Pretty print both files for comparison + jq . "$TEMP_DIR/${pod1}.json" > "$TEMP_DIR/${pod1}_pretty.json" 2>/dev/null + jq . "$TEMP_DIR/${pod2}.json" > "$TEMP_DIR/${pod2}_pretty.json" 2>/dev/null + diff -u "$TEMP_DIR/${pod1}_pretty.json" "$TEMP_DIR/${pod2}_pretty.json" || true + else + diff -u "$TEMP_DIR/${pod1}.json" "$TEMP_DIR/${pod2}.json" || true + fi + fi +fi + +echo "" +echo "āœ… Health status collection complete!" +echo " Found ${#PODS_FOUND[@]} total pods" +echo " ${#PODS_WITH_DATA[@]} pods with health data" +echo " ${#PODS_WITHOUT_DATA[@]} pods without health data" \ No newline at end of file From d36c3325f2a60f2d2d95b236895e1d5130d9b61e Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Mon, 22 Sep 2025 16:09:17 +0200 Subject: [PATCH 19/20] update sidecar deletion --- mongodb-community-operator/cmd/agent-sidecar/main.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mongodb-community-operator/cmd/agent-sidecar/main.go b/mongodb-community-operator/cmd/agent-sidecar/main.go index 84bb95eb4..5299681de 100644 --- a/mongodb-community-operator/cmd/agent-sidecar/main.go +++ b/mongodb-community-operator/cmd/agent-sidecar/main.go @@ -460,18 +460,18 @@ func shouldDeletePod(health agent.Health) (bool, error) { } // isWaitingToBeDeleted determines if the agent is currently waiting -// for the pod to be deleted by external sidecar. We check if the most recent step +// for the pod to be deleted by external sidecar. We check if the current/last step // is "WaitDeleteMyPodKube" func isWaitingToBeDeleted(healthStatus agent.MmsDirectorStatus) bool { if len(healthStatus.Plans) == 0 { return false } lastPlan := healthStatus.Plans[len(healthStatus.Plans)-1] - for _, m := range lastPlan.Moves { - // Check if the current step is WaitDeleteMyPodKube - if m.Move == "WaitDeleteMyPodKube" { - return true - } + + lastMove := lastPlan.Moves[len(lastPlan.Moves)-1] + lastStep := lastMove.Steps[len(lastMove.Steps)-1] + if lastStep.Step == "WaitDeleteMyPodKube" && lastStep.Completed == nil && lastStep.Started != nil { + return true } return false } From 5b72ee69c5231ca186868bed32d5e1b356081e01 Mon Sep 17 00:00:00 2001 From: Nam Nguyen Date: Mon, 22 Sep 2025 17:02:56 +0200 Subject: [PATCH 20/20] update sidecar deletion --- mongodb-community-operator/cmd/agent-sidecar/main.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mongodb-community-operator/cmd/agent-sidecar/main.go b/mongodb-community-operator/cmd/agent-sidecar/main.go index 5299681de..79360716a 100644 --- a/mongodb-community-operator/cmd/agent-sidecar/main.go +++ b/mongodb-community-operator/cmd/agent-sidecar/main.go @@ -473,6 +473,10 @@ func isWaitingToBeDeleted(healthStatus agent.MmsDirectorStatus) bool { if lastStep.Step == "WaitDeleteMyPodKube" && lastStep.Completed == nil && lastStep.Started != nil { return true } + if lastStep.Step == "WaitDeleteMyPodKube" && lastStep.Completed == nil && lastStep.Started == nil { + zap.S().Infof("In WaitDeleteMyPodKube step, but not started yet - not deleting pod") + return false + } return false }