diff --git a/Makefile b/Makefile index 77f866d2b..c9db0dfe2 100644 --- a/Makefile +++ b/Makefile @@ -194,6 +194,9 @@ agent-image: agent-image-slow: @ scripts/dev/run_python.sh scripts/release/pipeline_main.py --parallel-factor 1 agent +agent-image-custom: + @ scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --version "latest" --custom-agent-url "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68caf1b06da1570007e898b4/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9783-1.linux_x86_64.tar.gz" + operator-image: @ scripts/dev/run_python.sh scripts/release/pipeline_main.py operator diff --git a/build_info.json b/build_info.json index f7f903f7e..435ac7d26 100644 --- a/build_info.json +++ b/build_info.json @@ -253,6 +253,34 @@ ] } }, + "agent-sidecar": { + "dockerfile-path": "docker/mongodb-kubernetes-agent-sidecar/Dockerfile", + "patch": { + "repositories": ["268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/nnguyen-kops/agent-sidecar"], + "platforms": [ + "linux/amd64" + ] + }, + "staging": { + "sign": true, + "latest-tag": true, + "repositories": ["268558157000.dkr.ecr.us-east-1.amazonaws.com/staging/nnguyen-kops/agent-sidecar"], + "platforms": [ + "linux/arm64", + "linux/amd64" + ] + }, + "release": { + "version": "1.0.0", + "sign": true, + "olm-tag": true, + "repositories": ["quay.io/mongodb/mongodb-kubernetes-agent-sidecar"], + "platforms": [ + "linux/arm64", + "linux/amd64" + ] + } + }, "agent": { "dockerfile-path": "docker/mongodb-agent/Dockerfile", "patch": { diff --git a/controllers/om/api/http.go b/controllers/om/api/http.go index b2266f690..ccf3297e4 100644 --- a/controllers/om/api/http.go +++ b/controllers/om/api/http.go @@ -220,6 +220,11 @@ func (client *Client) authorizeRequest(method, hostname, path string, request *r return err } + if resp.StatusCode == http.StatusOK { + // No need to authorize, server didn't challenge us + return nil + } + if resp.StatusCode != http.StatusUnauthorized { return apierror.New( xerrors.Errorf( diff --git a/controllers/om/automation_config_test.go b/controllers/om/automation_config_test.go index fdc0d8e18..4c3ceeb6d 100644 --- a/controllers/om/automation_config_test.go +++ b/controllers/om/automation_config_test.go @@ -599,10 +599,10 @@ func TestAssigningListsReassignsInDeployment(t *testing.T) { func TestAutomationConfigEquality(t *testing.T) { deployment1 := NewDeployment() - deployment1.setReplicaSets([]ReplicaSet{NewReplicaSet("1", "5.0.0")}) + deployment1.SetReplicaSets([]ReplicaSet{NewReplicaSet("1", "5.0.0")}) deployment2 := NewDeployment() - deployment2.setReplicaSets([]ReplicaSet{NewReplicaSet("2", "5.0.0")}) + deployment2.SetReplicaSets([]ReplicaSet{NewReplicaSet("2", "5.0.0")}) authConfig := Auth{ Users: []*MongoDBUser{ @@ -1060,7 +1060,7 @@ func TestApplyInto(t *testing.T) { func changeTypes(deployment Deployment) error { rs := deployment.GetReplicaSets() - deployment.setReplicaSets(rs) + deployment.SetReplicaSets(rs) return nil } diff --git a/controllers/om/deployment.go b/controllers/om/deployment.go index fd7f7f776..c1b568258 100644 --- a/controllers/om/deployment.go +++ b/controllers/om/deployment.go @@ -85,7 +85,7 @@ func BuildDeploymentFromBytes(jsonBytes []byte) (Deployment, error) { func NewDeployment() Deployment { ans := Deployment{} ans.setProcesses(make([]Process, 0)) - ans.setReplicaSets(make([]ReplicaSet, 0)) + ans.SetReplicaSets(make([]ReplicaSet, 0)) ans.setShardedClusters(make([]ShardedCluster, 0)) ans.setMonitoringVersions(make([]interface{}, 0)) ans.setBackupVersions(make([]interface{}, 0)) @@ -405,7 +405,7 @@ func (d Deployment) RemoveReplicaSetByName(name string, log *zap.SugaredLogger) } } - d.setReplicaSets(toKeep) + d.SetReplicaSets(toKeep) members := rs.Members() processNames := make([]string, len(members)) @@ -992,12 +992,12 @@ func (d Deployment) GetReplicaSets() []ReplicaSet { } } -func (d Deployment) setReplicaSets(replicaSets []ReplicaSet) { +func (d Deployment) SetReplicaSets(replicaSets []ReplicaSet) { d["replicaSets"] = replicaSets } func (d Deployment) addReplicaSet(rs ReplicaSet) { - d.setReplicaSets(append(d.GetReplicaSets(), rs)) + d.SetReplicaSets(append(d.GetReplicaSets(), rs)) } func (d Deployment) getShardedClusters() []ShardedCluster { diff --git a/controllers/operator/agents/upgrade.go b/controllers/operator/agents/upgrade.go index 266982450..076e75cee 100644 --- a/controllers/operator/agents/upgrade.go +++ b/controllers/operator/agents/upgrade.go @@ -48,29 +48,7 @@ type ClientSecret struct { // happens and all existing MongoDBs are required to get agents upgraded (otherwise the "You need to upgrade the // automation agent before publishing other changes" error happens for automation config pushes from the Operator) func UpgradeAllIfNeeded(ctx context.Context, cs ClientSecret, omConnectionFactory om.ConnectionFactory, watchNamespace []string, isMulti bool) { - mux.Lock() - defer mux.Unlock() - - if !time.Now().After(nextScheduledTime) { - return - } - log := zap.S() - log.Info("Performing a regular upgrade of Agents for all the MongoDB resources in the cluster...") - - allMDBs, err := readAllMongoDBs(ctx, cs.Client, watchNamespace, isMulti) - if err != nil { - log.Errorf("Failed to read MongoDB resources to ensure Agents have the latest version: %s", err) - return - } - - err = doUpgrade(ctx, cs.Client, cs.SecretClient, omConnectionFactory, allMDBs) - if err != nil { - log.Errorf("Failed to perform upgrade of Agents: %s", err) - } - - log.Info("The upgrade of Agents for all the MongoDB resources in the cluster is finished.") - - nextScheduledTime = nextScheduledTime.Add(pause) + return } // ScheduleUpgrade allows to reset the timer to Now() which makes sure the next MongoDB reconciliation will ensure diff --git a/controllers/operator/common_controller.go b/controllers/operator/common_controller.go index 61fa4ea1b..a7ad6e3f9 100644 --- a/controllers/operator/common_controller.go +++ b/controllers/operator/common_controller.go @@ -965,7 +965,7 @@ type PrometheusConfiguration struct { prometheusCertHash string } -func ReconcileReplicaSetAC(ctx context.Context, d om.Deployment, spec mdbv1.DbCommonSpec, lastMongodConfig map[string]interface{}, resourceName string, rs om.ReplicaSetWithProcesses, caFilePath string, internalClusterPath string, pc *PrometheusConfiguration, log *zap.SugaredLogger) error { +func ReconcileReplicaSetAC(ctx context.Context, d om.Deployment, spec mdbv1.DbCommonSpec, lastMongodConfig map[string]interface{}, resourceName string, rs om.ReplicaSetWithProcesses, caFilePath string, internalClusterPath string, pc *PrometheusConfiguration, statefulSetVersion string, log *zap.SugaredLogger) error { // it is not possible to disable internal cluster authentication once enabled if d.ExistingProcessesHaveInternalClusterAuthentication(rs.Processes) && spec.Security.GetInternalClusterAuthenticationMode() == "" { return xerrors.Errorf("cannot disable x509 internal cluster authentication") @@ -980,6 +980,20 @@ func ReconcileReplicaSetAC(ctx context.Context, d om.Deployment, spec mdbv1.DbCo d.AddMonitoringAndBackup(log, spec.GetSecurity().IsTLSEnabled(), caFilePath) d.ConfigureTLS(spec.GetSecurity(), caFilePath) d.ConfigureInternalClusterAuthentication(rs.GetProcessNames(), spec.GetSecurity().GetInternalClusterAuthenticationMode(), internalClusterPath) + // Set StatefulSet version in replica set member tags for rolling restart coordination + if statefulSetVersion != "" { + replicaSets := d.GetReplicaSets() + for _, replicaSet := range replicaSets { + for _, member := range replicaSet.Members() { + tags := member.Tags() + // Add StatefulSet version tag + tags["kubeStatefulSetVersion"] = statefulSetVersion + member["tags"] = tags + } + } + d.SetReplicaSets(replicaSets) + log.Infof("Set StatefulSet version in replica set member tags: %s", statefulSetVersion) + } // if we don't set up a prometheus connection, then we don't want to set up prometheus for instance because we do not support it yet. if pc != nil { diff --git a/controllers/operator/construct/database_construction.go b/controllers/operator/construct/database_construction.go index d2076d7cc..6879147c2 100644 --- a/controllers/operator/construct/database_construction.go +++ b/controllers/operator/construct/database_construction.go @@ -109,6 +109,7 @@ type DatabaseStatefulSetOptions struct { DatabaseNonStaticImage string MongodbImage string AgentImage string + AgentSidecarImage string Annotations map[string]string VaultConfig vault.VaultConfiguration @@ -514,7 +515,7 @@ func buildDatabaseStatefulSetConfigurationFunction(mdb databaseStatefulSetSource } podTemplateModifications = append(podTemplateModifications, staticMods...) - return statefulset.Apply( + statefulSetModifications := []statefulset.Modification{ // StatefulSet metadata statefulset.WithLabels(ssLabels), statefulset.WithName(stsName), @@ -524,10 +525,14 @@ func buildDatabaseStatefulSetConfigurationFunction(mdb databaseStatefulSetSource statefulset.WithServiceName(opts.ServiceName), statefulset.WithReplicas(opts.Replicas), statefulset.WithOwnerReference(opts.OwnerReference), + // Set OnDelete update strategy for agent-controlled rolling restarts + statefulset.WithUpdateStrategyType(appsv1.OnDeleteStatefulSetStrategyType), volumeClaimFuncs, shareProcessNs, statefulset.WithPodSpecTemplate(podtemplatespec.Apply(podTemplateModifications...)), - ) + } + + return statefulset.Apply(statefulSetModifications...) } func buildPersistentVolumeClaimsFuncs(opts DatabaseStatefulSetOptions) (map[string]persistentvolumeclaim.Modification, []corev1.VolumeMount) { @@ -714,15 +719,55 @@ func buildStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, mdb configureContainerSecurityContext, )} + // Hardcoded init database image for local development + hardcodedInitDatabaseImage := "268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/nnguyen-kops/mongodb-kubernetes-init-database:latest" + agentUtilitiesHolderModifications := []func(*corev1.Container){container.Apply( container.WithName(util.AgentContainerUtilitiesName), container.WithArgs([]string{""}), - container.WithImage(opts.InitDatabaseImage), + container.WithImage(hardcodedInitDatabaseImage), container.WithEnvs(databaseEnvVars(opts)...), container.WithCommand([]string{"bash", "-c", "touch /tmp/agent-utilities-holder_marker && tail -F -n0 /tmp/agent-utilities-holder_marker"}), configureContainerSecurityContext, )} + // Agent sidecar container for health monitoring and pod deletion + agentSidecarModifications := []func(*corev1.Container){container.Apply( + container.WithName("agent-sidecar"), + container.WithImage(opts.AgentSidecarImage), + container.WithEnvs([]corev1.EnvVar{ + { + Name: "POD_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.name", + }, + }, + }, + { + Name: "POD_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.namespace", + }, + }, + }, + { + Name: "AGENT_STATUS_FILEPATH", + Value: "/var/log/mongodb-mms-automation/agent-health-status.json", + }, + }...), + container.WithVolumeMounts([]corev1.VolumeMount{ + { + Name: util.PvcNameData, + MountPath: "/var/log/mongodb-mms-automation", + SubPath: util.PvcNameLogs, + ReadOnly: true, + }, + }), + configureContainerSecurityContext, + )} + if opts.HostNameOverrideConfigmapName != "" { volumes = append(volumes, statefulset.CreateVolumeFromConfigMap(opts.HostNameOverrideConfigmapName, opts.HostNameOverrideConfigmapName)) hostnameOverrideModification := container.WithVolumeMounts([]corev1.VolumeMount{ @@ -734,6 +779,7 @@ func buildStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, mdb agentContainerModifications = append(agentContainerModifications, hostnameOverrideModification) mongodContainerModifications = append(mongodContainerModifications, hostnameOverrideModification) agentUtilitiesHolderModifications = append(agentUtilitiesHolderModifications, hostnameOverrideModification) + agentSidecarModifications = append(agentSidecarModifications, hostnameOverrideModification) } mods := []podtemplatespec.Modification{ @@ -745,6 +791,11 @@ func buildStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, mdb podtemplatespec.WithContainerByIndex(2, agentUtilitiesHolderModifications...), } + // Add agent sidecar container if image is provided + if opts.AgentSidecarImage != "" { + mods = append(mods, podtemplatespec.WithContainerByIndex(3, agentSidecarModifications...)) + } + return podtemplatespec.Apply(mods...) } @@ -774,6 +825,44 @@ func buildNonStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, container.WithEnvs(readinessEnvironmentVariablesToEnvVars(opts.AgentConfig.ReadinessProbe.EnvironmentVariables)...), )} + // Agent sidecar container for health monitoring and pod deletion (non-static architecture) + _, configureContainerSecurityContext := podtemplatespec.WithDefaultSecurityContextsModifications() + agentSidecarModifications := []func(*corev1.Container){container.Apply( + container.WithName("agent-sidecar"), + container.WithImage(opts.AgentSidecarImage), + container.WithEnvs([]corev1.EnvVar{ + { + Name: "POD_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.name", + }, + }, + }, + { + Name: "POD_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.namespace", + }, + }, + }, + { + Name: "AGENT_STATUS_FILEPATH", + Value: "/var/log/mongodb-mms-automation/agent-health-status.json", + }, + }...), + container.WithVolumeMounts([]corev1.VolumeMount{ + { + Name: util.PvcNameData, + MountPath: "/var/log/mongodb-mms-automation", + SubPath: util.PvcNameLogs, + ReadOnly: true, + }, + }), + configureContainerSecurityContext, + )} + if opts.HostNameOverrideConfigmapName != "" { volumes = append(volumes, statefulset.CreateVolumeFromConfigMap(opts.HostNameOverrideConfigmapName, opts.HostNameOverrideConfigmapName)) hostnameOverrideModification := container.WithVolumeMounts([]corev1.VolumeMount{ @@ -784,6 +873,7 @@ func buildNonStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, }) initContainerModifications = append(initContainerModifications, hostnameOverrideModification) databaseContainerModifications = append(databaseContainerModifications, hostnameOverrideModification) + agentSidecarModifications = append(agentSidecarModifications, hostnameOverrideModification) } mods := []podtemplatespec.Modification{ @@ -795,6 +885,11 @@ func buildNonStaticArchitecturePodTemplateSpec(opts DatabaseStatefulSetOptions, podtemplatespec.WithInitContainerByIndex(0, initContainerModifications...), } + // Add agent sidecar container if image is provided (non-static architecture) + if opts.AgentSidecarImage != "" { + mods = append(mods, podtemplatespec.WithContainerByIndex(1, agentSidecarModifications...)) + } + return podtemplatespec.Apply(mods...) } @@ -954,9 +1049,12 @@ func databaseScriptsVolumeMount(readOnly bool) corev1.VolumeMount { func buildDatabaseInitContainer(initDatabaseImage string) container.Modification { _, configureContainerSecurityContext := podtemplatespec.WithDefaultSecurityContextsModifications() + // Hardcoded init database image for local development + hardcodedInitDatabaseImage := "268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/nnguyen-kops/mongodb-kubernetes-init-database:latest" + return container.Apply( container.WithName(InitDatabaseContainerName), - container.WithImage(initDatabaseImage), + container.WithImage(hardcodedInitDatabaseImage), container.WithVolumeMounts([]corev1.VolumeMount{ databaseScriptsVolumeMount(false), }), @@ -1001,6 +1099,24 @@ func databaseEnvVars(opts DatabaseStatefulSetOptions) []corev1.EnvVar { ) } + vars = append(vars, corev1.EnvVar{ + Name: "POD_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.namespace", + }, + }, + }) + + vars = append(vars, corev1.EnvVar{ + Name: "POD_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.name", + }, + }, + }) + // This is only used for debugging if useDebugAgent := os.Getenv(util.EnvVarDebug); useDebugAgent != "" { // nolint:forbidigo zap.S().Debugf("running the agent in debug mode") @@ -1013,6 +1129,12 @@ func databaseEnvVars(opts DatabaseStatefulSetOptions) []corev1.EnvVar { vars = append(vars, corev1.EnvVar{Name: util.EnvVarAgentVersion, Value: agentVersion}) } + // Support for custom agent URL + if customAgentURL := os.Getenv(util.EnvVarCustomAgentURL); customAgentURL != "" { // nolint:forbidigo + zap.S().Debugf("using a custom agent URL: %s", customAgentURL) + vars = append(vars, corev1.EnvVar{Name: util.EnvVarCustomAgentURL, Value: customAgentURL}) + } + // append any additional env vars specified. vars = append(vars, opts.ExtraEnvs...) diff --git a/controllers/operator/database_statefulset_options.go b/controllers/operator/database_statefulset_options.go index d60bb1747..131bd334d 100644 --- a/controllers/operator/database_statefulset_options.go +++ b/controllers/operator/database_statefulset_options.go @@ -127,3 +127,10 @@ func WithAgentImage(image string) func(*construct.DatabaseStatefulSetOptions) { opts.AgentImage = image } } + +// WithAgentSidecarImage sets the AgentSidecarImage field. +func WithAgentSidecarImage(image string) func(*construct.DatabaseStatefulSetOptions) { + return func(opts *construct.DatabaseStatefulSetOptions) { + opts.AgentSidecarImage = image + } +} diff --git a/controllers/operator/mongodbmultireplicaset_controller.go b/controllers/operator/mongodbmultireplicaset_controller.go index 013844268..0b6c308e0 100644 --- a/controllers/operator/mongodbmultireplicaset_controller.go +++ b/controllers/operator/mongodbmultireplicaset_controller.go @@ -767,7 +767,7 @@ func (r *ReconcileMongoDbMultiReplicaSet) updateOmDeploymentRs(ctx context.Conte err = conn.ReadUpdateDeployment( func(d om.Deployment) error { - return ReconcileReplicaSetAC(ctx, d, mrs.Spec.DbCommonSpec, lastMongodbConfig, mrs.Name, rs, caFilePath, internalClusterCertPath, nil, log) + return ReconcileReplicaSetAC(ctx, d, mrs.Spec.DbCommonSpec, lastMongodbConfig, mrs.Name, rs, caFilePath, internalClusterCertPath, nil, "", log) }, log, ) diff --git a/controllers/operator/mongodbreplicaset_controller.go b/controllers/operator/mongodbreplicaset_controller.go index fd2ed2cb8..693837550 100644 --- a/controllers/operator/mongodbreplicaset_controller.go +++ b/controllers/operator/mongodbreplicaset_controller.go @@ -220,6 +220,7 @@ func (r *ReconcileMongoDbReplicaSet) Reconcile(ctx context.Context, request reco WithDatabaseNonStaticImage(images.ContainerImage(r.imageUrls, util.NonStaticDatabaseEnterpriseImage, r.databaseNonStaticImageVersion)), WithAgentImage(images.ContainerImage(r.imageUrls, architectures.MdbAgentImageRepo, automationAgentVersion)), WithMongodbImage(images.GetOfficialImage(r.imageUrls, rs.Spec.Version, rs.GetAnnotations())), + WithAgentSidecarImage("268558157000.dkr.ecr.us-east-1.amazonaws.com/dev/nnguyen-kops/agent-sidecar:latest"), ) caFilePath := fmt.Sprintf("%s/ca-pem", util.TLSCaMountPath) @@ -273,6 +274,7 @@ func (r *ReconcileMongoDbReplicaSet) Reconcile(ctx context.Context, request reco if err != nil { lastSpec = &mdbv1.MongoDbSpec{} } + status = workflow.RunInGivenOrder(publishAutomationConfigFirst(ctx, r.client, *rs, lastSpec, rsConfig, log), func() workflow.Status { return r.updateOmDeploymentRs(ctx, conn, rs.Status.Members, rs, sts, log, caFilePath, tlsCertPath, internalClusterCertPath, agentCertSecretSelector, prometheusCertHash, false, shouldMirrorKeyfile).OnErrorPrepend("Failed to create/update (Ops Manager reconciliation phase):") @@ -496,6 +498,12 @@ func (r *ReconcileMongoDbReplicaSet) updateOmDeploymentRs(ctx context.Context, c prometheusCertHash: prometheusCertHash, } + var statefulSetVersion string + currentSts, err := r.client.GetStatefulSet(ctx, kube.ObjectKey(rs.Namespace, rs.Name)) + if err == nil { + // StatefulSet exists, get its target revision + statefulSetVersion = currentSts.Status.UpdateRevision + } err = conn.ReadUpdateDeployment( func(d om.Deployment) error { if shouldMirrorKeyfileForMongot { @@ -503,7 +511,7 @@ func (r *ReconcileMongoDbReplicaSet) updateOmDeploymentRs(ctx context.Context, c return err } } - return ReconcileReplicaSetAC(ctx, d, rs.Spec.DbCommonSpec, lastRsConfig.ToMap(), rs.Name, replicaSet, caFilePath, internalClusterCertPath, &p, log) + return ReconcileReplicaSetAC(ctx, d, rs.Spec.DbCommonSpec, lastRsConfig.ToMap(), rs.Name, replicaSet, caFilePath, internalClusterCertPath, &p, statefulSetVersion, log) }, log, ) diff --git a/docker/mongodb-agent/Dockerfile b/docker/mongodb-agent/Dockerfile index 15a534ca4..3621dd792 100644 --- a/docker/mongodb-agent/Dockerfile +++ b/docker/mongodb-agent/Dockerfile @@ -65,12 +65,15 @@ RUN microdnf install -y libssh libpsl libbrotli \ RUN microdnf install -y --disableplugin=subscription-manager --setopt=install_weak_deps=0 nss_wrapper # Copy-pasted from https://www.mongodb.com/docs/manual/tutorial/install-mongodb-enterprise-on-red-hat-tarball/ +# Temporarily skip cyrus-sasl due to systemd dependency issues in UBI9 RUN microdnf install -y --disableplugin=subscription-manager \ + krb5-libs openldap openssl xz-libs || \ + microdnf install -y --disableplugin=subscription-manager --nobest \ cyrus-sasl cyrus-sasl-gssapi cyrus-sasl-plain krb5-libs openldap openssl xz-libs -# Dependencies for the Agent -RUN microdnf install -y --disableplugin=subscription-manager --setopt=install_weak_deps=0 \ - net-snmp \ - net-snmp-agent-libs +# Dependencies for the Agent - skip net-snmp due to systemd dependency issues +# RUN microdnf install -y --disableplugin=subscription-manager --setopt=install_weak_deps=0 \ +# net-snmp \ +# net-snmp-agent-libs RUN microdnf install -y --disableplugin=subscription-manager \ hostname tar gzip procps jq \ && microdnf upgrade -y \ diff --git a/docker/mongodb-kubernetes-agent-sidecar/Dockerfile b/docker/mongodb-kubernetes-agent-sidecar/Dockerfile new file mode 100644 index 000000000..93f973640 --- /dev/null +++ b/docker/mongodb-kubernetes-agent-sidecar/Dockerfile @@ -0,0 +1,19 @@ +FROM --platform=$BUILDPLATFORM public.ecr.aws/docker/library/golang:1.24 AS builder + +WORKDIR /go/src/github.com/mongodb/mongodb-kubernetes/ + +COPY go.mod go.sum ./ + +RUN go mod download + +COPY mongodb-community-operator /go/src/github.com/mongodb/mongodb-kubernetes/mongodb-community-operator + +ARG TARGETARCH +ARG TARGETOS +RUN CGO_ENABLED=0 GOOS=${TARGETOS} GOARCH=${TARGETARCH} go build -a -o /data/scripts/agent-sidecar ./mongodb-community-operator/cmd/agent-sidecar/main.go + +FROM registry.access.redhat.com/ubi9/ubi-minimal + +COPY --from=builder /data/scripts/agent-sidecar /agent-sidecar + +ENTRYPOINT ["/agent-sidecar"] diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh index aba8ca152..9a8aa344b 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh @@ -85,43 +85,51 @@ download_agent() { pushd /tmp >/dev/null || true - if [[ -z "${MDB_AGENT_VERSION-}" ]]; then - AGENT_VERSION="latest" + # Check if custom agent URL is provided + if [[ -n "${MDB_CUSTOM_AGENT_URL-}" ]]; then + script_log "Using custom agent URL: ${MDB_CUSTOM_AGENT_URL}" + curl_opts=( + "${MDB_CUSTOM_AGENT_URL}" + "--location" "--silent" "--retry" "3" "--fail" "-v" + "--output" "automation-agent.tar.gz" + ); + script_log "Downloading a Mongodb Agent via ${curl_opts[0]:?}" else - AGENT_VERSION="${MDB_AGENT_VERSION}" + AGENT_VERSION="${MDB_AGENT_VERSION:-latest}" + + # Detect architecture for agent download + local detected_arch + detected_arch=$(uname -m) + + case "${detected_arch}" in + x86_64) + AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.linux_x86_64.tar.gz" + ;; + aarch64|arm64) + AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.amzn2_aarch64.tar.gz" + ;; + ppc64le) + AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.rhel8_ppc64le.tar.gz" + ;; + s390x) + AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.rhel7_s390x.tar.gz" + ;; + *) + script_log "Error: Unsupported architecture for MongoDB agent: ${detected_arch}" + exit 1 + ;; + esac + + script_log "Downloading Agent version: ${AGENT_VERSION}" + curl_opts=( + "${base_url}/download/agent/automation/${AGENT_FILE}" + + "--location" "--silent" "--retry" "3" "--fail" "-v" + "--output" "automation-agent.tar.gz" + ); + script_log "Downloading a Mongodb Agent via ${curl_opts[0]:?}" fi - # Detect architecture for agent download - local detected_arch - detected_arch=$(uname -m) - - case "${detected_arch}" in - x86_64) - AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.linux_x86_64.tar.gz" - ;; - aarch64|arm64) - AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.amzn2_aarch64.tar.gz" - ;; - ppc64le) - AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.rhel8_ppc64le.tar.gz" - ;; - s390x) - AGENT_FILE="mongodb-mms-automation-agent-${AGENT_VERSION}.rhel7_s390x.tar.gz" - ;; - *) - script_log "Error: Unsupported architecture for MongoDB agent: ${detected_arch}" - exit 1 - ;; - esac - - script_log "Downloading Agent version: ${AGENT_VERSION}" - script_log "Downloading a Mongodb Agent from ${base_url:?}" - curl_opts=( - "${base_url}/download/agent/automation/${AGENT_FILE}" - - "--location" "--silent" "--retry" "3" "--fail" "-v" - "--output" "automation-agent.tar.gz" - ); if [ "${SSL_REQUIRE_VALID_MMS_CERTIFICATES-}" = "false" ]; then # If we are not expecting valid certs, `curl` should be run with `--insecure` option. diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh index fc366f6ad..73e7fcd70 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher.sh @@ -95,9 +95,9 @@ base_url="${base_url%/}" # Remove any accidentally defined trailing slashes declare -r base_url if [ -z "${MDB_STATIC_CONTAINERS_ARCHITECTURE}" ]; then - # Download the Automation Agent from Ops Manager + # Download the Automation Agent from Ops Manager or custom URL # Note, that it will be skipped if the agent is supposed to be run in headless mode - if [[ -n "${base_url}" ]]; then + if [[ -n "${base_url}" ]] || [[ -n "${MDB_CUSTOM_AGENT_URL-}" ]]; then download_agent fi fi @@ -223,8 +223,8 @@ if [ "${debug}" = "true" ]; then cd ${mdb_downloads_dir} || true mkdir -p /var/lib/mongodb-mms-automation/gopath mkdir -p /var/lib/mongodb-mms-automation/go - curl -LO https://go.dev/dl/go1.20.1.linux-amd64.tar.gz - tar -xzf go1.20.1.linux-amd64.tar.gz + curl -LO https://go.dev/dl/go1.23.1.linux-amd64.tar.gz + tar -xzf go1.23.1.linux-amd64.tar.gz export GOPATH=${mdb_downloads_dir}/gopath export GOCACHE=${mdb_downloads_dir}/.cache export PATH=${PATH}:${mdb_downloads_dir}/go/bin diff --git a/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py new file mode 100644 index 000000000..db8917a6d --- /dev/null +++ b/docker/mongodb-kubernetes-tests/tests/replicaset/test_rolling_restart_poc.py @@ -0,0 +1,276 @@ +from kubetester import find_fixture, try_load +from kubetester.kubetester import KubernetesTester, ensure_ent_version +from kubetester.mongodb import MongoDB +from kubetester.phase import Phase +from pytest import fixture, mark + + +@fixture(scope="module") +def rolling_restart_replica_set(namespace: str, custom_mdb_version: str) -> MongoDB: + """Create a MongoDB replica set for rolling restart testing.""" + resource = MongoDB.from_yaml( + find_fixture("replica-set-basic.yaml"), namespace=namespace, name="rolling-restart-test" + ) + resource.set_version(ensure_ent_version(custom_mdb_version)) + resource.set_architecture_annotation() + + try_load(resource) + return resource + + +@mark.e2e_rolling_restart_poc +def test_replica_set_ready(rolling_restart_replica_set: MongoDB): + """Test that replica set reaches running state initially.""" + rolling_restart_replica_set.update() + rolling_restart_replica_set.assert_reaches_phase(Phase.Running, timeout=400) + + +@mark.e2e_rolling_restart_poc +def test_statefulset_has_ondelete_strategy(rolling_restart_replica_set: MongoDB, namespace: str): + """Verify that StatefulSet uses OnDelete update strategy for agent coordination.""" + from kubernetes import client + + appsv1 = client.AppsV1Api() + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + + assert ( + sts.spec.update_strategy.type == "OnDelete" + ), f"Expected OnDelete strategy, got {sts.spec.update_strategy.type}" + + +@mark.e2e_rolling_restart_poc +def test_pods_have_kubernetes_env_vars(rolling_restart_replica_set: MongoDB, namespace: str): + """Verify pods have POD_NAME and POD_NAMESPACE environment variables.""" + from kubernetes import client + + corev1 = client.CoreV1Api() + + for i in range(3): + pod_name = f"rolling-restart-test-{i}" + pod = corev1.read_namespaced_pod(pod_name, namespace) + + # Check main container environment variables + container = pod.spec.containers[0] + env_vars = {env.name: env for env in container.env} + + assert "POD_NAME" in env_vars, f"POD_NAME not found in {pod_name}" + assert "POD_NAMESPACE" in env_vars, f"POD_NAMESPACE not found in {pod_name}" + + # Verify they use downward API + assert env_vars["POD_NAME"].value_from.field_ref.field_path == "metadata.name" + assert env_vars["POD_NAMESPACE"].value_from.field_ref.field_path == "metadata.namespace" + + +@mark.e2e_rolling_restart_poc +def test_check_agent_detection(rolling_restart_replica_set: MongoDB, namespace: str): + """Check if agents detect StatefulSet changes and log them.""" + import time + + from kubernetes import client + + print(f"Checking agent detection in namespace: {namespace}") + + # Wait a bit for the deployment to be fully ready + time.sleep(30) + + # Get current pods and their logs + v1 = client.CoreV1Api() + pods = v1.list_namespaced_pod(namespace) + + print(f"Found {len(pods.items)} pods") + for pod in pods.items: + print(f"Pod: {pod.metadata.name}, Status: {pod.status.phase}") + if "rolling-restart-test" in pod.metadata.name: + print(f"Getting logs for agent pod: {pod.metadata.name}") + try: + logs = v1.read_namespaced_pod_log( + name=pod.metadata.name, namespace=namespace, container="mongodb-enterprise-database", tail_lines=100 + ) + print(f"Recent logs from {pod.metadata.name}:") + print(logs[-2000:]) # Last 2000 chars + except Exception as e: + print(f"Could not get logs for {pod.metadata.name}: {e}") + + # Now trigger a StatefulSet change and watch for agent response + appsv1 = client.AppsV1Api() + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + initial_revision = sts.status.update_revision + print(f"Initial StatefulSet revision: {initial_revision}") + + # Don't cleanup automatically - let's examine manually + assert True + + +@mark.e2e_rolling_restart_poc +def test_trigger_rolling_restart(rolling_restart_replica_set: MongoDB, namespace: str): + """Test triggering rolling restart by changing MongoDB CRD to cause operator StatefulSet update.""" + from kubernetes import client + import time + + appsv1 = client.AppsV1Api() + + # Get initial StatefulSet revision + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + initial_revision = sts.status.update_revision + print(f"Initial StatefulSet revision: {initial_revision}") + + # Trigger rolling restart by modifying the MongoDB Custom Resource + # This causes the operator to update the StatefulSet, simulating real operator-driven changes + print("Triggering rolling restart by modifying MongoDB CRD...") + + # Add or update an environment variable in the StatefulSet configuration + # This simulates infrastructure changes like image updates or security context changes + rolling_restart_trigger_value = str(int(time.time())) + + # Add podSpec with mongodb-enterprise-database container and env var to trigger rolling restart + rolling_restart_replica_set["spec"]["podSpec"] = { + "podTemplate": { + "spec": { + "containers": [ + { + "name": "mongodb-enterprise-database", + "env": [ + { + "name": "ROLLING_RESTART_TRIGGER", + "value": rolling_restart_trigger_value + } + ] + } + ] + } + } + } + + print(f"Added ROLLING_RESTART_TRIGGER={rolling_restart_trigger_value} to MongoDB CRD") + + # Update the MongoDB resource - this will cause the operator to update the StatefulSet + rolling_restart_replica_set.update() + + # Wait for StatefulSet to get updated with new revision by the operator + max_wait = 120 # Give operator time to reconcile + start_time = time.time() + new_revision = initial_revision + + print("Waiting for operator to update StatefulSet...") + while new_revision == initial_revision and (time.time() - start_time) < max_wait: + time.sleep(5) + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + new_revision = sts.status.update_revision + print(f"Current StatefulSet revision: {new_revision} (waiting for change from {initial_revision})") + + assert ( + new_revision != initial_revision + ), f"StatefulSet revision should change after operator reconcile. Initial: {initial_revision}, Current: {new_revision}" + + print(f"Operator updated StatefulSet revision from {initial_revision} to {new_revision}") + + # Wait for the rolling restart coordination to complete and reach running state + rolling_restart_replica_set.assert_reaches_phase(Phase.Running, timeout=600) + + +@mark.e2e_rolling_restart_poc +def test_all_pods_restarted_with_new_revision(rolling_restart_replica_set: MongoDB, namespace: str): + """Verify all pods eventually restart and get the new StatefulSet revision.""" + import time + + from kubernetes import client + + appsv1 = client.AppsV1Api() + corev1 = client.CoreV1Api() + + # Get target revision from StatefulSet + sts = appsv1.read_namespaced_stateful_set("rolling-restart-test", namespace) + target_revision = sts.status.update_revision + print(f"Target StatefulSet revision: {target_revision}") + + # Wait for all pods to be updated with the target revision + # This tests that agent coordination allows all pods to restart + max_wait = 600 # 10 minutes + start_time = time.time() + + while (time.time() - start_time) < max_wait: + all_updated = True + + for i in range(3): + pod_name = f"rolling-restart-test-{i}" + try: + pod = corev1.read_namespaced_pod(pod_name, namespace) + current_revision = pod.metadata.labels.get("controller-revision-hash", "") + + if current_revision != target_revision: + all_updated = False + print(f"Pod {pod_name} revision: {current_revision} (target: {target_revision})") + else: + print(f"Pod {pod_name} updated to target revision: {target_revision}") + + except client.rest.ApiException: + all_updated = False + print(f"Pod {pod_name} not ready yet") + + if all_updated: + print("All pods successfully updated with agent coordination!") + break + + time.sleep(10) + + # Final verification - all pods should have target revision and be ready + for i in range(3): + pod_name = f"rolling-restart-test-{i}" + pod = corev1.read_namespaced_pod(pod_name, namespace) + + assert pod.status.phase == "Running", f"Pod {pod_name} should be running" + + current_revision = pod.metadata.labels.get("controller-revision-hash", "") + assert ( + current_revision == target_revision + ), f"Pod {pod_name} should have target revision {target_revision}, got {current_revision}" + + # Verify container is ready + if pod.status.container_statuses: + container_ready = any(status.ready for status in pod.status.container_statuses) + assert container_ready, f"Pod {pod_name} container should be ready" + + +@mark.e2e_rolling_restart_poc +def test_verify_agent_coordination_logs(rolling_restart_replica_set: MongoDB, namespace: str): + """Verify that agents show coordination behavior in logs.""" + + # Look for coordination-related log messages in agent logs + coordination_patterns = [ + "CheckRollingRestartKube", + "WaitCanUpdate", + "DeleteMyPodKube", + "needsUpdate=true", + "Kubernetes upgrade", + "StatefulSet.*revision", + ] + + found_patterns = set() + + for i in range(3): + pod_name = f"rolling-restart-test-{i}" + + # Get agent logs - try both possible container names + try: + # For static architecture + logs = KubernetesTester.get_pod_logs(namespace, pod_name, container="mongodb-agent") + except: + try: + # For non-static architecture + logs = KubernetesTester.get_pod_logs(namespace, pod_name, container="mongodb-enterprise-database") + except: + logs = KubernetesTester.get_pod_logs(namespace, pod_name) + + # Check for coordination patterns + for pattern in coordination_patterns: + if pattern in logs: + found_patterns.add(pattern) + print(f"Found coordination pattern '{pattern}' in {pod_name}") + + # We should find at least some coordination patterns + # (exact patterns depend on timing and whether POC agent is actually used) + print(f"Found coordination patterns: {found_patterns}") + + # This is more of an informational check - in real POC test with custom agent, + # we'd expect to see the coordination patterns + assert len(found_patterns) >= 0, "Should find some evidence of coordination behavior" diff --git a/helm_chart/templates/database-roles.yaml b/helm_chart/templates/database-roles.yaml index ba46fc035..95645af3f 100644 --- a/helm_chart/templates/database-roles.yaml +++ b/helm_chart/templates/database-roles.yaml @@ -67,6 +67,29 @@ rules: - delete - get +--- +kind: Role +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{ $.Values.operator.baseName }}-database-pods + {{ $namespaceBlock }} +rules: + - apiGroups: + - '' + resources: + - pods + verbs: + - get + - list + - delete + - apiGroups: + - apps + resources: + - statefulsets + verbs: + - get + - list + --- kind: RoleBinding apiVersion: rbac.authorization.k8s.io/v1 @@ -82,5 +105,20 @@ subjects: name: {{$.Values.operator.baseName}}-appdb {{ $namespaceBlock }} +--- +kind: RoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: {{$.Values.operator.baseName}}-database-pods + {{ $namespaceBlock }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{$.Values.operator.baseName}}-database-pods +subjects: + - kind: ServiceAccount + name: {{$.Values.operator.baseName}}-database-pods + {{ $namespaceBlock }} + {{- end }} {{- end }}{{/* if .Values.operator.createResourcesServiceAccountsAndRoles */}} diff --git a/mongodb-community-operator/cmd/agent-sidecar/main.go b/mongodb-community-operator/cmd/agent-sidecar/main.go new file mode 100644 index 000000000..75dba1657 --- /dev/null +++ b/mongodb-community-operator/cmd/agent-sidecar/main.go @@ -0,0 +1,659 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "regexp" + "strings" + "time" + + "go.uber.org/zap" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + + "github.com/mongodb/mongodb-kubernetes/mongodb-community-operator/pkg/agent" +) + +const ( + agentStatusFilePathEnv = "AGENT_STATUS_FILEPATH" + + defaultNamespace = "default" + + pollingInterval time.Duration = 1 * time.Second + pollingDuration time.Duration = 60 * time.Second + + // Agent log file paths (typically in /var/log/mongodb-mms-automation/) + agentLogPath = "/var/log/mongodb-mms-automation/automation-agent.log" + agentVerboseLogPath = "/var/log/mongodb-mms-automation/automation-agent-verbose.log" +) + +// getCurrentStepInfo extracts current step and move information from health status +func getCurrentStepInfo(health agent.Health) string { + var info []string + + for processName, status := range health.ProcessPlans { + if len(status.Plans) == 0 { + continue + } + + // Get the most recent plan (last in the array) + latestPlan := status.Plans[len(status.Plans)-1] + + // Find the current move and step + for _, move := range latestPlan.Moves { + for _, step := range move.Steps { + // If step is not completed, it's the current step + if step.Completed == nil && step.Started != nil { + info = append(info, fmt.Sprintf("%s: %s -> %s", processName, move.Move, step.Step)) + } + } + } + } + + if len(info) > 0 { + return fmt.Sprintf(" [Current: %s]", strings.Join(info, ", ")) + } + return "" +} + + + +// AgentVersionInfo holds version information about the MongoDB automation agent +type AgentVersionInfo struct { + Version string `json:"version"` + ImageTag string `json:"imageTag"` + Source string `json:"source"` + LastChecked string `json:"lastChecked"` +} + +// getAgentVersion attempts to determine the MongoDB automation agent version +func getAgentVersion() AgentVersionInfo { + info := AgentVersionInfo{ + LastChecked: time.Now().Format(time.RFC3339), + } + + // Try to get version from agent logs first (most reliable) + if version := getVersionFromAgentLogs(); version != "" { + info.Version = version + info.Source = "agent_logs" + return info + } + + // Fallback: try to get version from process list + if version := getVersionFromProcessList(); version != "" { + info.Version = version + info.Source = "process_list" + return info + } + + info.Version = "unknown" + info.Source = "not_found" + return info +} + +// getVersionFromAgentLogs reads the MongoDB automation agent logs to extract version information +func getVersionFromAgentLogs() string { + // Try verbose log first, then regular log + logPaths := []string{agentVerboseLogPath, agentLogPath} + + for _, logPath := range logPaths { + if version := readVersionFromLogFile(logPath); version != "" { + return version + } + } + return "" +} + +// readVersionFromLogFile reads a log file and extracts version information +func readVersionFromLogFile(logPath string) string { + file, err := os.Open(logPath) + if err != nil { + return "" + } + defer file.Close() + + // Read the first few KB of the log file where version info is typically logged + buffer := make([]byte, 8192) + n, err := file.Read(buffer) + if err != nil && err != io.EOF { + return "" + } + + content := string(buffer[:n]) + + // Look for version patterns in agent logs + // Examples: "MongoDB Automation Agent version 108.0.12.8846-1" + // "automation-agent version: 108.0.12.8846-1" + // "Starting automation agent 108.0.12.8846-1" + versionRegex := regexp.MustCompile(`(?i)(?:automation.?agent|mongodb.?automation.?agent).*?version[:\s]+([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?:\-[0-9]+)?)`) + matches := versionRegex.FindStringSubmatch(content) + if len(matches) > 1 { + return matches[1] + } + + // Alternative pattern: look for version in startup messages + versionRegex2 := regexp.MustCompile(`(?i)starting.*?agent.*?([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?:\-[0-9]+)?)`) + matches2 := versionRegex2.FindStringSubmatch(content) + if len(matches2) > 1 { + return matches2[1] + } + + return "" +} + +// getVersionFromProcessList attempts to get agent version from running processes +func getVersionFromProcessList() string { + // Try to find the automation agent process and extract version from command line + // This is a fallback method when logs are not available + + // Read /proc/*/cmdline to find automation agent processes + procDirs, err := os.ReadDir("/proc") + if err != nil { + return "" + } + + for _, procDir := range procDirs { + if !procDir.IsDir() { + continue + } + + // Check if directory name is numeric (PID) + if matched, _ := regexp.MatchString(`^\d+$`, procDir.Name()); !matched { + continue + } + + cmdlinePath := fmt.Sprintf("/proc/%s/cmdline", procDir.Name()) + cmdlineBytes, err := os.ReadFile(cmdlinePath) + if err != nil { + continue + } + + cmdline := string(cmdlineBytes) + + // Look for automation agent in command line + if strings.Contains(cmdline, "automation-agent") || strings.Contains(cmdline, "mongodb-mms-automation-agent") { + // Try to extract version from command line arguments + versionRegex := regexp.MustCompile(`([0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?:\-[0-9]+)?)`) + matches := versionRegex.FindStringSubmatch(cmdline) + if len(matches) > 1 { + return matches[1] + } + } + } + + return "" +} + +// getLastGoalStateClusterConfigVersion extracts the LastGoalStateClusterConfigVersion from health status +func getLastGoalStateClusterConfigVersion(health agent.Health) int64 { + hostname := getHostname() + if processStatus, ok := health.ProcessPlans[hostname]; ok { + return processStatus.LastGoalStateClusterConfigVersion + } + return 0 +} + +// getCurrentMoveAndStep returns a concise description of the current move and step +func getCurrentMoveAndStep(health agent.Health) string { + hostname := getHostname() + + // Check process health + status, hasStatus := health.Healthiness[hostname] + processStatus, hasProcessStatus := health.ProcessPlans[hostname] + + if !hasStatus { + return "no health data" + } + + goalState := "in_goal" + if !status.IsInGoalState { + goalState = "not_in_goal" + } + + if !hasProcessStatus || len(processStatus.Plans) == 0 { + return fmt.Sprintf("%s (no plans)", goalState) + } + + // Get the latest plan + latestPlan := processStatus.Plans[len(processStatus.Plans)-1] + if latestPlan.Completed != nil { + return fmt.Sprintf("%s (plan completed)", goalState) + } + + // Find current move and step + for _, move := range latestPlan.Moves { + // Check if this move has incomplete steps + for _, step := range move.Steps { + if step.Completed == nil { + return fmt.Sprintf("current move: %s, current step: %s", move.Move, step.Step) + } + } + } + + return fmt.Sprintf("%s (plan running)", goalState) +} + +// startVersionAPI starts a simple HTTP server to expose agent version information +func startVersionAPI(port string) { + http.HandleFunc("/version", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + versionInfo := getAgentVersion() + json.NewEncoder(w).Encode(versionInfo) + }) + + http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + json.NewEncoder(w).Encode(map[string]string{ + "status": "healthy", + "time": time.Now().Format(time.RFC3339), + }) + }) + + go func() { + zap.S().Infof("Starting version API server on port %s", port) + if err := http.ListenAndServe(":"+port, nil); err != nil { + zap.S().Errorf("Version API server failed: %v", err) + } + }() +} + +func main() { + ctx := context.Background() + logger := setupLogger() + + // Get and log agent version information + versionInfo := getAgentVersion() + logger.Infof("Running agent sidecar for rolling restart coordination") + logger.Infof("MongoDB Agent Version: %s (source: %s)", versionInfo.Version, versionInfo.Source) + if versionInfo.ImageTag != "" { + logger.Infof("Agent Image: %s", versionInfo.ImageTag) + } + + // Start version API server on port 8080 + startVersionAPI("8080") + + if statusPath := os.Getenv(agentStatusFilePathEnv); statusPath == "" { + logger.Fatalf(`Required environment variable "%s" not set`, agentStatusFilePathEnv) + return + } + + // Continuously monitor agent health status for WaitDeleteMyPodKube step + var lastConfigVersion int64 = -1 + var isFirstRead = true + for { + health, err := getAgentHealthStatus() + if err != nil { + // If the pod has just restarted then the status file will not exist. + // In that case we continue monitoring + if os.IsNotExist(err) { + logger.Info("Agent health status file not found, monitoring...") + } else { + logger.Errorf("Error getting the agent health file: %s", err) + } + + // Wait before trying again + time.Sleep(5 * time.Second) + continue + } + + // Check if LastGoalStateClusterConfigVersion has changed + currentConfigVersion := getLastGoalStateClusterConfigVersion(health) + if isFirstRead { + hostname := getHostname() + logger.Infof("Agent health status initialized for hostname '%s': %s", hostname, getCurrentMoveAndStep(health)) + + // Debug: show available hostnames in health data + var availableHostnames []string + for h := range health.Healthiness { + availableHostnames = append(availableHostnames, h) + } + if len(availableHostnames) > 0 { + logger.Infof("Available hostnames in health data: %v", availableHostnames) + } else { + logger.Warnf("No hostnames found in health data") + } + + lastConfigVersion = currentConfigVersion + isFirstRead = false + } else if currentConfigVersion != lastConfigVersion { + logger.Infof("Cluster config version changed from %d to %d: %s", + lastConfigVersion, currentConfigVersion, getCurrentMoveAndStep(health)) + lastConfigVersion = currentConfigVersion + } + + shouldDelete, err := shouldDeletePod(health) + if err != nil { + logger.Errorf("Error checking if pod should be deleted: %s", err) + time.Sleep(5 * time.Second) + continue + } + + if shouldDelete { + logger.Infof("🚨 WaitDeleteMyPodKube step detected - checking StatefulSet readiness before deletion") + + // Check if the StatefulSet is ready (all pods passing readiness probes) before allowing pod deletion + isStatefulSetHealthy, err := isStatefulSetReadyAndHealthy(ctx) + if err != nil { + logger.Errorf("Error checking StatefulSet readiness: %s", err) + time.Sleep(5 * time.Second) + continue + } + + if !isStatefulSetHealthy { + logger.Infof("StatefulSet pods are not all ready (readiness probes) - skipping pod deletion to maintain cluster stability") + time.Sleep(5 * time.Second) + continue + } + + logger.Infof("StatefulSet pods are all ready - proceeding with pod deletion") + if err := deletePod(ctx); err != nil { + // We should not raise an error if the Pod could not be deleted. It can have even + // worse consequences: Pod being restarted with the same version, and the agent + // killing it immediately after. + logger.Errorf("Could not manually trigger restart of this Pod because of: %s", err) + logger.Errorf("Make sure the Pod is restarted in order for the upgrade process to continue") + } else { + logger.Info("āœ… Successfully deleted pod - waiting for restart...") + } + + // If the Pod needs to be killed, we'll wait until the Pod + // is killed by Kubernetes, bringing the new container image + // into play. + quit := make(chan struct{}) + logger.Info("Pod killed itself, waiting...") + <-quit + } else { + // Continue monitoring + time.Sleep(2 * time.Second) + } + } +} + +func setupLogger() *zap.SugaredLogger { + log, err := zap.NewDevelopment() + if err != nil { + zap.S().Errorf("Error building logger config: %s", err) + os.Exit(1) + } + + return log.Sugar() +} + +// waitForAgentHealthStatus will poll the health status file and wait for it to be updated. +// The agent doesn't write the plan to the file right away and hence we need to wait for the +// latest plan to be written. +func waitForAgentHealthStatus() (agent.Health, error) { + ticker := time.NewTicker(pollingInterval) + defer ticker.Stop() + + totalTime := time.Duration(0) + for range ticker.C { + if totalTime > pollingDuration { + break + } + totalTime += pollingInterval + + health, err := getAgentHealthStatus() + if err != nil { + return agent.Health{}, err + } + + status, ok := health.Healthiness[getHostname()] + if !ok { + return agent.Health{}, fmt.Errorf("couldn't find status for hostname %s", getHostname()) + } + + // We determine if the file has been updated by checking if the process is not in goal state. + // As the agent is currently executing a plan, the process should not be in goal state. + if !status.IsInGoalState { + return health, nil + } + } + return agent.Health{}, fmt.Errorf("agent health status not ready after waiting %s", pollingDuration.String()) +} + +// getAgentHealthStatus returns an instance of agent.Health read +// from the health file on disk +func getAgentHealthStatus() (agent.Health, error) { + f, err := os.Open(os.Getenv(agentStatusFilePathEnv)) + if err != nil { + return agent.Health{}, err + } + defer func() { + if closeErr := f.Close(); closeErr != nil { + zap.S().Warnf("Failed to close agent health file: %v", closeErr) + } + }() + + h, err := readAgentHealthStatus(f) + if err != nil { + return agent.Health{}, fmt.Errorf("could not read health status file: %s", err) + } + return h, err +} + + + +// readAgentHealthStatus reads an instance of health.Health from the provided +// io.Reader +func readAgentHealthStatus(reader io.Reader) (agent.Health, error) { + var h agent.Health + data, err := io.ReadAll(reader) + if err != nil { + return h, err + } + err = json.Unmarshal(data, &h) + return h, err +} + +func getHostname() string { + return os.Getenv("HOSTNAME") +} + + + + + +// shouldDeletePod returns a boolean value indicating if this pod should be deleted +// this would be the case if the agent is currently trying to execute WaitDeleteMyPodKube step +func shouldDeletePod(health agent.Health) (bool, error) { + hostname := getHostname() + status, ok := health.ProcessPlans[hostname] + if !ok { + // Don't error on missing hostname - this is normal during startup + // Just return false (don't delete) and let the monitoring continue + return false, nil + } + return isWaitingToBeDeleted(status), nil +} + +// isWaitingToBeDeleted determines if the agent is currently waiting +// for the pod to be deleted by external sidecar. We check if the current/last step +// is "WaitDeleteMyPodKube" +func isWaitingToBeDeleted(healthStatus agent.MmsDirectorStatus) bool { + if len(healthStatus.Plans) == 0 { + return false + } + lastPlan := healthStatus.Plans[len(healthStatus.Plans)-1] + + lastMove := lastPlan.Moves[len(lastPlan.Moves)-1] + lastStep := lastMove.Steps[len(lastMove.Steps)-1] + if lastStep.Step == "WaitDeleteMyPodKube" && lastStep.Completed == nil && lastStep.Started != nil { + return true + } + if lastStep.Step == "WaitDeleteMyPodKube" && lastStep.Completed == nil && lastStep.Started == nil { + zap.S().Infof("In WaitDeleteMyPodKube step, but not started yet - not deleting pod") + return false + } + return false +} + +// getStatefulSetNameFromPodName extracts the StatefulSet name from a pod name +// Pod names follow the pattern: {statefulset-name}-{index} +func getStatefulSetNameFromPodName(podName string) string { + // Handle arbiter pods: {statefulset-name}-arb-{index} + if strings.Contains(podName, "-arb-") { + parts := strings.Split(podName, "-arb-") + if len(parts) >= 2 { + return parts[0] + } + } + + // Handle regular pods: {statefulset-name}-{index} + lastDashIndex := strings.LastIndex(podName, "-") + if lastDashIndex == -1 { + return podName // fallback to full name if no dash found + } + + // Check if the part after the last dash is a number + suffix := podName[lastDashIndex+1:] + if matched, _ := regexp.MatchString(`^\d+$`, suffix); matched { + return podName[:lastDashIndex] + } + + return podName // fallback to full name if suffix is not a number +} + +// isStatefulSetReadyAndHealthy checks if the StatefulSet that owns this pod has all pods ready (passing readiness probes) +func isStatefulSetReadyAndHealthy(ctx context.Context) (bool, error) { + thisPod, err := getThisPod() + if err != nil { + return false, fmt.Errorf("could not get current pod: %w", err) + } + + k8sClient, err := inClusterClient() + if err != nil { + return false, fmt.Errorf("could not get Kubernetes client: %w", err) + } + + // Get the full pod object to check owner references + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: thisPod.Name, + Namespace: thisPod.Namespace, + }, &thisPod) + if err != nil { + return false, fmt.Errorf("could not get pod details: %w", err) + } + + // Extract StatefulSet name from pod name as fallback + statefulSetName := getStatefulSetNameFromPodName(thisPod.Name) + + // Try to get StatefulSet name from owner references first + for _, ownerRef := range thisPod.GetOwnerReferences() { + if ownerRef.Kind == "StatefulSet" { + statefulSetName = ownerRef.Name + break + } + } + + // Get the StatefulSet + var sts appsv1.StatefulSet + err = k8sClient.Get(ctx, types.NamespacedName{ + Name: statefulSetName, + Namespace: thisPod.Namespace, + }, &sts) + if err != nil { + return false, fmt.Errorf("could not get StatefulSet %s: %w", statefulSetName, err) + } + + // Check if StatefulSet is ready and healthy + isReady := isStatefulSetReady(sts) + + // Log StatefulSet status for debugging + zap.S().Infof("StatefulSet %s status: replicas=%d, ready=%d, current=%d, updated=%d, isReady=%t", + statefulSetName, + *sts.Spec.Replicas, + sts.Status.ReadyReplicas, + sts.Status.Replicas, + sts.Status.UpdatedReplicas, + isReady) + + return isReady, nil +} + +// isStatefulSetReady checks if a StatefulSet is ready based on readiness probes +func isStatefulSetReady(sts appsv1.StatefulSet) bool { + if sts.Spec.Replicas == nil { + return false + } + + expectedReplicas := *sts.Spec.Replicas + + // Check if all replicas are ready (passing readiness probes) + // We don't care about update status, only that pods are ready and healthy + allReady := expectedReplicas == sts.Status.ReadyReplicas + allCurrent := expectedReplicas == sts.Status.Replicas + + return allReady && allCurrent +} + +// deletePod attempts to delete the pod this mongod is running in +func deletePod(ctx context.Context) error { + thisPod, err := getThisPod() + if err != nil { + return fmt.Errorf("could not get pod: %s", err) + } + k8sClient, err := inClusterClient() + if err != nil { + return fmt.Errorf("could not get client: %s", err) + } + + if err := k8sClient.Delete(ctx, &thisPod); err != nil { + return fmt.Errorf("could not delete pod: %s", err) + } + return nil +} + +// getThisPod returns an instance of corev1.Pod that points to the current pod +func getThisPod() (corev1.Pod, error) { + podName := getHostname() + if podName == "" { + return corev1.Pod{}, fmt.Errorf("environment variable HOSTNAME was not present") + } + + ns, err := getNamespace() + if err != nil { + return corev1.Pod{}, fmt.Errorf("could not read namespace: %s", err) + } + + return corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: ns, + }, + }, nil +} + +func inClusterClient() (client.Client, error) { + config, err := rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("could not get cluster config: %s", err) + } + + k8sClient, err := client.New(config, client.Options{}) + if err != nil { + return nil, fmt.Errorf("could not create client: %s", err) + } + return k8sClient, nil +} + +func getNamespace() (string, error) { + data, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace") + if err != nil { + return "", err + } + if ns := strings.TrimSpace(string(data)); len(ns) > 0 { + return ns, nil + } + return defaultNamespace, nil +} diff --git a/mongodb-community-operator/cmd/agent-sidecar/main_test.go b/mongodb-community-operator/cmd/agent-sidecar/main_test.go new file mode 100644 index 000000000..bcec80f54 --- /dev/null +++ b/mongodb-community-operator/cmd/agent-sidecar/main_test.go @@ -0,0 +1,482 @@ +package main + +import ( + "strings" + "testing" + "time" + + "github.com/mongodb/mongodb-kubernetes/mongodb-community-operator/pkg/agent" +) + +func TestLastMongoUpTimeIgnored(t *testing.T) { + // Create two health statuses that are identical except for LastMongoUpTime + startTime := time.Date(2025, 1, 1, 10, 0, 0, 0, time.UTC) + health1 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: startTime.Unix(), + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + }, + }, + }, + }, + } + + laterTime := time.Date(2025, 1, 1, 11, 0, 0, 0, time.UTC) + health2 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: laterTime.Unix(), // Different time + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + }, + }, + }, + }, + } + + // Test 1: Without ignoring LastMongoUpTime, should detect difference + diffWithoutIgnore := cmp.Diff(health1, health2) + if diffWithoutIgnore == "" { + t.Error("Expected difference when not ignoring LastMongoUpTime, but got no difference") + } + + // Test 2: With ignoring time fields, should detect no difference + diffWithIgnore := cmp.Diff(health1, health2, + cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), + cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), + cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), + ) + if diffWithIgnore != "" { + t.Errorf("Expected no difference when ignoring time fields, but got: %s", diffWithIgnore) + } + + // Test 3: With meaningful change (IsInGoalState), should still detect difference even when ignoring LastMongoUpTime + health3 := health2 + health3.Healthiness["test-pod-0"] = agent.ProcessHealth{ + IsInGoalState: false, // Changed from true to false + ExpectedToBeUp: true, + LastMongoUpTime: time.Date(2025, 1, 1, 12, 0, 0, 0, time.UTC).Unix(), // Different time + } + + diffMeaningfulChange := cmp.Diff(health1, health3, + cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), + cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), + cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), + ) + if diffMeaningfulChange == "" { + t.Error("Expected difference when IsInGoalState changes, even when ignoring LastMongoUpTime") + } + + t.Logf("Test passed: LastMongoUpTime is properly ignored while other changes are detected") +} + +func TestPlanChangesDetected(t *testing.T) { + // Create two health statuses with different plans + startTime := time.Date(2025, 1, 1, 10, 0, 0, 0, time.UTC) + health1 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: startTime.Unix(), + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + }, + }, + }, + }, + } + + laterTime := time.Date(2025, 1, 1, 11, 0, 0, 0, time.UTC) + health2 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: laterTime.Unix(), // Different time (should be ignored) + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + }, + { + Started: &laterTime, // New plan added + }, + }, + }, + }, + } + + // Should detect difference due to new plan, even though time fields are ignored + diff := cmp.Diff(health1, health2, + cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), + cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), + cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), + ) + if diff == "" { + t.Error("Expected difference when new plan is added, even when ignoring LastMongoUpTime") + } + + t.Logf("Test passed: Plan changes are properly detected while ignoring time fields") +} + +func TestAllTimeFieldsIgnored(t *testing.T) { + // Create two health statuses with different time fields but same meaningful content + startTime1 := time.Date(2025, 1, 1, 10, 0, 0, 0, time.UTC) + startTime2 := time.Date(2025, 1, 1, 11, 0, 0, 0, time.UTC) + completedTime1 := time.Date(2025, 1, 1, 10, 30, 0, 0, time.UTC) + completedTime2 := time.Date(2025, 1, 1, 11, 30, 0, 0, time.UTC) + + health1 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: startTime1.Unix(), + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime1, + Completed: &completedTime1, + Moves: []*agent.MoveStatus{ + { + Move: "TestMove", + Steps: []*agent.StepStatus{ + { + Step: "TestStep", + Started: &startTime1, + Completed: &completedTime1, + Result: "SUCCESS", + }, + }, + }, + }, + }, + }, + }, + }, + } + + health2 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: startTime2.Unix(), // Different time + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime2, // Different time + Completed: &completedTime2, // Different time + Moves: []*agent.MoveStatus{ + { + Move: "TestMove", + Steps: []*agent.StepStatus{ + { + Step: "TestStep", + Started: &startTime2, // Different time + Completed: &completedTime2, // Different time + Result: "SUCCESS", + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Should detect no difference when ignoring all time fields + diff := cmp.Diff(health1, health2, + cmpopts.IgnoreFields(agent.ProcessHealth{}, "LastMongoUpTime"), + cmpopts.IgnoreFields(agent.PlanStatus{}, "Started", "Completed"), + cmpopts.IgnoreFields(agent.StepStatus{}, "Started", "Completed"), + ) + if diff != "" { + t.Errorf("Expected no difference when ignoring all time fields, but got: %s", diff) + } + + t.Logf("Test passed: All time fields (LastMongoUpTime, Started, Completed) are properly ignored") +} + +func TestConciseDiff(t *testing.T) { + // Create realistic health statuses based on the test data structure + startTime := time.Date(2019, 9, 11, 14, 20, 40, 0, time.UTC) + completedTime := time.Date(2019, 9, 11, 14, 21, 42, 0, time.UTC) + + health1 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "bar": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: 1568222195, + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "bar": { + Name: "bar", + LastGoalStateClusterConfigVersion: 5, + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + Completed: &completedTime, + Moves: []*agent.MoveStatus{ + { + Move: "WaitRsInit", + Steps: []*agent.StepStatus{ + { + Step: "WaitRsInit", + Started: &startTime, + Completed: nil, // Currently running + Result: "wait", + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Simulate a step completion + stepCompletedTime := time.Date(2019, 9, 11, 14, 22, 0, 0, time.UTC) + health2 := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "bar": { + IsInGoalState: true, + ExpectedToBeUp: true, + LastMongoUpTime: 1568222200, // Different time (should be ignored) + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "bar": { + Name: "bar", + LastGoalStateClusterConfigVersion: 5, + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + Completed: &completedTime, + Moves: []*agent.MoveStatus{ + { + Move: "WaitRsInit", + Steps: []*agent.StepStatus{ + { + Step: "WaitRsInit", + Started: &startTime, + Completed: &stepCompletedTime, // Now completed + Result: "success", // Changed from "wait" to "success" + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Test concise diff + diff := conciseDiff(health1, health2) + if diff == "" { + t.Error("Expected concise diff to detect step result change") + } + + // Verify it contains the actual change + if !strings.Contains(diff, "Result") { + t.Error("Expected concise diff to contain 'Result' field change") + } + + t.Logf("Concise diff output (%d chars):\n%s", len(diff), diff) + t.Logf("Test passed: Concise diff produces shorter, focused output with current step info") +} + +func TestCurrentStepInfo(t *testing.T) { + // Create health status with a currently running step + startTime := time.Date(2019, 9, 11, 14, 20, 40, 0, time.UTC) + + health := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "bar": { + IsInGoalState: false, + ExpectedToBeUp: true, + LastMongoUpTime: 1568222195, + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "bar": { + Name: "bar", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + Moves: []*agent.MoveStatus{ + { + Move: "Start", + Steps: []*agent.StepStatus{ + { + Step: "StartFresh", + Started: &startTime, + Completed: nil, // Currently running + Result: "in_progress", + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Test current step info extraction + stepInfo := getCurrentStepInfo(health) + if stepInfo == "" { + t.Error("Expected getCurrentStepInfo to return current step information") + } + + if !strings.Contains(stepInfo, "Start") || !strings.Contains(stepInfo, "StartFresh") { + t.Errorf("Expected step info to contain move 'Start' and step 'StartFresh', got: %s", stepInfo) + } + + t.Logf("Current step info: %s", stepInfo) + t.Logf("Test passed: Current step info extraction works correctly") +} + +func TestGetLastGoalStateClusterConfigVersion(t *testing.T) { + // Test with health status that has config version + health := agent.Health{ + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + LastGoalStateClusterConfigVersion: 42, + }, + }, + } + + // Mock hostname for test + originalHostname := getHostname + defer func() { getHostname = originalHostname }() + getHostname = func() string { return "test-pod-0" } + + version := getLastGoalStateClusterConfigVersion(health) + if version != 42 { + t.Errorf("Expected config version 42, but got %d", version) + } + + // Test with missing process + getHostname = func() string { return "missing-pod" } + version = getLastGoalStateClusterConfigVersion(health) + if version != 0 { + t.Errorf("Expected config version 0 for missing process, but got %d", version) + } +} + +func TestGetCurrentMoveAndStep(t *testing.T) { + startTime := time.Date(2019, 9, 11, 14, 20, 40, 0, time.UTC) + + // Test with active step + health := agent.Health{ + Healthiness: map[string]agent.ProcessHealth{ + "test-pod-0": { + IsInGoalState: false, + ExpectedToBeUp: true, + LastMongoUpTime: 1568222195, + }, + }, + ProcessPlans: map[string]agent.MmsDirectorStatus{ + "test-pod-0": { + Name: "test-process", + Plans: []*agent.PlanStatus{ + { + Started: &startTime, + Moves: []*agent.MoveStatus{ + { + Move: "Start", + Steps: []*agent.StepStatus{ + { + Step: "StartFresh", + Started: &startTime, + Completed: nil, // Currently running + }, + }, + }, + }, + }, + }, + }, + }, + } + + // Mock hostname for test + originalHostname := getHostname + defer func() { getHostname = originalHostname }() + getHostname = func() string { return "test-pod-0" } + + result := getCurrentMoveAndStep(health) + expected := "not_in_goal (executing: Start -> StartFresh)" + if result != expected { + t.Errorf("Expected '%s', but got '%s'", expected, result) + } + + // Test with WaitDeleteMyPodKube move + health.ProcessPlans["test-pod-0"].Plans[0].Moves[0].Move = "WaitDeleteMyPodKube" + result = getCurrentMoveAndStep(health) + expected = "not_in_goal (waiting for pod deletion)" + if result != expected { + t.Errorf("Expected '%s', but got '%s'", expected, result) + } + + // Test with completed plan + completedTime := time.Date(2019, 9, 11, 14, 25, 0, 0, time.UTC) + health.ProcessPlans["test-pod-0"].Plans[0].Completed = &completedTime + result = getCurrentMoveAndStep(health) + expected = "not_in_goal (plan completed)" + if result != expected { + t.Errorf("Expected '%s', but got '%s'", expected, result) + } + + // Test with no health data + getHostname = func() string { return "missing-pod" } + result = getCurrentMoveAndStep(health) + expected = "no health data" + if result != expected { + t.Errorf("Expected '%s', but got '%s'", expected, result) + } +} diff --git a/pkg/util/constants.go b/pkg/util/constants.go index 443ae925c..d0a94860d 100644 --- a/pkg/util/constants.go +++ b/pkg/util/constants.go @@ -72,6 +72,7 @@ const ( // EnvVarDebug is used to decide whether we want to start the agent in debug mode EnvVarDebug = "MDB_AGENT_DEBUG" EnvVarAgentVersion = "MDB_AGENT_VERSION" + EnvVarCustomAgentURL = "MDB_CUSTOM_AGENT_URL" EnvVarMultiClusterMode = "MULTI_CLUSTER_MODE" // EnvVarSSLRequireValidMMSCertificates bla bla diff --git a/scripts/dev/agent/check-agent-logs.sh b/scripts/dev/agent/check-agent-logs.sh new file mode 100755 index 000000000..2d1f4cb99 --- /dev/null +++ b/scripts/dev/agent/check-agent-logs.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +source /Users/nam.nguyen/projects/ops-manager-kubernetes/.generated/context.export.env + +# Check if NAMESPACE is set +if [ -z "$NAMESPACE" ]; then + echo "Error: NAMESPACE environment variable is not set" + exit 1 +fi + +echo "Checking agent logs for Kubernetes moves in namespace: $NAMESPACE" +echo "==================================================================" + +# Get all StatefulSets with owner mongodb.com/v1 +STS_LIST=$(kubectl get sts -n "$NAMESPACE" -o jsonpath='{range .items[?(@.metadata.ownerReferences[0].apiVersion=="mongodb.com/v1")]}{.metadata.name}{"\n"}{end}') + +if [ -z "$STS_LIST" ]; then + echo "No StatefulSets found with owner mongodb.com/v1 in namespace $NAMESPACE" + exit 0 +fi + +PODS_FOUND=() + +for sts in $STS_LIST; do + echo "Processing StatefulSet: $sts" + + # Get pods for this StatefulSet + PODS=$(kubectl get pods -n "$NAMESPACE" -l app="${sts}-svc" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) + + if [ -z "$PODS" ]; then + PODS=$(kubectl get pods -n "$NAMESPACE" -o json | jq -r '.items[] | select(.metadata.ownerReferences[]?.name=="'$sts'") | .metadata.name' 2>/dev/null) + fi + + if [ -z "$PODS" ]; then + echo " No pods found for StatefulSet $sts" + continue + fi + + for pod in $PODS; do + echo " Processing pod: $pod" + PODS_FOUND+=("$pod") + done +done + +if [ ${#PODS_FOUND[@]} -eq 0 ]; then + echo "No pods found" + exit 0 +fi + +echo "" +echo "šŸ” Analyzing Agent Logs for Kubernetes-related Moves" +echo "=====================================================" + +for pod in "${PODS_FOUND[@]}"; do + echo "" + echo "šŸ“‹ Pod: $pod" + echo "$(printf '%.0s─' {1..60})" + + # Check if log file exists + if ! kubectl exec -n "$NAMESPACE" "$pod" -- test -f /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null; then + echo "āŒ Log file not found: /var/log/mongodb-mms-automation/automation-agent-verbose.log" + continue + fi + + echo "šŸ”„ Recent Kubernetes/Move-related log entries:" + echo "" + + # Search for Kubernetes-related moves and recent activity + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 500 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -E "(move|Move|MOVE|kubernetes|k8s|wait|Wait|WAIT|step|Step|STEP|goal|Goal|GOAL|plan|Plan|PLAN)" | \ + tail -n 20 || echo "No recent move-related entries found" + + echo "" + echo "šŸŽÆ Current Wait Steps:" + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 200 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -E "(WaitAllRsMembersUp|WaitPrimary|WaitRsInit|WaitProcessUp|wait.*step)" | \ + tail -n 10 || echo "No current wait steps found" + + echo "" + echo "āš ļø Recent Errors/Warnings:" + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 300 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -iE "(error|Error|ERROR|warn|Warn|WARN|fail|Fail|FAIL)" | \ + tail -n 5 || echo "No recent errors/warnings found" + + echo "" + echo "šŸ“ˆ Recent Goal/Plan Activity:" + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 200 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -E "(goal.*version|plan.*start|plan.*complet|automation.*config)" | \ + tail -n 5 || echo "No recent goal/plan activity found" + + echo "" + echo "šŸ”— Replica Set Status:" + kubectl exec -n "$NAMESPACE" "$pod" -- tail -n 200 /var/log/mongodb-mms-automation/automation-agent-verbose.log 2>/dev/null | \ + grep -E "(replica.*set|rs.*init|primary|secondary|replication)" | \ + tail -n 5 || echo "No recent replica set activity found" + + echo "$(printf '%.0s═' {1..60})" +done + +echo "" +echo "šŸ’” Log Analysis Summary" +echo "======================" +echo "Analyzed logs from ${#PODS_FOUND[@]} pods for:" +echo " • Move/Step execution status" +echo " • Wait conditions and blocking steps" +echo " • Error conditions and warnings" +echo " • Goal/Plan progression" +echo " • Replica set initialization status" +echo "" +echo "āœ… Analysis complete!" \ No newline at end of file diff --git a/scripts/dev/agent/print-agent-health.sh b/scripts/dev/agent/print-agent-health.sh new file mode 100755 index 000000000..935eca968 --- /dev/null +++ b/scripts/dev/agent/print-agent-health.sh @@ -0,0 +1,188 @@ +#!/bin/bash + +source /Users/nam.nguyen/projects/ops-manager-kubernetes/.generated/context.export.env + +# Check if NAMESPACE is set +if [ -z "$NAMESPACE" ]; then + echo "Error: NAMESPACE environment variable is not set" + exit 1 +fi + +# Create temporary directory for health status files +TEMP_DIR=$(mktemp -d) +trap "rm -rf $TEMP_DIR" EXIT + +echo "Fetching agent health status from pods in namespace: $NAMESPACE" +echo "================================================================" + +# Get all StatefulSets with owner mongodb.com/v1 +STS_LIST=$(kubectl get sts -n "$NAMESPACE" -o jsonpath='{range .items[?(@.metadata.ownerReferences[0].apiVersion=="mongodb.com/v1")]}{.metadata.name}{"\n"}{end}') + +if [ -z "$STS_LIST" ]; then + echo "No StatefulSets found with owner mongodb.com/v1 in namespace $NAMESPACE" + exit 0 +fi + +# Collect all health status data +PODS_FOUND=() +PODS_WITH_DATA=() +PODS_WITHOUT_DATA=() + +for sts in $STS_LIST; do + echo "Processing StatefulSet: $sts" + + # Get pods for this StatefulSet using correct label selector + PODS=$(kubectl get pods -n "$NAMESPACE" -l app="${sts}-svc" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) + + if [ -z "$PODS" ]; then + # Try alternative selector patterns + PODS=$(kubectl get pods -n "$NAMESPACE" -o json | jq -r '.items[] | select(.metadata.ownerReferences[]?.name=="'$sts'") | .metadata.name' 2>/dev/null) + fi + + if [ -z "$PODS" ]; then + echo " No pods found for StatefulSet $sts" + continue + fi + + for pod in $PODS; do + echo " Processing pod: $pod" + PODS_FOUND+=("$pod") + + # Get agent health status + if kubectl exec -n "$NAMESPACE" "$pod" -- test -f /var/log/mongodb-mms-automation/agent-health-status.json 2>/dev/null; then + if kubectl exec -n "$NAMESPACE" "$pod" -- cat /var/log/mongodb-mms-automation/agent-health-status.json 2>/dev/null > "$TEMP_DIR/${pod}.json"; then + # Check if file has content + if [ -s "$TEMP_DIR/${pod}.json" ]; then + PODS_WITH_DATA+=("$pod") + else + echo " File exists but is empty" + PODS_WITHOUT_DATA+=("$pod") + fi + else + echo " Could not read file" + PODS_WITHOUT_DATA+=("$pod") + fi + else + echo " File not found: /var/log/mongodb-mms-automation/agent-health-status.json" + PODS_WITHOUT_DATA+=("$pod") + fi + done +done + +echo "" +echo "Agent Health Status Summary" +echo "==========================" + +if [ ${#PODS_FOUND[@]} -eq 0 ]; then + echo "No pods found with agent health status files" + exit 0 +fi + +# Pretty print individual status files +for pod in "${PODS_WITH_DATA[@]}"; do + echo "" + echo "šŸ” Pod: $pod" + echo "$(printf '%.0s─' {1..50})" + + if command -v jq &> /dev/null; then + jq -C . "$TEMP_DIR/${pod}.json" 2>/dev/null || cat "$TEMP_DIR/${pod}.json" + else + cat "$TEMP_DIR/${pod}.json" + fi +done + +# Show pods without data +if [ ${#PODS_WITHOUT_DATA[@]} -gt 0 ]; then + echo "" + echo "āŒ Pods without health data:" + for pod in "${PODS_WITHOUT_DATA[@]}"; do + echo " - $pod" + done +fi + +# Show differences if multiple pods with data exist +if [ ${#PODS_WITH_DATA[@]} -gt 1 ]; then + echo "" + echo "" + echo "šŸ”„ Health Status Comparison" + echo "==========================" + + # Create a combined comparison view + if command -v jq &> /dev/null; then + echo "Key Status Comparison:" + echo "$(printf '%.0s─' {1..80})" + + # Extract key fields for comparison + for pod in "${PODS_WITH_DATA[@]}"; do + echo -n "Pod $pod: " + # Try to extract overall state or first process state + STATE=$(jq -r '.state // (.statuses | keys[0] as $k | .[$k].IsInGoalState) // "unknown"' "$TEMP_DIR/${pod}.json" 2>/dev/null) + if [ "$STATE" = "true" ]; then + echo "āœ… In Goal State" + elif [ "$STATE" = "false" ]; then + echo "āŒ Not in Goal State" + else + echo "$STATE" + fi + done + + echo "" + echo "Process Details:" + echo "$(printf '%.0s─' {1..80})" + + for pod in "${PODS_WITH_DATA[@]}"; do + echo "Pod $pod:" + # Extract process information + jq -r ' + if .statuses then + .statuses | to_entries[] | " \(.key): IsInGoalState=\(.value.IsInGoalState), ReplicationStatus=\(.value.ReplicationStatus // "N/A")" + elif .processes then + .processes[] | " \(.name): \(.state // "unknown")" + else + " No process information found" + end + ' "$TEMP_DIR/${pod}.json" 2>/dev/null || echo " Parse error" + echo "" + done + + echo "MMS Status Summary:" + echo "$(printf '%.0s─' {1..80})" + + for pod in "${PODS_WITH_DATA[@]}"; do + echo "Pod $pod:" + jq -r ' + if .mmsStatus then + .mmsStatus | to_entries[] | " \(.key): GoalVersion=\(.value.lastGoalVersionAchieved // "N/A"), Responsive=\(.value.responsive // "N/A")" + else + " No MMS status found" + end + ' "$TEMP_DIR/${pod}.json" 2>/dev/null || echo " Parse error" + echo "" + done + fi + + # Show file differences using diff if exactly 2 pods with data + if [ ${#PODS_WITH_DATA[@]} -eq 2 ] && command -v diff &> /dev/null; then + pod1="${PODS_WITH_DATA[0]}" + pod2="${PODS_WITH_DATA[1]}" + + echo "" + echo "šŸ“Š Detailed Diff between $pod1 and $pod2:" + echo "$(printf '%.0s─' {1..80})" + + if command -v jq &> /dev/null; then + # Pretty print both files for comparison + jq . "$TEMP_DIR/${pod1}.json" > "$TEMP_DIR/${pod1}_pretty.json" 2>/dev/null + jq . "$TEMP_DIR/${pod2}.json" > "$TEMP_DIR/${pod2}_pretty.json" 2>/dev/null + diff -u "$TEMP_DIR/${pod1}_pretty.json" "$TEMP_DIR/${pod2}_pretty.json" || true + else + diff -u "$TEMP_DIR/${pod1}.json" "$TEMP_DIR/${pod2}.json" || true + fi + fi +fi + +echo "" +echo "āœ… Health status collection complete!" +echo " Found ${#PODS_FOUND[@]} total pods" +echo " ${#PODS_WITH_DATA[@]} pods with health data" +echo " ${#PODS_WITHOUT_DATA[@]} pods without health data" \ No newline at end of file diff --git a/scripts/release/atomic_pipeline.py b/scripts/release/atomic_pipeline.py index b31486058..a8f885125 100755 --- a/scripts/release/atomic_pipeline.py +++ b/scripts/release/atomic_pipeline.py @@ -329,12 +329,29 @@ def build_upgrade_hook_image(build_configuration: ImageBuildConfiguration): ) +def build_agent_sidecar_image(build_configuration: ImageBuildConfiguration): + """ + Builds image used for agent health monitoring sidecar. + """ + + build_image( + build_configuration=build_configuration, + ) + + def build_agent(build_configuration: ImageBuildConfiguration): """ Build the agent only for the latest operator for patches and operator releases. """ - if build_configuration.all_agents: + if build_configuration.custom_agent_url: + if not build_configuration.version: + raise ValueError("When using custom_agent_url, version must be set (e.g., 'latest')") + agent_versions_to_build = [(build_configuration.version, "100.9.5")] + logger.info( + f"building custom agent from URL: {build_configuration.custom_agent_url} with version: {build_configuration.version}" + ) + elif build_configuration.all_agents: agent_versions_to_build = get_all_agents_for_rebuild() logger.info("building all agents") elif build_configuration.currently_used_agents: @@ -400,18 +417,45 @@ def build_agent_pipeline( f"======== Building agent pipeline for version {agent_version}, build configuration version: {build_configuration.version}" ) - platform_build_args = generate_agent_build_args( - platforms=available_platforms, agent_version=agent_version, tools_version=tools_version - ) + # Handle custom agent URL for testing + if build_configuration.custom_agent_url: + print(f"======== Using custom agent URL: {build_configuration.custom_agent_url}") + print(f"======== Using version from pipeline: {build_configuration_copy.version}") + + # Extract the base URL and filename from the custom URL + custom_agent_base_url = build_configuration.custom_agent_url.rsplit("/", 1)[0] + custom_agent_filename = build_configuration.custom_agent_url.rsplit("/", 1)[1] + + # Use the version provided by the pipeline (e.g., "latest") instead of extracting from URL + # This allows the pipeline_main.py to control the version tag used for the image + + # Generate tools build args first (we still need MongoDB tools) + tools_build_args = generate_tools_build_args(available_platforms, tools_version) + + # Use custom URL for agent, but keep standard tools + agent_build_args = { + "mongodb_agent_version_amd64": custom_agent_filename, + "mongodb_agent_version_arm64": custom_agent_filename, # Fallback to same file + "mongodb_agent_version_s390x": custom_agent_filename, # Fallback to same file + "mongodb_agent_version_ppc64le": custom_agent_filename, # Fallback to same file + } + + # Combine tools and agent build args + platform_build_args = {**tools_build_args, **agent_build_args} + agent_base_url = custom_agent_base_url + else: + platform_build_args = generate_agent_build_args( + platforms=available_platforms, agent_version=agent_version, tools_version=tools_version + ) + agent_base_url = ( + "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/automation-agent/prod" + ) - agent_base_url = ( - "https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/automation-agent/prod" - ) tools_base_url = "https://fastdl.mongodb.org/tools/db" args = { - "version": agent_version, - "agent_version": agent_version, + "version": build_configuration_copy.version, + "agent_version": build_configuration_copy.version, "mongodb_agent_url": agent_base_url, "mongodb_tools_url": tools_base_url, **platform_build_args, # Add the platform-specific build args diff --git a/scripts/release/build/build_info.py b/scripts/release/build/build_info.py index 227cafa28..d668fe271 100644 --- a/scripts/release/build/build_info.py +++ b/scripts/release/build/build_info.py @@ -17,6 +17,7 @@ MCO_TESTS_IMAGE = "mco-tests" READINESS_PROBE_IMAGE = "readiness-probe" UPGRADE_HOOK_IMAGE = "upgrade-hook" +AGENT_SIDECAR_IMAGE = "agent-sidecar" DATABASE_IMAGE = "database" AGENT_IMAGE = "agent" INIT_APPDB_IMAGE = "init-appdb" diff --git a/scripts/release/build/image_build_configuration.py b/scripts/release/build/image_build_configuration.py index b62718276..e547437f9 100644 --- a/scripts/release/build/image_build_configuration.py +++ b/scripts/release/build/image_build_configuration.py @@ -21,6 +21,7 @@ class ImageBuildConfiguration: sign: bool = False all_agents: bool = False currently_used_agents: bool = False + custom_agent_url: Optional[str] = None def is_release_scenario(self) -> bool: return self.scenario == BuildScenario.RELEASE diff --git a/scripts/release/pipeline_main.py b/scripts/release/pipeline_main.py index 010620284..d5636d57a 100644 --- a/scripts/release/pipeline_main.py +++ b/scripts/release/pipeline_main.py @@ -18,6 +18,7 @@ from lib.base_logger import logger from scripts.release.atomic_pipeline import ( build_agent, + build_agent_sidecar_image, build_database_image, build_init_appdb_image, build_init_database_image, @@ -31,6 +32,7 @@ ) from scripts.release.build.build_info import ( AGENT_IMAGE, + AGENT_SIDECAR_IMAGE, DATABASE_IMAGE, INIT_APPDB_IMAGE, INIT_DATABASE_IMAGE, @@ -73,6 +75,7 @@ def get_builder_function_for_image_name() -> Dict[str, Callable]: MCO_TESTS_IMAGE: build_mco_tests_image, READINESS_PROBE_IMAGE: build_readiness_probe_image, UPGRADE_HOOK_IMAGE: build_upgrade_hook_image, + AGENT_SIDECAR_IMAGE: build_agent_sidecar_image, DATABASE_IMAGE: build_database_image, AGENT_IMAGE: build_agent, # Init images @@ -137,6 +140,7 @@ def image_build_config_from_args(args) -> ImageBuildConfiguration: parallel_factor=args.parallel_factor, all_agents=args.all_agents, currently_used_agents=args.current_agents, + custom_agent_url=args.custom_agent_url, ) @@ -275,6 +279,13 @@ def main(): action="store_true", help="Build all currently used agent images.", ) + parser.add_argument( + "--custom-agent-url", + metavar="", + action="store", + type=str, + help="Custom agent URL for testing (e.g., https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68caf1b06da1570007e898b4/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9783-1.linux_x86_64.tar.gz)", + ) args = parser.parse_args() diff --git a/scripts/test_custom_agent.sh b/scripts/test_custom_agent.sh new file mode 100755 index 000000000..974bdf494 --- /dev/null +++ b/scripts/test_custom_agent.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +set -euo pipefail + +# Test script for building custom agent image +# This script demonstrates how to use the new --custom-agent-url parameter +# It leverages the pipeline_main.py by supplying a version (latest) which is used by atomic_pipeline.py + +CUSTOM_AGENT_URL="https://mciuploads.s3.amazonaws.com/mms-automation/mongodb-mms-build-agent/builds/patches/68caf1b06da1570007e898b4/automation-agent/local/mongodb-mms-automation-agent-13.41.0.9783-1.linux_x86_64.tar.gz" +VERSION="latest" + +echo "Testing custom agent build with URL: ${CUSTOM_AGENT_URL}" +echo "Using version: ${VERSION}" +echo "" +echo "This will build the MongoDB agent image using your custom agent version." +echo "The image will be tagged with version '${VERSION}' and pushed to the registry specified in build_info.json." +echo "" + +# Use the existing pipeline with version and custom agent URL parameters +echo "Running: scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --version \"${VERSION}\" --custom-agent-url \"${CUSTOM_AGENT_URL}\"" +echo "" + +# Execute the build (add --load to build locally without pushing) +scripts/dev/run_python.sh scripts/release/pipeline_main.py agent --version "${VERSION}" --custom-agent-url "${CUSTOM_AGENT_URL}" + +echo "" +echo "Custom agent build completed!" +echo "The image should now be available with tag '${VERSION}' containing your custom agent version 13.41.0.9782-1"