From 8c957110f3ec71c7ca2d944ec1acf6852257d5cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Tue, 3 Jun 2025 13:23:48 +0200 Subject: [PATCH 1/9] Allow for more than one deployment per test under cloud-qa --- .../kubetester/mongodb.py | 49 +++++++++++- .../kubetester/omtester.py | 9 +-- ...multi_cluster_sharded_disaster_recovery.py | 7 +- ...d_cluster_operator_upgrade_v1_27_to_mck.py | 74 +++++++++++++++---- scripts/dev/contexts/variables/om80 | 2 + scripts/funcs/kubernetes | 9 +-- 6 files changed, 118 insertions(+), 32 deletions(-) diff --git a/docker/mongodb-kubernetes-tests/kubetester/mongodb.py b/docker/mongodb-kubernetes-tests/kubetester/mongodb.py index c4fd2a103..8665f357d 100644 --- a/docker/mongodb-kubernetes-tests/kubetester/mongodb.py +++ b/docker/mongodb-kubernetes-tests/kubetester/mongodb.py @@ -230,7 +230,18 @@ def __repr__(self): def configure( self, - om: MongoDBOpsManager, + om: Optional[MongoDBOpsManager], + project_name: str, + api_client: Optional[client.ApiClient] = None, + ) -> MongoDB: + if om is not None: + return self.configure_ops_manager(om, project_name, api_client=api_client) + else: + return self.configure_cloud_qa(project_name, api_client=api_client) + + def configure_ops_manager( + self, + om: Optional[MongoDBOpsManager], project_name: str, api_client: Optional[client.ApiClient] = None, ) -> MongoDB: @@ -247,6 +258,39 @@ def configure( self["spec"]["credentials"] = om.api_key_secret(self.namespace, api_client=api_client) return self + def configure_cloud_qa( + self, + project_name, + src_project_config_map_name: str = None, + api_client: Optional[client.ApiClient] = None, + ) -> MongoDB: + if "opsManager" in self["spec"]: + del self["spec"]["opsManager"] + + if src_project_config_map_name is None and "cloudManager" in self["spec"]: + src_project_config_map_name = self["spec"]["cloudManager"]["configMapRef"]["name"] + else: + # my-project cm and my-credentials secret are created by scripts/evergreen/e2e/configure_operator.sh + src_project_config_map_name = "my-project" + + try: + src_cm = read_configmap(self.namespace, src_project_config_map_name, api_client=api_client) + except client.ApiException as e: + if e.status == 404: + logger.debug("project config map is not specified, trying my-project as the source") + src_cm = read_configmap(self.namespace, "my-project", api_client=api_client) + else: + raise e + + new_project_config_map_name = f"{self.name}-project-config" + ensure_nested_objects(self, ["spec", "cloudManager", "configMapRef"]) + self["spec"]["cloudManager"]["configMapRef"]["name"] = new_project_config_map_name + + src_cm.update({"projectName": f"{self.namespace}-{project_name}"}) + create_or_update_configmap(self.namespace, new_project_config_map_name, src_cm, api_client=api_client) + + return self + def configure_backup(self, mode: str = "enabled") -> MongoDB: ensure_nested_objects(self, ["spec", "backup"]) self["spec"]["backup"]["mode"] = mode @@ -449,6 +493,9 @@ def get_external_domain(self): def config_map_name(self) -> str: if "opsManager" in self["spec"]: return self["spec"]["opsManager"]["configMapRef"]["name"] + elif "cloudManager" in self["spec"]: + return self["spec"]["cloudManager"]["configMapRef"]["name"] + return self["spec"]["project"] def shard_replicaset_names(self) -> List[str]: diff --git a/docker/mongodb-kubernetes-tests/kubetester/omtester.py b/docker/mongodb-kubernetes-tests/kubetester/omtester.py index 72874d2b6..3fd78bc78 100644 --- a/docker/mongodb-kubernetes-tests/kubetester/omtester.py +++ b/docker/mongodb-kubernetes-tests/kubetester/omtester.py @@ -20,16 +20,11 @@ from kubetester.om_queryable_backups import OMQueryableBackup from opentelemetry import trace from requests.adapters import HTTPAdapter, Retry +from tests.common.ops_manager.cloud_manager import is_cloud_qa from .kubetester import get_env_var_or_fail - -def running_cloud_manager(): - "Determines if the current test is running against Cloud Manager" - return get_env_var_or_fail("OM_HOST") == "https://cloud-qa.mongodb.com" - - -skip_if_cloud_manager = pytest.mark.skipif(running_cloud_manager(), reason="Do not run in Cloud Manager") +skip_if_cloud_manager = pytest.mark.skipif(is_cloud_qa(), reason="Do not run in Cloud Manager") class BackupStatus(str, Enum): diff --git a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py index 0e119fa57..da5f51a0d 100644 --- a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py +++ b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py @@ -12,12 +12,10 @@ update_configmap, ) from kubetester.kubetester import ( - KubernetesTester, ensure_ent_version, ) from kubetester.kubetester import fixture as yaml_fixture from kubetester.kubetester import ( - get_env_var_or_fail, is_default_architecture_static, is_multi_cluster, run_periodically, @@ -29,6 +27,7 @@ from kubetester.phase import Phase from pytest import fixture, mark from tests import test_logger +from tests.common.ops_manager.cloud_manager import is_cloud_qa from tests.conftest import ( MULTI_CLUSTER_MEMBER_LIST_CONFIGMAP, get_central_cluster_client, @@ -54,10 +53,6 @@ # to reconfigure the deployment further. -def is_cloud_qa() -> bool: - return os.getenv("ops_manager_version", "cloud_qa") == "cloud_qa" - - @mark.e2e_multi_cluster_sharded_disaster_recovery def test_install_operator(multi_cluster_operator: Operator): multi_cluster_operator.assert_is_running() diff --git a/docker/mongodb-kubernetes-tests/tests/upgrades/sharded_cluster_operator_upgrade_v1_27_to_mck.py b/docker/mongodb-kubernetes-tests/tests/upgrades/sharded_cluster_operator_upgrade_v1_27_to_mck.py index 4ed13d2a9..0b059386e 100644 --- a/docker/mongodb-kubernetes-tests/tests/upgrades/sharded_cluster_operator_upgrade_v1_27_to_mck.py +++ b/docker/mongodb-kubernetes-tests/tests/upgrades/sharded_cluster_operator_upgrade_v1_27_to_mck.py @@ -1,21 +1,16 @@ -from typing import Dict, Optional +from typing import Dict import pytest -from kubeobject import CustomObject -from kubernetes import client -from kubetester import create_or_update_configmap, read_configmap -from kubetester.certs import create_sharded_cluster_certs -from kubetester.kubetester import ensure_nested_objects +from kubetester import read_configmap, try_load +from kubetester.certs import create_mongodb_tls_certs, create_sharded_cluster_certs from kubetester.kubetester import fixture as yaml_fixture from kubetester.mongodb import MongoDB -from kubetester.mongotester import ShardedClusterTester -from kubetester.operator import Operator +from kubetester.mongotester import ReplicaSetTester, ShardedClusterTester from kubetester.phase import Phase from tests import test_logger from tests.conftest import ( LEGACY_OPERATOR_NAME, OPERATOR_NAME, - get_central_cluster_client, get_default_operator, install_legacy_deployment_state_meko, log_deployments_info, @@ -23,6 +18,7 @@ from tests.upgrades import downscale_operator_deployment MDB_RESOURCE = "sh001-base" +MDB_RS_RESOURCE = "rs" CERT_PREFIX = "prefix" logger = test_logger.get_test_logger(__name__) @@ -41,6 +37,8 @@ If the sharded cluster resource correctly reconciles after upgrade/downgrade and scaling steps, we assume it works correctly. """ + + # TODO CLOUDP-318100: this test should eventually be updated and not pinned to 1.27 anymore @@ -68,7 +66,7 @@ def server_certs(issuer: str, namespace: str) -> str: ) -@pytest.fixture(scope="module") +@pytest.fixture(scope="function") def sharded_cluster( issuer_ca_configmap: str, namespace: str, @@ -79,7 +77,11 @@ def sharded_cluster( yaml_fixture("sharded-cluster.yaml"), namespace=namespace, name=MDB_RESOURCE, - ) + ).configure(om=None, project_name=MDB_RESOURCE) + + if try_load(resource): + return resource + resource.set_version(custom_mdb_version) resource["spec"]["mongodsPerShardCount"] = 2 resource["spec"]["configServerCount"] = 2 @@ -87,7 +89,34 @@ def sharded_cluster( resource["spec"]["persistent"] = True resource.configure_custom_tls(issuer_ca_configmap, CERT_PREFIX) - return resource.update() + return resource + + +@pytest.fixture(scope="module") +def replica_set_certs(issuer: str, namespace: str): + return create_mongodb_tls_certs(issuer, namespace, MDB_RS_RESOURCE, f"prefix-{MDB_RS_RESOURCE}-cert") + + +@pytest.fixture(scope="module") +def replica_set( + issuer_ca_configmap: str, + namespace: str, + replica_set_certs: str, + custom_mdb_version: str, +): + resource = MongoDB.from_yaml( + yaml_fixture("replica-set-basic.yaml"), + namespace=namespace, + name=MDB_RS_RESOURCE, + ).configure(om=None, project_name=f"{MDB_RS_RESOURCE}") + + if try_load(resource): + return resource + + resource.set_version(custom_mdb_version) + resource.configure_custom_tls(issuer_ca_configmap, CERT_PREFIX) + + return resource @pytest.mark.e2e_sharded_cluster_operator_upgrade_v1_27_to_mck @@ -101,16 +130,23 @@ def test_install_legacy_deployment_state_meko( install_legacy_deployment_state_meko(namespace, managed_security_context, operator_installation_config) def test_create_sharded_cluster(self, sharded_cluster: MongoDB): + sharded_cluster.update() sharded_cluster.assert_reaches_phase(phase=Phase.Running, timeout=350) def test_scale_up_sharded_cluster(self, sharded_cluster: MongoDB): - sharded_cluster.load() sharded_cluster["spec"]["mongodsPerShardCount"] = 3 sharded_cluster["spec"]["configServerCount"] = 3 sharded_cluster.update() sharded_cluster.assert_reaches_phase(phase=Phase.Running, timeout=300) +@pytest.mark.e2e_sharded_cluster_operator_upgrade_v1_27_to_mck +class TestReplicaSetDeployment: + def test_create_replica_set(self, replica_set: MongoDB): + replica_set.update() + replica_set.assert_reaches_phase(phase=Phase.Running, timeout=350) + + @pytest.mark.e2e_sharded_cluster_operator_upgrade_v1_27_to_mck class TestOperatorUpgrade: @@ -137,6 +173,12 @@ def test_sharded_cluster_reconciled(self, sharded_cluster: MongoDB, namespace: s def test_assert_connectivity(self, ca_path: str): ShardedClusterTester(MDB_RESOURCE, 1, ssl=True, ca_path=ca_path).assert_connectivity() + def test_replica_set_reconciled(self, replica_set: MongoDB): + replica_set.assert_reaches_phase(phase=Phase.Running, timeout=850, ignore_errors=True) + + def test_assert_connectivity_replica_set(self, ca_path: str): + ReplicaSetTester(MDB_RS_RESOURCE, 3, ssl=True, ca_path=ca_path).assert_connectivity() + def test_scale_down_sharded_cluster(self, sharded_cluster: MongoDB, namespace: str): sharded_cluster.load() # Scale down both by 1 @@ -168,6 +210,12 @@ def test_sharded_cluster_reconciled(self, sharded_cluster: MongoDB): def test_assert_connectivity(self, ca_path: str): ShardedClusterTester(MDB_RESOURCE, 1, ssl=True, ca_path=ca_path).assert_connectivity() + def test_replica_set_reconciled(self, replica_set: MongoDB): + replica_set.assert_reaches_phase(phase=Phase.Running, timeout=850, ignore_errors=True) + + def test_assert_connectivity_replica_set(self, ca_path: str): + ReplicaSetTester(MDB_RS_RESOURCE, 3, ssl=True, ca_path=ca_path).assert_connectivity() + def test_scale_up_sharded_cluster(self, sharded_cluster: MongoDB): sharded_cluster.load() sharded_cluster["spec"]["mongodsPerShardCount"] = 3 diff --git a/scripts/dev/contexts/variables/om80 b/scripts/dev/contexts/variables/om80 index f0b677d90..06f10fe7a 100644 --- a/scripts/dev/contexts/variables/om80 +++ b/scripts/dev/contexts/variables/om80 @@ -19,3 +19,5 @@ export AGENT_IMAGE="${MDB_AGENT_IMAGE_REPOSITORY}:${AGENT_VERSION}" export CUSTOM_APPDB_VERSION=8.0.6-ent export TEST_MODE=opsmanager export OPS_MANAGER_REGISTRY="${BASE_REPO_URL}" + +export ops_manager_version="${CUSTOM_OM_VERSION}" diff --git a/scripts/funcs/kubernetes b/scripts/funcs/kubernetes index 11250422d..247da29b9 100644 --- a/scripts/funcs/kubernetes +++ b/scripts/funcs/kubernetes @@ -98,7 +98,7 @@ create_image_registries_secret() { context=$1 namespace=$2 secret_name=$3 - + # Detect the correct config file path based on container runtime local config_file local temp_config_file="" @@ -106,7 +106,7 @@ create_image_registries_secret() { # For Podman, use root's auth.json since minikube uses sudo podman config_file="/root/.config/containers/auth.json" echo "Using Podman config: ${config_file}" - + # Create a temporary copy that the current user can read temp_config_file=$(mktemp) sudo cp "${config_file}" "${temp_config_file}" @@ -117,7 +117,7 @@ create_image_registries_secret() { config_file="${HOME}/.docker/config.json" echo "Using Docker config: ${config_file}" fi - + # shellcheck disable=SC2154 if kubectl --context "${context}" get namespace "${namespace}"; then kubectl --context "${context}" -n "${namespace}" delete secret "${secret_name}" --ignore-not-found @@ -127,7 +127,7 @@ create_image_registries_secret() { else echo "Skipping creating pull secret in ${context}/${namespace}. The namespace doesn't exist yet." fi - + # Clean up temporary file if [[ -n "${temp_config_file}" ]] && [[ -f "${temp_config_file}" ]]; then rm -f "${temp_config_file}" @@ -255,7 +255,6 @@ run_script_with_wrapped_kubectl() { cat > "${wrapper_script}" << EOF #!/bin/bash # Define kubectl function to include the context -set -x kubectl() { command kubectl --context "${context}" "\$@" } From 84ff3d6b1e5d38cdc39f724c1b551f5b45f9d610 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Mon, 29 Sep 2025 09:42:48 +0200 Subject: [PATCH 2/9] Removed any changes to the existing tests --- .../kubetester/omtester.py | 9 ++- ...multi_cluster_sharded_disaster_recovery.py | 7 +- ...d_cluster_operator_upgrade_v1_27_to_mck.py | 74 ++++--------------- 3 files changed, 26 insertions(+), 64 deletions(-) diff --git a/docker/mongodb-kubernetes-tests/kubetester/omtester.py b/docker/mongodb-kubernetes-tests/kubetester/omtester.py index 3fd78bc78..72874d2b6 100644 --- a/docker/mongodb-kubernetes-tests/kubetester/omtester.py +++ b/docker/mongodb-kubernetes-tests/kubetester/omtester.py @@ -20,11 +20,16 @@ from kubetester.om_queryable_backups import OMQueryableBackup from opentelemetry import trace from requests.adapters import HTTPAdapter, Retry -from tests.common.ops_manager.cloud_manager import is_cloud_qa from .kubetester import get_env_var_or_fail -skip_if_cloud_manager = pytest.mark.skipif(is_cloud_qa(), reason="Do not run in Cloud Manager") + +def running_cloud_manager(): + "Determines if the current test is running against Cloud Manager" + return get_env_var_or_fail("OM_HOST") == "https://cloud-qa.mongodb.com" + + +skip_if_cloud_manager = pytest.mark.skipif(running_cloud_manager(), reason="Do not run in Cloud Manager") class BackupStatus(str, Enum): diff --git a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py index da5f51a0d..0e119fa57 100644 --- a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py +++ b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py @@ -12,10 +12,12 @@ update_configmap, ) from kubetester.kubetester import ( + KubernetesTester, ensure_ent_version, ) from kubetester.kubetester import fixture as yaml_fixture from kubetester.kubetester import ( + get_env_var_or_fail, is_default_architecture_static, is_multi_cluster, run_periodically, @@ -27,7 +29,6 @@ from kubetester.phase import Phase from pytest import fixture, mark from tests import test_logger -from tests.common.ops_manager.cloud_manager import is_cloud_qa from tests.conftest import ( MULTI_CLUSTER_MEMBER_LIST_CONFIGMAP, get_central_cluster_client, @@ -53,6 +54,10 @@ # to reconfigure the deployment further. +def is_cloud_qa() -> bool: + return os.getenv("ops_manager_version", "cloud_qa") == "cloud_qa" + + @mark.e2e_multi_cluster_sharded_disaster_recovery def test_install_operator(multi_cluster_operator: Operator): multi_cluster_operator.assert_is_running() diff --git a/docker/mongodb-kubernetes-tests/tests/upgrades/sharded_cluster_operator_upgrade_v1_27_to_mck.py b/docker/mongodb-kubernetes-tests/tests/upgrades/sharded_cluster_operator_upgrade_v1_27_to_mck.py index 0b059386e..4ed13d2a9 100644 --- a/docker/mongodb-kubernetes-tests/tests/upgrades/sharded_cluster_operator_upgrade_v1_27_to_mck.py +++ b/docker/mongodb-kubernetes-tests/tests/upgrades/sharded_cluster_operator_upgrade_v1_27_to_mck.py @@ -1,16 +1,21 @@ -from typing import Dict +from typing import Dict, Optional import pytest -from kubetester import read_configmap, try_load -from kubetester.certs import create_mongodb_tls_certs, create_sharded_cluster_certs +from kubeobject import CustomObject +from kubernetes import client +from kubetester import create_or_update_configmap, read_configmap +from kubetester.certs import create_sharded_cluster_certs +from kubetester.kubetester import ensure_nested_objects from kubetester.kubetester import fixture as yaml_fixture from kubetester.mongodb import MongoDB -from kubetester.mongotester import ReplicaSetTester, ShardedClusterTester +from kubetester.mongotester import ShardedClusterTester +from kubetester.operator import Operator from kubetester.phase import Phase from tests import test_logger from tests.conftest import ( LEGACY_OPERATOR_NAME, OPERATOR_NAME, + get_central_cluster_client, get_default_operator, install_legacy_deployment_state_meko, log_deployments_info, @@ -18,7 +23,6 @@ from tests.upgrades import downscale_operator_deployment MDB_RESOURCE = "sh001-base" -MDB_RS_RESOURCE = "rs" CERT_PREFIX = "prefix" logger = test_logger.get_test_logger(__name__) @@ -37,8 +41,6 @@ If the sharded cluster resource correctly reconciles after upgrade/downgrade and scaling steps, we assume it works correctly. """ - - # TODO CLOUDP-318100: this test should eventually be updated and not pinned to 1.27 anymore @@ -66,7 +68,7 @@ def server_certs(issuer: str, namespace: str) -> str: ) -@pytest.fixture(scope="function") +@pytest.fixture(scope="module") def sharded_cluster( issuer_ca_configmap: str, namespace: str, @@ -77,11 +79,7 @@ def sharded_cluster( yaml_fixture("sharded-cluster.yaml"), namespace=namespace, name=MDB_RESOURCE, - ).configure(om=None, project_name=MDB_RESOURCE) - - if try_load(resource): - return resource - + ) resource.set_version(custom_mdb_version) resource["spec"]["mongodsPerShardCount"] = 2 resource["spec"]["configServerCount"] = 2 @@ -89,34 +87,7 @@ def sharded_cluster( resource["spec"]["persistent"] = True resource.configure_custom_tls(issuer_ca_configmap, CERT_PREFIX) - return resource - - -@pytest.fixture(scope="module") -def replica_set_certs(issuer: str, namespace: str): - return create_mongodb_tls_certs(issuer, namespace, MDB_RS_RESOURCE, f"prefix-{MDB_RS_RESOURCE}-cert") - - -@pytest.fixture(scope="module") -def replica_set( - issuer_ca_configmap: str, - namespace: str, - replica_set_certs: str, - custom_mdb_version: str, -): - resource = MongoDB.from_yaml( - yaml_fixture("replica-set-basic.yaml"), - namespace=namespace, - name=MDB_RS_RESOURCE, - ).configure(om=None, project_name=f"{MDB_RS_RESOURCE}") - - if try_load(resource): - return resource - - resource.set_version(custom_mdb_version) - resource.configure_custom_tls(issuer_ca_configmap, CERT_PREFIX) - - return resource + return resource.update() @pytest.mark.e2e_sharded_cluster_operator_upgrade_v1_27_to_mck @@ -130,23 +101,16 @@ def test_install_legacy_deployment_state_meko( install_legacy_deployment_state_meko(namespace, managed_security_context, operator_installation_config) def test_create_sharded_cluster(self, sharded_cluster: MongoDB): - sharded_cluster.update() sharded_cluster.assert_reaches_phase(phase=Phase.Running, timeout=350) def test_scale_up_sharded_cluster(self, sharded_cluster: MongoDB): + sharded_cluster.load() sharded_cluster["spec"]["mongodsPerShardCount"] = 3 sharded_cluster["spec"]["configServerCount"] = 3 sharded_cluster.update() sharded_cluster.assert_reaches_phase(phase=Phase.Running, timeout=300) -@pytest.mark.e2e_sharded_cluster_operator_upgrade_v1_27_to_mck -class TestReplicaSetDeployment: - def test_create_replica_set(self, replica_set: MongoDB): - replica_set.update() - replica_set.assert_reaches_phase(phase=Phase.Running, timeout=350) - - @pytest.mark.e2e_sharded_cluster_operator_upgrade_v1_27_to_mck class TestOperatorUpgrade: @@ -173,12 +137,6 @@ def test_sharded_cluster_reconciled(self, sharded_cluster: MongoDB, namespace: s def test_assert_connectivity(self, ca_path: str): ShardedClusterTester(MDB_RESOURCE, 1, ssl=True, ca_path=ca_path).assert_connectivity() - def test_replica_set_reconciled(self, replica_set: MongoDB): - replica_set.assert_reaches_phase(phase=Phase.Running, timeout=850, ignore_errors=True) - - def test_assert_connectivity_replica_set(self, ca_path: str): - ReplicaSetTester(MDB_RS_RESOURCE, 3, ssl=True, ca_path=ca_path).assert_connectivity() - def test_scale_down_sharded_cluster(self, sharded_cluster: MongoDB, namespace: str): sharded_cluster.load() # Scale down both by 1 @@ -210,12 +168,6 @@ def test_sharded_cluster_reconciled(self, sharded_cluster: MongoDB): def test_assert_connectivity(self, ca_path: str): ShardedClusterTester(MDB_RESOURCE, 1, ssl=True, ca_path=ca_path).assert_connectivity() - def test_replica_set_reconciled(self, replica_set: MongoDB): - replica_set.assert_reaches_phase(phase=Phase.Running, timeout=850, ignore_errors=True) - - def test_assert_connectivity_replica_set(self, ca_path: str): - ReplicaSetTester(MDB_RS_RESOURCE, 3, ssl=True, ca_path=ca_path).assert_connectivity() - def test_scale_up_sharded_cluster(self, sharded_cluster: MongoDB): sharded_cluster.load() sharded_cluster["spec"]["mongodsPerShardCount"] = 3 From ac898f4152a7af3b2911b8129bfd329b867daeba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Thu, 18 Sep 2025 23:16:49 +0200 Subject: [PATCH 3/9] Running search test under OM 8.2 --- .evergreen.yml | 10 + .../content/agent-launcher-lib.sh | 6 +- ...multi_cluster_sharded_disaster_recovery.py | 1 - .../fixtures/om_ops_manager_basic.yaml | 13 ++ .../enterprise-replicaset-sample-mflix.yaml | 25 ++- .../tests/search/om_deployment.py | 31 +++ .../tests/search/search_enterprise_tls.py | 181 +++++++++++++----- 7 files changed, 201 insertions(+), 66 deletions(-) create mode 100644 docker/mongodb-kubernetes-tests/tests/search/om_deployment.py diff --git a/.evergreen.yml b/.evergreen.yml index 5918cdf0f..a6a3b8407 100644 --- a/.evergreen.yml +++ b/.evergreen.yml @@ -1131,6 +1131,15 @@ task_groups: - e2e_om_ops_manager_prometheus <<: *teardown_group + # Tests features only supported on OM80 + - name: e2e_ops_manager_kind_8_0_only_task_group + max_hosts: -1 + <<: *setup_group + <<: *setup_and_teardown_task + tasks: + - e2e_search_enterprise_tls + <<: *teardown_group + # Tests features only supported on OM70 and OM80, its only upgrade test as we test upgrading from 6 to 7 or 7 to 8 - name: e2e_ops_manager_upgrade_only_task_group max_hosts: -1 @@ -1341,6 +1350,7 @@ buildvariants: - name: e2e_ops_manager_kind_5_0_only_task_group_without_queryable_backup - name: e2e_ops_manager_kind_6_0_only_task_group - name: e2e_ops_manager_upgrade_only_task_group + - name: e2e_ops_manager_kind_8_0_only_task_group - name: e2e_static_om80_kind_ubi display_name: e2e_static_om80_kind_ubi diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh index aba8ca152..f60d37bc0 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh @@ -115,7 +115,7 @@ download_agent() { esac script_log "Downloading Agent version: ${AGENT_VERSION}" - script_log "Downloading a Mongodb Agent from ${base_url:?}" + script_log "Downloading a Mongodb Agent from ${base_url:?}/download/agent/automation/${AGENT_FILE}" curl_opts=( "${base_url}/download/agent/automation/${AGENT_FILE}" @@ -133,11 +133,13 @@ download_agent() { curl_opts+=("--cacert" "${SSL_TRUSTED_MMS_SERVER_CERTIFICATE}") fi + echo "Executing curl command: curl ${curl_opts[*]}" if ! curl "${curl_opts[@]}" &>"${MMS_LOG_DIR}/curl.log"; then script_log "Error while downloading the Mongodb agent" exit 1 fi - json_log 'agent-launcher-script' <"${MMS_LOG_DIR}/curl.log" >>"${MDB_LOG_FILE_AGENT_LAUNCHER_SCRIPT}" + + grep -v -E "bytes data\]|\[no content\]" "${MMS_LOG_DIR}/curl.log" | json_log 'agent-launcher-script' >>"${MDB_LOG_FILE_AGENT_LAUNCHER_SCRIPT}" rm "${MMS_LOG_DIR}/curl.log" 2>/dev/null || true script_log "The Mongodb Agent binary downloaded, unpacking" diff --git a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py index 0e119fa57..02f6e8c1e 100644 --- a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py +++ b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py @@ -1,4 +1,3 @@ -import os import time from typing import Optional diff --git a/docker/mongodb-kubernetes-tests/tests/opsmanager/fixtures/om_ops_manager_basic.yaml b/docker/mongodb-kubernetes-tests/tests/opsmanager/fixtures/om_ops_manager_basic.yaml index daa8b85f5..a4b6ba3d1 100644 --- a/docker/mongodb-kubernetes-tests/tests/opsmanager/fixtures/om_ops_manager_basic.yaml +++ b/docker/mongodb-kubernetes-tests/tests/opsmanager/fixtures/om_ops_manager_basic.yaml @@ -15,3 +15,16 @@ spec: backup: enabled: false + + # adding this just to avoid wizard when opening OM UI + configuration: + automation.versions.source: mongodb + mms.adminEmailAddr: cloud-manager-support@mongodb.com + mms.fromEmailAddr: cloud-manager-support@mongodb.com + mms.ignoreInitialUiSetup: "true" + mms.mail.hostname: email-smtp.us-east-1.amazonaws.com + mms.mail.port: "465" + mms.mail.ssl: "true" + mms.mail.transport: smtp + mms.minimumTLSVersion: TLSv1.2 + mms.replyToEmailAddr: cloud-manager-support@mongodb.com diff --git a/docker/mongodb-kubernetes-tests/tests/search/fixtures/enterprise-replicaset-sample-mflix.yaml b/docker/mongodb-kubernetes-tests/tests/search/fixtures/enterprise-replicaset-sample-mflix.yaml index cf58035f1..926aed93d 100644 --- a/docker/mongodb-kubernetes-tests/tests/search/fixtures/enterprise-replicaset-sample-mflix.yaml +++ b/docker/mongodb-kubernetes-tests/tests/search/fixtures/enterprise-replicaset-sample-mflix.yaml @@ -19,16 +19,15 @@ spec: - SCRAM agent: logLevel: DEBUG - statefulSet: - spec: - template: - spec: - containers: - - name: mongodb-enterprise-database - resources: - limits: - cpu: "2" - memory: 2Gi - requests: - cpu: "1" - memory: 1Gi + podSpec: + podTemplate: + spec: + containers: + - name: mongodb-enterprise-database + resources: + limits: + cpu: "2" + memory: 2Gi + requests: + cpu: "1" + memory: 1Gi diff --git a/docker/mongodb-kubernetes-tests/tests/search/om_deployment.py b/docker/mongodb-kubernetes-tests/tests/search/om_deployment.py new file mode 100644 index 000000000..f8212a22e --- /dev/null +++ b/docker/mongodb-kubernetes-tests/tests/search/om_deployment.py @@ -0,0 +1,31 @@ +from typing import Optional + +from kubetester import try_load +from kubetester.kubetester import fixture as yaml_fixture +from kubetester.kubetester import is_multi_cluster +from kubetester.opsmanager import MongoDBOpsManager +from pytest import fixture +from tests.common.ops_manager.cloud_manager import is_cloud_qa +from tests.conftest import get_custom_appdb_version, get_custom_om_version +from tests.opsmanager.withMonitoredAppDB.conftest import enable_multi_cluster_deployment + + +def get_ops_manager(namespace: str) -> Optional[MongoDBOpsManager]: + if is_cloud_qa(): + return None + + resource: MongoDBOpsManager = MongoDBOpsManager.from_yaml( + yaml_fixture("om_ops_manager_basic.yaml"), namespace=namespace + ) + + if try_load(resource): + return resource + + resource.set_version(get_custom_om_version()) + resource.set_appdb_version(get_custom_appdb_version()) + resource.allow_mdb_rc_versions() + + if is_multi_cluster(): + enable_multi_cluster_deployment(resource) + + return resource diff --git a/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py b/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py index 887a8e5d0..e99a544db 100644 --- a/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py +++ b/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py @@ -1,19 +1,22 @@ import pymongo import yaml -from kubetester import create_or_update_secret, try_load +from kubetester import create_or_update_secret, run_periodically, try_load from kubetester.certs import create_mongodb_tls_certs, create_tls_certs from kubetester.kubetester import KubernetesTester from kubetester.kubetester import fixture as yaml_fixture from kubetester.mongodb import MongoDB from kubetester.mongodb_search import MongoDBSearch from kubetester.mongodb_user import MongoDBUser +from kubetester.omtester import skip_if_cloud_manager from kubetester.phase import Phase +from mypyc.irbuild.function import check_native_override from pytest import fixture, mark from tests import test_logger from tests.common.search import movies_search_helper -from tests.common.search.movies_search_helper import SampleMoviesSearchHelper from tests.common.search.search_tester import SearchTester -from tests.conftest import get_default_operator +from tests.conftest import get_default_operator, get_issuer_ca_filepath +from tests.opsmanager.conftest import custom_om_prev_version +from tests.search.om_deployment import get_ops_manager logger = test_logger.get_test_logger(__name__) @@ -26,11 +29,14 @@ USER_NAME = "mdb-user" USER_PASSWORD = f"{USER_NAME}-password" -MDB_RESOURCE_NAME = "mdb-rs" +MDB_RESOURCE_NAME = "mdb-ent-tls" # MongoDBSearch TLS configuration MDBS_TLS_SECRET_NAME = "mdbs-tls-secret" +MDB_VERSION_WITHOUT_BUILT_IN_ROLE = "8.0.10-ent" +MDB_VERSION_WITH_BUILT_IN_ROLE = "8.2.0-ent" + @fixture(scope="function") def mdb(namespace: str, issuer_ca_configmap: str) -> MongoDB: @@ -39,6 +45,8 @@ def mdb(namespace: str, issuer_ca_configmap: str) -> MongoDB: name=MDB_RESOURCE_NAME, namespace=namespace, ) + resource.configure(om=get_ops_manager(namespace), project_name=MDB_RESOURCE_NAME) + resource.set_version(MDB_VERSION_WITHOUT_BUILT_IN_ROLE) if try_load(resource): return resource @@ -73,6 +81,7 @@ def admin_user(namespace: str) -> MongoDBUser: if try_load(resource): return resource + resource["spec"]["mongodbResourceRef"]["name"] = MDB_RESOURCE_NAME resource["spec"]["username"] = resource.name resource["spec"]["passwordSecretKeyRef"]["name"] = f"{resource.name}-password" @@ -86,6 +95,7 @@ def user(namespace: str) -> MongoDBUser: if try_load(resource): return resource + resource["spec"]["mongodbResourceRef"]["name"] = MDB_RESOURCE_NAME resource["spec"]["username"] = resource.name resource["spec"]["passwordSecretKeyRef"]["name"] = f"{resource.name}-password" @@ -103,6 +113,7 @@ def mongot_user(namespace: str, mdbs: MongoDBSearch) -> MongoDBUser: if try_load(resource): return resource + resource["spec"]["mongodbResourceRef"]["name"] = MDB_RESOURCE_NAME resource["spec"]["username"] = MONGOT_USER_NAME resource["spec"]["passwordSecretKeyRef"]["name"] = f"{resource.name}-password" @@ -115,6 +126,16 @@ def test_install_operator(namespace: str, operator_installation_config: dict[str operator.assert_is_running() +@mark.e2e_search_enterprise_tls +@skip_if_cloud_manager +def test_create_ops_manager(namespace: str): + ops_manager = get_ops_manager(namespace) + if ops_manager is not None: + ops_manager.update() + ops_manager.om_status().assert_reaches_phase(Phase.Running, timeout=1200) + ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=600) + + @mark.e2e_search_enterprise_tls def test_install_tls_secrets_and_configmaps(namespace: str, mdb: MongoDB, mdbs: MongoDBSearch, issuer: str): create_mongodb_tls_certs(issuer, namespace, mdb.name, f"certs-{mdb.name}-cert", mdb.get_members()) @@ -144,19 +165,19 @@ def test_create_users( create_or_update_secret( namespace, name=admin_user["spec"]["passwordSecretKeyRef"]["name"], data={"password": ADMIN_USER_PASSWORD} ) - admin_user.create() + admin_user.update() admin_user.assert_reaches_phase(Phase.Updated, timeout=300) create_or_update_secret( namespace, name=user["spec"]["passwordSecretKeyRef"]["name"], data={"password": USER_PASSWORD} ) - user.create() + user.update() user.assert_reaches_phase(Phase.Updated, timeout=300) create_or_update_secret( namespace, name=mongot_user["spec"]["passwordSecretKeyRef"]["name"], data={"password": MONGOT_USER_PASSWORD} ) - mongot_user.create() + mongot_user.update() # we deliberately don't wait for this user to be ready, because to be reconciled successfully it needs the searchCoordinator role # which the ReplicaSet reconciler will only define in the automation config after the MongoDBSearch resource is created. @@ -169,27 +190,117 @@ def test_create_search_resource(mdbs: MongoDBSearch): @mark.e2e_search_enterprise_tls def test_wait_for_database_resource_ready(mdb: MongoDB): - mdb.assert_abandons_phase(Phase.Running, timeout=300) mdb.assert_reaches_phase(Phase.Running, timeout=300) - for idx in range(mdb.get_members()): - mongod_config = yaml.safe_load( - KubernetesTester.run_command_in_pod_container( - f"{mdb.name}-{idx}", mdb.namespace, ["cat", "/data/automation-mongod.conf"] + +@mark.e2e_search_enterprise_tls +def test_wait_for_mongod_parameters(mdb: MongoDB): + def check_mongod_parameters(): + parameters_are_set = True + pod_parameters = [] + for idx in range(mdb.get_members()): + mongod_config = yaml.safe_load( + KubernetesTester.run_command_in_pod_container( + f"{mdb.name}-{idx}", mdb.namespace, ["cat", "/data/automation-mongod.conf"] + ) ) - ) - setParameter = mongod_config.get("setParameter", {}) - assert ( - "mongotHost" in setParameter and "searchIndexManagementHostAndPort" in setParameter - ), "mongot parameters not found in mongod config" + set_parameter = mongod_config.get("setParameter", {}) + parameters_are_set = parameters_are_set and ( + "mongotHost" in set_parameter and "searchIndexManagementHostAndPort" in set_parameter + ) + pod_parameters.append(f"pod {idx} setParameter: {set_parameter}") + + return parameters_are_set, f'Not all pods have mongot parameters set:\n{"\n".join(pod_parameters)}' + + run_periodically(lambda: check_mongod_parameters(), timeout=200) + + +@mark.e2e_search_enterprise_tls +def test_wait_for_database_resource_ready2(mdb: MongoDB): + mdb.assert_reaches_phase(Phase.Running, timeout=300) + + +@mark.e2e_search_enterprise_tls +def test_validate_tls_connections(mdb: MongoDB, mdbs: MongoDBSearch, namespace: str): + validate_tls_connections(mdb, mdbs, namespace) + + +@mark.e2e_search_enterprise_tls +def test_search_restore_sample_database(mdb: MongoDB): + get_admin_sample_movies_helper(mdb).restore_sample_database() + + +@mark.e2e_search_enterprise_tls +def test_search_create_search_index(mdb: MongoDB): + get_user_sample_movies_helper(mdb).create_search_index() + + +@mark.e2e_search_enterprise_tls +def test_search_assert_search_query(mdb: MongoDB): + get_user_sample_movies_helper(mdb).assert_search_query(retry_timeout=60) @mark.e2e_search_enterprise_tls -def test_validate_tls_connections(mdb: MongoDB, mdbs: MongoDBSearch, namespace: str, issuer_ca_filepath: str): +class TestUpgradeMongod: + def test_check_polyfilled_role_in_ac(self, mdb: MongoDB): + custom_roles = mdb.get_automation_config_tester().automation_config.get("roles", []) + assert len(custom_roles) > 0 + assert "searchCoordinator" in [role["role"] for role in custom_roles] + + def test_mongod_version(self, mdb: MongoDB): + mdb.tester(ca_path=get_issuer_ca_filepath(), use_ssl=True).assert_version(MDB_VERSION_WITHOUT_BUILT_IN_ROLE) + + def test_upgrade_to_mongo_8_2(self, mdb: MongoDB): + mdb.set_version(MDB_VERSION_WITH_BUILT_IN_ROLE) + mdb.update() + mdb.assert_reaches_phase(Phase.Running, timeout=600) + + def test_check_polyfilled_role_not_in_ac(self, mdb: MongoDB): + custom_roles = mdb.get_automation_config_tester().automation_config.get("roles", []) + assert len(custom_roles) >= 0 + assert "searchCoordinator" not in [role["role"] for role in custom_roles] + + def test_mongod_version_after_upgrade(self, mdb: MongoDB): + mdb_tester = mdb.tester(ca_path=get_issuer_ca_filepath(), use_ssl=True) + mdb_tester.assert_scram_sha_authentication( + ADMIN_USER_NAME, ADMIN_USER_PASSWORD, "SCRAM-SHA-256", 1, ssl=True, tlsCAFile=get_issuer_ca_filepath() + ) + # TODO check why assert version works without auth for 8.0 and not for 8.2 + mdb_tester.assert_version(MDB_VERSION_WITH_BUILT_IN_ROLE) + + +@mark.e2e_search_enterprise_tlssh +def test_search_assert_search_query_2(mdb: MongoDB): + get_user_sample_movies_helper(mdb).assert_search_query(retry_timeout=60) + + +def get_connection_string(mdb: MongoDB, user_name: str, user_password: str) -> str: + return f"mongodb://{user_name}:{user_password}@{mdb.name}-0.{mdb.name}-svc.{mdb.namespace}.svc.cluster.local:27017/?replicaSet={mdb.name}" + + +def get_admin_sample_movies_helper(mdb): + return movies_search_helper.SampleMoviesSearchHelper( + SearchTester( + get_connection_string(mdb, ADMIN_USER_NAME, ADMIN_USER_PASSWORD), + use_ssl=True, + ca_path=get_issuer_ca_filepath(), + ) + ) + + +def get_user_sample_movies_helper(mdb): + return movies_search_helper.SampleMoviesSearchHelper( + SearchTester( + get_connection_string(mdb, USER_NAME, USER_PASSWORD), use_ssl=True, ca_path=get_issuer_ca_filepath() + ) + ) + + +def validate_tls_connections(mdb: MongoDB, mdbs: MongoDBSearch, namespace: str): with pymongo.MongoClient( f"mongodb://{mdb.name}-0.{mdb.name}-svc.{namespace}.svc.cluster.local:27017/?replicaSet={mdb.name}", tls=True, - tlsCAFile=issuer_ca_filepath, + tlsCAFile=get_issuer_ca_filepath(), tlsAllowInvalidHostnames=False, serverSelectionTimeoutMS=30000, connectTimeoutMS=20000, @@ -200,40 +311,10 @@ def test_validate_tls_connections(mdb: MongoDB, mdbs: MongoDBSearch, namespace: with pymongo.MongoClient( f"mongodb://{mdbs.name}-search-svc.{namespace}.svc.cluster.local:27027", tls=True, - tlsCAFile=issuer_ca_filepath, + tlsCAFile=get_issuer_ca_filepath(), tlsAllowInvalidHostnames=False, serverSelectionTimeoutMS=10000, connectTimeoutMS=10000, ) as search_client: search_info = search_client.admin.command("hello") assert search_info.get("ok") == 1, "MongoDBSearch connection failed" - - -@mark.e2e_search_enterprise_tls -def test_search_restore_sample_database(mdb: MongoDB, issuer_ca_filepath: str): - sample_movies_helper = movies_search_helper.SampleMoviesSearchHelper( - SearchTester( - get_connection_string(mdb, ADMIN_USER_NAME, ADMIN_USER_PASSWORD), use_ssl=True, ca_path=issuer_ca_filepath - ) - ) - sample_movies_helper.restore_sample_database() - - -@mark.e2e_search_enterprise_tls -def test_search_create_search_index(mdb: MongoDB, issuer_ca_filepath: str): - sample_movies_helper = movies_search_helper.SampleMoviesSearchHelper( - SearchTester(get_connection_string(mdb, USER_NAME, USER_PASSWORD), use_ssl=True, ca_path=issuer_ca_filepath) - ) - sample_movies_helper.create_search_index() - - -@mark.e2e_search_enterprise_tls -def test_search_assert_search_query(mdb: MongoDB, issuer_ca_filepath: str): - sample_movies_helper = movies_search_helper.SampleMoviesSearchHelper( - SearchTester(get_connection_string(mdb, USER_NAME, USER_PASSWORD), use_ssl=True, ca_path=issuer_ca_filepath) - ) - sample_movies_helper.assert_search_query(retry_timeout=60) - - -def get_connection_string(mdb: MongoDB, user_name: str, user_password: str) -> str: - return f"mongodb://{user_name}:{user_password}@{mdb.name}-0.{mdb.name}-svc.{mdb.namespace}.svc.cluster.local:27017/?replicaSet={mdb.name}" From d6d7c70d2544bb0dd726e7ca7b992de1ef3a7a57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Tue, 23 Sep 2025 08:02:02 +0200 Subject: [PATCH 4/9] Review fixes --- .../content/agent-launcher-lib.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh index f60d37bc0..aba8ca152 100755 --- a/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh +++ b/docker/mongodb-kubernetes-init-database/content/agent-launcher-lib.sh @@ -115,7 +115,7 @@ download_agent() { esac script_log "Downloading Agent version: ${AGENT_VERSION}" - script_log "Downloading a Mongodb Agent from ${base_url:?}/download/agent/automation/${AGENT_FILE}" + script_log "Downloading a Mongodb Agent from ${base_url:?}" curl_opts=( "${base_url}/download/agent/automation/${AGENT_FILE}" @@ -133,13 +133,11 @@ download_agent() { curl_opts+=("--cacert" "${SSL_TRUSTED_MMS_SERVER_CERTIFICATE}") fi - echo "Executing curl command: curl ${curl_opts[*]}" if ! curl "${curl_opts[@]}" &>"${MMS_LOG_DIR}/curl.log"; then script_log "Error while downloading the Mongodb agent" exit 1 fi - - grep -v -E "bytes data\]|\[no content\]" "${MMS_LOG_DIR}/curl.log" | json_log 'agent-launcher-script' >>"${MDB_LOG_FILE_AGENT_LAUNCHER_SCRIPT}" + json_log 'agent-launcher-script' <"${MMS_LOG_DIR}/curl.log" >>"${MDB_LOG_FILE_AGENT_LAUNCHER_SCRIPT}" rm "${MMS_LOG_DIR}/curl.log" 2>/dev/null || true script_log "The Mongodb Agent binary downloaded, unpacking" From 2046f891c7085dab052a4ec051a44f1521ceff86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Mon, 29 Sep 2025 10:47:23 +0200 Subject: [PATCH 5/9] Incorporated cloud-qa changes --- .../tests/common/ops_manager/cloud_manager.py | 5 +++++ .../multi_cluster_sharded_disaster_recovery.py | 1 + .../tests/search/search_enterprise_tls.py | 10 +--------- 3 files changed, 7 insertions(+), 9 deletions(-) create mode 100644 docker/mongodb-kubernetes-tests/tests/common/ops_manager/cloud_manager.py diff --git a/docker/mongodb-kubernetes-tests/tests/common/ops_manager/cloud_manager.py b/docker/mongodb-kubernetes-tests/tests/common/ops_manager/cloud_manager.py new file mode 100644 index 000000000..ed936858d --- /dev/null +++ b/docker/mongodb-kubernetes-tests/tests/common/ops_manager/cloud_manager.py @@ -0,0 +1,5 @@ +import os + + +def is_cloud_qa() -> bool: + return os.getenv("ops_manager_version", "cloud_qa") == "cloud_qa" diff --git a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py index 02f6e8c1e..0e119fa57 100644 --- a/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py +++ b/docker/mongodb-kubernetes-tests/tests/multicluster_shardedcluster/multi_cluster_sharded_disaster_recovery.py @@ -1,3 +1,4 @@ +import os import time from typing import Optional diff --git a/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py b/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py index e99a544db..18ddfe7e3 100644 --- a/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py +++ b/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py @@ -9,13 +9,11 @@ from kubetester.mongodb_user import MongoDBUser from kubetester.omtester import skip_if_cloud_manager from kubetester.phase import Phase -from mypyc.irbuild.function import check_native_override from pytest import fixture, mark from tests import test_logger from tests.common.search import movies_search_helper from tests.common.search.search_tester import SearchTester from tests.conftest import get_default_operator, get_issuer_ca_filepath -from tests.opsmanager.conftest import custom_om_prev_version from tests.search.om_deployment import get_ops_manager logger = test_logger.get_test_logger(__name__) @@ -130,8 +128,7 @@ def test_install_operator(namespace: str, operator_installation_config: dict[str @skip_if_cloud_manager def test_create_ops_manager(namespace: str): ops_manager = get_ops_manager(namespace) - if ops_manager is not None: - ops_manager.update() + ops_manager.update() ops_manager.om_status().assert_reaches_phase(Phase.Running, timeout=1200) ops_manager.appdb_status().assert_reaches_phase(Phase.Running, timeout=600) @@ -215,11 +212,6 @@ def check_mongod_parameters(): run_periodically(lambda: check_mongod_parameters(), timeout=200) -@mark.e2e_search_enterprise_tls -def test_wait_for_database_resource_ready2(mdb: MongoDB): - mdb.assert_reaches_phase(Phase.Running, timeout=300) - - @mark.e2e_search_enterprise_tls def test_validate_tls_connections(mdb: MongoDB, mdbs: MongoDBSearch, namespace: str): validate_tls_connections(mdb, mdbs, namespace) From 48c5aaa4391728b0048f3232970f94e434f3c0d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Mon, 29 Sep 2025 12:15:57 +0200 Subject: [PATCH 6/9] Test fixes --- .../mongodb-kubernetes-tests/kubetester/omtester.py | 11 ++--------- .../tests/search/om_deployment.py | 1 - 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/docker/mongodb-kubernetes-tests/kubetester/omtester.py b/docker/mongodb-kubernetes-tests/kubetester/omtester.py index 72874d2b6..5798a89d7 100644 --- a/docker/mongodb-kubernetes-tests/kubetester/omtester.py +++ b/docker/mongodb-kubernetes-tests/kubetester/omtester.py @@ -20,16 +20,9 @@ from kubetester.om_queryable_backups import OMQueryableBackup from opentelemetry import trace from requests.adapters import HTTPAdapter, Retry +from tests.common.ops_manager.cloud_manager import is_cloud_qa -from .kubetester import get_env_var_or_fail - - -def running_cloud_manager(): - "Determines if the current test is running against Cloud Manager" - return get_env_var_or_fail("OM_HOST") == "https://cloud-qa.mongodb.com" - - -skip_if_cloud_manager = pytest.mark.skipif(running_cloud_manager(), reason="Do not run in Cloud Manager") +skip_if_cloud_manager = pytest.mark.skipif(is_cloud_qa(), reason="Do not run in Cloud Manager") class BackupStatus(str, Enum): diff --git a/docker/mongodb-kubernetes-tests/tests/search/om_deployment.py b/docker/mongodb-kubernetes-tests/tests/search/om_deployment.py index f8212a22e..a027b6f98 100644 --- a/docker/mongodb-kubernetes-tests/tests/search/om_deployment.py +++ b/docker/mongodb-kubernetes-tests/tests/search/om_deployment.py @@ -23,7 +23,6 @@ def get_ops_manager(namespace: str) -> Optional[MongoDBOpsManager]: resource.set_version(get_custom_om_version()) resource.set_appdb_version(get_custom_appdb_version()) - resource.allow_mdb_rc_versions() if is_multi_cluster(): enable_multi_cluster_deployment(resource) From e3fa3e7d66ee657f0a7494be01ed5861a262b125 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Fri, 3 Oct 2025 08:47:36 +0200 Subject: [PATCH 7/9] Review fixes --- .../tests/search/search_enterprise_tls.py | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py b/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py index 18ddfe7e3..72aa65e10 100644 --- a/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py +++ b/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py @@ -1,6 +1,7 @@ import pymongo import yaml -from kubetester import create_or_update_secret, run_periodically, try_load +from kubernetes import client +from kubetester import create_or_update_secret, run_periodically, try_load, wait_until from kubetester.certs import create_mongodb_tls_certs, create_tls_certs from kubetester.kubetester import KubernetesTester from kubetester.kubetester import fixture as yaml_fixture @@ -163,12 +164,13 @@ def test_create_users( namespace, name=admin_user["spec"]["passwordSecretKeyRef"]["name"], data={"password": ADMIN_USER_PASSWORD} ) admin_user.update() - admin_user.assert_reaches_phase(Phase.Updated, timeout=300) create_or_update_secret( namespace, name=user["spec"]["passwordSecretKeyRef"]["name"], data={"password": USER_PASSWORD} ) user.update() + + admin_user.assert_reaches_phase(Phase.Updated, timeout=300) user.assert_reaches_phase(Phase.Updated, timeout=300) create_or_update_secret( @@ -192,6 +194,9 @@ def test_wait_for_database_resource_ready(mdb: MongoDB): @mark.e2e_search_enterprise_tls def test_wait_for_mongod_parameters(mdb: MongoDB): + # After search CR is deployed, MongoDB controller will pick it up + # and start adding searchCoordinator role and search-related + # parameters to the automation config. def check_mongod_parameters(): parameters_are_set = True pod_parameters = [] @@ -209,7 +214,7 @@ def check_mongod_parameters(): return parameters_are_set, f'Not all pods have mongot parameters set:\n{"\n".join(pod_parameters)}' - run_periodically(lambda: check_mongod_parameters(), timeout=200) + run_periodically(check_mongod_parameters, timeout=200) @mark.e2e_search_enterprise_tls @@ -233,6 +238,11 @@ def test_search_assert_search_query(mdb: MongoDB): @mark.e2e_search_enterprise_tls +# This test class verifies if mongodb <8.2 can be upgraded to mongodb >=8.2 +# For mongod <8.2 the operator is automatically creating searchCoordinator customRole. +# We test here that the role exists before upgrade, because +# after mongodb is upgraded, the role should be removed from AC +# From 8.2 searchCoordinator role is a built-in role. class TestUpgradeMongod: def test_check_polyfilled_role_in_ac(self, mdb: MongoDB): custom_roles = mdb.get_automation_config_tester().automation_config.get("roles", []) @@ -240,6 +250,12 @@ def test_check_polyfilled_role_in_ac(self, mdb: MongoDB): assert "searchCoordinator" in [role["role"] for role in custom_roles] def test_mongod_version(self, mdb: MongoDB): + # This test is redundant when looking at the context of the full test file, + # as we deploy MDB_VERSION_WITHOUT_BUILT_IN_ROLE initially + # But it makes sense if we take into consideration TestUpgradeMongod test class alone. + # This checks the most important prerequisite for this test class to work. + # We check the version in case the test class is reused in another place + # or executed again when running locally. mdb.tester(ca_path=get_issuer_ca_filepath(), use_ssl=True).assert_version(MDB_VERSION_WITHOUT_BUILT_IN_ROLE) def test_upgrade_to_mongo_8_2(self, mdb: MongoDB): @@ -257,13 +273,10 @@ def test_mongod_version_after_upgrade(self, mdb: MongoDB): mdb_tester.assert_scram_sha_authentication( ADMIN_USER_NAME, ADMIN_USER_PASSWORD, "SCRAM-SHA-256", 1, ssl=True, tlsCAFile=get_issuer_ca_filepath() ) - # TODO check why assert version works without auth for 8.0 and not for 8.2 mdb_tester.assert_version(MDB_VERSION_WITH_BUILT_IN_ROLE) - -@mark.e2e_search_enterprise_tlssh -def test_search_assert_search_query_2(mdb: MongoDB): - get_user_sample_movies_helper(mdb).assert_search_query(retry_timeout=60) + def test_search_assert_search_query_after_upgrade(self, mdb: MongoDB): + get_user_sample_movies_helper(mdb).assert_search_query(retry_timeout=60) def get_connection_string(mdb: MongoDB, user_name: str, user_password: str) -> str: From 853a47f5afc9e8f5a933a251c2865fc5d16e28de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Fri, 3 Oct 2025 10:55:28 +0200 Subject: [PATCH 8/9] Review fixes --- .../tests/search/search_enterprise_tls.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py b/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py index 72aa65e10..0ecb1a961 100644 --- a/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py +++ b/docker/mongodb-kubernetes-tests/tests/search/search_enterprise_tls.py @@ -187,11 +187,6 @@ def test_create_search_resource(mdbs: MongoDBSearch): mdbs.assert_reaches_phase(Phase.Running, timeout=300) -@mark.e2e_search_enterprise_tls -def test_wait_for_database_resource_ready(mdb: MongoDB): - mdb.assert_reaches_phase(Phase.Running, timeout=300) - - @mark.e2e_search_enterprise_tls def test_wait_for_mongod_parameters(mdb: MongoDB): # After search CR is deployed, MongoDB controller will pick it up @@ -216,6 +211,13 @@ def check_mongod_parameters(): run_periodically(check_mongod_parameters, timeout=200) +# After picking up MongoDBSearch CR, MongoDB reconciler will add mongod parameters. +# But it will not immediately mark the MongoDB CR as Pending +# spinning +@mark.e2e_search_enterprise_tls +def test_wait_for_database_resource_ready(mdb: MongoDB): + mdb.assert_reaches_phase(Phase.Running, timeout=300) + @mark.e2e_search_enterprise_tls def test_validate_tls_connections(mdb: MongoDB, mdbs: MongoDBSearch, namespace: str): @@ -244,11 +246,6 @@ def test_search_assert_search_query(mdb: MongoDB): # after mongodb is upgraded, the role should be removed from AC # From 8.2 searchCoordinator role is a built-in role. class TestUpgradeMongod: - def test_check_polyfilled_role_in_ac(self, mdb: MongoDB): - custom_roles = mdb.get_automation_config_tester().automation_config.get("roles", []) - assert len(custom_roles) > 0 - assert "searchCoordinator" in [role["role"] for role in custom_roles] - def test_mongod_version(self, mdb: MongoDB): # This test is redundant when looking at the context of the full test file, # as we deploy MDB_VERSION_WITHOUT_BUILT_IN_ROLE initially @@ -258,6 +255,11 @@ def test_mongod_version(self, mdb: MongoDB): # or executed again when running locally. mdb.tester(ca_path=get_issuer_ca_filepath(), use_ssl=True).assert_version(MDB_VERSION_WITHOUT_BUILT_IN_ROLE) + def test_check_polyfilled_role_in_ac(self, mdb: MongoDB): + custom_roles = mdb.get_automation_config_tester().automation_config.get("roles", []) + assert len(custom_roles) > 0 + assert "searchCoordinator" in [role["role"] for role in custom_roles] + def test_upgrade_to_mongo_8_2(self, mdb: MongoDB): mdb.set_version(MDB_VERSION_WITH_BUILT_IN_ROLE) mdb.update() From d6d17ddc4374c1043051e42d246ac39b94b03a9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=81ukasz=20Sierant?= Date: Fri, 3 Oct 2025 16:10:44 +0200 Subject: [PATCH 9/9] Check wait for the goal state without blocking --- controllers/om/automation_status.go | 45 +++++++++++++++++++ controllers/om/replicaset/om_replicaset.go | 2 +- .../operator/appdbreplicaset_controller.go | 5 +++ .../operator/authentication/authentication.go | 5 ++- controllers/operator/common_controller.go | 15 +++++-- .../mongodbmultireplicaset_controller.go | 12 ++++- .../operator/mongodbreplicaset_controller.go | 13 +++++- .../mongodbshardedcluster_controller.go | 36 +++++++++------ .../operator/mongodbstandalone_controller.go | 8 +++- 9 files changed, 117 insertions(+), 24 deletions(-) diff --git a/controllers/om/automation_status.go b/controllers/om/automation_status.go index 502628cfa..15e54e6af 100644 --- a/controllers/om/automation_status.go +++ b/controllers/om/automation_status.go @@ -2,7 +2,9 @@ package om import ( "encoding/json" + "errors" "fmt" + "github.com/mongodb/mongodb-kubernetes/controllers/operator/workflow" "maps" "slices" "sort" @@ -40,6 +42,14 @@ func buildAutomationStatusFromBytes(b []byte) (*AutomationStatus, error) { return as, nil } +type PendingErr struct { + msg string +} + +func (e PendingErr) Error() string { + return e.msg +} + // WaitForReadyState waits until the agents for relevant processes reach their state func WaitForReadyState(oc Connection, processNames []string, supressErrors bool, log *zap.SugaredLogger) error { if len(processNames) == 0 { @@ -72,6 +82,41 @@ func WaitForReadyState(oc Connection, processNames []string, supressErrors bool, return nil } +func CheckForReadyState(oc Connection, processNames []string, log *zap.SugaredLogger) workflow.Status { + err := CheckForReadyStateReturningError(oc, processNames, log) + + if err != nil { + pendingErr := PendingErr{} + if ok := errors.As(err, &pendingErr); ok { + return workflow.Pending(pendingErr.Error()) + } + + return workflow.Failed(err) + } + + return workflow.OK() +} + +func CheckForReadyStateReturningError(oc Connection, processNames []string, log *zap.SugaredLogger) error { + if len(processNames) == 0 { + log.Infow("Not checking for MongoDB agents to reach READY state (no expected processes to check)") + return nil + } + + log.Infow("Checking if MongoDB agents reached READY state...", "processes", processNames) + as, err := oc.ReadAutomationStatus() + if err != nil { + return xerrors.Errorf("Error reading Automation Agents status: %s", err) + } + + if allReachedGoalState, msg := checkAutomationStatusIsGoal(as, processNames, log); allReachedGoalState { + log.Info("MongoDB agents have reached READY state") + return nil + } else { + return PendingErr{fmt.Sprintf("MongoDB agents haven't reached READY state; %s", msg)} + } +} + // CheckAutomationStatusIsGoal returns true if all the relevant processes are in Goal // state. // Note, that the function is quite tolerant to any situations except for non-matching goal state, for example diff --git a/controllers/om/replicaset/om_replicaset.go b/controllers/om/replicaset/om_replicaset.go index 2e72d2c3e..b98a122da 100644 --- a/controllers/om/replicaset/om_replicaset.go +++ b/controllers/om/replicaset/om_replicaset.go @@ -58,7 +58,7 @@ func PrepareScaleDownFromMap(omClient om.Connection, rsMembers map[string][]stri return xerrors.Errorf("unable to set votes, priority to 0 in Ops Manager, hosts: %v, err: %w", processes, err) } - if err := om.WaitForReadyState(omClient, processesToWaitForGoalState, false, log); err != nil { + if err := om.CheckForReadyStateReturningError(omClient, processesToWaitForGoalState, log); err != nil { return err } diff --git a/controllers/operator/appdbreplicaset_controller.go b/controllers/operator/appdbreplicaset_controller.go index fe68e07c6..b51607f5f 100644 --- a/controllers/operator/appdbreplicaset_controller.go +++ b/controllers/operator/appdbreplicaset_controller.go @@ -2,6 +2,7 @@ package operator import ( "context" + "errors" "fmt" "path" "sort" @@ -550,6 +551,10 @@ func (r *ReconcileAppDbReplicaSet) ReconcileAppDB(ctx context.Context, opsManage // it's possible that Ops Manager will not be available when we attempt to configure AppDB monitoring // in Ops Manager. This is not a blocker to continue with the rest of the reconciliation. if err != nil { + pendingErr := om.PendingErr{} + if ok := errors.As(err, &pendingErr); ok { + return r.updateStatus(ctx, opsManager, workflow.Pending(pendingErr.Error()), log, omStatusOption) + } log.Errorf("Unable to configure monitoring of AppDB: %s, configuration will be attempted next reconciliation.", err) if podVars.ProjectID != "" { diff --git a/controllers/operator/authentication/authentication.go b/controllers/operator/authentication/authentication.go index c2e36735b..3b11e4a1c 100644 --- a/controllers/operator/authentication/authentication.go +++ b/controllers/operator/authentication/authentication.go @@ -91,7 +91,7 @@ func Configure(conn om.Connection, opts Options, isRecovering bool, log *zap.Sug if isRecovering { return nil } - return om.WaitForReadyState(conn, opts.ProcessNames, false, log) + return om.CheckForReadyStateReturningError(conn, opts.ProcessNames, log) } // we need to make sure the desired authentication mechanism for the agent exists. If the desired agent @@ -172,6 +172,7 @@ func Disable(conn om.Connection, opts Options, deleteUsers bool, log *zap.Sugare return xerrors.Errorf("error read/updating automation config: %w", err) } + // Disable is called also onDelete, so we cannot requeue here, we must wait if err := om.WaitForReadyState(conn, opts.ProcessNames, false, log); err != nil { return xerrors.Errorf("error waiting for ready state: %w", err) } @@ -222,7 +223,7 @@ func Disable(conn om.Connection, opts Options, deleteUsers bool, log *zap.Sugare return xerrors.Errorf("error read/updating backup agent config: %w", err) } - if err := om.WaitForReadyState(conn, opts.ProcessNames, false, log); err != nil { + if err := om.CheckForReadyStateReturningError(conn, opts.ProcessNames, log); err != nil { return xerrors.Errorf("error waiting for ready state: %w", err) } diff --git a/controllers/operator/common_controller.go b/controllers/operator/common_controller.go index e76cf4e81..43594d0df 100644 --- a/controllers/operator/common_controller.go +++ b/controllers/operator/common_controller.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "encoding/pem" + "errors" "fmt" "path/filepath" "reflect" @@ -427,9 +428,12 @@ func (r *ReconcileCommonController) updateOmAuthentication(ctx context.Context, return workflow.Failed(err), false } - // we need to wait for all agents to be ready before configuring any authentication settings - if err := om.WaitForReadyState(conn, processNames, isRecovering, log); err != nil { - return workflow.Failed(err), false + if !isRecovering { + if workflowStatus := om.CheckForReadyState(conn, processNames, log); !workflowStatus.IsOK() { + return workflowStatus, false + } + } else { + log.Warnf("Ignoring checking for ready state due to recovering") } clientCerts := util.OptionalClientCertficates @@ -515,6 +519,10 @@ func (r *ReconcileCommonController) updateOmAuthentication(ctx context.Context, } if err := authentication.Configure(conn, authOpts, isRecovering, log); err != nil { + pendingErr := om.PendingErr{} + if ok := errors.As(err, &pendingErr); ok { + return workflow.Pending(pendingErr.Error()), false + } return workflow.Failed(err), false } } else if wantToEnableAuthentication { @@ -534,6 +542,7 @@ func (r *ReconcileCommonController) updateOmAuthentication(ctx context.Context, authOpts.UserOptions = userOpts if err := authentication.Disable(conn, authOpts, false, log); err != nil { + return workflow.Failed(err), false } } diff --git a/controllers/operator/mongodbmultireplicaset_controller.go b/controllers/operator/mongodbmultireplicaset_controller.go index 5d963c00e..c42376035 100644 --- a/controllers/operator/mongodbmultireplicaset_controller.go +++ b/controllers/operator/mongodbmultireplicaset_controller.go @@ -3,6 +3,7 @@ package operator import ( "context" "encoding/json" + "errors" "fmt" "reflect" "sort" @@ -213,6 +214,10 @@ func (r *ReconcileMongoDbMultiReplicaSet) Reconcile(ctx context.Context, request status := workflow.RunInGivenOrder(publishAutomationConfigFirst, func() workflow.Status { if err := r.updateOmDeploymentRs(ctx, conn, mrs, agentCertPath, tlsCertPath, internalClusterCertPath, false, log); err != nil { + pendingErr := om.PendingErr{} + if ok := errors.As(err, &pendingErr); ok { + return workflow.Pending(pendingErr.Error()) + } return workflow.Failed(err) } return workflow.OK() @@ -789,9 +794,14 @@ func (r *ReconcileMongoDbMultiReplicaSet) updateOmDeploymentRs(ctx context.Conte reachableProcessNames = append(reachableProcessNames, proc.Name()) } } - if err := om.WaitForReadyState(conn, reachableProcessNames, isRecovering, log); err != nil && !isRecovering { + if isRecovering { + return nil + } + + if err := om.CheckForReadyStateReturningError(conn, reachableProcessNames, log); err != nil { return err } + return nil } diff --git a/controllers/operator/mongodbreplicaset_controller.go b/controllers/operator/mongodbreplicaset_controller.go index 470e56716..cc6111a1f 100644 --- a/controllers/operator/mongodbreplicaset_controller.go +++ b/controllers/operator/mongodbreplicaset_controller.go @@ -2,6 +2,7 @@ package operator import ( "context" + goerrors "errors" "fmt" "go.uber.org/zap" @@ -240,6 +241,10 @@ func (r *ReconcileMongoDbReplicaSet) Reconcile(ctx context.Context, request reco if scale.ReplicasThisReconciliation(rs) < rs.Status.Members { if err := replicaset.PrepareScaleDownFromStatefulSet(conn, sts, rs, log); err != nil { + pendingErr := om.PendingErr{} + if ok := goerrors.As(err, &pendingErr); ok { + return r.updateStatus(ctx, rs, workflow.Pending(pendingErr.Error()), log) + } return r.updateStatus(ctx, rs, workflow.Failed(xerrors.Errorf("Failed to prepare Replica Set for scaling down using Ops Manager: %w", err)), log) } } @@ -512,8 +517,12 @@ func (r *ReconcileMongoDbReplicaSet) updateOmDeploymentRs(ctx context.Context, c return workflow.Failed(err) } - if err := om.WaitForReadyState(conn, processNames, isRecovering, log); err != nil { - return workflow.Failed(err) + if !isRecovering { + if workflowStatus := om.CheckForReadyState(conn, processNames, log); !workflowStatus.IsOK() { + return workflowStatus + } + } else { + log.Warnf("Ignoring checking for ready state due to recovering") } reconcileResult, _ := ReconcileLogRotateSetting(conn, rs.Spec.Agent, log) diff --git a/controllers/operator/mongodbshardedcluster_controller.go b/controllers/operator/mongodbshardedcluster_controller.go index abe384b16..c9ca7880f 100644 --- a/controllers/operator/mongodbshardedcluster_controller.go +++ b/controllers/operator/mongodbshardedcluster_controller.go @@ -2,7 +2,9 @@ package operator import ( "context" + goerrors "errors" "fmt" + "k8s.io/apimachinery/pkg/api/errors" "slices" "sort" "strings" @@ -11,7 +13,6 @@ import ( "github.com/hashicorp/go-multierror" "go.uber.org/zap" "golang.org/x/xerrors" - "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" @@ -1611,6 +1612,7 @@ func (r *ShardedClusterReconcileHelper) cleanOpsManagerState(ctx context.Context } logDiffOfProcessNames(processNames, r.getHealthyProcessNames(), log.With("ctx", "cleanOpsManagerState")) + // we're onDelete, we cannot requeue, so we need to wait if err := om.WaitForReadyState(conn, r.getHealthyProcessNames(), false, log); err != nil { return err } @@ -1849,13 +1851,12 @@ func (r *ShardedClusterReconcileHelper) updateOmDeploymentShardedCluster(ctx con healthyProcessesToWaitForReadyState := r.getHealthyProcessNamesToWaitForReadyState(conn, log) logDiffOfProcessNames(processNames, healthyProcessesToWaitForReadyState, log.With("ctx", "updateOmDeploymentShardedCluster")) - if err = om.WaitForReadyState(conn, healthyProcessesToWaitForReadyState, isRecovering, log); err != nil { - if !isRecovering { - if shardsRemoving { - return workflow.Pending("automation agents haven't reached READY state: shards removal in progress: %v", err) - } - return workflow.Failed(err) + + if !isRecovering { + if workflowStatus := om.CheckForReadyState(conn, healthyProcessesToWaitForReadyState, log); !workflowStatus.IsOK() { + return workflowStatus } + } else { logWarnIgnoredDueToRecovery(log, err) } @@ -1873,12 +1874,16 @@ func (r *ShardedClusterReconcileHelper) updateOmDeploymentShardedCluster(ctx con healthyProcessesToWaitForReadyState := r.getHealthyProcessNamesToWaitForReadyState(conn, log) logDiffOfProcessNames(processNames, healthyProcessesToWaitForReadyState, log.With("ctx", "shardsRemoving")) - if err = om.WaitForReadyState(conn, healthyProcessesToWaitForReadyState, isRecovering, log); err != nil { - if !isRecovering { - return workflow.Failed(xerrors.Errorf("automation agents haven't reached READY state while cleaning replica set and processes: %w", err)) - } + if isRecovering { logWarnIgnoredDueToRecovery(log, err) } + if err = om.CheckForReadyStateReturningError(conn, healthyProcessesToWaitForReadyState, log); err != nil { + pendingErr := om.PendingErr{} + if ok := goerrors.As(err, &pendingErr); ok { + return workflow.Pending(pendingErr.Error()) + } + return workflow.Failed(err) + } } currentHosts := r.getAllHostnames(false) @@ -2042,8 +2047,13 @@ func (r *ShardedClusterReconcileHelper) publishDeployment(ctx context.Context, c healthyProcessesToWaitForReadyState = r.getHealthyProcessNamesToWaitForReadyState(conn, log) logDiffOfProcessNames(opts.processNames, healthyProcessesToWaitForReadyState, log.With("ctx", "publishDeployment")) - if err := om.WaitForReadyState(conn, healthyProcessesToWaitForReadyState, isRecovering, log); err != nil { - return nil, shardsRemoving, workflow.Failed(err) + + if !isRecovering { + if workflowStatus := om.CheckForReadyState(conn, healthyProcessesToWaitForReadyState, log); workflowStatus != workflow.OK() { + return nil, shardsRemoving, workflowStatus + } + } else { + log.Warnf("Ignoring checking for ready state due to recovering") } if additionalReconciliationRequired { diff --git a/controllers/operator/mongodbstandalone_controller.go b/controllers/operator/mongodbstandalone_controller.go index 47ff489bb..ad5b20c13 100644 --- a/controllers/operator/mongodbstandalone_controller.go +++ b/controllers/operator/mongodbstandalone_controller.go @@ -353,8 +353,12 @@ func (r *ReconcileMongoDbStandalone) updateOmDeployment(ctx context.Context, con return workflow.Failed(err) } - if err := om.WaitForReadyState(conn, []string{set.Name}, isRecovering, log); err != nil { - return workflow.Failed(err) + if !isRecovering { + if workflowStatus := om.CheckForReadyState(conn, []string{set.Name}, log); status != workflow.OK() { + return workflowStatus + } + } else { + log.Warnf("Ignoring checking for ready state due to recovering") } if additionalReconciliationRequired {