From be21a5d6af803026361650e33cd05ff44d76afde Mon Sep 17 00:00:00 2001 From: Natalia Marukovich Date: Sat, 11 Feb 2023 13:32:11 +0400 Subject: [PATCH 1/5] add new tags --- cloud/jenkins/psmdb_operator_eks_latest.groovy | 1 + cloud/jenkins/psmdb_operator_eks_version.groovy | 1 + cloud/jenkins/pxc_operator_eks_latest.groovy | 1 + cloud/jenkins/pxc_operator_eks_version.groovy | 1 + 4 files changed, 4 insertions(+) diff --git a/cloud/jenkins/psmdb_operator_eks_latest.groovy b/cloud/jenkins/psmdb_operator_eks_latest.groovy index aa583af752..3a819fef28 100644 --- a/cloud/jenkins/psmdb_operator_eks_latest.groovy +++ b/cloud/jenkins/psmdb_operator_eks_latest.groovy @@ -41,6 +41,7 @@ nodeGroups: tags: 'iit-billing-tag': 'jenkins-eks' 'delete-cluster-after-hours': '10' + 'team': 'cloud' EOF """ diff --git a/cloud/jenkins/psmdb_operator_eks_version.groovy b/cloud/jenkins/psmdb_operator_eks_version.groovy index 0dc6637a23..7dd1975b61 100644 --- a/cloud/jenkins/psmdb_operator_eks_version.groovy +++ b/cloud/jenkins/psmdb_operator_eks_version.groovy @@ -41,6 +41,7 @@ nodeGroups: tags: 'iit-billing-tag': 'jenkins-eks' 'delete-cluster-after-hours': '10' + 'team': 'cloud' EOF """ diff --git a/cloud/jenkins/pxc_operator_eks_latest.groovy b/cloud/jenkins/pxc_operator_eks_latest.groovy index 292acabbf1..17027098d4 100644 --- a/cloud/jenkins/pxc_operator_eks_latest.groovy +++ b/cloud/jenkins/pxc_operator_eks_latest.groovy @@ -41,6 +41,7 @@ nodeGroups: tags: 'iit-billing-tag': 'jenkins-eks' 'delete-cluster-after-hours': '10' + 'team': 'cloud' EOF """ diff --git a/cloud/jenkins/pxc_operator_eks_version.groovy b/cloud/jenkins/pxc_operator_eks_version.groovy index ca96f78e5d..4b1675674c 100644 --- a/cloud/jenkins/pxc_operator_eks_version.groovy +++ b/cloud/jenkins/pxc_operator_eks_version.groovy @@ -41,6 +41,7 @@ nodeGroups: tags: 'iit-billing-tag': 'jenkins-eks' 'delete-cluster-after-hours': '10' + 'team': 'cloud' EOF """ From 97c1150ab5daab3095886fd8bc2c69739982cd47 Mon Sep 17 00:00:00 2001 From: Natalia Marukovich Date: Fri, 31 Oct 2025 16:06:20 +0100 Subject: [PATCH 2/5] K8SPXC-1650 delete resources from k8s --- cloud/azure/README.md | 7 + cloud/azure/cmd/.funcignore | 6 + .../cmd/aks-cleanup-function/__init__.py | 169 ++++++++++++++++++ .../cmd/aks-cleanup-function/function.json | 12 ++ cloud/azure/cmd/host.json | 1 + cloud/azure/cmd/local.settings.json | 10 ++ cloud/azure/cmd/requirements.txt | 5 + 7 files changed, 210 insertions(+) create mode 100644 cloud/azure/README.md create mode 100644 cloud/azure/cmd/.funcignore create mode 100644 cloud/azure/cmd/aks-cleanup-function/__init__.py create mode 100644 cloud/azure/cmd/aks-cleanup-function/function.json create mode 100644 cloud/azure/cmd/host.json create mode 100644 cloud/azure/cmd/local.settings.json create mode 100644 cloud/azure/cmd/requirements.txt diff --git a/cloud/azure/README.md b/cloud/azure/README.md new file mode 100644 index 0000000000..8267326a1e --- /dev/null +++ b/cloud/azure/README.md @@ -0,0 +1,7 @@ +# To redeploy function run in cmd folder: +`` +zip -r ../aks-cleanup.zip . -x "local.settings.json" ".funcignore" "**/__pycache__/*" ".git/*" ".venv/*" + +az functionapp deployment source config-zip --resource-group percona-operators --name DeleteOrpanedK8sResources --src ../aks-cleanup.zip + +`` diff --git a/cloud/azure/cmd/.funcignore b/cloud/azure/cmd/.funcignore new file mode 100644 index 0000000000..41740c9736 --- /dev/null +++ b/cloud/azure/cmd/.funcignore @@ -0,0 +1,6 @@ +.venv/ +.env +__pycache__/ +.local/ +bin/ +obj/ diff --git a/cloud/azure/cmd/aks-cleanup-function/__init__.py b/cloud/azure/cmd/aks-cleanup-function/__init__.py new file mode 100644 index 0000000000..704cc4d799 --- /dev/null +++ b/cloud/azure/cmd/aks-cleanup-function/__init__.py @@ -0,0 +1,169 @@ +# Remove expired AKS clusters (Azure, cluster-only) + +import os +import math +import logging +import datetime +import time +import azure.functions as func +from typing import List, Dict, Optional + +from azure.identity import DefaultAzureCredential +from azure.mgmt.resource import ResourceManagementClient +from azure.mgmt.containerservice import ContainerServiceClient +from azure.core.exceptions import ResourceNotFoundError, HttpResponseError + +DRY_RUN = os.getenv("DRY_RUN", "true").lower() == "true" + +credential: Optional[DefaultAzureCredential] = None +resource_groups_client: Optional[ResourceManagementClient] = None +aks_client: Optional[ContainerServiceClient] = None + +# Resolve RG for a cluster name +CLUSTER_RG_MAP: Dict[str, str] = {} + + +def parse_epoch_creation_time(tags: dict) -> Optional[datetime.datetime]: + """Try parse tags['creation-time'] (epoch seconds) into aware datetime UTC.""" + raw = (tags or {}).get("creation-time") + if not raw: + return None + try: + ts = float(raw) + return datetime.datetime.fromtimestamp(ts, tz=datetime.timezone.utc) + except Exception: + logging.warning("Invalid creation-time tag: %r", raw) + return None + + +def is_cluster_to_terminate(cluster) -> bool: + """ + Delete rules: + - requires tag team=cloud (case-insensitive) + - if TTL tag missing -> True (delete by policy) + - else TTL must be an integer number of hours + - delete when (now - creation-time[tag]) in hours > TTL + - if TTL present but creation-time missing/invalid -> safe skip + """ + tags = cluster.tags or {} + name = getattr(cluster, "name", "") + logging.info("Cluster %s tags: %s", name, tags) + + if tags.get("team", "").lower() != "cloud": + return False + + ttl_hours = tags.get("delete-cluster-after-hours") + if ttl_hours is None: + logging.info("Cluster %s has no TTL tag — marked for deletion by policy", name) + return True + + created_at = parse_epoch_creation_time(tags) + logging.info("Cluster %s created_at: %s", cluster.name, created_at) + if created_at is None: + logging.info("Cluster %s has TTL but no valid creation-time tag — skipping", name) + return False + now = datetime.datetime.now(datetime.timezone.utc) + lifetime_hours = int(math.ceil((now - created_at).total_seconds() / 3600.0)) + + return lifetime_hours > int(ttl_hours) + + +def get_clusters_to_terminate() -> List[str]: + """ + Scan all resource groups, return cluster names to delete. + Also populate CLUSTER_RG_MAP[name] = rg for later deletion. + """ + clusters_for_deletion: List[str] = [] + CLUSTER_RG_MAP.clear() + + for rg in resource_groups_client.resource_groups.list(): + rg_name = rg.name + try: + for mc in aks_client.managed_clusters.list_by_resource_group(rg_name): + if is_cluster_to_terminate(mc): + clusters_for_deletion.append(mc.name) + CLUSTER_RG_MAP[mc.name] = rg_name + except HttpResponseError as e: + logging.warning("Failed to list AKS in RG %s: %s", rg_name, e) + + if not clusters_for_deletion: + logging.info("There are no clusters for deletion") + return clusters_for_deletion + + +def wait_for_cluster_delete(cluster_name: str, timeout: int = 300, sleep_time: int = 10): + """Poll until the AKS cluster disappears (or timeout).""" + attempts = timeout // sleep_time + for attempt in range(attempts): + rg_name = CLUSTER_RG_MAP.get(cluster_name) + if not rg_name: + logging.info("Cluster %s RG mapping missing; assuming deleted", cluster_name) + return + try: + _ = aks_client.managed_clusters.get(rg_name, cluster_name) + logging.info( + "Cluster %s still exists. Attempt %d/%d. Sleeping %ds.", + cluster_name, attempt + 1, attempts, sleep_time + ) + time.sleep(sleep_time) + except ResourceNotFoundError: + logging.info("Cluster %s was successfully deleted.", cluster_name) + return + except HttpResponseError as e: + status = getattr(e, "status_code", None) + if status == 404 or "NotFound" in str(e) or "404" in str(e): + logging.info("Cluster %s was successfully deleted.", cluster_name) + return + logging.warning("Error checking cluster %s: %s", cluster_name, e) + time.sleep(sleep_time) + logging.error("Cluster %s was not deleted in %d seconds.", cluster_name, timeout) + + +def delete_cluster(cluster_name: str): + """ + Resolve RG from CLUSTER_RG_MAP (or scan), then delete the AKS cluster. + """ + rg_name = CLUSTER_RG_MAP.get(cluster_name) + if not rg_name: + # Slow path: try to resolve by scanning RGs + for rg in resource_groups_client.resource_groups.list(): + try: + _ = aks_client.managed_clusters.get(rg.name, cluster_name) + rg_name = rg.name + CLUSTER_RG_MAP[cluster_name] = rg_name + break + except Exception: + continue + + if not rg_name: + logging.info("Cluster %s not found — skipping", cluster_name) + return + + if DRY_RUN: + logging.info("[DRY-RUN] Would delete cluster %s/%s", rg_name, cluster_name) + return + + aks_client.managed_clusters.begin_delete(rg_name, cluster_name) + wait_for_cluster_delete(cluster_name) + + +def main(mytimer: func.TimerRequest) -> None: + + global credential, resource_groups_client, aks_client + + logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") + + subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID") + if not subscription_id: + logging.error("AZURE_SUBSCRIPTION_ID is not set") + return + + credential = DefaultAzureCredential() + resource_groups_client = ResourceManagementClient(credential, subscription_id) + aks_client = ContainerServiceClient(credential, subscription_id) + + logging.info("Searching for AKS clusters to remove.") + clusters = get_clusters_to_terminate() + for cluster in clusters: + logging.info("Terminating %s", cluster) + delete_cluster(cluster) diff --git a/cloud/azure/cmd/aks-cleanup-function/function.json b/cloud/azure/cmd/aks-cleanup-function/function.json new file mode 100644 index 0000000000..53551154e0 --- /dev/null +++ b/cloud/azure/cmd/aks-cleanup-function/function.json @@ -0,0 +1,12 @@ +{ + "scriptFile": "__init__.py", + "entryPoint": "main", + "bindings": [ + { + "name": "mytimer", + "type": "timerTrigger", + "direction": "in", + "schedule": "0 0 * * * *" + } + ] +} \ No newline at end of file diff --git a/cloud/azure/cmd/host.json b/cloud/azure/cmd/host.json new file mode 100644 index 0000000000..5ce595737f --- /dev/null +++ b/cloud/azure/cmd/host.json @@ -0,0 +1 @@ +{ "version": "2.0" } diff --git a/cloud/azure/cmd/local.settings.json b/cloud/azure/cmd/local.settings.json new file mode 100644 index 0000000000..be1644756d --- /dev/null +++ b/cloud/azure/cmd/local.settings.json @@ -0,0 +1,10 @@ +{ + "IsEncrypted": false, + "Values": { + "AzureWebJobsStorage": "UseDevelopmentStorage=true", + "FUNCTIONS_WORKER_RUNTIME": "python", + "DRY_RUN": "true", + "SLEEP_BETWEEN_DELETES_SECONDS": "0.2", + "DELETE_START_MAX_RETRIES": "3" + } +} \ No newline at end of file diff --git a/cloud/azure/cmd/requirements.txt b/cloud/azure/cmd/requirements.txt new file mode 100644 index 0000000000..79e6a4d781 --- /dev/null +++ b/cloud/azure/cmd/requirements.txt @@ -0,0 +1,5 @@ +azure-functions>=1.18.0 +azure-identity>=1.17.1 +azure-mgmt-containerservice>=31.0.0 +azure-mgmt-resource>=23.1.1 +azure-core>=1.30.0 From 0accc27d6d8d2267e18d831820fc0415ebbe701f Mon Sep 17 00:00:00 2001 From: Natalia Marukovich Date: Mon, 3 Nov 2025 12:09:03 +0100 Subject: [PATCH 3/5] add tags to jenkins jobs --- cloud/jenkins/pgo_aks.groovy | 1 + cloud/jenkins/psmdbo_aks.groovy | 1 + cloud/jenkins/pxco_aks.groovy | 1 + 3 files changed, 3 insertions(+) diff --git a/cloud/jenkins/pgo_aks.groovy b/cloud/jenkins/pgo_aks.groovy index 27850c4296..109fafc4ed 100644 --- a/cloud/jenkins/pgo_aks.groovy +++ b/cloud/jenkins/pgo_aks.groovy @@ -220,6 +220,7 @@ void createCluster(String CLUSTER_SUFFIX) { --enable-cluster-autoscaler \ --outbound-type loadbalancer \ --kubernetes-version $PLATFORM_VER \ + --tags team=cloud delete-cluster-after-hours=6 creation-time=$(date -u +%s) \ -l $location az aks get-credentials --subscription eng-cloud-dev --resource-group percona-operators --name $CLUSTER_NAME-$CLUSTER_SUFFIX --overwrite-existing """ diff --git a/cloud/jenkins/psmdbo_aks.groovy b/cloud/jenkins/psmdbo_aks.groovy index e45f102830..16e9cc8563 100644 --- a/cloud/jenkins/psmdbo_aks.groovy +++ b/cloud/jenkins/psmdbo_aks.groovy @@ -243,6 +243,7 @@ void createCluster(String CLUSTER_SUFFIX) { --enable-cluster-autoscaler \ --outbound-type loadbalancer \ --kubernetes-version $PLATFORM_VER \ + --tags team=cloud delete-cluster-after-hours=6 creation-time=$(date -u +%s) \ -l $location az aks get-credentials --subscription eng-cloud-dev --resource-group percona-operators --name $CLUSTER_NAME-$CLUSTER_SUFFIX --overwrite-existing """ diff --git a/cloud/jenkins/pxco_aks.groovy b/cloud/jenkins/pxco_aks.groovy index 59ba60d701..ac91bbe6e0 100644 --- a/cloud/jenkins/pxco_aks.groovy +++ b/cloud/jenkins/pxco_aks.groovy @@ -212,6 +212,7 @@ void createCluster(String CLUSTER_SUFFIX) { --enable-cluster-autoscaler \ --outbound-type loadbalancer \ --kubernetes-version $PLATFORM_VER \ + --tags team=cloud delete-cluster-after-hours=6 creation-time=$(date -u +%s) \ -l $location az aks get-credentials --subscription eng-cloud-dev --resource-group percona-operators --name $CLUSTER_NAME-$CLUSTER_SUFFIX --overwrite-existing """ From b47394d35a3b30999d3981b0840d1d95344b4bac Mon Sep 17 00:00:00 2001 From: Natalia Marukovich Date: Wed, 26 Nov 2025 14:51:13 +0100 Subject: [PATCH 4/5] fix readme --- cloud/azure/README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 8267326a1e..3d1e9a6bc7 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -1,4 +1,11 @@ -# To redeploy function run in cmd folder: +1. In the Azure portal, search for **Function App** (make sure the subscription is set to **eng-cloud-dev**). + In the list, find the Function App named **DeleteOrpanedK8sResources**. + +2. Open this Function App and select the function **aks-cleanup-function**. + +3. To update this function, modify the code **locally** and then **redeploy** it to Azure. + +# To redeploy function run in jenkins-pipelines/cloud/azure/cmd folder: `` zip -r ../aks-cleanup.zip . -x "local.settings.json" ".funcignore" "**/__pycache__/*" ".git/*" ".venv/*" From 8311280ef685114f5757e279ba6d48e40cfde050 Mon Sep 17 00:00:00 2001 From: Natalia Marukovich Date: Thu, 4 Dec 2025 11:51:04 +0100 Subject: [PATCH 5/5] fix PR comments --- cloud/azure/README.md | 2 + .../cmd/aks-cleanup-function/__init__.py | 49 ++++++------------- 2 files changed, 17 insertions(+), 34 deletions(-) diff --git a/cloud/azure/README.md b/cloud/azure/README.md index 3d1e9a6bc7..f493d35be5 100644 --- a/cloud/azure/README.md +++ b/cloud/azure/README.md @@ -12,3 +12,5 @@ zip -r ../aks-cleanup.zip . -x "local.settings.json" ".funcignore" "**/__pycache az functionapp deployment source config-zip --resource-group percona-operators --name DeleteOrpanedK8sResources --src ../aks-cleanup.zip `` + + diff --git a/cloud/azure/cmd/aks-cleanup-function/__init__.py b/cloud/azure/cmd/aks-cleanup-function/__init__.py index 704cc4d799..8ad2ce3d8c 100644 --- a/cloud/azure/cmd/aks-cleanup-function/__init__.py +++ b/cloud/azure/cmd/aks-cleanup-function/__init__.py @@ -4,7 +4,6 @@ import math import logging import datetime -import time import azure.functions as func from typing import List, Dict, Optional @@ -14,6 +13,7 @@ from azure.core.exceptions import ResourceNotFoundError, HttpResponseError DRY_RUN = os.getenv("DRY_RUN", "true").lower() == "true" +DELETE_TIMEOUT = int(os.getenv("DELETE_TIMEOUT", "600")) # 10 minutes default credential: Optional[DefaultAzureCredential] = None resource_groups_client: Optional[ResourceManagementClient] = None @@ -91,37 +91,10 @@ def get_clusters_to_terminate() -> List[str]: return clusters_for_deletion -def wait_for_cluster_delete(cluster_name: str, timeout: int = 300, sleep_time: int = 10): - """Poll until the AKS cluster disappears (or timeout).""" - attempts = timeout // sleep_time - for attempt in range(attempts): - rg_name = CLUSTER_RG_MAP.get(cluster_name) - if not rg_name: - logging.info("Cluster %s RG mapping missing; assuming deleted", cluster_name) - return - try: - _ = aks_client.managed_clusters.get(rg_name, cluster_name) - logging.info( - "Cluster %s still exists. Attempt %d/%d. Sleeping %ds.", - cluster_name, attempt + 1, attempts, sleep_time - ) - time.sleep(sleep_time) - except ResourceNotFoundError: - logging.info("Cluster %s was successfully deleted.", cluster_name) - return - except HttpResponseError as e: - status = getattr(e, "status_code", None) - if status == 404 or "NotFound" in str(e) or "404" in str(e): - logging.info("Cluster %s was successfully deleted.", cluster_name) - return - logging.warning("Error checking cluster %s: %s", cluster_name, e) - time.sleep(sleep_time) - logging.error("Cluster %s was not deleted in %d seconds.", cluster_name, timeout) - - -def delete_cluster(cluster_name: str): +def terminate_cluster(cluster_name: str): """ Resolve RG from CLUSTER_RG_MAP (or scan), then delete the AKS cluster. + Uses .result() to wait for deletion completion. """ rg_name = CLUSTER_RG_MAP.get(cluster_name) if not rg_name: @@ -143,9 +116,17 @@ def delete_cluster(cluster_name: str): logging.info("[DRY-RUN] Would delete cluster %s/%s", rg_name, cluster_name) return - aks_client.managed_clusters.begin_delete(rg_name, cluster_name) - wait_for_cluster_delete(cluster_name) + try: + logging.info("Starting deletion of cluster %s/%s", rg_name, cluster_name) + + aks_client.managed_clusters.begin_delete(rg_name, cluster_name).result(timeout=DELETE_TIMEOUT) + + logging.info("Cluster %s was successfully deleted", cluster_name) + except TimeoutError: + logging.error("Cluster %s deletion timed out after %d seconds", cluster_name, DELETE_TIMEOUT) + except Exception as e: + logging.error("Failed to delete cluster %s: %s", cluster_name, e) def main(mytimer: func.TimerRequest) -> None: @@ -162,8 +143,8 @@ def main(mytimer: func.TimerRequest) -> None: resource_groups_client = ResourceManagementClient(credential, subscription_id) aks_client = ContainerServiceClient(credential, subscription_id) - logging.info("Searching for AKS clusters to remove.") + logging.info("Searching for AKS clusters to terminate.") clusters = get_clusters_to_terminate() for cluster in clusters: logging.info("Terminating %s", cluster) - delete_cluster(cluster) + terminate_cluster(cluster) \ No newline at end of file