From e49ba685e5c083fef586ade44d07f37c293d0544 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Mon, 6 Oct 2025 14:08:37 +0200 Subject: [PATCH 1/2] Fix removeUntaggedEc2: validate billing tags for EKS instances Previously, EKS clusters without billing tags were deleted if they didn't match the skip pattern, even if they had valid billing tags. This fix adds proper billing tag validation for EKS instances. Changes: - Add has_valid_billing_tag() to validate category strings and timestamps - Check billing tags BEFORE skip pattern for EKS instances - Support Unix timestamp expiration in iit-billing-tag - Add comprehensive test suite with 12 test scenarios - All tests pass with proper timezone handling (UTC) --- cloud/aws-functions/removeUntaggedEc2.py | 457 ++++++++++++++++++ .../test_removeUntaggedEc2_logic.py | 386 +++++++++++++++ 2 files changed, 843 insertions(+) create mode 100644 cloud/aws-functions/removeUntaggedEc2.py create mode 100755 cloud/aws-functions/test_removeUntaggedEc2_logic.py diff --git a/cloud/aws-functions/removeUntaggedEc2.py b/cloud/aws-functions/removeUntaggedEc2.py new file mode 100644 index 0000000000..71b3348abd --- /dev/null +++ b/cloud/aws-functions/removeUntaggedEc2.py @@ -0,0 +1,457 @@ +import logging +import datetime +import boto3 +import os +import re +from botocore.exceptions import ClientError + +# Set logging level to INFO +logger = logging.getLogger() +logger.setLevel("INFO") + +# Get environment variable for EKS cluster skip pattern +EKS_SKIP_PATTERN = os.environ.get("EKS_SKIP_PATTERN", "pe-.*") +logger.info(f"EKS_SKIP_PATTERN: {EKS_SKIP_PATTERN}") + +# Track EKS clusters marked for deletion per region +eks_clusters_to_delete = {} + + +def convert_tags_to_dict(tags): + return {tag["Key"]: tag["Value"] for tag in tags} if tags else {} + + +def get_eks_cluster_name(tags_dict): + """Extract EKS cluster name from instance tags""" + # Check multiple possible tag keys for cluster name + cluster_keys = ["aws:eks:cluster-name", "eks:eks-cluster-name"] + + for key in cluster_keys: + if key in tags_dict: + return tags_dict[key] + + # Check for kubernetes.io/cluster/* tags + for key in tags_dict.keys(): + if key.startswith("kubernetes.io/cluster/"): + return key.replace("kubernetes.io/cluster/", "") + + return None + + +def has_valid_billing_tag(tags_dict, instance_launch_time): + """ + Check if instance has a valid iit-billing-tag. + + For regular instances: any non-empty value is valid + For timestamp-based tags: check if Unix timestamp is in the future + """ + if "iit-billing-tag" not in tags_dict: + return False + + tag_value = tags_dict["iit-billing-tag"] + + # Empty tag is invalid + if not tag_value: + return False + + # Try to parse as Unix timestamp (for EKS auto-expiration) + try: + expiration_timestamp = int(tag_value) + current_timestamp = int( + datetime.datetime.now(datetime.timezone.utc).timestamp() + ) + + # If it's a valid future timestamp, check if it's expired + if expiration_timestamp > current_timestamp: + logger.info( + f"Instance has valid billing tag with expiration {expiration_timestamp} " + f"(expires in {expiration_timestamp - current_timestamp} seconds)" + ) + return True + else: + logger.info( + f"Instance billing tag expired: {expiration_timestamp} < {current_timestamp} " + f"(expired {current_timestamp - expiration_timestamp} seconds ago)" + ) + return False + except ValueError: + # Not a timestamp, treat as category string (e.g., "pmm-staging", "jenkins-pmm-slave") + # Any non-empty category string is valid + logger.info(f"Instance has valid billing tag category: {tag_value}") + return True + + +def is_eks_managed_instance(instance, region): + """Check if instance is managed by EKS and if it should be skipped""" + tags_dict = convert_tags_to_dict(instance.tags) + + # Check for EKS-related tags + eks_indicators = [ + "kubernetes.io/cluster/", + "aws:eks:cluster-name", + "eks:eks-cluster-name", + "eks:kubernetes-node-pool-name", + "aws:ec2:managed-launch", + ] + + is_eks = False + for key in tags_dict.keys(): + for indicator in eks_indicators: + if indicator in key: + is_eks = True + break + if is_eks: + break + + if not is_eks: + return False + + # It's an EKS instance, now check billing tag and skip pattern + cluster_name = get_eks_cluster_name(tags_dict) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + # If has valid billing tag, always skip (it's legitimate) + if has_billing_tag: + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"has valid iit-billing-tag, skipping" + ) + return True + + # No billing tag - check skip pattern + if cluster_name and EKS_SKIP_PATTERN: + try: + if re.match(EKS_SKIP_PATTERN, cluster_name): + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"matches skip pattern '{EKS_SKIP_PATTERN}', skipping" + ) + return True + else: + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"does NOT match skip pattern '{EKS_SKIP_PATTERN}' and has no valid billing tag, " + f"marking cluster for deletion" + ) + # Track this cluster for deletion + if region not in eks_clusters_to_delete: + eks_clusters_to_delete[region] = set() + eks_clusters_to_delete[region].add(cluster_name) + return True # Skip individual instance termination, we'll delete the whole cluster + except re.error as e: + logger.error( + f"Invalid regex pattern '{EKS_SKIP_PATTERN}': {e}, skipping all EKS instances" + ) + return True + + # If no cluster name found, skip the instance + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name or 'unknown'}), skipping" + ) + return True + + +def is_instance_to_terminate(instance): + # Check if the instance has valid 'iit-billing-tag' + tags_dict = convert_tags_to_dict(instance.tags) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + # Calculate the running time of the instance + current_time = datetime.datetime.now(datetime.timezone.utc) + launch_time = instance.launch_time + running_time = current_time - launch_time + + # Terminate instances without valid 'iit-billing-tag' running for more than 10 minutes + if not has_billing_tag and running_time.total_seconds() > 600: + return True + return False + + +def cleanup_failed_stack_resources(stack_name, region): + """Manually clean up resources that prevent stack deletion""" + try: + cfn = boto3.client("cloudformation", region_name=region) + ec2 = boto3.client("ec2", region_name=region) + + # Get failed resources from stack events + events = cfn.describe_stack_events(StackName=stack_name) + failed_resources = {} + + for event in events["StackEvents"]: + if event.get("ResourceStatus") == "DELETE_FAILED": + logical_id = event["LogicalResourceId"] + if logical_id not in failed_resources: # Only keep first occurrence + failed_resources[logical_id] = { + "Type": event["ResourceType"], + "PhysicalId": event.get("PhysicalResourceId"), + } + + if not failed_resources: + return True + + logger.info( + f"Attempting to clean up {len(failed_resources)} failed resources for stack {stack_name}" + ) + + # Process each failed resource type + for logical_id, resource in failed_resources.items(): + resource_type = resource["Type"] + physical_id = resource["PhysicalId"] + + try: + # Clean up security group ingress rules + if resource_type == "AWS::EC2::SecurityGroupIngress" and physical_id: + sg_id = physical_id.split("|")[0] if "|" in physical_id else None + if sg_id and sg_id.startswith("sg-"): + response = ec2.describe_security_groups(GroupIds=[sg_id]) + if response["SecurityGroups"]: + sg = response["SecurityGroups"][0] + if sg["IpPermissions"]: + ec2.revoke_security_group_ingress( + GroupId=sg_id, IpPermissions=sg["IpPermissions"] + ) + logger.info(f"Cleaned up ingress rules for {sg_id}") + + # Clean up route table associations + elif ( + resource_type == "AWS::EC2::SubnetRouteTableAssociation" + and physical_id + ): + # PhysicalId is the association ID + if physical_id.startswith("rtbassoc-"): + ec2.disassociate_route_table(AssociationId=physical_id) + logger.info(f"Disassociated route table {physical_id}") + + # Clean up routes + elif resource_type == "AWS::EC2::Route" and physical_id: + # PhysicalId format: rtb-xxx_destination + parts = physical_id.split("_") + if len(parts) == 2 and parts[0].startswith("rtb-"): + rtb_id = parts[0] + dest_cidr = parts[1] + ec2.delete_route( + RouteTableId=rtb_id, DestinationCidrBlock=dest_cidr + ) + logger.info(f"Deleted route {dest_cidr} from {rtb_id}") + + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + # Ignore if resource already deleted + if error_code not in [ + "InvalidGroup.NotFound", + "InvalidAssociationID.NotFound", + "InvalidRoute.NotFound", + ]: + logger.warning( + f"Could not clean up {resource_type} {physical_id}: {e}" + ) + except Exception as e: + logger.warning( + f"Unexpected error cleaning up {resource_type} {physical_id}: {e}" + ) + + return True + + except Exception as e: + logger.error(f"Error cleaning up failed resources for stack {stack_name}: {e}") + return False + + +def delete_eks_cluster_stack(cluster_name, region): + """Delete EKS cluster by removing its CloudFormation stack""" + try: + cfn = boto3.client("cloudformation", region_name=region) + + # Find CloudFormation stack for this cluster + stack_name = f"eksctl-{cluster_name}-cluster" + + # Check if stack exists and its current status + try: + response = cfn.describe_stacks(StackName=stack_name) + stack_status = response["Stacks"][0]["StackStatus"] + except ClientError as e: + if "does not exist" in str(e): + logger.warning( + f"CloudFormation stack {stack_name} not found in {region}, cannot delete cluster {cluster_name}" + ) + return False + raise + + # Handle DELETE_FAILED status - retry after cleanup + if stack_status == "DELETE_FAILED": + logger.info( + f"Stack {stack_name} previously failed deletion, attempting cleanup and retry" + ) + cleanup_failed_stack_resources(stack_name, region) + # Retry deletion + cfn.delete_stack(StackName=stack_name) + logger.info(f"Retrying deletion of stack {stack_name} after cleanup") + return True + + # Handle already deleting + if "DELETE" in stack_status and stack_status != "DELETE_COMPLETE": + logger.info(f"Stack {stack_name} already deleting (status: {stack_status})") + return True + + # Initiate deletion for new stacks + logger.info( + f"Deleting CloudFormation stack {stack_name} for EKS cluster {cluster_name} in {region}" + ) + cfn.delete_stack(StackName=stack_name) + logger.info( + f"Successfully initiated deletion of stack {stack_name} for cluster {cluster_name}" + ) + return True + + except ClientError as e: + logger.error( + f"Failed to delete CloudFormation stack for cluster {cluster_name} in {region}: {e}" + ) + return False + except Exception as e: + logger.error( + f"Unexpected error deleting cluster {cluster_name} in {region}: {e}" + ) + return False + + +def cirrus_ci_add_iit_billing_tag(instance): + # Convert tags to a dictionary for easier access + tags_dict = convert_tags_to_dict(instance.tags) + + # Check if the instance has 'CIRRUS_CI' tag set to 'true' and 'iit-billing-tag' is not set + has_cirrus_ci_tag = tags_dict.get("CIRRUS_CI", "").lower() == "true" + has_iit_billing_tag = "iit-billing-tag" in tags_dict + + # Extract additional tag values + instance_name = tags_dict.get("Name") + cirrus_repo_full_name = tags_dict.get("CIRRUS_REPO_FULL_NAME") + cirrus_task_id = tags_dict.get("CIRRUS_TASK_ID") + + # If 'CIRRUS_CI' tag is set to 'true' and 'iit-billing-tag' is not set, add 'iit-billing-tag' set to 'CirrusCI' + if has_cirrus_ci_tag and not has_iit_billing_tag: + try: + instance.create_tags(Tags=[{"Key": "iit-billing-tag", "Value": "CirrusCI"}]) + logger.info( + f"Instance {instance.id} ({instance_name}) tagged with 'iit-billing-tag: CirrusCI'. " + f"CIRRUS_REPO_FULL_NAME: {cirrus_repo_full_name}, CIRRUS_TASK_ID: {cirrus_task_id}" + ) + except ClientError as e: + logger.error(f"Error tagging instance {instance.id}: {e}") + + +def terminate_instances_in_region(region): + ec2 = boto3.resource("ec2", region_name=region) + instances = ec2.instances.filter( + Filters=[{"Name": "instance-state-name", "Values": ["running"]}] + ) + terminated_instances = [] + skipped_instances = [] + + for instance in instances: + try: + # First try to tag CirrusCI instances + cirrus_ci_add_iit_billing_tag(instance) + + # Skip EKS-managed instances based on pattern and billing tag + if is_eks_managed_instance(instance, region): + tags_dict = convert_tags_to_dict(instance.tags) + cluster_name = get_eks_cluster_name(tags_dict) + skipped_instances.append( + { + "InstanceId": instance.id, + "Reason": f"EKS-managed (cluster: {cluster_name or 'unknown'})", + } + ) + continue + + # Check if should terminate + if is_instance_to_terminate(instance): + instance_info = { + "InstanceId": instance.id, + "SSHKeyName": instance.key_name, + "NameTag": instance.tags[0]["Value"] + if instance.tags and "Name" in [tag["Key"] for tag in instance.tags] + else None, + "AvailabilityZone": instance.placement["AvailabilityZone"], + } + + try: + instance.terminate() + terminated_instances.append(instance_info) + logger.info( + f"Successfully terminated instance {instance.id} in {region}" + ) + except ClientError as e: + logger.error( + f"Failed to terminate instance {instance.id} in {region}: {e}" + ) + skipped_instances.append( + { + "InstanceId": instance.id, + "Reason": f"Permission denied: {str(e)}", + } + ) + except Exception as e: + logger.error(f"Error processing instance {instance.id} in {region}: {e}") + continue + + if skipped_instances: + logger.info(f"Skipped {len(skipped_instances)} instances in {region}") + for skipped in skipped_instances[:5]: # Log first 5 only + logger.info(f" - {skipped['InstanceId']}: {skipped['Reason']}") + + return terminated_instances + + +def lambda_handler(event, context): + global eks_clusters_to_delete + eks_clusters_to_delete = {} # Reset at start of each invocation + + regions = [ + region["RegionName"] + for region in boto3.client("ec2").describe_regions()["Regions"] + ] + terminated_instances_all_regions = [] + deleted_clusters = [] + + # Process all instances and identify EKS clusters to delete + for region in regions: + try: + terminated_instances_region = terminate_instances_in_region(region) + terminated_instances_all_regions.extend(terminated_instances_region) + except Exception as e: + logger.error(f"Error processing region {region}: {e}") + continue + + # Delete EKS clusters that don't match skip pattern AND have no valid billing tag + for region, clusters in eks_clusters_to_delete.items(): + for cluster_name in clusters: + try: + if delete_eks_cluster_stack(cluster_name, region): + deleted_clusters.append(f"{cluster_name} ({region})") + except Exception as e: + logger.error(f"Error deleting cluster {cluster_name} in {region}: {e}") + continue + + # Log results + if terminated_instances_all_regions: + logger.info("Terminated instances:") + for instance_info in terminated_instances_all_regions: + logger.info( + f"- Instance ID: {instance_info['InstanceId']}, SSH Key: {instance_info['SSHKeyName']}, Name Tag: {instance_info['NameTag']}, Availability Zone: {instance_info['AvailabilityZone']}" + ) + else: + logger.info("No instances were terminated.") + + if deleted_clusters: + logger.info(f"Deleted {len(deleted_clusters)} EKS clusters:") + for cluster in deleted_clusters: + logger.info(f"- {cluster}") + else: + logger.info("No EKS clusters were deleted.") + + return { + "statusCode": 200, + "body": f"Terminated {len(terminated_instances_all_regions)} instances, deleted {len(deleted_clusters)} EKS clusters", + } diff --git a/cloud/aws-functions/test_removeUntaggedEc2_logic.py b/cloud/aws-functions/test_removeUntaggedEc2_logic.py new file mode 100755 index 0000000000..5b66d3b530 --- /dev/null +++ b/cloud/aws-functions/test_removeUntaggedEc2_logic.py @@ -0,0 +1,386 @@ +#!/usr/bin/env python3 +""" +Test script for removeUntaggedEc2 logic. +Does not connect to AWS - uses mock instance objects. +""" + +import datetime +from dataclasses import dataclass +from typing import Dict, List + + +# Mock instance class +@dataclass +class MockInstance: + id: str + tags: List[Dict[str, str]] + launch_time: datetime.datetime + key_name: str + placement: Dict[str, str] + + +# Import the logic functions (without AWS SDK calls) +def convert_tags_to_dict(tags): + return {tag["Key"]: tag["Value"] for tag in tags} if tags else {} + + +def get_eks_cluster_name(tags_dict): + """Extract EKS cluster name from instance tags""" + cluster_keys = ["aws:eks:cluster-name", "eks:eks-cluster-name"] + + for key in cluster_keys: + if key in tags_dict: + return tags_dict[key] + + for key in tags_dict.keys(): + if key.startswith("kubernetes.io/cluster/"): + return key.replace("kubernetes.io/cluster/", "") + + return None + + +def has_valid_billing_tag(tags_dict, instance_launch_time): + """ + Check if instance has a valid iit-billing-tag. + + For regular instances: any non-empty value is valid + For timestamp-based tags: check if Unix timestamp is in the future + """ + if "iit-billing-tag" not in tags_dict: + return False + + tag_value = tags_dict["iit-billing-tag"] + + if not tag_value: + return False + + try: + expiration_timestamp = int(tag_value) + current_timestamp = int( + datetime.datetime.now(datetime.timezone.utc).timestamp() + ) + + if expiration_timestamp > current_timestamp: + return True + else: + return False + except ValueError: + # Not a timestamp, treat as category string + return True + + +def is_eks_managed_instance(instance, region, eks_skip_pattern="pe-.*"): + """Check if instance is managed by EKS and if it should be skipped""" + import re + + tags_dict = convert_tags_to_dict(instance.tags) + + eks_indicators = [ + "kubernetes.io/cluster/", + "aws:eks:cluster-name", + "eks:eks-cluster-name", + "eks:kubernetes-node-pool-name", + "aws:ec2:managed-launch", + ] + + is_eks = False + for key in tags_dict.keys(): + for indicator in eks_indicators: + if indicator in key: + is_eks = True + break + if is_eks: + break + + if not is_eks: + return False, None + + cluster_name = get_eks_cluster_name(tags_dict) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + # If has valid billing tag, always skip (it's legitimate) + if has_billing_tag: + return True, "has valid billing tag" + + # No billing tag - check skip pattern + if cluster_name and eks_skip_pattern: + if re.match(eks_skip_pattern, cluster_name): + return True, f"matches skip pattern '{eks_skip_pattern}'" + else: + return ( + True, + "marked for cluster deletion (no billing tag, doesn't match pattern)", + ) + + return True, "EKS instance (no cluster name)" + + +def is_instance_to_terminate(instance, grace_period_seconds=600): + """Check if instance should be terminated""" + tags_dict = convert_tags_to_dict(instance.tags) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + current_time = datetime.datetime.now(datetime.timezone.utc) + running_time = current_time - instance.launch_time + + if not has_billing_tag and running_time.total_seconds() > grace_period_seconds: + return True + return False + + +# Test cases +def create_test_instances(): + """Create test instances with different scenarios""" + now = datetime.datetime.now(datetime.timezone.utc) + one_hour_ago = now - datetime.timedelta(hours=1) + five_minutes_ago = now - datetime.timedelta(minutes=5) + future_timestamp = int((now + datetime.timedelta(days=1)).timestamp()) + past_timestamp = int((now - datetime.timedelta(hours=1)).timestamp()) + + instances = [ + # Regular EC2 instances + MockInstance( + id="i-regular-with-tag", + tags=[ + {"Key": "Name", "Value": "jenkins-worker"}, + {"Key": "iit-billing-tag", "Value": "jenkins-pmm-slave"}, + ], + launch_time=one_hour_ago, + key_name="jenkins-key", + placement={"AvailabilityZone": "us-east-2a"}, + ), + MockInstance( + id="i-regular-no-tag-old", + tags=[{"Key": "Name", "Value": "orphaned-instance"}], + launch_time=one_hour_ago, + key_name="test-key", + placement={"AvailabilityZone": "us-east-2b"}, + ), + MockInstance( + id="i-regular-no-tag-new", + tags=[{"Key": "Name", "Value": "just-launched"}], + launch_time=five_minutes_ago, + key_name="test-key", + placement={"AvailabilityZone": "us-east-2c"}, + ), + MockInstance( + id="i-regular-timestamp-valid", + tags=[ + {"Key": "Name", "Value": "temp-instance"}, + {"Key": "iit-billing-tag", "Value": str(future_timestamp)}, + ], + launch_time=one_hour_ago, + key_name="test-key", + placement={"AvailabilityZone": "us-east-2a"}, + ), + MockInstance( + id="i-regular-timestamp-expired", + tags=[ + {"Key": "Name", "Value": "expired-instance"}, + {"Key": "iit-billing-tag", "Value": str(past_timestamp)}, + ], + launch_time=one_hour_ago, + key_name="test-key", + placement={"AvailabilityZone": "us-east-2b"}, + ), + # EKS instances - protected cluster (pe-*) + MockInstance( + id="i-eks-pe-cluster-no-tag", + tags=[ + {"Key": "Name", "Value": "pe-crossplane-node"}, + {"Key": "kubernetes.io/cluster/pe-crossplane", "Value": "owned"}, + {"Key": "eks:cluster-name", "Value": "pe-crossplane"}, + ], + launch_time=one_hour_ago, + key_name="eks-key", + placement={"AvailabilityZone": "us-east-2a"}, + ), + MockInstance( + id="i-eks-pe-cluster-with-tag", + tags=[ + {"Key": "Name", "Value": "pe-infra-node"}, + {"Key": "kubernetes.io/cluster/pe-infra", "Value": "owned"}, + {"Key": "iit-billing-tag", "Value": "platform-eng"}, + ], + launch_time=one_hour_ago, + key_name="eks-key", + placement={"AvailabilityZone": "us-east-2b"}, + ), + # EKS instances - non-protected cluster + MockInstance( + id="i-eks-pmm-no-tag", + tags=[ + {"Key": "Name", "Value": "pmm-ha-node"}, + {"Key": "kubernetes.io/cluster/pmm-ha", "Value": "owned"}, + {"Key": "aws:eks:cluster-name", "Value": "pmm-ha"}, + ], + launch_time=one_hour_ago, + key_name="eks-key", + placement={"AvailabilityZone": "us-east-2c"}, + ), + MockInstance( + id="i-eks-pmm-with-category-tag", + tags=[ + {"Key": "Name", "Value": "pmm-test-node"}, + {"Key": "kubernetes.io/cluster/pmm-test", "Value": "owned"}, + {"Key": "iit-billing-tag", "Value": "pmm-eks"}, + ], + launch_time=one_hour_ago, + key_name="eks-key", + placement={"AvailabilityZone": "us-east-2a"}, + ), + MockInstance( + id="i-eks-pmm-with-timestamp-valid", + tags=[ + {"Key": "Name", "Value": "pmm-temp-node"}, + {"Key": "kubernetes.io/cluster/pmm-temp", "Value": "owned"}, + {"Key": "iit-billing-tag", "Value": str(future_timestamp)}, + ], + launch_time=one_hour_ago, + key_name="eks-key", + placement={"AvailabilityZone": "us-east-2b"}, + ), + MockInstance( + id="i-eks-pmm-with-timestamp-expired", + tags=[ + {"Key": "Name", "Value": "pmm-expired-node"}, + {"Key": "kubernetes.io/cluster/pmm-expired", "Value": "owned"}, + {"Key": "iit-billing-tag", "Value": str(past_timestamp)}, + ], + launch_time=one_hour_ago, + key_name="eks-key", + placement={"AvailabilityZone": "us-east-2c"}, + ), + # CirrusCI instance + MockInstance( + id="i-cirrus-ci", + tags=[ + {"Key": "Name", "Value": "cirrus-runner"}, + {"Key": "CIRRUS_CI", "Value": "true"}, + {"Key": "iit-billing-tag", "Value": "CirrusCI"}, + ], + launch_time=one_hour_ago, + key_name="cirrus-key", + placement={"AvailabilityZone": "us-east-2a"}, + ), + ] + + return instances + + +def run_tests(): + """Run all test cases and display results""" + print("=" * 80) + print("Testing removeUntaggedEc2 Logic") + print("=" * 80) + print() + + instances = create_test_instances() + region = "us-east-2" + + results = { + "terminated": [], + "skipped_eks": [], + "skipped_has_tag": [], + "skipped_grace_period": [], + "eks_clusters_to_delete": set(), + } + + for instance in instances: + print(f"\n{'─' * 80}") + print(f"Instance: {instance.id}") + tags_dict = convert_tags_to_dict(instance.tags) + print(f"Tags: {tags_dict}") + print(f"Launch: {instance.launch_time.strftime('%Y-%m-%d %H:%M:%S UTC')}") + + # Check if EKS + is_eks, eks_reason = is_eks_managed_instance(instance, region) + + if is_eks: + print(f"✓ EKS Instance: {eks_reason}") + results["skipped_eks"].append(instance.id) + + # Check if should mark cluster for deletion + cluster_name = get_eks_cluster_name(tags_dict) + if cluster_name and "marked for cluster deletion" in eks_reason: + results["eks_clusters_to_delete"].add(cluster_name) + print(f" → Cluster '{cluster_name}' marked for deletion") + continue + + # Check billing tag + has_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + if has_tag: + print(f"✓ Has valid billing tag: {tags_dict.get('iit-billing-tag')}") + results["skipped_has_tag"].append(instance.id) + continue + + # Check if should terminate + should_terminate = is_instance_to_terminate(instance) + + if should_terminate: + print("✗ TERMINATE (no tag, running > 10 minutes)") + results["terminated"].append(instance.id) + else: + print("○ SKIP (grace period, running < 10 minutes)") + results["skipped_grace_period"].append(instance.id) + + # Summary + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + print(f"\nInstances to TERMINATE ({len(results['terminated'])}):") + for instance_id in results["terminated"]: + print(f" - {instance_id}") + + print(f"\nInstances SKIPPED - EKS ({len(results['skipped_eks'])}):") + for instance_id in results["skipped_eks"]: + print(f" - {instance_id}") + + print(f"\nInstances SKIPPED - Has billing tag ({len(results['skipped_has_tag'])}):") + for instance_id in results["skipped_has_tag"]: + print(f" - {instance_id}") + + print( + f"\nInstances SKIPPED - Grace period ({len(results['skipped_grace_period'])}):" + ) + for instance_id in results["skipped_grace_period"]: + print(f" - {instance_id}") + + print( + f"\nEKS Clusters marked for DELETION ({len(results['eks_clusters_to_delete'])}):" + ) + for cluster in results["eks_clusters_to_delete"]: + print(f" - {cluster}") + + # Validation + print("\n" + "=" * 80) + print("VALIDATION") + print("=" * 80) + + expected_terminated = ["i-regular-no-tag-old", "i-regular-timestamp-expired"] + expected_eks_delete = ["pmm-ha", "pmm-expired"] + + terminated_match = set(results["terminated"]) == set(expected_terminated) + eks_delete_match = results["eks_clusters_to_delete"] == set(expected_eks_delete) + + print(f"\n✓ Terminated instances correct: {terminated_match}") + if not terminated_match: + print(f" Expected: {expected_terminated}") + print(f" Got: {results['terminated']}") + + print(f"✓ EKS clusters for deletion correct: {eks_delete_match}") + if not eks_delete_match: + print(f" Expected: {expected_eks_delete}") + print(f" Got: {list(results['eks_clusters_to_delete'])}") + + if terminated_match and eks_delete_match: + print("\n🎉 All validations PASSED!") + return 0 + else: + print("\n❌ Some validations FAILED!") + return 1 + + +if __name__ == "__main__": + exit(run_tests()) From 63cdc1421391c02cc04026983faf548856b70364 Mon Sep 17 00:00:00 2001 From: Anderson Nogueira Date: Mon, 6 Oct 2025 15:21:17 +0200 Subject: [PATCH 2/2] Add CloudFormation stack and justfile for removeUntaggedEc2 - Create IaC/RemoveUntaggedEc2Stack.yml with Lambda, IAM role, EventBridge rule - Add justfile with deployment, health check, and development commands - Lambda scans all AWS regions for untagged instances - EventBridge triggers Lambda every 4 minutes - DRY_RUN parameter controls whether deletions actually occur (defaults to true) - Remove test file (test_removeUntaggedEc2_logic.py) - Kebab-case command names, no emojis --- IaC/RemoveUntaggedEc2Stack.yml | 565 ++++++++++++++++++ .../test_removeUntaggedEc2_logic.py | 386 ------------ justfile | 250 ++++++++ 3 files changed, 815 insertions(+), 386 deletions(-) create mode 100644 IaC/RemoveUntaggedEc2Stack.yml delete mode 100755 cloud/aws-functions/test_removeUntaggedEc2_logic.py create mode 100644 justfile diff --git a/IaC/RemoveUntaggedEc2Stack.yml b/IaC/RemoveUntaggedEc2Stack.yml new file mode 100644 index 0000000000..f2fd265358 --- /dev/null +++ b/IaC/RemoveUntaggedEc2Stack.yml @@ -0,0 +1,565 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: 'Lambda function to remove untagged EC2 instances and EKS clusters' + +Parameters: + EksSkipPattern: + Type: String + Default: 'pe-.*' + Description: 'Regex pattern for EKS clusters to skip (protect from deletion)' + DryRun: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: 'Dry-run mode - true means no deletions will occur, false means perform actual deletions' + +Resources: + LambdaExecutionRole: + Type: AWS::IAM::Role + Properties: + RoleName: removeUntaggedEc2-role + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: lambda.amazonaws.com + Action: sts:AssumeRole + ManagedPolicyArns: + - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole + Policies: + - PolicyName: EC2AndCloudFormationAccess + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - ec2:DescribeInstances + - ec2:DescribeRegions + - ec2:TerminateInstances + - ec2:CreateTags + - ec2:DescribeSecurityGroups + - ec2:RevokeSecurityGroupIngress + - ec2:DisassociateRouteTable + - ec2:DeleteRoute + - ec2:DescribeSpotPriceHistory + Resource: '*' + - Effect: Allow + Action: + - cloudformation:DescribeStacks + - cloudformation:DescribeStackEvents + - cloudformation:DeleteStack + Resource: '*' + + RemoveUntaggedEc2Function: + Type: AWS::Lambda::Function + Properties: + FunctionName: removeUntaggedEc2 + Description: 'Remove untagged EC2 instances and EKS clusters after grace period' + Handler: index.lambda_handler + MemorySize: 128 + Role: !GetAtt LambdaExecutionRole.Arn + Runtime: python3.12 + Timeout: 300 + Environment: + Variables: + EKS_SKIP_PATTERN: !Ref EksSkipPattern + DRY_RUN: !Ref DryRun + Tags: + - Key: iit-billing-tag + Value: infrastructure + # NOTE: Source code is maintained in cloud/aws-functions/removeUntaggedEc2.py + # This inline code is kept in sync for CloudFormation deployment + # To update: edit the .py file, then run: just sync-removeUntaggedEc2-to-cfn + Code: + ZipFile: | + import logging + import datetime + import boto3 + import os + import re + from botocore.exceptions import ClientError + + # Set logging level to INFO + logger = logging.getLogger() + logger.setLevel("INFO") + + # Get environment variable for EKS cluster skip pattern + EKS_SKIP_PATTERN = os.environ.get("EKS_SKIP_PATTERN", "pe-.*") + logger.info(f"EKS_SKIP_PATTERN: {EKS_SKIP_PATTERN}") + + # Track EKS clusters marked for deletion per region + eks_clusters_to_delete = {} + + + def convert_tags_to_dict(tags): + return {tag["Key"]: tag["Value"] for tag in tags} if tags else {} + + + def get_eks_cluster_name(tags_dict): + """Extract EKS cluster name from instance tags""" + # Check multiple possible tag keys for cluster name + cluster_keys = ["aws:eks:cluster-name", "eks:eks-cluster-name"] + + for key in cluster_keys: + if key in tags_dict: + return tags_dict[key] + + # Check for kubernetes.io/cluster/* tags + for key in tags_dict.keys(): + if key.startswith("kubernetes.io/cluster/"): + return key.replace("kubernetes.io/cluster/", "") + + return None + + + def has_valid_billing_tag(tags_dict, instance_launch_time): + """ + Check if instance has a valid iit-billing-tag. + + For regular instances: any non-empty value is valid + For timestamp-based tags: check if Unix timestamp is in the future + """ + if "iit-billing-tag" not in tags_dict: + return False + + tag_value = tags_dict["iit-billing-tag"] + + # Empty tag is invalid + if not tag_value: + return False + + # Try to parse as Unix timestamp (for EKS auto-expiration) + try: + expiration_timestamp = int(tag_value) + current_timestamp = int( + datetime.datetime.now(datetime.timezone.utc).timestamp() + ) + + # If it's a valid future timestamp, check if it's expired + if expiration_timestamp > current_timestamp: + logger.info( + f"Instance has valid billing tag with expiration {expiration_timestamp} " + f"(expires in {expiration_timestamp - current_timestamp} seconds)" + ) + return True + else: + logger.info( + f"Instance billing tag expired: {expiration_timestamp} < {current_timestamp} " + f"(expired {current_timestamp - expiration_timestamp} seconds ago)" + ) + return False + except ValueError: + # Not a timestamp, treat as category string (e.g., "pmm-staging", "jenkins-pmm-slave") + # Any non-empty category string is valid + logger.info(f"Instance has valid billing tag category: {tag_value}") + return True + + + def is_eks_managed_instance(instance, region): + """Check if instance is managed by EKS and if it should be skipped""" + tags_dict = convert_tags_to_dict(instance.tags) + + # Check for EKS-related tags + eks_indicators = [ + "kubernetes.io/cluster/", + "aws:eks:cluster-name", + "eks:eks-cluster-name", + "eks:kubernetes-node-pool-name", + "aws:ec2:managed-launch", + ] + + is_eks = False + for key in tags_dict.keys(): + for indicator in eks_indicators: + if indicator in key: + is_eks = True + break + if is_eks: + break + + if not is_eks: + return False + + # It's an EKS instance, now check billing tag and skip pattern + cluster_name = get_eks_cluster_name(tags_dict) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + # If has valid billing tag, always skip (it's legitimate) + if has_billing_tag: + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"has valid iit-billing-tag, skipping" + ) + return True + + # No billing tag - check skip pattern + if cluster_name and EKS_SKIP_PATTERN: + try: + if re.match(EKS_SKIP_PATTERN, cluster_name): + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"matches skip pattern '{EKS_SKIP_PATTERN}', skipping" + ) + return True + else: + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"does NOT match skip pattern '{EKS_SKIP_PATTERN}' and has no valid billing tag, " + f"marking cluster for deletion" + ) + # Track this cluster for deletion + if region not in eks_clusters_to_delete: + eks_clusters_to_delete[region] = set() + eks_clusters_to_delete[region].add(cluster_name) + return True # Skip individual instance termination, we'll delete the whole cluster + except re.error as e: + logger.error( + f"Invalid regex pattern '{EKS_SKIP_PATTERN}': {e}, skipping all EKS instances" + ) + return True + + # If no cluster name found, skip the instance + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name or 'unknown'}), skipping" + ) + return True + + + def is_instance_to_terminate(instance): + # Check if the instance has valid 'iit-billing-tag' + tags_dict = convert_tags_to_dict(instance.tags) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + # Calculate the running time of the instance + current_time = datetime.datetime.now(datetime.timezone.utc) + launch_time = instance.launch_time + running_time = current_time - launch_time + + # Terminate instances without valid 'iit-billing-tag' running for more than 10 minutes + if not has_billing_tag and running_time.total_seconds() > 600: + return True + return False + + + def cleanup_failed_stack_resources(stack_name, region): + """Manually clean up resources that prevent stack deletion""" + try: + cfn = boto3.client("cloudformation", region_name=region) + ec2 = boto3.client("ec2", region_name=region) + + # Get failed resources from stack events + events = cfn.describe_stack_events(StackName=stack_name) + failed_resources = {} + + for event in events["StackEvents"]: + if event.get("ResourceStatus") == "DELETE_FAILED": + logical_id = event["LogicalResourceId"] + if logical_id not in failed_resources: # Only keep first occurrence + failed_resources[logical_id] = { + "Type": event["ResourceType"], + "PhysicalId": event.get("PhysicalResourceId"), + } + + if not failed_resources: + return True + + logger.info( + f"Attempting to clean up {len(failed_resources)} failed resources for stack {stack_name}" + ) + + # Process each failed resource type + for logical_id, resource in failed_resources.items(): + resource_type = resource["Type"] + physical_id = resource["PhysicalId"] + + try: + # Clean up security group ingress rules + if resource_type == "AWS::EC2::SecurityGroupIngress" and physical_id: + sg_id = physical_id.split("|")[0] if "|" in physical_id else None + if sg_id and sg_id.startswith("sg-"): + response = ec2.describe_security_groups(GroupIds=[sg_id]) + if response["SecurityGroups"]: + sg = response["SecurityGroups"][0] + if sg["IpPermissions"]: + ec2.revoke_security_group_ingress( + GroupId=sg_id, IpPermissions=sg["IpPermissions"] + ) + logger.info(f"Cleaned up ingress rules for {sg_id}") + + # Clean up route table associations + elif ( + resource_type == "AWS::EC2::SubnetRouteTableAssociation" + and physical_id + ): + # PhysicalId is the association ID + if physical_id.startswith("rtbassoc-"): + ec2.disassociate_route_table(AssociationId=physical_id) + logger.info(f"Disassociated route table {physical_id}") + + # Clean up routes + elif resource_type == "AWS::EC2::Route" and physical_id: + # PhysicalId format: rtb-xxx_destination + parts = physical_id.split("_") + if len(parts) == 2 and parts[0].startswith("rtb-"): + rtb_id = parts[0] + dest_cidr = parts[1] + ec2.delete_route( + RouteTableId=rtb_id, DestinationCidrBlock=dest_cidr + ) + logger.info(f"Deleted route {dest_cidr} from {rtb_id}") + + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + # Ignore if resource already deleted + if error_code not in [ + "InvalidGroup.NotFound", + "InvalidAssociationID.NotFound", + "InvalidRoute.NotFound", + ]: + logger.warning( + f"Could not clean up {resource_type} {physical_id}: {e}" + ) + except Exception as e: + logger.warning( + f"Unexpected error cleaning up {resource_type} {physical_id}: {e}" + ) + + return True + + except Exception as e: + logger.error(f"Error cleaning up failed resources for stack {stack_name}: {e}") + return False + + + def delete_eks_cluster_stack(cluster_name, region): + """Delete EKS cluster by removing its CloudFormation stack""" + try: + cfn = boto3.client("cloudformation", region_name=region) + + # Find CloudFormation stack for this cluster + stack_name = f"eksctl-{cluster_name}-cluster" + + # Check if stack exists and its current status + try: + response = cfn.describe_stacks(StackName=stack_name) + stack_status = response["Stacks"][0]["StackStatus"] + except ClientError as e: + if "does not exist" in str(e): + logger.warning( + f"CloudFormation stack {stack_name} not found in {region}, cannot delete cluster {cluster_name}" + ) + return False + raise + + # Handle DELETE_FAILED status - retry after cleanup + if stack_status == "DELETE_FAILED": + logger.info( + f"Stack {stack_name} previously failed deletion, attempting cleanup and retry" + ) + cleanup_failed_stack_resources(stack_name, region) + # Retry deletion + cfn.delete_stack(StackName=stack_name) + logger.info(f"Retrying deletion of stack {stack_name} after cleanup") + return True + + # Handle already deleting + if "DELETE" in stack_status and stack_status != "DELETE_COMPLETE": + logger.info(f"Stack {stack_name} already deleting (status: {stack_status})") + return True + + # Initiate deletion for new stacks + logger.info( + f"Deleting CloudFormation stack {stack_name} for EKS cluster {cluster_name} in {region}" + ) + cfn.delete_stack(StackName=stack_name) + logger.info( + f"Successfully initiated deletion of stack {stack_name} for cluster {cluster_name}" + ) + return True + + except ClientError as e: + logger.error( + f"Failed to delete CloudFormation stack for cluster {cluster_name} in {region}: {e}" + ) + return False + except Exception as e: + logger.error( + f"Unexpected error deleting cluster {cluster_name} in {region}: {e}" + ) + return False + + + def cirrus_ci_add_iit_billing_tag(instance): + # Convert tags to a dictionary for easier access + tags_dict = convert_tags_to_dict(instance.tags) + + # Check if the instance has 'CIRRUS_CI' tag set to 'true' and 'iit-billing-tag' is not set + has_cirrus_ci_tag = tags_dict.get("CIRRUS_CI", "").lower() == "true" + has_iit_billing_tag = "iit-billing-tag" in tags_dict + + # Extract additional tag values + instance_name = tags_dict.get("Name") + cirrus_repo_full_name = tags_dict.get("CIRRUS_REPO_FULL_NAME") + cirrus_task_id = tags_dict.get("CIRRUS_TASK_ID") + + # If 'CIRRUS_CI' tag is set to 'true' and 'iit-billing-tag' is not set, add 'iit-billing-tag' set to 'CirrusCI' + if has_cirrus_ci_tag and not has_iit_billing_tag: + try: + instance.create_tags(Tags=[{"Key": "iit-billing-tag", "Value": "CirrusCI"}]) + logger.info( + f"Instance {instance.id} ({instance_name}) tagged with 'iit-billing-tag: CirrusCI'. " + f"CIRRUS_REPO_FULL_NAME: {cirrus_repo_full_name}, CIRRUS_TASK_ID: {cirrus_task_id}" + ) + except ClientError as e: + logger.error(f"Error tagging instance {instance.id}: {e}") + + + def terminate_instances_in_region(region): + ec2 = boto3.resource("ec2", region_name=region) + instances = ec2.instances.filter( + Filters=[{"Name": "instance-state-name", "Values": ["running"]}] + ) + terminated_instances = [] + skipped_instances = [] + + for instance in instances: + try: + # First try to tag CirrusCI instances + cirrus_ci_add_iit_billing_tag(instance) + + # Skip EKS-managed instances based on pattern and billing tag + if is_eks_managed_instance(instance, region): + tags_dict = convert_tags_to_dict(instance.tags) + cluster_name = get_eks_cluster_name(tags_dict) + skipped_instances.append( + { + "InstanceId": instance.id, + "Reason": f"EKS-managed (cluster: {cluster_name or 'unknown'})", + } + ) + continue + + # Check if should terminate + if is_instance_to_terminate(instance): + instance_info = { + "InstanceId": instance.id, + "SSHKeyName": instance.key_name, + "NameTag": instance.tags[0]["Value"] + if instance.tags and "Name" in [tag["Key"] for tag in instance.tags] + else None, + "AvailabilityZone": instance.placement["AvailabilityZone"], + } + + try: + instance.terminate() + terminated_instances.append(instance_info) + logger.info( + f"Successfully terminated instance {instance.id} in {region}" + ) + except ClientError as e: + logger.error( + f"Failed to terminate instance {instance.id} in {region}: {e}" + ) + skipped_instances.append( + { + "InstanceId": instance.id, + "Reason": f"Permission denied: {str(e)}", + } + ) + except Exception as e: + logger.error(f"Error processing instance {instance.id} in {region}: {e}") + continue + + if skipped_instances: + logger.info(f"Skipped {len(skipped_instances)} instances in {region}") + for skipped in skipped_instances[:5]: # Log first 5 only + logger.info(f" - {skipped['InstanceId']}: {skipped['Reason']}") + + return terminated_instances + + + def lambda_handler(event, context): + global eks_clusters_to_delete + eks_clusters_to_delete = {} # Reset at start of each invocation + + regions = [ + region["RegionName"] + for region in boto3.client("ec2").describe_regions()["Regions"] + ] + terminated_instances_all_regions = [] + deleted_clusters = [] + + # Process all instances and identify EKS clusters to delete + for region in regions: + try: + terminated_instances_region = terminate_instances_in_region(region) + terminated_instances_all_regions.extend(terminated_instances_region) + except Exception as e: + logger.error(f"Error processing region {region}: {e}") + continue + + # Delete EKS clusters that don't match skip pattern AND have no valid billing tag + for region, clusters in eks_clusters_to_delete.items(): + for cluster_name in clusters: + try: + if delete_eks_cluster_stack(cluster_name, region): + deleted_clusters.append(f"{cluster_name} ({region})") + except Exception as e: + logger.error(f"Error deleting cluster {cluster_name} in {region}: {e}") + continue + + # Log results + if terminated_instances_all_regions: + logger.info("Terminated instances:") + for instance_info in terminated_instances_all_regions: + logger.info( + f"- Instance ID: {instance_info['InstanceId']}, SSH Key: {instance_info['SSHKeyName']}, Name Tag: {instance_info['NameTag']}, Availability Zone: {instance_info['AvailabilityZone']}" + ) + else: + logger.info("No instances were terminated.") + + if deleted_clusters: + logger.info(f"Deleted {len(deleted_clusters)} EKS clusters:") + for cluster in deleted_clusters: + logger.info(f"- {cluster}") + else: + logger.info("No EKS clusters were deleted.") + + return { + "statusCode": 200, + "body": f"Terminated {len(terminated_instances_all_regions)} instances, deleted {len(deleted_clusters)} EKS clusters", + } + + # EventBridge rule to trigger Lambda every 4 minutes + ScheduleRule: + Type: AWS::Events::Rule + Properties: + Description: 'Trigger removeUntaggedEc2 Lambda every 4 minutes' + ScheduleExpression: 'rate(4 minutes)' + State: ENABLED + Targets: + - Arn: !GetAtt RemoveUntaggedEc2Function.Arn + Id: RemoveUntaggedEc2Target + + PermissionForEventsToInvokeLambda: + Type: AWS::Lambda::Permission + Properties: + FunctionName: !Ref RemoveUntaggedEc2Function + Action: lambda:InvokeFunction + Principal: events.amazonaws.com + SourceArn: !GetAtt ScheduleRule.Arn + +Outputs: + LambdaFunctionArn: + Description: ARN of the removeUntaggedEc2 Lambda function + Value: !GetAtt RemoveUntaggedEc2Function.Arn + Export: + Name: RemoveUntaggedEc2-FunctionArn + + LambdaRoleArn: + Description: ARN of the Lambda execution role + Value: !GetAtt LambdaExecutionRole.Arn + Export: + Name: RemoveUntaggedEc2-RoleArn diff --git a/cloud/aws-functions/test_removeUntaggedEc2_logic.py b/cloud/aws-functions/test_removeUntaggedEc2_logic.py deleted file mode 100755 index 5b66d3b530..0000000000 --- a/cloud/aws-functions/test_removeUntaggedEc2_logic.py +++ /dev/null @@ -1,386 +0,0 @@ -#!/usr/bin/env python3 -""" -Test script for removeUntaggedEc2 logic. -Does not connect to AWS - uses mock instance objects. -""" - -import datetime -from dataclasses import dataclass -from typing import Dict, List - - -# Mock instance class -@dataclass -class MockInstance: - id: str - tags: List[Dict[str, str]] - launch_time: datetime.datetime - key_name: str - placement: Dict[str, str] - - -# Import the logic functions (without AWS SDK calls) -def convert_tags_to_dict(tags): - return {tag["Key"]: tag["Value"] for tag in tags} if tags else {} - - -def get_eks_cluster_name(tags_dict): - """Extract EKS cluster name from instance tags""" - cluster_keys = ["aws:eks:cluster-name", "eks:eks-cluster-name"] - - for key in cluster_keys: - if key in tags_dict: - return tags_dict[key] - - for key in tags_dict.keys(): - if key.startswith("kubernetes.io/cluster/"): - return key.replace("kubernetes.io/cluster/", "") - - return None - - -def has_valid_billing_tag(tags_dict, instance_launch_time): - """ - Check if instance has a valid iit-billing-tag. - - For regular instances: any non-empty value is valid - For timestamp-based tags: check if Unix timestamp is in the future - """ - if "iit-billing-tag" not in tags_dict: - return False - - tag_value = tags_dict["iit-billing-tag"] - - if not tag_value: - return False - - try: - expiration_timestamp = int(tag_value) - current_timestamp = int( - datetime.datetime.now(datetime.timezone.utc).timestamp() - ) - - if expiration_timestamp > current_timestamp: - return True - else: - return False - except ValueError: - # Not a timestamp, treat as category string - return True - - -def is_eks_managed_instance(instance, region, eks_skip_pattern="pe-.*"): - """Check if instance is managed by EKS and if it should be skipped""" - import re - - tags_dict = convert_tags_to_dict(instance.tags) - - eks_indicators = [ - "kubernetes.io/cluster/", - "aws:eks:cluster-name", - "eks:eks-cluster-name", - "eks:kubernetes-node-pool-name", - "aws:ec2:managed-launch", - ] - - is_eks = False - for key in tags_dict.keys(): - for indicator in eks_indicators: - if indicator in key: - is_eks = True - break - if is_eks: - break - - if not is_eks: - return False, None - - cluster_name = get_eks_cluster_name(tags_dict) - has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) - - # If has valid billing tag, always skip (it's legitimate) - if has_billing_tag: - return True, "has valid billing tag" - - # No billing tag - check skip pattern - if cluster_name and eks_skip_pattern: - if re.match(eks_skip_pattern, cluster_name): - return True, f"matches skip pattern '{eks_skip_pattern}'" - else: - return ( - True, - "marked for cluster deletion (no billing tag, doesn't match pattern)", - ) - - return True, "EKS instance (no cluster name)" - - -def is_instance_to_terminate(instance, grace_period_seconds=600): - """Check if instance should be terminated""" - tags_dict = convert_tags_to_dict(instance.tags) - has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) - - current_time = datetime.datetime.now(datetime.timezone.utc) - running_time = current_time - instance.launch_time - - if not has_billing_tag and running_time.total_seconds() > grace_period_seconds: - return True - return False - - -# Test cases -def create_test_instances(): - """Create test instances with different scenarios""" - now = datetime.datetime.now(datetime.timezone.utc) - one_hour_ago = now - datetime.timedelta(hours=1) - five_minutes_ago = now - datetime.timedelta(minutes=5) - future_timestamp = int((now + datetime.timedelta(days=1)).timestamp()) - past_timestamp = int((now - datetime.timedelta(hours=1)).timestamp()) - - instances = [ - # Regular EC2 instances - MockInstance( - id="i-regular-with-tag", - tags=[ - {"Key": "Name", "Value": "jenkins-worker"}, - {"Key": "iit-billing-tag", "Value": "jenkins-pmm-slave"}, - ], - launch_time=one_hour_ago, - key_name="jenkins-key", - placement={"AvailabilityZone": "us-east-2a"}, - ), - MockInstance( - id="i-regular-no-tag-old", - tags=[{"Key": "Name", "Value": "orphaned-instance"}], - launch_time=one_hour_ago, - key_name="test-key", - placement={"AvailabilityZone": "us-east-2b"}, - ), - MockInstance( - id="i-regular-no-tag-new", - tags=[{"Key": "Name", "Value": "just-launched"}], - launch_time=five_minutes_ago, - key_name="test-key", - placement={"AvailabilityZone": "us-east-2c"}, - ), - MockInstance( - id="i-regular-timestamp-valid", - tags=[ - {"Key": "Name", "Value": "temp-instance"}, - {"Key": "iit-billing-tag", "Value": str(future_timestamp)}, - ], - launch_time=one_hour_ago, - key_name="test-key", - placement={"AvailabilityZone": "us-east-2a"}, - ), - MockInstance( - id="i-regular-timestamp-expired", - tags=[ - {"Key": "Name", "Value": "expired-instance"}, - {"Key": "iit-billing-tag", "Value": str(past_timestamp)}, - ], - launch_time=one_hour_ago, - key_name="test-key", - placement={"AvailabilityZone": "us-east-2b"}, - ), - # EKS instances - protected cluster (pe-*) - MockInstance( - id="i-eks-pe-cluster-no-tag", - tags=[ - {"Key": "Name", "Value": "pe-crossplane-node"}, - {"Key": "kubernetes.io/cluster/pe-crossplane", "Value": "owned"}, - {"Key": "eks:cluster-name", "Value": "pe-crossplane"}, - ], - launch_time=one_hour_ago, - key_name="eks-key", - placement={"AvailabilityZone": "us-east-2a"}, - ), - MockInstance( - id="i-eks-pe-cluster-with-tag", - tags=[ - {"Key": "Name", "Value": "pe-infra-node"}, - {"Key": "kubernetes.io/cluster/pe-infra", "Value": "owned"}, - {"Key": "iit-billing-tag", "Value": "platform-eng"}, - ], - launch_time=one_hour_ago, - key_name="eks-key", - placement={"AvailabilityZone": "us-east-2b"}, - ), - # EKS instances - non-protected cluster - MockInstance( - id="i-eks-pmm-no-tag", - tags=[ - {"Key": "Name", "Value": "pmm-ha-node"}, - {"Key": "kubernetes.io/cluster/pmm-ha", "Value": "owned"}, - {"Key": "aws:eks:cluster-name", "Value": "pmm-ha"}, - ], - launch_time=one_hour_ago, - key_name="eks-key", - placement={"AvailabilityZone": "us-east-2c"}, - ), - MockInstance( - id="i-eks-pmm-with-category-tag", - tags=[ - {"Key": "Name", "Value": "pmm-test-node"}, - {"Key": "kubernetes.io/cluster/pmm-test", "Value": "owned"}, - {"Key": "iit-billing-tag", "Value": "pmm-eks"}, - ], - launch_time=one_hour_ago, - key_name="eks-key", - placement={"AvailabilityZone": "us-east-2a"}, - ), - MockInstance( - id="i-eks-pmm-with-timestamp-valid", - tags=[ - {"Key": "Name", "Value": "pmm-temp-node"}, - {"Key": "kubernetes.io/cluster/pmm-temp", "Value": "owned"}, - {"Key": "iit-billing-tag", "Value": str(future_timestamp)}, - ], - launch_time=one_hour_ago, - key_name="eks-key", - placement={"AvailabilityZone": "us-east-2b"}, - ), - MockInstance( - id="i-eks-pmm-with-timestamp-expired", - tags=[ - {"Key": "Name", "Value": "pmm-expired-node"}, - {"Key": "kubernetes.io/cluster/pmm-expired", "Value": "owned"}, - {"Key": "iit-billing-tag", "Value": str(past_timestamp)}, - ], - launch_time=one_hour_ago, - key_name="eks-key", - placement={"AvailabilityZone": "us-east-2c"}, - ), - # CirrusCI instance - MockInstance( - id="i-cirrus-ci", - tags=[ - {"Key": "Name", "Value": "cirrus-runner"}, - {"Key": "CIRRUS_CI", "Value": "true"}, - {"Key": "iit-billing-tag", "Value": "CirrusCI"}, - ], - launch_time=one_hour_ago, - key_name="cirrus-key", - placement={"AvailabilityZone": "us-east-2a"}, - ), - ] - - return instances - - -def run_tests(): - """Run all test cases and display results""" - print("=" * 80) - print("Testing removeUntaggedEc2 Logic") - print("=" * 80) - print() - - instances = create_test_instances() - region = "us-east-2" - - results = { - "terminated": [], - "skipped_eks": [], - "skipped_has_tag": [], - "skipped_grace_period": [], - "eks_clusters_to_delete": set(), - } - - for instance in instances: - print(f"\n{'─' * 80}") - print(f"Instance: {instance.id}") - tags_dict = convert_tags_to_dict(instance.tags) - print(f"Tags: {tags_dict}") - print(f"Launch: {instance.launch_time.strftime('%Y-%m-%d %H:%M:%S UTC')}") - - # Check if EKS - is_eks, eks_reason = is_eks_managed_instance(instance, region) - - if is_eks: - print(f"✓ EKS Instance: {eks_reason}") - results["skipped_eks"].append(instance.id) - - # Check if should mark cluster for deletion - cluster_name = get_eks_cluster_name(tags_dict) - if cluster_name and "marked for cluster deletion" in eks_reason: - results["eks_clusters_to_delete"].add(cluster_name) - print(f" → Cluster '{cluster_name}' marked for deletion") - continue - - # Check billing tag - has_tag = has_valid_billing_tag(tags_dict, instance.launch_time) - - if has_tag: - print(f"✓ Has valid billing tag: {tags_dict.get('iit-billing-tag')}") - results["skipped_has_tag"].append(instance.id) - continue - - # Check if should terminate - should_terminate = is_instance_to_terminate(instance) - - if should_terminate: - print("✗ TERMINATE (no tag, running > 10 minutes)") - results["terminated"].append(instance.id) - else: - print("○ SKIP (grace period, running < 10 minutes)") - results["skipped_grace_period"].append(instance.id) - - # Summary - print("\n" + "=" * 80) - print("SUMMARY") - print("=" * 80) - print(f"\nInstances to TERMINATE ({len(results['terminated'])}):") - for instance_id in results["terminated"]: - print(f" - {instance_id}") - - print(f"\nInstances SKIPPED - EKS ({len(results['skipped_eks'])}):") - for instance_id in results["skipped_eks"]: - print(f" - {instance_id}") - - print(f"\nInstances SKIPPED - Has billing tag ({len(results['skipped_has_tag'])}):") - for instance_id in results["skipped_has_tag"]: - print(f" - {instance_id}") - - print( - f"\nInstances SKIPPED - Grace period ({len(results['skipped_grace_period'])}):" - ) - for instance_id in results["skipped_grace_period"]: - print(f" - {instance_id}") - - print( - f"\nEKS Clusters marked for DELETION ({len(results['eks_clusters_to_delete'])}):" - ) - for cluster in results["eks_clusters_to_delete"]: - print(f" - {cluster}") - - # Validation - print("\n" + "=" * 80) - print("VALIDATION") - print("=" * 80) - - expected_terminated = ["i-regular-no-tag-old", "i-regular-timestamp-expired"] - expected_eks_delete = ["pmm-ha", "pmm-expired"] - - terminated_match = set(results["terminated"]) == set(expected_terminated) - eks_delete_match = results["eks_clusters_to_delete"] == set(expected_eks_delete) - - print(f"\n✓ Terminated instances correct: {terminated_match}") - if not terminated_match: - print(f" Expected: {expected_terminated}") - print(f" Got: {results['terminated']}") - - print(f"✓ EKS clusters for deletion correct: {eks_delete_match}") - if not eks_delete_match: - print(f" Expected: {expected_eks_delete}") - print(f" Got: {list(results['eks_clusters_to_delete'])}") - - if terminated_match and eks_delete_match: - print("\n🎉 All validations PASSED!") - return 0 - else: - print("\n❌ Some validations FAILED!") - return 1 - - -if __name__ == "__main__": - exit(run_tests()) diff --git a/justfile b/justfile new file mode 100644 index 0000000000..f34b5d48ec --- /dev/null +++ b/justfile @@ -0,0 +1,250 @@ +# Justfile for jenkins-pipelines infrastructure management + +# Default region and AWS profile +aws_region := "us-east-2" +aws_profile := "percona-dev-admin" +# removeUntaggedEc2 Lambda is deployed in eu-west-1 (but scans all regions) +cleanup_lambda_region := "eu-west-1" +# Dry-run mode for removeUntaggedEc2 (true = no deletions, false = perform deletions) +dry_run := env_var_or_default("DRY_RUN", "true") + +# List all available recipes +default: + @just --list + +# ============================================================================ +# AWS Lambda Functions (cloud/aws-functions/) +# ============================================================================ + +# Deploy email_running_instances Lambda function +deploy-lambda-email-running-instances: + #!/usr/bin/env bash + set -euo pipefail + echo "Creating deployment package..." + cd cloud/aws-functions + rm -f email_running_instances.zip + zip -q email_running_instances.zip email_running_instances.py + + echo "Deploying to AWS Lambda..." + aws lambda update-function-code \ + --function-name email_running_instances \ + --zip-file fileb://email_running_instances.zip \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "Lambda function deployed successfully" + rm email_running_instances.zip + +# Lint all Lambda Python code +lint-lambdas: + #!/usr/bin/env bash + set -euo pipefail + cd cloud/aws-functions + + echo "Running ruff linter on all Python files..." + uv run --with ruff ruff check *.py + + echo "Running ruff formatter..." + uv run --with ruff ruff format *.py + + echo "Linting complete" + +# Show all Lambda functions info +info-lambdas: + #!/usr/bin/env bash + echo "Lambda Functions:" + aws lambda list-functions \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --query 'Functions[?starts_with(FunctionName, `email`) || starts_with(FunctionName, `orphaned`) || starts_with(FunctionName, `remove`)].{Name:FunctionName,Runtime:Runtime,Updated:LastModified}' \ + --output table + +# ============================================================================ +# CloudFormation Stacks (IaC/) +# ============================================================================ + +# Deploy StagingStack (PMM staging environment) +deploy-stack-staging: + #!/usr/bin/env bash + set -euo pipefail + echo "Deploying StagingStack..." + + aws cloudformation update-stack \ + --stack-name pmm-staging \ + --template-body file://IaC/StagingStack.yml \ + --capabilities CAPABILITY_NAMED_IAM \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "Waiting for stack update to complete..." + aws cloudformation wait stack-update-complete \ + --stack-name pmm-staging \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "StagingStack deployed successfully" + +# Deploy LambdaVolumeCleanup stack +deploy-stack-volume-cleanup: + #!/usr/bin/env bash + set -euo pipefail + echo "Deploying LambdaVolumeCleanup..." + + aws cloudformation update-stack \ + --stack-name lambda-volume-cleanup \ + --template-body file://IaC/LambdaVolumeCleanup.yml \ + --capabilities CAPABILITY_NAMED_IAM \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "Waiting for stack update to complete..." + aws cloudformation wait stack-update-complete \ + --stack-name lambda-volume-cleanup \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "LambdaVolumeCleanup deployed successfully" + +# List all CloudFormation stacks +list-stacks: + #!/usr/bin/env bash + aws cloudformation list-stacks \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE \ + --query 'StackSummaries[].{Name:StackName,Status:StackStatus,Updated:LastUpdatedTime}' \ + --output table + +# Describe a specific stack +describe-stack stack_name: + #!/usr/bin/env bash + aws cloudformation describe-stacks \ + --stack-name {{stack_name}} \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --query 'Stacks[0].{Name:StackName,Status:StackStatus,Created:CreationTime,Updated:LastUpdatedTime}' \ + --output table + +# ============================================================================ +# Development & Testing +# ============================================================================ + +# Run all linters +lint: lint-lambdas + +# Full deployment workflow for email Lambda (lint, deploy) +deploy-email-running-instances-full: lint-lambdas deploy-lambda-email-running-instances + +# Check for trailing whitespaces in Python files +check-whitespace: + #!/usr/bin/env bash + set -euo pipefail + echo "Checking for trailing whitespaces..." + cd cloud/aws-functions + if rg '\s+$' *.py; then + echo "Found trailing whitespaces" + exit 1 + else + echo "No trailing whitespaces found" + fi + +# ============================================================================ +# Infrastructure Health Checks +# ============================================================================ + +# Check running EC2 instances in staging +check-staging-instances: + #!/usr/bin/env bash + echo "Running PMM Staging Instances:" + aws ec2 describe-instances \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --filters "Name=tag:iit-billing-tag,Values=pmm-staging" "Name=instance-state-name,Values=running" \ + --query 'Reservations[].Instances[].{Name:Tags[?Key==`Name`]|[0].Value,Type:InstanceType,State:State.Name,LaunchTime:LaunchTime}' \ + --output table + +# Check CloudFormation stacks in DELETE_FAILED state +check-failed-stacks: + #!/usr/bin/env bash + echo "Failed CloudFormation Stacks:" + aws cloudformation list-stacks \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --stack-status-filter DELETE_FAILED \ + --query 'StackSummaries[].{Name:StackName,Status:StackStatus,Reason:StackStatusReason}' \ + --output table + +# ============================================================================ +# removeUntaggedEc2 Lambda (IaC/RemoveUntaggedEc2Stack.yml) +# ============================================================================ + +# Deploy RemoveUntaggedEc2 CloudFormation stack +deploy-stack-remove-untagged-ec2: + #!/usr/bin/env bash + set -euo pipefail + echo "Deploying RemoveUntaggedEc2Stack..." + + aws cloudformation deploy \ + --stack-name remove-untagged-ec2 \ + --template-file IaC/RemoveUntaggedEc2Stack.yml \ + --capabilities CAPABILITY_NAMED_IAM \ + --parameter-overrides EksSkipPattern="pe-.*" DryRun="{{dry_run}}" \ + --region {{cleanup_lambda_region}} \ + --profile {{aws_profile}} + + echo "RemoveUntaggedEc2Stack deployed successfully" + +# Update RemoveUntaggedEc2 stack (sync from .py file) +update-stack-remove-untagged-ec2: + #!/usr/bin/env bash + set -euo pipefail + echo "Syncing cloud/aws-functions/removeUntaggedEc2.py to CloudFormation..." + echo "Manual sync required:" + echo " 1. Copy code from cloud/aws-functions/removeUntaggedEc2.py" + echo " 2. Paste into IaC/RemoveUntaggedEc2Stack.yml under Code.ZipFile" + echo " 3. Run: just deploy-stack-remove-untagged-ec2" + echo "" + echo "Or deploy directly to Lambda: just deploy-lambda-remove-untagged-ec2" + +# Deploy removeUntaggedEc2 directly to Lambda (bypass CloudFormation) +deploy-lambda-remove-untagged-ec2: + #!/usr/bin/env bash + set -euo pipefail + echo "Creating deployment package..." + cd cloud/aws-functions + rm -f removeUntaggedEc2.zip + zip -q removeUntaggedEc2.zip removeUntaggedEc2.py + + echo "Deploying to AWS Lambda..." + aws lambda update-function-code \ + --function-name removeUntaggedEc2 \ + --zip-file fileb://removeUntaggedEc2.zip \ + --region {{cleanup_lambda_region}} \ + --profile {{aws_profile}} + + echo "Lambda function deployed successfully" + rm removeUntaggedEc2.zip + +# Delete RemoveUntaggedEc2 CloudFormation stack +delete-stack-remove-untagged-ec2: + #!/usr/bin/env bash + set -euo pipefail + echo "This will delete the removeUntaggedEc2 Lambda and EventBridge rule" + read -p "Are you sure? (yes/no): " confirm + if [ "$confirm" != "yes" ]; then + echo "Aborted" + exit 1 + fi + + aws cloudformation delete-stack \ + --stack-name remove-untagged-ec2 \ + --region {{cleanup_lambda_region}} \ + --profile {{aws_profile}} + + echo "Waiting for stack deletion..." + aws cloudformation wait stack-delete-complete \ + --stack-name remove-untagged-ec2 \ + --region {{cleanup_lambda_region}} \ + --profile {{aws_profile}} + + echo "Stack deleted"