diff --git a/IaC/RemoveUntaggedEc2Stack.yml b/IaC/RemoveUntaggedEc2Stack.yml new file mode 100644 index 0000000000..f2fd265358 --- /dev/null +++ b/IaC/RemoveUntaggedEc2Stack.yml @@ -0,0 +1,565 @@ +AWSTemplateFormatVersion: '2010-09-09' +Description: 'Lambda function to remove untagged EC2 instances and EKS clusters' + +Parameters: + EksSkipPattern: + Type: String + Default: 'pe-.*' + Description: 'Regex pattern for EKS clusters to skip (protect from deletion)' + DryRun: + Type: String + Default: 'true' + AllowedValues: + - 'true' + - 'false' + Description: 'Dry-run mode - true means no deletions will occur, false means perform actual deletions' + +Resources: + LambdaExecutionRole: + Type: AWS::IAM::Role + Properties: + RoleName: removeUntaggedEc2-role + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: lambda.amazonaws.com + Action: sts:AssumeRole + ManagedPolicyArns: + - arn:aws:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole + Policies: + - PolicyName: EC2AndCloudFormationAccess + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - ec2:DescribeInstances + - ec2:DescribeRegions + - ec2:TerminateInstances + - ec2:CreateTags + - ec2:DescribeSecurityGroups + - ec2:RevokeSecurityGroupIngress + - ec2:DisassociateRouteTable + - ec2:DeleteRoute + - ec2:DescribeSpotPriceHistory + Resource: '*' + - Effect: Allow + Action: + - cloudformation:DescribeStacks + - cloudformation:DescribeStackEvents + - cloudformation:DeleteStack + Resource: '*' + + RemoveUntaggedEc2Function: + Type: AWS::Lambda::Function + Properties: + FunctionName: removeUntaggedEc2 + Description: 'Remove untagged EC2 instances and EKS clusters after grace period' + Handler: index.lambda_handler + MemorySize: 128 + Role: !GetAtt LambdaExecutionRole.Arn + Runtime: python3.12 + Timeout: 300 + Environment: + Variables: + EKS_SKIP_PATTERN: !Ref EksSkipPattern + DRY_RUN: !Ref DryRun + Tags: + - Key: iit-billing-tag + Value: infrastructure + # NOTE: Source code is maintained in cloud/aws-functions/removeUntaggedEc2.py + # This inline code is kept in sync for CloudFormation deployment + # To update: edit the .py file, then run: just sync-removeUntaggedEc2-to-cfn + Code: + ZipFile: | + import logging + import datetime + import boto3 + import os + import re + from botocore.exceptions import ClientError + + # Set logging level to INFO + logger = logging.getLogger() + logger.setLevel("INFO") + + # Get environment variable for EKS cluster skip pattern + EKS_SKIP_PATTERN = os.environ.get("EKS_SKIP_PATTERN", "pe-.*") + logger.info(f"EKS_SKIP_PATTERN: {EKS_SKIP_PATTERN}") + + # Track EKS clusters marked for deletion per region + eks_clusters_to_delete = {} + + + def convert_tags_to_dict(tags): + return {tag["Key"]: tag["Value"] for tag in tags} if tags else {} + + + def get_eks_cluster_name(tags_dict): + """Extract EKS cluster name from instance tags""" + # Check multiple possible tag keys for cluster name + cluster_keys = ["aws:eks:cluster-name", "eks:eks-cluster-name"] + + for key in cluster_keys: + if key in tags_dict: + return tags_dict[key] + + # Check for kubernetes.io/cluster/* tags + for key in tags_dict.keys(): + if key.startswith("kubernetes.io/cluster/"): + return key.replace("kubernetes.io/cluster/", "") + + return None + + + def has_valid_billing_tag(tags_dict, instance_launch_time): + """ + Check if instance has a valid iit-billing-tag. + + For regular instances: any non-empty value is valid + For timestamp-based tags: check if Unix timestamp is in the future + """ + if "iit-billing-tag" not in tags_dict: + return False + + tag_value = tags_dict["iit-billing-tag"] + + # Empty tag is invalid + if not tag_value: + return False + + # Try to parse as Unix timestamp (for EKS auto-expiration) + try: + expiration_timestamp = int(tag_value) + current_timestamp = int( + datetime.datetime.now(datetime.timezone.utc).timestamp() + ) + + # If it's a valid future timestamp, check if it's expired + if expiration_timestamp > current_timestamp: + logger.info( + f"Instance has valid billing tag with expiration {expiration_timestamp} " + f"(expires in {expiration_timestamp - current_timestamp} seconds)" + ) + return True + else: + logger.info( + f"Instance billing tag expired: {expiration_timestamp} < {current_timestamp} " + f"(expired {current_timestamp - expiration_timestamp} seconds ago)" + ) + return False + except ValueError: + # Not a timestamp, treat as category string (e.g., "pmm-staging", "jenkins-pmm-slave") + # Any non-empty category string is valid + logger.info(f"Instance has valid billing tag category: {tag_value}") + return True + + + def is_eks_managed_instance(instance, region): + """Check if instance is managed by EKS and if it should be skipped""" + tags_dict = convert_tags_to_dict(instance.tags) + + # Check for EKS-related tags + eks_indicators = [ + "kubernetes.io/cluster/", + "aws:eks:cluster-name", + "eks:eks-cluster-name", + "eks:kubernetes-node-pool-name", + "aws:ec2:managed-launch", + ] + + is_eks = False + for key in tags_dict.keys(): + for indicator in eks_indicators: + if indicator in key: + is_eks = True + break + if is_eks: + break + + if not is_eks: + return False + + # It's an EKS instance, now check billing tag and skip pattern + cluster_name = get_eks_cluster_name(tags_dict) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + # If has valid billing tag, always skip (it's legitimate) + if has_billing_tag: + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"has valid iit-billing-tag, skipping" + ) + return True + + # No billing tag - check skip pattern + if cluster_name and EKS_SKIP_PATTERN: + try: + if re.match(EKS_SKIP_PATTERN, cluster_name): + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"matches skip pattern '{EKS_SKIP_PATTERN}', skipping" + ) + return True + else: + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"does NOT match skip pattern '{EKS_SKIP_PATTERN}' and has no valid billing tag, " + f"marking cluster for deletion" + ) + # Track this cluster for deletion + if region not in eks_clusters_to_delete: + eks_clusters_to_delete[region] = set() + eks_clusters_to_delete[region].add(cluster_name) + return True # Skip individual instance termination, we'll delete the whole cluster + except re.error as e: + logger.error( + f"Invalid regex pattern '{EKS_SKIP_PATTERN}': {e}, skipping all EKS instances" + ) + return True + + # If no cluster name found, skip the instance + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name or 'unknown'}), skipping" + ) + return True + + + def is_instance_to_terminate(instance): + # Check if the instance has valid 'iit-billing-tag' + tags_dict = convert_tags_to_dict(instance.tags) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + # Calculate the running time of the instance + current_time = datetime.datetime.now(datetime.timezone.utc) + launch_time = instance.launch_time + running_time = current_time - launch_time + + # Terminate instances without valid 'iit-billing-tag' running for more than 10 minutes + if not has_billing_tag and running_time.total_seconds() > 600: + return True + return False + + + def cleanup_failed_stack_resources(stack_name, region): + """Manually clean up resources that prevent stack deletion""" + try: + cfn = boto3.client("cloudformation", region_name=region) + ec2 = boto3.client("ec2", region_name=region) + + # Get failed resources from stack events + events = cfn.describe_stack_events(StackName=stack_name) + failed_resources = {} + + for event in events["StackEvents"]: + if event.get("ResourceStatus") == "DELETE_FAILED": + logical_id = event["LogicalResourceId"] + if logical_id not in failed_resources: # Only keep first occurrence + failed_resources[logical_id] = { + "Type": event["ResourceType"], + "PhysicalId": event.get("PhysicalResourceId"), + } + + if not failed_resources: + return True + + logger.info( + f"Attempting to clean up {len(failed_resources)} failed resources for stack {stack_name}" + ) + + # Process each failed resource type + for logical_id, resource in failed_resources.items(): + resource_type = resource["Type"] + physical_id = resource["PhysicalId"] + + try: + # Clean up security group ingress rules + if resource_type == "AWS::EC2::SecurityGroupIngress" and physical_id: + sg_id = physical_id.split("|")[0] if "|" in physical_id else None + if sg_id and sg_id.startswith("sg-"): + response = ec2.describe_security_groups(GroupIds=[sg_id]) + if response["SecurityGroups"]: + sg = response["SecurityGroups"][0] + if sg["IpPermissions"]: + ec2.revoke_security_group_ingress( + GroupId=sg_id, IpPermissions=sg["IpPermissions"] + ) + logger.info(f"Cleaned up ingress rules for {sg_id}") + + # Clean up route table associations + elif ( + resource_type == "AWS::EC2::SubnetRouteTableAssociation" + and physical_id + ): + # PhysicalId is the association ID + if physical_id.startswith("rtbassoc-"): + ec2.disassociate_route_table(AssociationId=physical_id) + logger.info(f"Disassociated route table {physical_id}") + + # Clean up routes + elif resource_type == "AWS::EC2::Route" and physical_id: + # PhysicalId format: rtb-xxx_destination + parts = physical_id.split("_") + if len(parts) == 2 and parts[0].startswith("rtb-"): + rtb_id = parts[0] + dest_cidr = parts[1] + ec2.delete_route( + RouteTableId=rtb_id, DestinationCidrBlock=dest_cidr + ) + logger.info(f"Deleted route {dest_cidr} from {rtb_id}") + + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + # Ignore if resource already deleted + if error_code not in [ + "InvalidGroup.NotFound", + "InvalidAssociationID.NotFound", + "InvalidRoute.NotFound", + ]: + logger.warning( + f"Could not clean up {resource_type} {physical_id}: {e}" + ) + except Exception as e: + logger.warning( + f"Unexpected error cleaning up {resource_type} {physical_id}: {e}" + ) + + return True + + except Exception as e: + logger.error(f"Error cleaning up failed resources for stack {stack_name}: {e}") + return False + + + def delete_eks_cluster_stack(cluster_name, region): + """Delete EKS cluster by removing its CloudFormation stack""" + try: + cfn = boto3.client("cloudformation", region_name=region) + + # Find CloudFormation stack for this cluster + stack_name = f"eksctl-{cluster_name}-cluster" + + # Check if stack exists and its current status + try: + response = cfn.describe_stacks(StackName=stack_name) + stack_status = response["Stacks"][0]["StackStatus"] + except ClientError as e: + if "does not exist" in str(e): + logger.warning( + f"CloudFormation stack {stack_name} not found in {region}, cannot delete cluster {cluster_name}" + ) + return False + raise + + # Handle DELETE_FAILED status - retry after cleanup + if stack_status == "DELETE_FAILED": + logger.info( + f"Stack {stack_name} previously failed deletion, attempting cleanup and retry" + ) + cleanup_failed_stack_resources(stack_name, region) + # Retry deletion + cfn.delete_stack(StackName=stack_name) + logger.info(f"Retrying deletion of stack {stack_name} after cleanup") + return True + + # Handle already deleting + if "DELETE" in stack_status and stack_status != "DELETE_COMPLETE": + logger.info(f"Stack {stack_name} already deleting (status: {stack_status})") + return True + + # Initiate deletion for new stacks + logger.info( + f"Deleting CloudFormation stack {stack_name} for EKS cluster {cluster_name} in {region}" + ) + cfn.delete_stack(StackName=stack_name) + logger.info( + f"Successfully initiated deletion of stack {stack_name} for cluster {cluster_name}" + ) + return True + + except ClientError as e: + logger.error( + f"Failed to delete CloudFormation stack for cluster {cluster_name} in {region}: {e}" + ) + return False + except Exception as e: + logger.error( + f"Unexpected error deleting cluster {cluster_name} in {region}: {e}" + ) + return False + + + def cirrus_ci_add_iit_billing_tag(instance): + # Convert tags to a dictionary for easier access + tags_dict = convert_tags_to_dict(instance.tags) + + # Check if the instance has 'CIRRUS_CI' tag set to 'true' and 'iit-billing-tag' is not set + has_cirrus_ci_tag = tags_dict.get("CIRRUS_CI", "").lower() == "true" + has_iit_billing_tag = "iit-billing-tag" in tags_dict + + # Extract additional tag values + instance_name = tags_dict.get("Name") + cirrus_repo_full_name = tags_dict.get("CIRRUS_REPO_FULL_NAME") + cirrus_task_id = tags_dict.get("CIRRUS_TASK_ID") + + # If 'CIRRUS_CI' tag is set to 'true' and 'iit-billing-tag' is not set, add 'iit-billing-tag' set to 'CirrusCI' + if has_cirrus_ci_tag and not has_iit_billing_tag: + try: + instance.create_tags(Tags=[{"Key": "iit-billing-tag", "Value": "CirrusCI"}]) + logger.info( + f"Instance {instance.id} ({instance_name}) tagged with 'iit-billing-tag: CirrusCI'. " + f"CIRRUS_REPO_FULL_NAME: {cirrus_repo_full_name}, CIRRUS_TASK_ID: {cirrus_task_id}" + ) + except ClientError as e: + logger.error(f"Error tagging instance {instance.id}: {e}") + + + def terminate_instances_in_region(region): + ec2 = boto3.resource("ec2", region_name=region) + instances = ec2.instances.filter( + Filters=[{"Name": "instance-state-name", "Values": ["running"]}] + ) + terminated_instances = [] + skipped_instances = [] + + for instance in instances: + try: + # First try to tag CirrusCI instances + cirrus_ci_add_iit_billing_tag(instance) + + # Skip EKS-managed instances based on pattern and billing tag + if is_eks_managed_instance(instance, region): + tags_dict = convert_tags_to_dict(instance.tags) + cluster_name = get_eks_cluster_name(tags_dict) + skipped_instances.append( + { + "InstanceId": instance.id, + "Reason": f"EKS-managed (cluster: {cluster_name or 'unknown'})", + } + ) + continue + + # Check if should terminate + if is_instance_to_terminate(instance): + instance_info = { + "InstanceId": instance.id, + "SSHKeyName": instance.key_name, + "NameTag": instance.tags[0]["Value"] + if instance.tags and "Name" in [tag["Key"] for tag in instance.tags] + else None, + "AvailabilityZone": instance.placement["AvailabilityZone"], + } + + try: + instance.terminate() + terminated_instances.append(instance_info) + logger.info( + f"Successfully terminated instance {instance.id} in {region}" + ) + except ClientError as e: + logger.error( + f"Failed to terminate instance {instance.id} in {region}: {e}" + ) + skipped_instances.append( + { + "InstanceId": instance.id, + "Reason": f"Permission denied: {str(e)}", + } + ) + except Exception as e: + logger.error(f"Error processing instance {instance.id} in {region}: {e}") + continue + + if skipped_instances: + logger.info(f"Skipped {len(skipped_instances)} instances in {region}") + for skipped in skipped_instances[:5]: # Log first 5 only + logger.info(f" - {skipped['InstanceId']}: {skipped['Reason']}") + + return terminated_instances + + + def lambda_handler(event, context): + global eks_clusters_to_delete + eks_clusters_to_delete = {} # Reset at start of each invocation + + regions = [ + region["RegionName"] + for region in boto3.client("ec2").describe_regions()["Regions"] + ] + terminated_instances_all_regions = [] + deleted_clusters = [] + + # Process all instances and identify EKS clusters to delete + for region in regions: + try: + terminated_instances_region = terminate_instances_in_region(region) + terminated_instances_all_regions.extend(terminated_instances_region) + except Exception as e: + logger.error(f"Error processing region {region}: {e}") + continue + + # Delete EKS clusters that don't match skip pattern AND have no valid billing tag + for region, clusters in eks_clusters_to_delete.items(): + for cluster_name in clusters: + try: + if delete_eks_cluster_stack(cluster_name, region): + deleted_clusters.append(f"{cluster_name} ({region})") + except Exception as e: + logger.error(f"Error deleting cluster {cluster_name} in {region}: {e}") + continue + + # Log results + if terminated_instances_all_regions: + logger.info("Terminated instances:") + for instance_info in terminated_instances_all_regions: + logger.info( + f"- Instance ID: {instance_info['InstanceId']}, SSH Key: {instance_info['SSHKeyName']}, Name Tag: {instance_info['NameTag']}, Availability Zone: {instance_info['AvailabilityZone']}" + ) + else: + logger.info("No instances were terminated.") + + if deleted_clusters: + logger.info(f"Deleted {len(deleted_clusters)} EKS clusters:") + for cluster in deleted_clusters: + logger.info(f"- {cluster}") + else: + logger.info("No EKS clusters were deleted.") + + return { + "statusCode": 200, + "body": f"Terminated {len(terminated_instances_all_regions)} instances, deleted {len(deleted_clusters)} EKS clusters", + } + + # EventBridge rule to trigger Lambda every 4 minutes + ScheduleRule: + Type: AWS::Events::Rule + Properties: + Description: 'Trigger removeUntaggedEc2 Lambda every 4 minutes' + ScheduleExpression: 'rate(4 minutes)' + State: ENABLED + Targets: + - Arn: !GetAtt RemoveUntaggedEc2Function.Arn + Id: RemoveUntaggedEc2Target + + PermissionForEventsToInvokeLambda: + Type: AWS::Lambda::Permission + Properties: + FunctionName: !Ref RemoveUntaggedEc2Function + Action: lambda:InvokeFunction + Principal: events.amazonaws.com + SourceArn: !GetAtt ScheduleRule.Arn + +Outputs: + LambdaFunctionArn: + Description: ARN of the removeUntaggedEc2 Lambda function + Value: !GetAtt RemoveUntaggedEc2Function.Arn + Export: + Name: RemoveUntaggedEc2-FunctionArn + + LambdaRoleArn: + Description: ARN of the Lambda execution role + Value: !GetAtt LambdaExecutionRole.Arn + Export: + Name: RemoveUntaggedEc2-RoleArn diff --git a/cloud/aws-functions/removeUntaggedEc2.py b/cloud/aws-functions/removeUntaggedEc2.py new file mode 100644 index 0000000000..71b3348abd --- /dev/null +++ b/cloud/aws-functions/removeUntaggedEc2.py @@ -0,0 +1,457 @@ +import logging +import datetime +import boto3 +import os +import re +from botocore.exceptions import ClientError + +# Set logging level to INFO +logger = logging.getLogger() +logger.setLevel("INFO") + +# Get environment variable for EKS cluster skip pattern +EKS_SKIP_PATTERN = os.environ.get("EKS_SKIP_PATTERN", "pe-.*") +logger.info(f"EKS_SKIP_PATTERN: {EKS_SKIP_PATTERN}") + +# Track EKS clusters marked for deletion per region +eks_clusters_to_delete = {} + + +def convert_tags_to_dict(tags): + return {tag["Key"]: tag["Value"] for tag in tags} if tags else {} + + +def get_eks_cluster_name(tags_dict): + """Extract EKS cluster name from instance tags""" + # Check multiple possible tag keys for cluster name + cluster_keys = ["aws:eks:cluster-name", "eks:eks-cluster-name"] + + for key in cluster_keys: + if key in tags_dict: + return tags_dict[key] + + # Check for kubernetes.io/cluster/* tags + for key in tags_dict.keys(): + if key.startswith("kubernetes.io/cluster/"): + return key.replace("kubernetes.io/cluster/", "") + + return None + + +def has_valid_billing_tag(tags_dict, instance_launch_time): + """ + Check if instance has a valid iit-billing-tag. + + For regular instances: any non-empty value is valid + For timestamp-based tags: check if Unix timestamp is in the future + """ + if "iit-billing-tag" not in tags_dict: + return False + + tag_value = tags_dict["iit-billing-tag"] + + # Empty tag is invalid + if not tag_value: + return False + + # Try to parse as Unix timestamp (for EKS auto-expiration) + try: + expiration_timestamp = int(tag_value) + current_timestamp = int( + datetime.datetime.now(datetime.timezone.utc).timestamp() + ) + + # If it's a valid future timestamp, check if it's expired + if expiration_timestamp > current_timestamp: + logger.info( + f"Instance has valid billing tag with expiration {expiration_timestamp} " + f"(expires in {expiration_timestamp - current_timestamp} seconds)" + ) + return True + else: + logger.info( + f"Instance billing tag expired: {expiration_timestamp} < {current_timestamp} " + f"(expired {current_timestamp - expiration_timestamp} seconds ago)" + ) + return False + except ValueError: + # Not a timestamp, treat as category string (e.g., "pmm-staging", "jenkins-pmm-slave") + # Any non-empty category string is valid + logger.info(f"Instance has valid billing tag category: {tag_value}") + return True + + +def is_eks_managed_instance(instance, region): + """Check if instance is managed by EKS and if it should be skipped""" + tags_dict = convert_tags_to_dict(instance.tags) + + # Check for EKS-related tags + eks_indicators = [ + "kubernetes.io/cluster/", + "aws:eks:cluster-name", + "eks:eks-cluster-name", + "eks:kubernetes-node-pool-name", + "aws:ec2:managed-launch", + ] + + is_eks = False + for key in tags_dict.keys(): + for indicator in eks_indicators: + if indicator in key: + is_eks = True + break + if is_eks: + break + + if not is_eks: + return False + + # It's an EKS instance, now check billing tag and skip pattern + cluster_name = get_eks_cluster_name(tags_dict) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + # If has valid billing tag, always skip (it's legitimate) + if has_billing_tag: + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"has valid iit-billing-tag, skipping" + ) + return True + + # No billing tag - check skip pattern + if cluster_name and EKS_SKIP_PATTERN: + try: + if re.match(EKS_SKIP_PATTERN, cluster_name): + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"matches skip pattern '{EKS_SKIP_PATTERN}', skipping" + ) + return True + else: + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name}), " + f"does NOT match skip pattern '{EKS_SKIP_PATTERN}' and has no valid billing tag, " + f"marking cluster for deletion" + ) + # Track this cluster for deletion + if region not in eks_clusters_to_delete: + eks_clusters_to_delete[region] = set() + eks_clusters_to_delete[region].add(cluster_name) + return True # Skip individual instance termination, we'll delete the whole cluster + except re.error as e: + logger.error( + f"Invalid regex pattern '{EKS_SKIP_PATTERN}': {e}, skipping all EKS instances" + ) + return True + + # If no cluster name found, skip the instance + logger.info( + f"Instance {instance.id} is EKS-managed (cluster: {cluster_name or 'unknown'}), skipping" + ) + return True + + +def is_instance_to_terminate(instance): + # Check if the instance has valid 'iit-billing-tag' + tags_dict = convert_tags_to_dict(instance.tags) + has_billing_tag = has_valid_billing_tag(tags_dict, instance.launch_time) + + # Calculate the running time of the instance + current_time = datetime.datetime.now(datetime.timezone.utc) + launch_time = instance.launch_time + running_time = current_time - launch_time + + # Terminate instances without valid 'iit-billing-tag' running for more than 10 minutes + if not has_billing_tag and running_time.total_seconds() > 600: + return True + return False + + +def cleanup_failed_stack_resources(stack_name, region): + """Manually clean up resources that prevent stack deletion""" + try: + cfn = boto3.client("cloudformation", region_name=region) + ec2 = boto3.client("ec2", region_name=region) + + # Get failed resources from stack events + events = cfn.describe_stack_events(StackName=stack_name) + failed_resources = {} + + for event in events["StackEvents"]: + if event.get("ResourceStatus") == "DELETE_FAILED": + logical_id = event["LogicalResourceId"] + if logical_id not in failed_resources: # Only keep first occurrence + failed_resources[logical_id] = { + "Type": event["ResourceType"], + "PhysicalId": event.get("PhysicalResourceId"), + } + + if not failed_resources: + return True + + logger.info( + f"Attempting to clean up {len(failed_resources)} failed resources for stack {stack_name}" + ) + + # Process each failed resource type + for logical_id, resource in failed_resources.items(): + resource_type = resource["Type"] + physical_id = resource["PhysicalId"] + + try: + # Clean up security group ingress rules + if resource_type == "AWS::EC2::SecurityGroupIngress" and physical_id: + sg_id = physical_id.split("|")[0] if "|" in physical_id else None + if sg_id and sg_id.startswith("sg-"): + response = ec2.describe_security_groups(GroupIds=[sg_id]) + if response["SecurityGroups"]: + sg = response["SecurityGroups"][0] + if sg["IpPermissions"]: + ec2.revoke_security_group_ingress( + GroupId=sg_id, IpPermissions=sg["IpPermissions"] + ) + logger.info(f"Cleaned up ingress rules for {sg_id}") + + # Clean up route table associations + elif ( + resource_type == "AWS::EC2::SubnetRouteTableAssociation" + and physical_id + ): + # PhysicalId is the association ID + if physical_id.startswith("rtbassoc-"): + ec2.disassociate_route_table(AssociationId=physical_id) + logger.info(f"Disassociated route table {physical_id}") + + # Clean up routes + elif resource_type == "AWS::EC2::Route" and physical_id: + # PhysicalId format: rtb-xxx_destination + parts = physical_id.split("_") + if len(parts) == 2 and parts[0].startswith("rtb-"): + rtb_id = parts[0] + dest_cidr = parts[1] + ec2.delete_route( + RouteTableId=rtb_id, DestinationCidrBlock=dest_cidr + ) + logger.info(f"Deleted route {dest_cidr} from {rtb_id}") + + except ClientError as e: + error_code = e.response.get("Error", {}).get("Code", "") + # Ignore if resource already deleted + if error_code not in [ + "InvalidGroup.NotFound", + "InvalidAssociationID.NotFound", + "InvalidRoute.NotFound", + ]: + logger.warning( + f"Could not clean up {resource_type} {physical_id}: {e}" + ) + except Exception as e: + logger.warning( + f"Unexpected error cleaning up {resource_type} {physical_id}: {e}" + ) + + return True + + except Exception as e: + logger.error(f"Error cleaning up failed resources for stack {stack_name}: {e}") + return False + + +def delete_eks_cluster_stack(cluster_name, region): + """Delete EKS cluster by removing its CloudFormation stack""" + try: + cfn = boto3.client("cloudformation", region_name=region) + + # Find CloudFormation stack for this cluster + stack_name = f"eksctl-{cluster_name}-cluster" + + # Check if stack exists and its current status + try: + response = cfn.describe_stacks(StackName=stack_name) + stack_status = response["Stacks"][0]["StackStatus"] + except ClientError as e: + if "does not exist" in str(e): + logger.warning( + f"CloudFormation stack {stack_name} not found in {region}, cannot delete cluster {cluster_name}" + ) + return False + raise + + # Handle DELETE_FAILED status - retry after cleanup + if stack_status == "DELETE_FAILED": + logger.info( + f"Stack {stack_name} previously failed deletion, attempting cleanup and retry" + ) + cleanup_failed_stack_resources(stack_name, region) + # Retry deletion + cfn.delete_stack(StackName=stack_name) + logger.info(f"Retrying deletion of stack {stack_name} after cleanup") + return True + + # Handle already deleting + if "DELETE" in stack_status and stack_status != "DELETE_COMPLETE": + logger.info(f"Stack {stack_name} already deleting (status: {stack_status})") + return True + + # Initiate deletion for new stacks + logger.info( + f"Deleting CloudFormation stack {stack_name} for EKS cluster {cluster_name} in {region}" + ) + cfn.delete_stack(StackName=stack_name) + logger.info( + f"Successfully initiated deletion of stack {stack_name} for cluster {cluster_name}" + ) + return True + + except ClientError as e: + logger.error( + f"Failed to delete CloudFormation stack for cluster {cluster_name} in {region}: {e}" + ) + return False + except Exception as e: + logger.error( + f"Unexpected error deleting cluster {cluster_name} in {region}: {e}" + ) + return False + + +def cirrus_ci_add_iit_billing_tag(instance): + # Convert tags to a dictionary for easier access + tags_dict = convert_tags_to_dict(instance.tags) + + # Check if the instance has 'CIRRUS_CI' tag set to 'true' and 'iit-billing-tag' is not set + has_cirrus_ci_tag = tags_dict.get("CIRRUS_CI", "").lower() == "true" + has_iit_billing_tag = "iit-billing-tag" in tags_dict + + # Extract additional tag values + instance_name = tags_dict.get("Name") + cirrus_repo_full_name = tags_dict.get("CIRRUS_REPO_FULL_NAME") + cirrus_task_id = tags_dict.get("CIRRUS_TASK_ID") + + # If 'CIRRUS_CI' tag is set to 'true' and 'iit-billing-tag' is not set, add 'iit-billing-tag' set to 'CirrusCI' + if has_cirrus_ci_tag and not has_iit_billing_tag: + try: + instance.create_tags(Tags=[{"Key": "iit-billing-tag", "Value": "CirrusCI"}]) + logger.info( + f"Instance {instance.id} ({instance_name}) tagged with 'iit-billing-tag: CirrusCI'. " + f"CIRRUS_REPO_FULL_NAME: {cirrus_repo_full_name}, CIRRUS_TASK_ID: {cirrus_task_id}" + ) + except ClientError as e: + logger.error(f"Error tagging instance {instance.id}: {e}") + + +def terminate_instances_in_region(region): + ec2 = boto3.resource("ec2", region_name=region) + instances = ec2.instances.filter( + Filters=[{"Name": "instance-state-name", "Values": ["running"]}] + ) + terminated_instances = [] + skipped_instances = [] + + for instance in instances: + try: + # First try to tag CirrusCI instances + cirrus_ci_add_iit_billing_tag(instance) + + # Skip EKS-managed instances based on pattern and billing tag + if is_eks_managed_instance(instance, region): + tags_dict = convert_tags_to_dict(instance.tags) + cluster_name = get_eks_cluster_name(tags_dict) + skipped_instances.append( + { + "InstanceId": instance.id, + "Reason": f"EKS-managed (cluster: {cluster_name or 'unknown'})", + } + ) + continue + + # Check if should terminate + if is_instance_to_terminate(instance): + instance_info = { + "InstanceId": instance.id, + "SSHKeyName": instance.key_name, + "NameTag": instance.tags[0]["Value"] + if instance.tags and "Name" in [tag["Key"] for tag in instance.tags] + else None, + "AvailabilityZone": instance.placement["AvailabilityZone"], + } + + try: + instance.terminate() + terminated_instances.append(instance_info) + logger.info( + f"Successfully terminated instance {instance.id} in {region}" + ) + except ClientError as e: + logger.error( + f"Failed to terminate instance {instance.id} in {region}: {e}" + ) + skipped_instances.append( + { + "InstanceId": instance.id, + "Reason": f"Permission denied: {str(e)}", + } + ) + except Exception as e: + logger.error(f"Error processing instance {instance.id} in {region}: {e}") + continue + + if skipped_instances: + logger.info(f"Skipped {len(skipped_instances)} instances in {region}") + for skipped in skipped_instances[:5]: # Log first 5 only + logger.info(f" - {skipped['InstanceId']}: {skipped['Reason']}") + + return terminated_instances + + +def lambda_handler(event, context): + global eks_clusters_to_delete + eks_clusters_to_delete = {} # Reset at start of each invocation + + regions = [ + region["RegionName"] + for region in boto3.client("ec2").describe_regions()["Regions"] + ] + terminated_instances_all_regions = [] + deleted_clusters = [] + + # Process all instances and identify EKS clusters to delete + for region in regions: + try: + terminated_instances_region = terminate_instances_in_region(region) + terminated_instances_all_regions.extend(terminated_instances_region) + except Exception as e: + logger.error(f"Error processing region {region}: {e}") + continue + + # Delete EKS clusters that don't match skip pattern AND have no valid billing tag + for region, clusters in eks_clusters_to_delete.items(): + for cluster_name in clusters: + try: + if delete_eks_cluster_stack(cluster_name, region): + deleted_clusters.append(f"{cluster_name} ({region})") + except Exception as e: + logger.error(f"Error deleting cluster {cluster_name} in {region}: {e}") + continue + + # Log results + if terminated_instances_all_regions: + logger.info("Terminated instances:") + for instance_info in terminated_instances_all_regions: + logger.info( + f"- Instance ID: {instance_info['InstanceId']}, SSH Key: {instance_info['SSHKeyName']}, Name Tag: {instance_info['NameTag']}, Availability Zone: {instance_info['AvailabilityZone']}" + ) + else: + logger.info("No instances were terminated.") + + if deleted_clusters: + logger.info(f"Deleted {len(deleted_clusters)} EKS clusters:") + for cluster in deleted_clusters: + logger.info(f"- {cluster}") + else: + logger.info("No EKS clusters were deleted.") + + return { + "statusCode": 200, + "body": f"Terminated {len(terminated_instances_all_regions)} instances, deleted {len(deleted_clusters)} EKS clusters", + } diff --git a/justfile b/justfile new file mode 100644 index 0000000000..f34b5d48ec --- /dev/null +++ b/justfile @@ -0,0 +1,250 @@ +# Justfile for jenkins-pipelines infrastructure management + +# Default region and AWS profile +aws_region := "us-east-2" +aws_profile := "percona-dev-admin" +# removeUntaggedEc2 Lambda is deployed in eu-west-1 (but scans all regions) +cleanup_lambda_region := "eu-west-1" +# Dry-run mode for removeUntaggedEc2 (true = no deletions, false = perform deletions) +dry_run := env_var_or_default("DRY_RUN", "true") + +# List all available recipes +default: + @just --list + +# ============================================================================ +# AWS Lambda Functions (cloud/aws-functions/) +# ============================================================================ + +# Deploy email_running_instances Lambda function +deploy-lambda-email-running-instances: + #!/usr/bin/env bash + set -euo pipefail + echo "Creating deployment package..." + cd cloud/aws-functions + rm -f email_running_instances.zip + zip -q email_running_instances.zip email_running_instances.py + + echo "Deploying to AWS Lambda..." + aws lambda update-function-code \ + --function-name email_running_instances \ + --zip-file fileb://email_running_instances.zip \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "Lambda function deployed successfully" + rm email_running_instances.zip + +# Lint all Lambda Python code +lint-lambdas: + #!/usr/bin/env bash + set -euo pipefail + cd cloud/aws-functions + + echo "Running ruff linter on all Python files..." + uv run --with ruff ruff check *.py + + echo "Running ruff formatter..." + uv run --with ruff ruff format *.py + + echo "Linting complete" + +# Show all Lambda functions info +info-lambdas: + #!/usr/bin/env bash + echo "Lambda Functions:" + aws lambda list-functions \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --query 'Functions[?starts_with(FunctionName, `email`) || starts_with(FunctionName, `orphaned`) || starts_with(FunctionName, `remove`)].{Name:FunctionName,Runtime:Runtime,Updated:LastModified}' \ + --output table + +# ============================================================================ +# CloudFormation Stacks (IaC/) +# ============================================================================ + +# Deploy StagingStack (PMM staging environment) +deploy-stack-staging: + #!/usr/bin/env bash + set -euo pipefail + echo "Deploying StagingStack..." + + aws cloudformation update-stack \ + --stack-name pmm-staging \ + --template-body file://IaC/StagingStack.yml \ + --capabilities CAPABILITY_NAMED_IAM \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "Waiting for stack update to complete..." + aws cloudformation wait stack-update-complete \ + --stack-name pmm-staging \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "StagingStack deployed successfully" + +# Deploy LambdaVolumeCleanup stack +deploy-stack-volume-cleanup: + #!/usr/bin/env bash + set -euo pipefail + echo "Deploying LambdaVolumeCleanup..." + + aws cloudformation update-stack \ + --stack-name lambda-volume-cleanup \ + --template-body file://IaC/LambdaVolumeCleanup.yml \ + --capabilities CAPABILITY_NAMED_IAM \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "Waiting for stack update to complete..." + aws cloudformation wait stack-update-complete \ + --stack-name lambda-volume-cleanup \ + --region {{aws_region}} \ + --profile {{aws_profile}} + + echo "LambdaVolumeCleanup deployed successfully" + +# List all CloudFormation stacks +list-stacks: + #!/usr/bin/env bash + aws cloudformation list-stacks \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --stack-status-filter CREATE_COMPLETE UPDATE_COMPLETE \ + --query 'StackSummaries[].{Name:StackName,Status:StackStatus,Updated:LastUpdatedTime}' \ + --output table + +# Describe a specific stack +describe-stack stack_name: + #!/usr/bin/env bash + aws cloudformation describe-stacks \ + --stack-name {{stack_name}} \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --query 'Stacks[0].{Name:StackName,Status:StackStatus,Created:CreationTime,Updated:LastUpdatedTime}' \ + --output table + +# ============================================================================ +# Development & Testing +# ============================================================================ + +# Run all linters +lint: lint-lambdas + +# Full deployment workflow for email Lambda (lint, deploy) +deploy-email-running-instances-full: lint-lambdas deploy-lambda-email-running-instances + +# Check for trailing whitespaces in Python files +check-whitespace: + #!/usr/bin/env bash + set -euo pipefail + echo "Checking for trailing whitespaces..." + cd cloud/aws-functions + if rg '\s+$' *.py; then + echo "Found trailing whitespaces" + exit 1 + else + echo "No trailing whitespaces found" + fi + +# ============================================================================ +# Infrastructure Health Checks +# ============================================================================ + +# Check running EC2 instances in staging +check-staging-instances: + #!/usr/bin/env bash + echo "Running PMM Staging Instances:" + aws ec2 describe-instances \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --filters "Name=tag:iit-billing-tag,Values=pmm-staging" "Name=instance-state-name,Values=running" \ + --query 'Reservations[].Instances[].{Name:Tags[?Key==`Name`]|[0].Value,Type:InstanceType,State:State.Name,LaunchTime:LaunchTime}' \ + --output table + +# Check CloudFormation stacks in DELETE_FAILED state +check-failed-stacks: + #!/usr/bin/env bash + echo "Failed CloudFormation Stacks:" + aws cloudformation list-stacks \ + --region {{aws_region}} \ + --profile {{aws_profile}} \ + --stack-status-filter DELETE_FAILED \ + --query 'StackSummaries[].{Name:StackName,Status:StackStatus,Reason:StackStatusReason}' \ + --output table + +# ============================================================================ +# removeUntaggedEc2 Lambda (IaC/RemoveUntaggedEc2Stack.yml) +# ============================================================================ + +# Deploy RemoveUntaggedEc2 CloudFormation stack +deploy-stack-remove-untagged-ec2: + #!/usr/bin/env bash + set -euo pipefail + echo "Deploying RemoveUntaggedEc2Stack..." + + aws cloudformation deploy \ + --stack-name remove-untagged-ec2 \ + --template-file IaC/RemoveUntaggedEc2Stack.yml \ + --capabilities CAPABILITY_NAMED_IAM \ + --parameter-overrides EksSkipPattern="pe-.*" DryRun="{{dry_run}}" \ + --region {{cleanup_lambda_region}} \ + --profile {{aws_profile}} + + echo "RemoveUntaggedEc2Stack deployed successfully" + +# Update RemoveUntaggedEc2 stack (sync from .py file) +update-stack-remove-untagged-ec2: + #!/usr/bin/env bash + set -euo pipefail + echo "Syncing cloud/aws-functions/removeUntaggedEc2.py to CloudFormation..." + echo "Manual sync required:" + echo " 1. Copy code from cloud/aws-functions/removeUntaggedEc2.py" + echo " 2. Paste into IaC/RemoveUntaggedEc2Stack.yml under Code.ZipFile" + echo " 3. Run: just deploy-stack-remove-untagged-ec2" + echo "" + echo "Or deploy directly to Lambda: just deploy-lambda-remove-untagged-ec2" + +# Deploy removeUntaggedEc2 directly to Lambda (bypass CloudFormation) +deploy-lambda-remove-untagged-ec2: + #!/usr/bin/env bash + set -euo pipefail + echo "Creating deployment package..." + cd cloud/aws-functions + rm -f removeUntaggedEc2.zip + zip -q removeUntaggedEc2.zip removeUntaggedEc2.py + + echo "Deploying to AWS Lambda..." + aws lambda update-function-code \ + --function-name removeUntaggedEc2 \ + --zip-file fileb://removeUntaggedEc2.zip \ + --region {{cleanup_lambda_region}} \ + --profile {{aws_profile}} + + echo "Lambda function deployed successfully" + rm removeUntaggedEc2.zip + +# Delete RemoveUntaggedEc2 CloudFormation stack +delete-stack-remove-untagged-ec2: + #!/usr/bin/env bash + set -euo pipefail + echo "This will delete the removeUntaggedEc2 Lambda and EventBridge rule" + read -p "Are you sure? (yes/no): " confirm + if [ "$confirm" != "yes" ]; then + echo "Aborted" + exit 1 + fi + + aws cloudformation delete-stack \ + --stack-name remove-untagged-ec2 \ + --region {{cleanup_lambda_region}} \ + --profile {{aws_profile}} + + echo "Waiting for stack deletion..." + aws cloudformation wait stack-delete-complete \ + --stack-name remove-untagged-ec2 \ + --region {{cleanup_lambda_region}} \ + --profile {{aws_profile}} + + echo "Stack deleted"