Skip to content

Commit 79cd828

Browse files
committed
fix/byosg: remove managed SG on BYOSG scenario for CLB
Fix the "managed" (controller-owned) security group leak when user provided security group is added to an existing Service type-loadBalancer CLB.
1 parent e955e76 commit 79cd828

File tree

3 files changed

+187
-25
lines changed

3 files changed

+187
-25
lines changed

pkg/providers/v1/aws.go

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2734,19 +2734,9 @@ func (c *Cloud) updateInstanceSecurityGroupsForLoadBalancer(ctx context.Context,
27342734
loadBalancerSecurityGroupID := lbSecurityGroupIDs[0]
27352735

27362736
// Get the actual list of groups that allow ingress from the load-balancer
2737-
actualGroups := make(map[*ec2types.SecurityGroup]bool)
2738-
{
2739-
describeRequest := &ec2.DescribeSecurityGroupsInput{}
2740-
describeRequest.Filters = []ec2types.Filter{
2741-
newEc2Filter("ip-permission.group-id", loadBalancerSecurityGroupID),
2742-
}
2743-
response, err := c.ec2.DescribeSecurityGroups(ctx, describeRequest)
2744-
if err != nil {
2745-
return fmt.Errorf("error querying security groups for ELB: %q", err)
2746-
}
2747-
for _, sg := range response {
2748-
actualGroups[&sg] = c.tagging.hasClusterTag(sg.Tags)
2749-
}
2737+
actualGroups, _, err := c.buildSecurityGroupRuleReferences(ctx, loadBalancerSecurityGroupID)
2738+
if err != nil {
2739+
return fmt.Errorf("error building security group rule references: %w", err)
27502740
}
27512741

27522742
// Open the firewall from the load balancer to the instance

pkg/providers/v1/aws_loadbalancer.go

Lines changed: 153 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"time"
3030

3131
"github.com/aws/aws-sdk-go-v2/aws"
32+
"github.com/aws/aws-sdk-go-v2/service/ec2"
3233
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
3334

3435
elb "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing"
@@ -1082,24 +1083,24 @@ func (c *Cloud) ensureLoadBalancer(ctx context.Context, namespacedName types.Nam
10821083

10831084
{
10841085
// Sync security groups
1085-
expected := sets.New[string](securityGroupIDs...)
1086-
actual := stringSetFromList(loadBalancer.SecurityGroups)
1086+
expected := sets.New(securityGroupIDs...)
1087+
actual := sets.New(loadBalancer.SecurityGroups...)
10871088

10881089
if !expected.Equal(actual) {
10891090
// This call just replaces the security groups, unlike e.g. subnets (!)
1090-
request := &elb.ApplySecurityGroupsToLoadBalancerInput{}
1091-
request.LoadBalancerName = aws.String(loadBalancerName)
1092-
if securityGroupIDs == nil {
1093-
request.SecurityGroups = nil
1094-
} else {
1095-
request.SecurityGroups = securityGroupIDs
1096-
}
1097-
klog.V(2).Info("Applying updated security groups to load balancer")
1098-
_, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, request)
1099-
if err != nil {
1091+
klog.V(2).Infof("Applying updated security groups to load balancer %q", loadBalancerName)
1092+
if _, err := c.elb.ApplySecurityGroupsToLoadBalancer(ctx, &elb.ApplySecurityGroupsToLoadBalancerInput{
1093+
LoadBalancerName: aws.String(loadBalancerName),
1094+
SecurityGroups: securityGroupIDs,
1095+
}); err != nil {
11001096
return nil, fmt.Errorf("error applying AWS loadbalancer security groups: %q", err)
11011097
}
11021098
dirty = true
1099+
1100+
// Ensure the replaced security groups are removed from AWS when owned by the controller.
1101+
if errs := c.removeOwnedSecurityGroups(ctx, loadBalancerName, actual.UnsortedList()); len(errs) > 0 {
1102+
return nil, fmt.Errorf("error removing owned security groups: %v", errs)
1103+
}
11031104
}
11041105
}
11051106

@@ -1704,3 +1705,143 @@ func ValidateHealthCheck(s *elbtypes.HealthCheck) error {
17041705

17051706
return nil
17061707
}
1708+
1709+
// isOwnedSecurityGroup checks if the security group is owned by the controller
1710+
// by checking if the security group has the cluster ownership tag
1711+
// (kubernetes.io/cluster/<clusterID>=owned).
1712+
//
1713+
// Parameters:
1714+
// - `ctx`: The context for the operation.
1715+
// - `securityGroupID`: The ID of the security group to check.
1716+
//
1717+
// Returns:
1718+
// - `bool`: True if the security group is owned by the controller, false otherwise.
1719+
// - `error`: An error if the security group cannot be retrieved, is not found,
1720+
// or if multiple security groups are found with the same ID (unexpected).
1721+
func (c *Cloud) isOwnedSecurityGroup(ctx context.Context, securityGroupID string) (bool, error) {
1722+
groups, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
1723+
GroupIds: []string{securityGroupID},
1724+
})
1725+
if err != nil {
1726+
return false, fmt.Errorf("error retrieving security group %q: %w", securityGroupID, err)
1727+
}
1728+
if len(groups) == 0 {
1729+
return false, fmt.Errorf("security group %q not found", securityGroupID)
1730+
}
1731+
if len(groups) != 1 {
1732+
// This should not be possible - ids should be unique
1733+
return false, fmt.Errorf("[BUG] multiple security groups(%d) found with same id %q", len(groups), securityGroupID)
1734+
}
1735+
return c.tagging.hasClusterTagOwned(groups[0].Tags)
1736+
}
1737+
1738+
// buildSecurityGroupRuleReferences finds all security groups that have ingress rules
1739+
// referencing the specified security group ID, and categorizes them based on cluster tagging.
1740+
// This is used to identify dependencies before removing a security group.
1741+
//
1742+
// Parameters:
1743+
// - ctx: The context for the request.
1744+
// - sgID: The ID of the security group to find references for.
1745+
//
1746+
// Returns:
1747+
// - map[*ec2types.SecurityGroup]bool: All security groups with ingress rules referencing sgID, mapped to their cluster tag status (true/false).
1748+
// - map[*ec2types.SecurityGroup]IPPermissionSet: Only cluster-tagged security groups mapped to their ingress rules that reference sgID.
1749+
// - error: An error if the AWS DescribeSecurityGroups API call fails.
1750+
func (c *Cloud) buildSecurityGroupRuleReferences(ctx context.Context, sgID string) (map[*ec2types.SecurityGroup]bool, map[*ec2types.SecurityGroup]IPPermissionSet, error) {
1751+
groupsHasTags := make(map[*ec2types.SecurityGroup]bool)
1752+
groupsLinkedPermissions := make(map[*ec2types.SecurityGroup]IPPermissionSet)
1753+
sgsOut, err := c.ec2.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
1754+
Filters: []ec2types.Filter{
1755+
newEc2Filter("ip-permission.group-id", sgID),
1756+
},
1757+
})
1758+
if err != nil {
1759+
return groupsHasTags, groupsLinkedPermissions, fmt.Errorf("error querying security groups for ELB: %q", err)
1760+
}
1761+
1762+
for _, sg := range sgsOut {
1763+
groupsHasTags[&sg] = c.tagging.hasClusterTag(sg.Tags)
1764+
1765+
groupsLinkedPermissions[&sg] = NewIPPermissionSet()
1766+
for _, rule := range sg.IpPermissions {
1767+
if rule.UserIdGroupPairs != nil {
1768+
for _, pair := range rule.UserIdGroupPairs {
1769+
if pair.GroupId != nil && aws.ToString(pair.GroupId) == sgID {
1770+
groupsLinkedPermissions[&sg].Insert(rule)
1771+
}
1772+
}
1773+
}
1774+
}
1775+
1776+
}
1777+
return groupsHasTags, groupsLinkedPermissions, nil
1778+
}
1779+
1780+
// removeOwnedSecurityGroups removes the CLB owned/managed security groups from AWS.
1781+
// It revokes ingress rules that reference the security groups to be removed,
1782+
// then deletes the security groups that are owned by the controller.
1783+
// This is used when updating load balancer security groups to clean up orphaned ones.
1784+
//
1785+
// Parameters:
1786+
// - `ctx`: The context for the operation.
1787+
// - `loadBalancerName`: The name of the load balancer (used for logging and deletion operations).
1788+
// - `securityGroups`: The list of security group IDs to process for removal.
1789+
//
1790+
// Returns:
1791+
// - `[]error`: Collection of all errors encountered during the removal process.
1792+
func (c *Cloud) removeOwnedSecurityGroups(ctx context.Context, loadBalancerName string, securityGroups []string) []error {
1793+
allErrs := []error{}
1794+
sgMap := make(map[string]struct{})
1795+
1796+
// Validate each security group references building a reading list to be deleted.
1797+
for _, sg := range securityGroups {
1798+
isOwned, err := c.isOwnedSecurityGroup(ctx, sg)
1799+
if err != nil {
1800+
allErrs = append(allErrs, fmt.Errorf("unable to validate if security group %q is owned by the controller: %w", sg, err))
1801+
continue
1802+
}
1803+
1804+
groupsWithClusterTag, groupsLinkedPermissions, err := c.buildSecurityGroupRuleReferences(ctx, sg)
1805+
if err != nil {
1806+
allErrs = append(allErrs, fmt.Errorf("error building security group rule references for %q: %w", sg, err))
1807+
continue
1808+
}
1809+
1810+
// Revoke ingress rules referencing the security group to be deleted
1811+
// from cluster-tagged security groups, when the referenced security
1812+
// group has no cluster tag, skip the revoke assuming it is user-managed.
1813+
for sgTarget, sgPerms := range groupsLinkedPermissions {
1814+
if !groupsWithClusterTag[sgTarget] {
1815+
klog.Warningf("security group %q has no cluster tag, skipping remove lifecycle after update", sg)
1816+
continue
1817+
}
1818+
1819+
klog.Infof("revoking security group ingress references of %q from %q", sg, aws.ToString(sgTarget.GroupId))
1820+
if _, err := c.ec2.RevokeSecurityGroupIngress(ctx, &ec2.RevokeSecurityGroupIngressInput{
1821+
GroupId: sgTarget.GroupId,
1822+
IpPermissions: sgPerms.List(),
1823+
}); err != nil {
1824+
allErrs = append(allErrs, fmt.Errorf("error revoking security group ingress rules from %q: %w", aws.ToString(sgTarget.GroupId), err))
1825+
continue
1826+
}
1827+
}
1828+
1829+
// Skip security group removal when the security group is not owned by the controller.
1830+
if !isOwned {
1831+
klog.Warningf("security group %q is not owned by the controller, skipping remove lifecycle after update", sg)
1832+
continue
1833+
}
1834+
1835+
klog.Infof("making loadbalancer owned security group %q ready for deletion", sg)
1836+
sgMap[sg] = struct{}{}
1837+
}
1838+
if len(sgMap) == 0 {
1839+
return allErrs
1840+
}
1841+
1842+
if err := c.deleteSecurityGroupsWithBackoff(ctx, loadBalancerName, sgMap); err != nil {
1843+
return append(allErrs, fmt.Errorf("error deleting security groups %v: %v", sgMap, err))
1844+
}
1845+
klog.Infof("loadbalancer owned security groups deleted!")
1846+
return nil
1847+
}

pkg/providers/v1/tags.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,37 @@ func (t *awsTagging) hasClusterTag(tags []ec2types.Tag) bool {
154154
return false
155155
}
156156

157+
// hasClusterTagOwned checks if the resource has the cluster tag with the value "owned"
158+
// This is used to determine if the resource is owned by the controller.
159+
// It checks both legacy and new tags.
160+
//
161+
// Returns:
162+
// - bool: true if the resource has the cluster tag with the value "owned",
163+
// otherwise false.
164+
// - error: nil if the resource has the cluster tag with a valid value,
165+
// otherwise error with the reason
166+
func (t *awsTagging) hasClusterTagOwned(tags []ec2types.Tag) (bool, error) {
167+
if len(t.ClusterID) == 0 {
168+
return false, fmt.Errorf("cannot check cluster tag owned: clusterID is empty")
169+
}
170+
clusterTagKey := t.clusterTagKey()
171+
for _, tag := range tags {
172+
tagKey := aws.ToString(tag.Key)
173+
tagValue := aws.ToString(tag.Value)
174+
175+
// For legacy tags, the cluster ID is the value, not "owned"
176+
if (tagKey == TagNameKubernetesClusterLegacy) && (tagValue == t.ClusterID) {
177+
return true, nil
178+
}
179+
180+
// For new tags, check if it's the cluster tag with "owned" value
181+
if tagKey == clusterTagKey && tagValue == string(ResourceLifecycleOwned) {
182+
return true, nil
183+
}
184+
}
185+
return false, nil
186+
}
187+
157188
func (t *awsTagging) hasNoClusterPrefixTag(tags []ec2types.Tag) bool {
158189
for _, tag := range tags {
159190
if strings.HasPrefix(aws.ToString(tag.Key), TagNameKubernetesClusterPrefix) {

0 commit comments

Comments
 (0)