From b5b1a9faf820cc7a9f975f24074abd4a5385346c Mon Sep 17 00:00:00 2001 From: Dawid Wysocki Date: Thu, 31 Jul 2025 08:24:15 +0000 Subject: [PATCH] Calculate new subsetPerZone with the exclusion of existing endpoints This change is there to fix edge cases when the number of nodes in the zone fluctuates while being nearly equal. Those fluctations cause the `pickSubsetMinRemovals` zone ordering to change - causing constant detach/attach of an endpoint. This "moving" endpoint then causes constant connection draining for ~1/25 of traffic. The fix is to keep existing endpoints as long as the node/subnet pair attached to the endpoint still exists, and only consider unused endpoints for `pickSubsetMinRemovals`. --- pkg/neg/syncers/endpoints_calculator_test.go | 235 +++++++++++++++++++ pkg/neg/syncers/subsets.go | 89 ++++++- 2 files changed, 314 insertions(+), 10 deletions(-) diff --git a/pkg/neg/syncers/endpoints_calculator_test.go b/pkg/neg/syncers/endpoints_calculator_test.go index 7210b3708d..e388bf7c04 100644 --- a/pkg/neg/syncers/endpoints_calculator_test.go +++ b/pkg/neg/syncers/endpoints_calculator_test.go @@ -618,6 +618,241 @@ func TestClusterWantedNEGsCount(t *testing.T) { } } +// TestClusterNEGsStability verifies that we limit the number of endpoints removal. +// Removing endpoint from a node causes 30 second connection draining. +// For that reason we should avoid removing endpoints for as long as possible. +func TestClusterNEGsStability(t *testing.T) { + t.Parallel() + + type stage struct { + desc string + nodes map[string][]*nodeWithSubnet + want map[negtypes.NEGLocation]negtypes.NetworkEndpointSet + } + + // For clarity of test cases we limit the size to 7 - which will still split unevenly in 3 zones + const subsetSize = 7 + + scenarios := []struct { + desc string + stages []stage + }{{ + desc: "equal nodes", + stages: []stage{{ + desc: "start", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 10), + "zone2": makeNodes(2000, 10), + "zone3": makeNodes(3000, 10), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1003"}: {}, {Node: "node1009"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2003"}: {}, {Node: "node2004"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3003"}: {}, {Node: "node3009"}: {}}, + }, + }, { + desc: "no change", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 10), + "zone2": makeNodes(2000, 10), + "zone3": makeNodes(3000, 10), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1003"}: {}, {Node: "node1009"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2003"}: {}, {Node: "node2004"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3003"}: {}, {Node: "node3009"}: {}}, + }, + }, { + desc: "increase nodes in all zones, no change expected", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 100), + "zone2": makeNodes(2000, 100), + "zone3": makeNodes(3000, 100), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1003"}: {}, {Node: "node1009"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2003"}: {}, {Node: "node2004"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3003"}: {}, {Node: "node3009"}: {}}, + }, + }, { + desc: "decrease all in all zones, no change expected", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 10), + "zone2": makeNodes(2000, 10), + "zone3": makeNodes(3000, 100), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1003"}: {}, {Node: "node1009"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2003"}: {}, {Node: "node2004"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3003"}: {}, {Node: "node3009"}: {}}, + }, + }, { + desc: "decrease, node disapear, change expected", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 5), + "zone2": makeNodes(2000, 5), + "zone3": makeNodes(3000, 5), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1002"}: {}, {Node: "node1003"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2003"}: {}, {Node: "node2004"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3002"}: {}, {Node: "node3003"}: {}}, + }, + }}, + }, { + desc: "unequal nodes", + stages: []stage{{ + desc: "start", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 10), + "zone2": makeNodes(2000, 20), + "zone3": makeNodes(3000, 10), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1003"}: {}, {Node: "node1009"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2011"}: {}, {Node: "node2017"}: {}, {Node: "node2019"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3003"}: {}}, + }, + }, { + desc: "no change", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 10), + "zone2": makeNodes(2000, 20), + "zone3": makeNodes(3000, 10), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1003"}: {}, {Node: "node1009"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2011"}: {}, {Node: "node2017"}: {}, {Node: "node2019"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3003"}: {}}, + }, + }, { + desc: "make zone1 largest, no change expected", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 30), + "zone2": makeNodes(2000, 20), + "zone3": makeNodes(3000, 10), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1003"}: {}, {Node: "node1009"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2011"}: {}, {Node: "node2017"}: {}, {Node: "node2019"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3003"}: {}}, + }, + }, { + desc: "shrink zone1, move endpoints from zone1", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 1), + "zone2": makeNodes(2000, 20), + "zone3": makeNodes(3000, 10), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {}, + {Zone: "zone2", Subnet: "default"}: { + {Node: "node2004"}: {}, // New + {Node: "node2011"}: {}, + {Node: "node2017"}: {}, + {Node: "node2019"}: {}, + }, + {Zone: "zone3", Subnet: "default"}: { + {Node: "node3000"}: {}, + {Node: "node3003"}: {}, + {Node: "node3009"}: {}, // New + }, + }, + }}, + }, { + desc: "zones flickering under subsetlimit", + stages: []stage{{ + desc: "start", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 2), + "zone2": makeNodes(2000, 2), + "zone3": makeNodes(3000, 2), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1000"}: {}, {Node: "node1001"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2000"}: {}, {Node: "node2001"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3001"}: {}}, + }, + }, { + desc: "zone1 bump", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 3), + "zone2": makeNodes(2000, 2), + "zone3": makeNodes(3000, 2), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1000"}: {}, {Node: "node1001"}: {}, {Node: "node1002"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2000"}: {}, {Node: "node2001"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3001"}: {}}, + }, + }, { + desc: "zone2 bump", + nodes: map[string][]*nodeWithSubnet{ + "zone1": makeNodes(1000, 2), + "zone2": makeNodes(2000, 3), + "zone3": makeNodes(3000, 2), + }, + want: map[negtypes.NEGLocation]negtypes.NetworkEndpointSet{ + {Zone: "zone1", Subnet: "default"}: {{Node: "node1000"}: {}, {Node: "node1001"}: {}}, + {Zone: "zone2", Subnet: "default"}: {{Node: "node2000"}: {}, {Node: "node2001"}: {}, {Node: "node2002"}: {}}, + {Zone: "zone3", Subnet: "default"}: {{Node: "node3000"}: {}, {Node: "node3001"}: {}}, + }, + }}, + }} + + for _, tc := range scenarios { + t.Run(tc.desc, func(t *testing.T) { + t.Parallel() + + // Arrange + nodeInformer := zonegetter.FakeNodeInformer() + zoneGetter, err := zonegetter.NewFakeZoneGetter(nodeInformer, nodeInformer, defaultTestSubnetURL, false) + if err != nil { + t.Fatalf("failed to initialize zone getter: %v", err) + } + defaultNetwork := network.NetworkInfo{IsDefault: true, K8sNetwork: "default", SubnetworkURL: defaultTestSubnetURL} + + // We use ILB so that it doesn't trigger code that linearly calculates number of NEGs needed + // based on actual number of Pods. + c := NewClusterL4EndpointsCalculator( + listers.NewNodeLister(nodeInformer.GetIndexer()), + zoneGetter, "svc", klog.TODO(), &defaultNetwork, negtypes.L4InternalLB, + ) + c.subsetSizeLimit = subsetSize + + var endpointsMap map[negtypes.NEGLocation]negtypes.NetworkEndpointSet + for _, stage := range tc.stages { + t.Run(stage.desc, func(t *testing.T) { + // Arrange + // Set up nodes + for zone, nodes := range stage.nodes { + zonegetter.DeleteFakeNodesInZone(t, zone, zoneGetter) + names := make([]string, 0, len(nodes)) + for _, node := range nodes { + names = append(names, node.node.Name) + } + if err := zonegetter.AddFakeNodes(zoneGetter, zone, names...); err != nil { + t.Fatalf("failed to add fake node: %v", err) + } + } + + // Act + res, _, _, err := c.CalculateEndpoints(nil, endpointsMap) + if err != nil { + t.Fatalf("expected no err, got %v", err) + } + + // Assert + if diff := cmp.Diff(stage.want, res); diff != "" { + t.Errorf("want != got, -want +got:\n%s", diff) + } + endpointsMap = stage.want + }) + } + }) + } +} + func TestValidateEndpoints(t *testing.T) { testPortName := "" emptyNamedPort := "" diff --git a/pkg/neg/syncers/subsets.go b/pkg/neg/syncers/subsets.go index 890fbcf3a2..af737f62b7 100644 --- a/pkg/neg/syncers/subsets.go +++ b/pkg/neg/syncers/subsets.go @@ -177,22 +177,24 @@ func newNodeWithSubnet(node *v1.Node, subnet string) *nodeWithSubnet { // getSubsetPerZone creates a subset of nodes from the given list of nodes, for each zone provided. // The output is a map of zone string to NEG subset. // In order to pick as many nodes as possible given the total limit, the following algorithm is used: -// 1) The zones are sorted in increasing order of the total number of nodes. -// 2) The number of nodes to be selected is divided equally among the zones. If there are 4 zones and the limit is 250, +// A) Leave existing endpoints where they are. Detaching endpoints is time consuming and causes connection draining. We want to avoid that. +// B) For the rest of endpoints following algorithm is used: // -// the algorithm attempts to pick 250/4 from the first zone. If 'n' nodes were selected from zone1, the limit for -// zone2 is (250 - n)/3. For the third zone, it is (250 - n - m)/2, if m nodes were picked from zone2. -// Since the number of nodes will keep increasing in successive zones due to the sorting, even if fewer nodes were -// present in some zones, more nodes will be picked from other nodes, taking the total subset size to the given limit -// whenever possible. +// 1. The zones are sorted in increasing order of the total number of nodes. +// +// 2. The number of nodes to be selected is divided equally among the zones. If there are 4 zones and the limit is 250, +// +// the algorithm attempts to pick 250/4 from the first zone. If 'n' nodes were selected from zone1, the limit for +// zone2 is (250 - n)/3. For the third zone, it is (250 - n - m)/2, if m nodes were picked from zone2. +// Since the number of nodes will keep increasing in successive zones due to the sorting, even if fewer nodes were +// present in some zones, more nodes will be picked from other nodes, taking the total subset size to the given limit +// whenever possible. func getSubsetPerZone(nodesPerZone map[string][]*nodeWithSubnet, totalLimit int, svcID string, currentMap map[negtypes.NEGLocation]negtypes.NetworkEndpointSet, logger klog.Logger, networkInfo *network.NetworkInfo) (map[negtypes.NEGLocation]negtypes.NetworkEndpointSet, error) { result := make(map[negtypes.NEGLocation]negtypes.NetworkEndpointSet) subsetSize := 0 // initialize zonesRemaining to the total number of zones. zonesRemaining := len(nodesPerZone) - // Sort zones in increasing order of node count. - zoneList := sortZones(nodesPerZone) defaultSubnet, err := utils.KeyName(networkInfo.SubnetworkURL) if err != nil { @@ -200,9 +202,19 @@ func getSubsetPerZone(nodesPerZone map[string][]*nodeWithSubnet, totalLimit int, return nil, err } + // Remove nodes that are already in use in currentMap + totalLimit, nodesPerZone = pickOutUsedEndpoints(currentMap, nodesPerZone, totalLimit, result) + + // Sort zones in increasing order of the remaining node count. + zoneList := sortZones(nodesPerZone) + for _, zone := range zoneList { // make sure there is an entry for the defaultSubnet in each zone, even if there will be no endpoints in there (maintains the old behavior). - result[negtypes.NEGLocation{Zone: zone.Name, Subnet: defaultSubnet}] = negtypes.NewNetworkEndpointSet() + defaultSubnetLocation := negtypes.NEGLocation{Zone: zone.Name, Subnet: defaultSubnet} + if _, ok := result[defaultSubnetLocation]; !ok { + result[defaultSubnetLocation] = negtypes.NewNetworkEndpointSet() + } + // split the limit across the leftover zones. subsetSize = totalLimit / zonesRemaining logger.Info("Picking subset for a zone", "subsetSize", subsetSize, "zone", zone, "svcID", svcID) @@ -230,6 +242,63 @@ func getSubsetPerZone(nodesPerZone map[string][]*nodeWithSubnet, totalLimit int, return result, nil } +// Removes nodes that are already used in the currentMap. +// +// Adds endpoints to the result in place. +// Returns totalLimit left and nodesPerZone after removal. +func pickOutUsedEndpoints(currentMap map[negtypes.NEGLocation]negtypes.NetworkEndpointSet, nodesPerZone map[string][]*nodeWithSubnet, totalLimit int, result map[negtypes.NEGLocation]negtypes.NetworkEndpointSet) (int, map[string][]*nodeWithSubnet) { + // We can use map to have O(1) find and delete + m := nodesToMap(nodesPerZone) + + for location, endpoints := range currentMap { + for endpoint := range endpoints { + key := nameSubnetKey{endpoint.Node, location.Subnet} + if _, ok := m[location.Zone][key]; !ok { + continue + } + + delete(m[location.Zone], key) + + if _, ok := result[location]; !ok { + result[location] = negtypes.NewNetworkEndpointSet() + } + result[location].Insert(endpoint) + totalLimit-- + } + } + + return totalLimit, mapToNodes(m) +} + +type nameSubnetKey struct { + name string + subnet string +} + +func nodesToMap(nodesPerZone map[string][]*nodeWithSubnet) map[string]map[nameSubnetKey]*nodeWithSubnet { + m := make(map[string]map[nameSubnetKey]*nodeWithSubnet) + for zone, nodes := range nodesPerZone { + m[zone] = make(map[nameSubnetKey]*nodeWithSubnet) + for _, node := range nodes { + m[zone][nameSubnetKey{node.node.Name, node.subnet}] = node + } + } + return m +} + +func mapToNodes(m map[string]map[nameSubnetKey]*nodeWithSubnet) map[string][]*nodeWithSubnet { + nodesPerZone := make(map[string][]*nodeWithSubnet) + for zone, nodes := range m { + // We NEED to have at least an empty slice for each zone + // as some code depends on this behavior. + nodesPerZone[zone] = make([]*nodeWithSubnet, 0) + for _, node := range nodes { + nodesPerZone[zone] = append(nodesPerZone[zone], node) + } + } + return nodesPerZone +} + // getNetworkEndpointsForZone gets all endpoints for a matching zone. // it will get all nodes in the zone no matter which subnet the nodes are in. func getNetworkEndpointsForZone(zone string, currentMap map[negtypes.NEGLocation]negtypes.NetworkEndpointSet) []negtypes.NetworkEndpoint {