diff --git a/test/integration/lrp/lrp_fqdn_test.go b/test/integration/lrp/lrp_fqdn_test.go index 93bca2439b..18ddc1caba 100644 --- a/test/integration/lrp/lrp_fqdn_test.go +++ b/test/integration/lrp/lrp_fqdn_test.go @@ -102,7 +102,7 @@ func TestLRPFQDN(t *testing.T) { for _, tt := range tests { tt := tt t.Run(tt.name, func(t *testing.T) { - testLRPCase(t, ctx, *selectedPod, tt.command, tt.expectedMsgContains, tt.expectedErrMsgContains, tt.shouldError, tt.countIncreases) + testLRPCase(t, ctx, *selectedPod, tt.command, tt.expectedMsgContains, tt.expectedErrMsgContains, tt.shouldError, tt.countIncreases, getPrometheusAddress(initialPrometheusPort)) }) } } diff --git a/test/integration/lrp/lrp_test.go b/test/integration/lrp/lrp_test.go index 59fd974114..9d473f9188 100644 --- a/test/integration/lrp/lrp_test.go +++ b/test/integration/lrp/lrp_test.go @@ -4,6 +4,7 @@ package lrp import ( "context" + "fmt" "os" "strings" "testing" @@ -13,11 +14,16 @@ import ( "github.com/Azure/azure-container-networking/test/integration/prometheus" "github.com/Azure/azure-container-networking/test/internal/kubernetes" "github.com/Azure/azure-container-networking/test/internal/retry" + ciliumv2 "github.com/cilium/cilium/pkg/k8s/apis/cilium.io/v2" ciliumClientset "github.com/cilium/cilium/pkg/k8s/client/clientset/versioned" "github.com/pkg/errors" "github.com/stretchr/testify/require" "golang.org/x/exp/rand" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sclient "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "sigs.k8s.io/yaml" ) const ( @@ -28,11 +34,13 @@ const ( dnsService = "kube-dns" retryAttempts = 10 retryDelay = 5 * time.Second - promAddress = "http://localhost:9253/metrics" nodeLocalDNSLabelSelector = "k8s-app=node-local-dns" clientLabelSelector = "lrp-test=true" coreDNSRequestCountTotal = "coredns_dns_request_count_total" clientContainer = "no-op" + // Port constants for prometheus endpoints + initialPrometheusPort = 9253 + recreatedPrometheusPort = 9254 ) var ( @@ -47,6 +55,11 @@ var ( clientPath = ciliumManifestsDir + "client-ds.yaml" ) +// getPrometheusAddress returns the prometheus metrics URL for the given port +func getPrometheusAddress(port int) string { + return fmt.Sprintf("http://localhost:%d/metrics", port) +} + func setupLRP(t *testing.T, ctx context.Context) (*corev1.Pod, func()) { var cleanUpFns []func() success := false @@ -132,8 +145,8 @@ func setupLRP(t *testing.T, ctx context.Context) (*corev1.Pod, func()) { pf, err := k8s.NewPortForwarder(config, k8s.PortForwardingOpts{ Namespace: nodeLocalDNSDS.Namespace, PodName: selectedLocalDNSPod, - LocalPort: 9253, - DestPort: 9253, + LocalPort: initialPrometheusPort, + DestPort: initialPrometheusPort, }) require.NoError(t, err) pctx := context.Background() @@ -154,7 +167,7 @@ func setupLRP(t *testing.T, ctx context.Context) (*corev1.Pod, func()) { } func testLRPCase(t *testing.T, ctx context.Context, clientPod corev1.Pod, clientCmd []string, expectResponse, expectErrMsg string, - shouldError, countShouldIncrease bool) { + shouldError, countShouldIncrease bool, prometheusAddress string) { config := kubernetes.MustGetRestConfig() cs := kubernetes.MustGetClientset() @@ -167,9 +180,11 @@ func testLRPCase(t *testing.T, ctx context.Context, clientPod corev1.Pod, client "zone": ".", } - // curl localhost:9253/metrics - beforeMetric, err := prometheus.GetMetric(promAddress, coreDNSRequestCountTotal, metricLabels) + // curl to the specified prometheus address + beforeMetric, err := prometheus.GetMetric(prometheusAddress, coreDNSRequestCountTotal, metricLabels) require.NoError(t, err) + beforeValue := beforeMetric.GetCounter().GetValue() + t.Logf("Before DNS request - metric count: %.0f", beforeValue) t.Log("calling command from client") @@ -187,13 +202,15 @@ func testLRPCase(t *testing.T, ctx context.Context, clientPod corev1.Pod, client time.Sleep(500 * time.Millisecond) // curl again and see count diff - afterMetric, err := prometheus.GetMetric(promAddress, coreDNSRequestCountTotal, metricLabels) + afterMetric, err := prometheus.GetMetric(prometheusAddress, coreDNSRequestCountTotal, metricLabels) require.NoError(t, err) + afterValue := afterMetric.GetCounter().GetValue() + t.Logf("After DNS request - metric count: %.0f (diff: %.0f)", afterValue, afterValue-beforeValue) if countShouldIncrease { - require.Greater(t, afterMetric.GetCounter().GetValue(), beforeMetric.GetCounter().GetValue(), "dns metric count did not increase after command") + require.Greater(t, afterValue, beforeValue, "dns metric count did not increase after command - before: %.0f, after: %.0f", beforeValue, afterValue) } else { - require.Equal(t, afterMetric.GetCounter().GetValue(), beforeMetric.GetCounter().GetValue(), "dns metric count increased after command") + require.Equal(t, afterValue, beforeValue, "dns metric count increased after command - before: %.0f, after: %.0f", beforeValue, afterValue) } } @@ -210,9 +227,282 @@ func TestLRP(t *testing.T) { defer cleanupFn() require.NotNil(t, selectedPod) + // Get the kube-dns service IP for DNS requests + cs := kubernetes.MustGetClientset() + svc, err := kubernetes.GetService(ctx, cs, kubeSystemNamespace, dnsService) + require.NoError(t, err) + kubeDNS := svc.Spec.ClusterIP + + t.Logf("LRP Test Starting...") + + // Basic LRP test - using initial port from setupLRP testLRPCase(t, ctx, *selectedPod, []string{ - "nslookup", "google.com", "10.0.0.10", - }, "", "", false, true) + "nslookup", "google.com", kubeDNS, + }, "", "", false, true, getPrometheusAddress(initialPrometheusPort)) + + t.Logf("LRP Test Completed") + + t.Logf("LRP Lifecycle Test Starting") + + // Run LRP Lifecycle test + testLRPLifecycle(t, ctx, *selectedPod, kubeDNS) + + t.Logf("LRP Lifecycle Test Completed") +} + +// testLRPLifecycle performs testing of Local Redirect Policy functionality +// including pod restarts, resource recreation, and cilium command validation +func testLRPLifecycle(t *testing.T, ctx context.Context, clientPod corev1.Pod, kubeDNS string) { + config := kubernetes.MustGetRestConfig() + cs := kubernetes.MustGetClientset() + + + // Step 1: Validate LRP using cilium commands + t.Log("Step 1: Validating LRP using cilium commands") + validateCiliumLRP(t, ctx, cs, config) + + // Step 2: Restart busybox pods and verify LRP still works + t.Log("Step 2: Restarting client pods to test persistence") + restartedPod := restartClientPodsAndGetPod(t, ctx, cs, clientPod) + + // Step 3: Verify metrics after restart + t.Log("Step 3: Verifying LRP functionality after pod restart") + testLRPCase(t, ctx, restartedPod, []string{ + "nslookup", "google.com", kubeDNS, + }, "", "", false, true, getPrometheusAddress(initialPrometheusPort)) + + // Step 4: Validate cilium commands still show LRP + t.Log("Step 4: Re-validating cilium LRP after restart") + validateCiliumLRP(t, ctx, cs, config) + + // Step 5: Delete and recreate resources & restart nodelocaldns daemonset + t.Log("Step 5: Testing resource deletion and recreation") + recreatedPod := deleteAndRecreateResources(t, ctx, cs, clientPod) + + // Step 6: Re-establish port forward to new node-local-dns pod and validate metrics + t.Log("Step 6: Re-establishing port forward to new node-local-dns pod for metrics validation") + + // Get the new node-local-dns pod on the same node as our recreated client pod + nodeName := recreatedPod.Spec.NodeName + newNodeLocalDNSPods, err := kubernetes.GetPodsByNode(ctx, cs, kubeSystemNamespace, nodeLocalDNSLabelSelector, nodeName) + require.NoError(t, err) + require.NotEmpty(t, newNodeLocalDNSPods.Items, "No node-local-dns pod found on node %s after restart", nodeName) + + newNodeLocalDNSPod := TakeOne(newNodeLocalDNSPods.Items) + t.Logf("Setting up port forward to new node-local-dns pod: %s", newNodeLocalDNSPod.Name) + + // Setup new port forward to the new node-local-dns pod + newPf, err := k8s.NewPortForwarder(config, k8s.PortForwardingOpts{ + Namespace: newNodeLocalDNSPod.Namespace, + PodName: newNodeLocalDNSPod.Name, + LocalPort: recreatedPrometheusPort, // Use different port to avoid conflicts + DestPort: initialPrometheusPort, + }) + require.NoError(t, err) + + newPortForwardCtx, newCancel := context.WithTimeout(ctx, (retryAttempts+1)*retryDelay) + defer newCancel() + + err = defaultRetrier.Do(newPortForwardCtx, func() error { + t.Logf("attempting port forward to new node-local-dns pod %s...", newNodeLocalDNSPod.Name) + return errors.Wrap(newPf.Forward(newPortForwardCtx), "could not start port forward to new pod") + }) + require.NoError(t, err, "could not start port forward to new node-local-dns pod") + defer newPf.Stop() + + t.Log("Port forward to new node-local-dns pod established") + + // Use testLRPCase function with the new prometheus address + t.Log("Validating metrics with new node-local-dns pod") + testLRPCase(t, ctx, recreatedPod, []string{ + "nslookup", "github.com", kubeDNS, + }, "", "", false, true, getPrometheusAddress(recreatedPrometheusPort)) + + t.Logf("SUCCESS: Metrics validation passed - traffic is being redirected to new node-local-dns pod %s", newNodeLocalDNSPod.Name) + + // Step 7: Final cilium validation after node-local-dns restart + t.Log("Step 7: Final cilium validation - ensuring LRP is still active after node-local-dns restart") + validateCiliumLRP(t, ctx, cs, config) + +} + +// validateCiliumLRP checks that LRP is properly configured in cilium +func validateCiliumLRP(t *testing.T, ctx context.Context, cs *k8sclient.Clientset, config *rest.Config) { + ciliumPods, err := cs.CoreV1().Pods(kubeSystemNamespace).List(ctx, metav1.ListOptions{ + LabelSelector: "k8s-app=cilium", + }) + require.NoError(t, err) + require.NotEmpty(t, ciliumPods.Items) + ciliumPod := TakeOne(ciliumPods.Items) + + // Get Kubernetes version to determine validation approach + serverVersion, err := cs.Discovery().ServerVersion() + require.NoError(t, err) + t.Logf("Detected Kubernetes version: %s", serverVersion.String()) + + // Get kube-dns service IP for validation + svc, err := kubernetes.GetService(ctx, cs, kubeSystemNamespace, dnsService) + require.NoError(t, err) + kubeDNSIP := svc.Spec.ClusterIP + + // IMPORTANT: Get node-local-dns pod IP on the SAME node as the cilium pod we're using + selectedNode := ciliumPod.Spec.NodeName + t.Logf("Using cilium pod %s on node %s for validation", ciliumPod.Name, selectedNode) + + // Get node-local-dns pod specifically on the same node as our cilium pod + nodeLocalDNSPods, err := kubernetes.GetPodsByNode(ctx, cs, kubeSystemNamespace, nodeLocalDNSLabelSelector, selectedNode) + require.NoError(t, err) + require.NotEmpty(t, nodeLocalDNSPods.Items, "No node-local-dns pod found on node %s", selectedNode) + + // Use the first (and should be only) node-local-dns pod on this node + nodeLocalDNSPod := nodeLocalDNSPods.Items[0] + nodeLocalDNSIP := nodeLocalDNSPod.Status.PodIP + require.NotEmpty(t, nodeLocalDNSIP, "node-local-dns pod %s has no IP address", nodeLocalDNSPod.Name) + + t.Logf("Validating LRP: kubeDNS IP=%s, nodeLocalDNS IP=%s (pod: %s), node=%s", + kubeDNSIP, nodeLocalDNSIP, nodeLocalDNSPod.Name, selectedNode) + + // Check cilium lrp list + lrpListCmd := []string{"cilium", "lrp", "list"} + lrpOutput, _, err := kubernetes.ExecCmdOnPod(ctx, cs, ciliumPod.Namespace, ciliumPod.Name, "cilium-agent", lrpListCmd, config, false) + require.NoError(t, err) + + // Validate the LRP output structure more thoroughly + lrpOutputStr := string(lrpOutput) + require.Contains(t, lrpOutputStr, "nodelocaldns", "LRP not found in cilium lrp list") + + // Parse LRP list output to validate structure + lrpLines := strings.Split(lrpOutputStr, "\n") + nodelocaldnsFound := false + + for _, line := range lrpLines { + line = strings.TrimSpace(line) + if strings.Contains(line, "nodelocaldns") && strings.Contains(line, "kube-system") { + // Validate that the line contains expected components + require.Contains(t, line, "kube-dns", "LRP line should reference kube-dns service") + nodelocaldnsFound = true + t.Logf("Found nodelocaldns LRP entry: %s", line) + break + } + } + + require.True(t, nodelocaldnsFound, "nodelocaldns LRP entry not found with expected structure in output: %s", lrpOutputStr) + + // Check cilium service list for localredirect + serviceListCmd := []string{"cilium", "service", "list"} + serviceOutput, _, err := kubernetes.ExecCmdOnPod(ctx, cs, ciliumPod.Namespace, ciliumPod.Name, "cilium-agent", serviceListCmd, config, false) + require.NoError(t, err) + require.Contains(t, string(serviceOutput), "LocalRedirect", "LocalRedirect not found in cilium service list") + + // Validate LocalRedirect entries + serviceLines := strings.Split(string(serviceOutput), "\n") + tcpFound := false + udpFound := false + legacyFound := false + + for _, line := range serviceLines { + if strings.Contains(line, "LocalRedirect") && strings.Contains(line, kubeDNSIP) { + // Check if this line contains the expected frontend (kube-dns) and backend (node-local-dns) IPs + if strings.Contains(line, nodeLocalDNSIP) { + // Check for both modern format (with /TCP or /UDP) and legacy format (without protocol) + if strings.Contains(line, "/TCP") { + tcpFound = true + t.Logf("Found TCP LocalRedirect: %s", strings.TrimSpace(line)) + } else if strings.Contains(line, "/UDP") { + udpFound = true + t.Logf("Found UDP LocalRedirect: %s", strings.TrimSpace(line)) + } else { + legacyFound = true + t.Logf("Found legacy LocalRedirect: %s", strings.TrimSpace(line)) + } + } + } + } + + // Validate that we found either legacy format or modern format entries + t.Log("Validating LocalRedirect entries - accepting either legacy format or modern TCP/UDP format") + require.True(t, legacyFound || (tcpFound && udpFound), "Either legacy LocalRedirect entry OR both TCP and UDP entries must be found with frontend IP %s and backend IP %s on node %s", kubeDNSIP, nodeLocalDNSIP, selectedNode) + + t.Logf("Cilium LRP List Output:\n%s", string(lrpOutput)) + t.Logf("Cilium Service List Output:\n%s", string(serviceOutput)) +} + +// restartClientPodsAndGetPod restarts the client daemonset and returns a new pod reference +func restartClientPodsAndGetPod(t *testing.T, ctx context.Context, cs *k8sclient.Clientset, originalPod corev1.Pod) corev1.Pod { + // Get the node name for consistent testing + nodeName := originalPod.Spec.NodeName + + // Restart the daemonset (assumes it's named "lrp-test" based on the manifest) + err := kubernetes.MustRestartDaemonset(ctx, cs, originalPod.Namespace, "lrp-test") + require.NoError(t, err) + + // Wait for the daemonset to be ready + kubernetes.WaitForPodDaemonset(ctx, cs, originalPod.Namespace, "lrp-test", clientLabelSelector) + + // Get the new pod on the same node + clientPods, err := kubernetes.GetPodsByNode(ctx, cs, originalPod.Namespace, clientLabelSelector, nodeName) + require.NoError(t, err) + require.NotEmpty(t, clientPods.Items) + + return TakeOne(clientPods.Items) +} + +// deleteAndRecreateResources deletes and recreates client pods and LRP, returning new pod +func deleteAndRecreateResources(t *testing.T, ctx context.Context, cs *k8sclient.Clientset, originalPod corev1.Pod) corev1.Pod { + config := kubernetes.MustGetRestConfig() + ciliumCS, err := ciliumClientset.NewForConfig(config) + require.NoError(t, err) + + nodeName := originalPod.Spec.NodeName + + // Delete client daemonset + dsClient := cs.AppsV1().DaemonSets(originalPod.Namespace) + clientDS := kubernetes.MustParseDaemonSet(clientPath) + kubernetes.MustDeleteDaemonset(ctx, dsClient, clientDS) + + // Delete LRP + lrpContent, err := os.ReadFile(lrpPath) + require.NoError(t, err) + var lrp ciliumv2.CiliumLocalRedirectPolicy + err = yaml.Unmarshal(lrpContent, &lrp) + require.NoError(t, err) + + lrpClient := ciliumCS.CiliumV2().CiliumLocalRedirectPolicies(lrp.Namespace) + kubernetes.MustDeleteCiliumLocalRedirectPolicy(ctx, lrpClient, lrp) + + // Wait for client pods to be deleted + t.Log("Waiting for client pods to be deleted...") + err = kubernetes.WaitForPodsDelete(ctx, cs, originalPod.Namespace, clientLabelSelector) + require.NoError(t, err) + + // Wait for LRP to be deleted by polling + t.Log("Waiting for LRP to be deleted...") + err = kubernetes.WaitForLRPDelete(ctx, ciliumCS, lrp) + require.NoError(t, err) + + // Recreate LRP + _, cleanupLRP := kubernetes.MustSetupLRP(ctx, ciliumCS, lrpPath) + t.Cleanup(cleanupLRP) + + // Restart node-local-dns pods to pick up new LRP configuration + t.Log("Restarting node-local-dns pods after LRP recreation") + err = kubernetes.MustRestartDaemonset(ctx, cs, kubeSystemNamespace, "node-local-dns") + require.NoError(t, err) + kubernetes.WaitForPodDaemonset(ctx, cs, kubeSystemNamespace, "node-local-dns", nodeLocalDNSLabelSelector) + + // Recreate client daemonset + _, cleanupClient := kubernetes.MustSetupDaemonset(ctx, cs, clientPath) + t.Cleanup(cleanupClient) + + // Wait for pods to be ready + kubernetes.WaitForPodDaemonset(ctx, cs, clientDS.Namespace, clientDS.Name, clientLabelSelector) + + // Get new pod on the same node + clientPods, err := kubernetes.GetPodsByNode(ctx, cs, clientDS.Namespace, clientLabelSelector, nodeName) + require.NoError(t, err) + require.NotEmpty(t, clientPods.Items) + + return TakeOne(clientPods.Items) } // TakeOne takes one item from the slice randomly; if empty, it returns the empty value for the type diff --git a/test/integration/manifests/cilium/v1.13/cilium-config/cilium-config.yaml b/test/integration/manifests/cilium/v1.13/cilium-config/cilium-config.yaml index 198074750b..add4b386b7 100644 --- a/test/integration/manifests/cilium/v1.13/cilium-config/cilium-config.yaml +++ b/test/integration/manifests/cilium/v1.13/cilium-config/cilium-config.yaml @@ -34,7 +34,7 @@ data: enable-l2-neigh-discovery: "true" enable-l7-proxy: "false" enable-local-node-route: "false" - enable-local-redirect-policy: "true" # set to true for lrp test + enable-local-redirect-policy: "false" enable-metrics: "true" enable-policy: default enable-remote-node-identity: "true" diff --git a/test/internal/kubernetes/utils.go b/test/internal/kubernetes/utils.go index ac35efeada..27aa58e5bd 100644 --- a/test/internal/kubernetes/utils.go +++ b/test/internal/kubernetes/utils.go @@ -365,6 +365,24 @@ func WaitForDeploymentToDelete(ctx context.Context, deploymentsClient typedappsv return errors.Wrapf(retrier.Do(ctx, assertDeploymentNotFound), "could not assert deployment %s isNotFound", d.Name) } +func WaitForLRPDelete(ctx context.Context, ciliumClientset *cilium.Clientset, lrp ciliumv2.CiliumLocalRedirectPolicy) error { + lrpClient := ciliumClientset.CiliumV2().CiliumLocalRedirectPolicies(lrp.Namespace) + + checkLRPDeleted := func() error { + _, err := lrpClient.Get(ctx, lrp.Name, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + return nil + } + if err != nil { + return errors.Wrapf(err, "could not get LRP %s", lrp.Name) + } + return errors.Errorf("LRP %s still present", lrp.Name) + } + + retrier := retry.Retrier{Attempts: RetryAttempts, Delay: RetryDelay} + return errors.Wrap(retrier.Do(ctx, checkLRPDeleted), "failed to wait for LRP to delete") +} + func WaitForPodDaemonset(ctx context.Context, clientset *kubernetes.Clientset, namespace, daemonsetName, podLabelSelector string) error { podsClient := clientset.CoreV1().Pods(namespace) daemonsetClient := clientset.AppsV1().DaemonSets(namespace)