diff --git a/internal/pkg/metrics/alerts/alerts.go b/internal/pkg/metrics/alerts/alerts.go index b0e991fcd..823d4c5dd 100644 --- a/internal/pkg/metrics/alerts/alerts.go +++ b/internal/pkg/metrics/alerts/alerts.go @@ -51,8 +51,9 @@ func (rb *ruleBuilder) kernelDrops() (*monitoringv1.Rule, error) { ) metric, totalMetric := rb.getMetricsForAlert() - metricsRate := promQLRateFromMetric(metric, "", "", "2m", "") - totalRate := promQLRateFromMetric(totalMetric, "", "", "2m", "") + filter := rb.buildLabelFilter("") + metricsRate := promQLRateFromMetric(metric, "", filter, "2m", "") + totalRate := promQLRateFromMetric(totalMetric, "", filter, "2m", "") metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "") totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "") promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold) @@ -118,8 +119,9 @@ func (rb *ruleBuilder) ipsecErrors() (*monitoringv1.Rule, error) { ) metric, totalMetric := rb.getMetricsForAlert() - metricsRate := promQLRateFromMetric(metric, "", "", "2m", "") - totalRate := promQLRateFromMetric(totalMetric, "", "", "2m", "") + filter := rb.buildLabelFilter("") + metricsRate := promQLRateFromMetric(metric, "", filter, "2m", "") + totalRate := promQLRateFromMetric(totalMetric, "", filter, "2m", "") metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "") totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "") promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold) @@ -140,8 +142,10 @@ func (rb *ruleBuilder) dnsErrors() (*monitoringv1.Rule, error) { ) metric, totalMetric := rb.getMetricsForAlert() - metricsRate := promQLRateFromMetric(metric, "_count", `{DnsFlagsResponseCode!="NoError"}`, "2m", "") - totalRate := promQLRateFromMetric(totalMetric, "_count", "", "2m", "") + metricsFilter := rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`) + totalFilter := rb.buildLabelFilter("") + metricsRate := promQLRateFromMetric(metric, "_count", metricsFilter, "2m", "") + totalRate := promQLRateFromMetric(totalMetric, "_count", totalFilter, "2m", "") metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "") totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "") promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold) @@ -160,8 +164,10 @@ func (rb *ruleBuilder) netpolDenied() (*monitoringv1.Rule, error) { ) metric, totalMetric := rb.getMetricsForAlert() - metricsRate := promQLRateFromMetric(metric, "", `{action="drop"}`, "2m", "") - totalRate := promQLRateFromMetric(totalMetric, "", "", "2m", "") + metricsFilter := rb.buildLabelFilter(`action="drop"`) + totalFilter := rb.buildLabelFilter("") + metricsRate := promQLRateFromMetric(metric, "", metricsFilter, "2m", "") + totalRate := promQLRateFromMetric(totalMetric, "", totalFilter, "2m", "") metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "") totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "") promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold) @@ -180,8 +186,9 @@ func (rb *ruleBuilder) latencyTrend() (*monitoringv1.Rule, error) { ) metric, baseline := rb.getMetricsForAlert() - metricsRate := promQLRateFromMetric(metric, "_bucket", "", "2m", "") - baselineRate := promQLRateFromMetric(baseline, "_bucket", "", duration, " offset "+offset) + filter := rb.buildLabelFilter("") + metricsRate := promQLRateFromMetric(metric, "_bucket", filter, "2m", "") + baselineRate := promQLRateFromMetric(baseline, "_bucket", filter, duration, " offset "+offset) metricQuantile := histogramQuantile(metricsRate, rb.alert.GroupBy, rb.side, "0.9") baselineQuantile := histogramQuantile(baselineRate, rb.alert.GroupBy, rb.side, "0.9") promql := baselineIncreasePromQL(metricQuantile, baselineQuantile, rb.threshold, rb.upperThreshold) diff --git a/internal/pkg/metrics/alerts/alerts_test.go b/internal/pkg/metrics/alerts/alerts_test.go index e5fae1bd7..0a5c84f3e 100644 --- a/internal/pkg/metrics/alerts/alerts_test.go +++ b/internal/pkg/metrics/alerts/alerts_test.go @@ -280,11 +280,11 @@ func TestLatencyPromql(t *testing.T) { assert.Equal(t, `100 * `+ `((histogram_quantile(0.9, `+ - `sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket[2m]), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+ + `sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!=""}[2m]), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+ ` - (histogram_quantile(0.9, `+ - `sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le))))`+ + `sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!=""}[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le))))`+ ` / (histogram_quantile(0.9, `+ - `sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+ + `sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!=""}[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+ ` > 100`, rules[0].Expr.StrVal, ) diff --git a/internal/pkg/metrics/alerts/builder.go b/internal/pkg/metrics/alerts/builder.go index a1cf83933..cef84894c 100644 --- a/internal/pkg/metrics/alerts/builder.go +++ b/internal/pkg/metrics/alerts/builder.go @@ -214,6 +214,33 @@ func buildLabels(severity string, forHealth bool) map[string]string { return m } +func (rb *ruleBuilder) buildLabelFilter(additionalFilter string) string { + var filters []string + + // Build label matchers to filter out metrics where K8s labels don't exist or are empty + // This prevents alerts from firing with empty namespace/workload/node labels + switch rb.alert.GroupBy { + case flowslatest.GroupByNode: + filters = append(filters, string(rb.side)+`K8S_HostName!=""`) + case flowslatest.GroupByNamespace: + filters = append(filters, string(rb.side)+`K8S_Namespace!=""`) + case flowslatest.GroupByWorkload: + filters = append(filters, string(rb.side)+`K8S_Namespace!=""`) + filters = append(filters, string(rb.side)+`K8S_OwnerName!=""`) + filters = append(filters, string(rb.side)+`K8S_OwnerType!=""`) + } + + // Add additional business logic filters + if additionalFilter != "" { + filters = append(filters, additionalFilter) + } + + if len(filters) == 0 { + return "" + } + return "{" + strings.Join(filters, ",") + "}" +} + func (rb *ruleBuilder) getMetricsForAlert() (string, string) { var reqMetric1, reqMetric2 string reqMetrics1, reqMetrics2 := flowslatest.GetElligibleMetricsForAlert(rb.template, rb.alert) diff --git a/internal/pkg/metrics/alerts/builder_test.go b/internal/pkg/metrics/alerts/builder_test.go new file mode 100644 index 000000000..073ea404d --- /dev/null +++ b/internal/pkg/metrics/alerts/builder_test.go @@ -0,0 +1,59 @@ +package alerts + +import ( + "testing" + + flowslatest "github.com/netobserv/network-observability-operator/api/flowcollector/v1beta2" + "github.com/stretchr/testify/assert" +) + +func TestBuildLabelFilter(t *testing.T) { + // Test GroupByNode with source side + rb := &ruleBuilder{ + alert: &flowslatest.AlertVariant{ + GroupBy: flowslatest.GroupByNode, + }, + side: asSource, + } + filter := rb.buildLabelFilter("") + assert.Equal(t, `{SrcK8S_HostName!=""}`, filter) + + // Test GroupByNode with destination side + rb.side = asDest + filter = rb.buildLabelFilter("") + assert.Equal(t, `{DstK8S_HostName!=""}`, filter) + + // Test GroupByNamespace + rb.alert.GroupBy = flowslatest.GroupByNamespace + rb.side = asSource + filter = rb.buildLabelFilter("") + assert.Equal(t, `{SrcK8S_Namespace!=""}`, filter) + + // Test GroupByWorkload + rb.alert.GroupBy = flowslatest.GroupByWorkload + rb.side = asDest + filter = rb.buildLabelFilter("") + assert.Equal(t, `{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!=""}`, filter) + + // Test with additional filter + rb.alert.GroupBy = flowslatest.GroupByNamespace + rb.side = asSource + filter = rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`) + assert.Equal(t, `{SrcK8S_Namespace!="",DnsFlagsResponseCode!="NoError"}`, filter) + + // Test with action filter (netpol) + rb.alert.GroupBy = flowslatest.GroupByWorkload + rb.side = asDest + filter = rb.buildLabelFilter(`action="drop"`) + assert.Equal(t, `{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!="",action="drop"}`, filter) + + // Test no grouping (global) + rb.alert.GroupBy = "" + rb.side = "" + filter = rb.buildLabelFilter("") + assert.Equal(t, "", filter) + + // Test no grouping with additional filter + filter = rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`) + assert.Equal(t, `{DnsFlagsResponseCode!="NoError"}`, filter) +} diff --git a/internal/pkg/metrics/alerts/promql_test.go b/internal/pkg/metrics/alerts/promql_test.go index 9e08207f8..547b48c8a 100644 --- a/internal/pkg/metrics/alerts/promql_test.go +++ b/internal/pkg/metrics/alerts/promql_test.go @@ -8,20 +8,46 @@ import ( ) func TestSumBy(t *testing.T) { - pql := sumBy("rate(my_metric[1m])", flowslatest.GroupByNode, asSource, "") + // Test GroupByNode with source side - filters are now added via buildLabelFilter before promQLRateFromMetric + // sumBy should only handle label replacement and aggregation + pql := sumBy(`rate(my_metric{SrcK8S_HostName!=""}[1m])`, flowslatest.GroupByNode, asSource, "") assert.Equal(t, - `sum(label_replace(rate(my_metric[1m]), "node", "$1", "SrcK8S_HostName", "(.*)")) by (node)`, + `sum(label_replace(rate(my_metric{SrcK8S_HostName!=""}[1m]), "node", "$1", "SrcK8S_HostName", "(.*)")) by (node)`, pql, ) + assert.Contains(t, pql, `SrcK8S_HostName!=""`, "should preserve filters from input") - pql = sumBy("rate(my_metric[1m])", flowslatest.GroupByWorkload, asDest, "") + // Test GroupByWorkload with destination side - filters come from buildLabelFilter + pql = sumBy(`rate(my_metric{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!=""}[1m])`, flowslatest.GroupByWorkload, asDest, "") assert.Equal(t, - `sum(label_replace(label_replace(label_replace(rate(my_metric[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)"), "workload", "$1", "DstK8S_OwnerName", "(.*)"), "kind", "$1", "DstK8S_OwnerType", "(.*)")) by (namespace,workload,kind)`, + `sum(label_replace(label_replace(label_replace(rate(my_metric{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!=""}[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)"), "workload", "$1", "DstK8S_OwnerName", "(.*)"), "kind", "$1", "DstK8S_OwnerType", "(.*)")) by (namespace,workload,kind)`, pql, ) + assert.Contains(t, pql, `DstK8S_Namespace!=""`, "should preserve DstK8S_Namespace filter from input") + assert.Contains(t, pql, `DstK8S_OwnerName!=""`, "should preserve DstK8S_OwnerName filter from input") + assert.Contains(t, pql, `DstK8S_OwnerType!=""`, "should preserve DstK8S_OwnerType filter from input") + // Test GroupByNamespace - filters from buildLabelFilter + pql = sumBy(`rate(my_metric{DstK8S_Namespace!=""}[1m])`, flowslatest.GroupByNamespace, asDest, "") + assert.Equal(t, + `sum(label_replace(rate(my_metric{DstK8S_Namespace!=""}[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)")) by (namespace)`, + pql, + ) + assert.Contains(t, pql, `DstK8S_Namespace!=""`, "should preserve namespace filter from input") + + // Test no grouping - should NOT add any filters pql = sumBy("rate(my_metric[1m])", "", "", "") assert.Equal(t, `sum(rate(my_metric[1m]))`, pql) + assert.NotContains(t, pql, "K8S_", "should not add K8s label filters when not grouping") + + // Test with existing label selector (like DNS errors) - filters are merged in buildLabelFilter + pql = sumBy(`rate(my_metric{DstK8S_Namespace!="",DnsFlagsResponseCode!="NoError"}[1m])`, flowslatest.GroupByNamespace, asDest, "") + assert.Equal(t, + `sum(label_replace(rate(my_metric{DstK8S_Namespace!="",DnsFlagsResponseCode!="NoError"}[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)")) by (namespace)`, + pql, + ) + assert.Contains(t, pql, `DstK8S_Namespace!=""`, "should preserve namespace filter") + assert.Contains(t, pql, `DnsFlagsResponseCode!="NoError"`, "should preserve business logic filter") } func TestPercentagePromQL(t *testing.T) {