Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 17 additions & 10 deletions internal/pkg/metrics/alerts/alerts.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ func (rb *ruleBuilder) kernelDrops() (*monitoringv1.Rule, error) {
)

metric, totalMetric := rb.getMetricsForAlert()
metricsRate := promQLRateFromMetric(metric, "", "", "2m", "")
totalRate := promQLRateFromMetric(totalMetric, "", "", "2m", "")
filter := rb.buildLabelFilter("")
metricsRate := promQLRateFromMetric(metric, "", filter, "2m", "")
totalRate := promQLRateFromMetric(totalMetric, "", filter, "2m", "")
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold)
Expand Down Expand Up @@ -118,8 +119,9 @@ func (rb *ruleBuilder) ipsecErrors() (*monitoringv1.Rule, error) {
)

metric, totalMetric := rb.getMetricsForAlert()
metricsRate := promQLRateFromMetric(metric, "", "", "2m", "")
totalRate := promQLRateFromMetric(totalMetric, "", "", "2m", "")
filter := rb.buildLabelFilter("")
metricsRate := promQLRateFromMetric(metric, "", filter, "2m", "")
totalRate := promQLRateFromMetric(totalMetric, "", filter, "2m", "")
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold)
Expand All @@ -140,8 +142,10 @@ func (rb *ruleBuilder) dnsErrors() (*monitoringv1.Rule, error) {
)

metric, totalMetric := rb.getMetricsForAlert()
metricsRate := promQLRateFromMetric(metric, "_count", `{DnsFlagsResponseCode!="NoError"}`, "2m", "")
totalRate := promQLRateFromMetric(totalMetric, "_count", "", "2m", "")
metricsFilter := rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`)
totalFilter := rb.buildLabelFilter("")
metricsRate := promQLRateFromMetric(metric, "_count", metricsFilter, "2m", "")
totalRate := promQLRateFromMetric(totalMetric, "_count", totalFilter, "2m", "")
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold)
Expand All @@ -160,8 +164,10 @@ func (rb *ruleBuilder) netpolDenied() (*monitoringv1.Rule, error) {
)

metric, totalMetric := rb.getMetricsForAlert()
metricsRate := promQLRateFromMetric(metric, "", `{action="drop"}`, "2m", "")
totalRate := promQLRateFromMetric(totalMetric, "", "", "2m", "")
metricsFilter := rb.buildLabelFilter(`action="drop"`)
totalFilter := rb.buildLabelFilter("")
metricsRate := promQLRateFromMetric(metric, "", metricsFilter, "2m", "")
totalRate := promQLRateFromMetric(totalMetric, "", totalFilter, "2m", "")
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
promql := percentagePromQL(metricsSumBy, totalSumBy, rb.threshold, rb.upperThreshold, rb.alert.LowVolumeThreshold)
Expand All @@ -180,8 +186,9 @@ func (rb *ruleBuilder) latencyTrend() (*monitoringv1.Rule, error) {
)

metric, baseline := rb.getMetricsForAlert()
metricsRate := promQLRateFromMetric(metric, "_bucket", "", "2m", "")
baselineRate := promQLRateFromMetric(baseline, "_bucket", "", duration, " offset "+offset)
filter := rb.buildLabelFilter("")
metricsRate := promQLRateFromMetric(metric, "_bucket", filter, "2m", "")
baselineRate := promQLRateFromMetric(baseline, "_bucket", filter, duration, " offset "+offset)
metricQuantile := histogramQuantile(metricsRate, rb.alert.GroupBy, rb.side, "0.9")
baselineQuantile := histogramQuantile(baselineRate, rb.alert.GroupBy, rb.side, "0.9")
promql := baselineIncreasePromQL(metricQuantile, baselineQuantile, rb.threshold, rb.upperThreshold)
Expand Down
6 changes: 3 additions & 3 deletions internal/pkg/metrics/alerts/alerts_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -280,11 +280,11 @@ func TestLatencyPromql(t *testing.T) {
assert.Equal(t,
`100 * `+
`((histogram_quantile(0.9, `+
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket[2m]), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!=""}[2m]), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+
` - (histogram_quantile(0.9, `+
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le))))`+
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!=""}[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le))))`+
` / (histogram_quantile(0.9, `+
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+
`sum(label_replace(rate(netobserv_namespace_rtt_seconds_bucket{SrcK8S_Namespace!=""}[2h] offset 24h), "namespace", "$1", "SrcK8S_Namespace", "(.*)")) by (namespace,le)))`+
` > 100`,
rules[0].Expr.StrVal,
)
Expand Down
27 changes: 27 additions & 0 deletions internal/pkg/metrics/alerts/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,33 @@ func buildLabels(severity string, forHealth bool) map[string]string {
return m
}

func (rb *ruleBuilder) buildLabelFilter(additionalFilter string) string {
var filters []string

// Build label matchers to filter out metrics where K8s labels don't exist or are empty
// This prevents alerts from firing with empty namespace/workload/node labels
switch rb.alert.GroupBy {
case flowslatest.GroupByNode:
filters = append(filters, string(rb.side)+`K8S_HostName!=""`)
case flowslatest.GroupByNamespace:
filters = append(filters, string(rb.side)+`K8S_Namespace!=""`)
case flowslatest.GroupByWorkload:
filters = append(filters, string(rb.side)+`K8S_Namespace!=""`)
filters = append(filters, string(rb.side)+`K8S_OwnerName!=""`)
filters = append(filters, string(rb.side)+`K8S_OwnerType!=""`)
}

// Add additional business logic filters
if additionalFilter != "" {
filters = append(filters, additionalFilter)
}

if len(filters) == 0 {
return ""
}
return "{" + strings.Join(filters, ",") + "}"
}

func (rb *ruleBuilder) getMetricsForAlert() (string, string) {
var reqMetric1, reqMetric2 string
reqMetrics1, reqMetrics2 := flowslatest.GetElligibleMetricsForAlert(rb.template, rb.alert)
Expand Down
59 changes: 59 additions & 0 deletions internal/pkg/metrics/alerts/builder_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package alerts

import (
"testing"

flowslatest "github.com/netobserv/network-observability-operator/api/flowcollector/v1beta2"
"github.com/stretchr/testify/assert"
)

func TestBuildLabelFilter(t *testing.T) {
// Test GroupByNode with source side
rb := &ruleBuilder{
alert: &flowslatest.AlertVariant{
GroupBy: flowslatest.GroupByNode,
},
side: asSource,
}
filter := rb.buildLabelFilter("")
assert.Equal(t, `{SrcK8S_HostName!=""}`, filter)

// Test GroupByNode with destination side
rb.side = asDest
filter = rb.buildLabelFilter("")
assert.Equal(t, `{DstK8S_HostName!=""}`, filter)

// Test GroupByNamespace
rb.alert.GroupBy = flowslatest.GroupByNamespace
rb.side = asSource
filter = rb.buildLabelFilter("")
assert.Equal(t, `{SrcK8S_Namespace!=""}`, filter)

// Test GroupByWorkload
rb.alert.GroupBy = flowslatest.GroupByWorkload
rb.side = asDest
filter = rb.buildLabelFilter("")
assert.Equal(t, `{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!=""}`, filter)

// Test with additional filter
rb.alert.GroupBy = flowslatest.GroupByNamespace
rb.side = asSource
filter = rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`)
assert.Equal(t, `{SrcK8S_Namespace!="",DnsFlagsResponseCode!="NoError"}`, filter)

// Test with action filter (netpol)
rb.alert.GroupBy = flowslatest.GroupByWorkload
rb.side = asDest
filter = rb.buildLabelFilter(`action="drop"`)
assert.Equal(t, `{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!="",action="drop"}`, filter)

// Test no grouping (global)
rb.alert.GroupBy = ""
rb.side = ""
filter = rb.buildLabelFilter("")
assert.Equal(t, "", filter)

// Test no grouping with additional filter
filter = rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`)
assert.Equal(t, `{DnsFlagsResponseCode!="NoError"}`, filter)
}
34 changes: 30 additions & 4 deletions internal/pkg/metrics/alerts/promql_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,46 @@ import (
)

func TestSumBy(t *testing.T) {
pql := sumBy("rate(my_metric[1m])", flowslatest.GroupByNode, asSource, "")
// Test GroupByNode with source side - filters are now added via buildLabelFilter before promQLRateFromMetric
// sumBy should only handle label replacement and aggregation
pql := sumBy(`rate(my_metric{SrcK8S_HostName!=""}[1m])`, flowslatest.GroupByNode, asSource, "")
assert.Equal(t,
`sum(label_replace(rate(my_metric[1m]), "node", "$1", "SrcK8S_HostName", "(.*)")) by (node)`,
`sum(label_replace(rate(my_metric{SrcK8S_HostName!=""}[1m]), "node", "$1", "SrcK8S_HostName", "(.*)")) by (node)`,
pql,
)
assert.Contains(t, pql, `SrcK8S_HostName!=""`, "should preserve filters from input")

pql = sumBy("rate(my_metric[1m])", flowslatest.GroupByWorkload, asDest, "")
// Test GroupByWorkload with destination side - filters come from buildLabelFilter
pql = sumBy(`rate(my_metric{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!=""}[1m])`, flowslatest.GroupByWorkload, asDest, "")
assert.Equal(t,
`sum(label_replace(label_replace(label_replace(rate(my_metric[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)"), "workload", "$1", "DstK8S_OwnerName", "(.*)"), "kind", "$1", "DstK8S_OwnerType", "(.*)")) by (namespace,workload,kind)`,
`sum(label_replace(label_replace(label_replace(rate(my_metric{DstK8S_Namespace!="",DstK8S_OwnerName!="",DstK8S_OwnerType!=""}[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)"), "workload", "$1", "DstK8S_OwnerName", "(.*)"), "kind", "$1", "DstK8S_OwnerType", "(.*)")) by (namespace,workload,kind)`,
pql,
)
assert.Contains(t, pql, `DstK8S_Namespace!=""`, "should preserve DstK8S_Namespace filter from input")
assert.Contains(t, pql, `DstK8S_OwnerName!=""`, "should preserve DstK8S_OwnerName filter from input")
assert.Contains(t, pql, `DstK8S_OwnerType!=""`, "should preserve DstK8S_OwnerType filter from input")

// Test GroupByNamespace - filters from buildLabelFilter
pql = sumBy(`rate(my_metric{DstK8S_Namespace!=""}[1m])`, flowslatest.GroupByNamespace, asDest, "")
assert.Equal(t,
`sum(label_replace(rate(my_metric{DstK8S_Namespace!=""}[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)")) by (namespace)`,
pql,
)
assert.Contains(t, pql, `DstK8S_Namespace!=""`, "should preserve namespace filter from input")

// Test no grouping - should NOT add any filters
pql = sumBy("rate(my_metric[1m])", "", "", "")
assert.Equal(t, `sum(rate(my_metric[1m]))`, pql)
assert.NotContains(t, pql, "K8S_", "should not add K8s label filters when not grouping")

// Test with existing label selector (like DNS errors) - filters are merged in buildLabelFilter
pql = sumBy(`rate(my_metric{DstK8S_Namespace!="",DnsFlagsResponseCode!="NoError"}[1m])`, flowslatest.GroupByNamespace, asDest, "")
assert.Equal(t,
`sum(label_replace(rate(my_metric{DstK8S_Namespace!="",DnsFlagsResponseCode!="NoError"}[1m]), "namespace", "$1", "DstK8S_Namespace", "(.*)")) by (namespace)`,
pql,
)
assert.Contains(t, pql, `DstK8S_Namespace!=""`, "should preserve namespace filter")
assert.Contains(t, pql, `DnsFlagsResponseCode!="NoError"`, "should preserve business logic filter")
}

func TestPercentagePromQL(t *testing.T) {
Expand Down