Skip to content

Commit 75c0670

Browse files
convert cache to a ttl cache
1 parent eddbbb3 commit 75c0670

File tree

3 files changed

+31
-39
lines changed

3 files changed

+31
-39
lines changed

internal/cortex/frontend/transport/handler.go

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ type HandlerConfig struct {
4848
QueryStatsEnabled bool `yaml:"query_stats_enabled"`
4949
LogFailedQueries bool `yaml:"log_failed_queries"`
5050
FailedQueryCacheCapacity int `yaml:"failed_query_cache_capacity"`
51+
FailedQueryTTL time.Duration `yaml:"failed_query_ttl"`
5152
}
5253

5354
// Handler accepts queries and forwards them to RoundTripper. It can log slow queries,
@@ -76,11 +77,8 @@ func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logge
7677
}
7778

7879
if cfg.FailedQueryCacheCapacity > 0 {
79-
level.Info(log).Log("msg", "Creating failed query cache", "capacity", cfg.FailedQueryCacheCapacity)
80-
FailedQueryCache, errQueryCache := utils.NewFailedQueryCache(cfg.FailedQueryCacheCapacity, reg)
81-
if errQueryCache != nil {
82-
level.Warn(log).Log(errQueryCache.Error())
83-
}
80+
level.Info(log).Log("msg", "Creating failed query cache", "capacity", cfg.FailedQueryCacheCapacity, "ttl", cfg.FailedQueryTTL.String())
81+
FailedQueryCache := utils.NewFailedQueryCache(cfg.FailedQueryCacheCapacity, cfg.FailedQueryTTL, reg)
8482
h.failedQueryCache = FailedQueryCache
8583
}
8684

internal/cortex/frontend/transport/utils/utils.go

Lines changed: 19 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,13 @@ package utils
66

77
import (
88
"fmt"
9+
"github.com/hashicorp/golang-lru/v2/expirable"
910
"net/http"
1011
"net/url"
1112
"regexp"
1213
"strconv"
1314
"time"
1415

15-
lru "github.com/hashicorp/golang-lru"
1616
"github.com/prometheus/client_golang/prometheus"
1717
"github.com/prometheus/client_golang/prometheus/promauto"
1818
)
@@ -25,39 +25,35 @@ var (
2525
type FailedQueryCache struct {
2626
regex *regexp.Regexp
2727
errorExtract *regexp.Regexp
28-
lruCache *lru.Cache
28+
lruCache *expirable.LRU[string, int]
2929
cachedHits prometheus.Counter
3030
cachedQueries prometheus.Gauge
3131
}
3232

33-
func NewFailedQueryCache(capacity int, reg prometheus.Registerer) (*FailedQueryCache, error) {
33+
func NewFailedQueryCache(capacity int, ttlDuration time.Duration, reg prometheus.Registerer) *FailedQueryCache {
3434
regex := regexp.MustCompile(`[\s\n\t]+`)
3535
errorExtract := regexp.MustCompile(`Code\((\d+)\)`)
36-
lruCache, err := lru.New(capacity)
37-
if err != nil {
38-
lruCache = nil
39-
err = fmt.Errorf("failed to create lru cache: %s", err)
40-
return nil, err
41-
}
36+
lruCacheWithTTL := expirable.NewLRU[string, int](capacity, nil, ttlDuration)
37+
4238
cachedHits := promauto.With(reg).NewCounter(prometheus.CounterOpts{
4339
Namespace: "cortex",
44-
Name: "cached_failed_queries_count",
45-
Help: "Total number of queries that hit the failed query cache.",
40+
Name: "cached_failed_queries_count",
41+
Help: "Total number of queries that hit the failed query cache.",
4642
})
4743
cachedQueries := promauto.With(reg).NewGauge(prometheus.GaugeOpts{
4844
Namespace: "cortex",
49-
Name: "failed_query_cache_size",
50-
Help: "How many queries are cached in the failed query cache.",
45+
Name: "failed_query_cache_size",
46+
Help: "How many queries are cached in the failed query cache.",
5147
})
5248
cachedQueries.Set(0)
5349

5450
return &FailedQueryCache{
5551
regex: regex,
5652
errorExtract: errorExtract,
57-
lruCache: lruCache,
53+
lruCache: lruCacheWithTTL,
5854
cachedHits: cachedHits,
5955
cachedQueries: cachedQueries,
60-
}, err
56+
}
6157
}
6258

6359
// UpdateFailedQueryCache returns true if query is cached so that callsite can increase counter, returns message as a string for callsite to log outcome
@@ -92,19 +88,20 @@ func (f *FailedQueryCache) updateFailedQueryCache(err error, queryExpressionNorm
9288

9389
func (f *FailedQueryCache) addCacheEntry(queryExpressionNormalized string, queryExpressionRangeLength int) {
9490
// Checks if queryExpression is already in cache, and updates time range length value to min of stored and new value.
95-
if contains, _ := f.lruCache.ContainsOrAdd(queryExpressionNormalized, queryExpressionRangeLength); contains {
91+
if contains := f.lruCache.Contains(queryExpressionNormalized); contains {
9692
if oldValue, ok := f.lruCache.Get(queryExpressionNormalized); ok {
97-
queryExpressionRangeLength = min(queryExpressionRangeLength, oldValue.(int))
93+
queryExpressionRangeLength = min(queryExpressionRangeLength, oldValue)
9894
}
99-
f.lruCache.Add(queryExpressionNormalized, queryExpressionRangeLength)
10095
}
96+
f.lruCache.Add(queryExpressionNormalized, queryExpressionRangeLength)
97+
10198
f.cachedQueries.Set(float64(f.lruCache.Len()))
10299
}
103100

104101
// QueryHitCache checks if the lru cache is hit and returns whether to increment counter for cache hits along with appropriate message.
105-
func queryHitCache(queryExpressionNormalized string, queryExpressionRangeLength int, lruCache *lru.Cache, cachedHits prometheus.Counter) (bool, string) {
106-
if value, ok := lruCache.Get(queryExpressionNormalized); ok && value.(int) <= queryExpressionRangeLength {
107-
cachedQueryRangeSeconds := value.(int)
102+
func queryHitCache(queryExpressionNormalized string, queryExpressionRangeLength int, lruCache *expirable.LRU[string, int], cachedHits prometheus.Counter) (bool, string) {
103+
if value, ok := lruCache.Get(queryExpressionNormalized); ok && value <= queryExpressionRangeLength {
104+
cachedQueryRangeSeconds := value
108105
message := createLogMessage("Retrieved query from cache", queryExpressionNormalized, cachedQueryRangeSeconds, queryExpressionRangeLength, nil)
109106
cachedHits.Inc()
110107
return true, message
@@ -159,7 +156,7 @@ func (f *FailedQueryCache) UpdateFailedQueryCache(err error, query url.Values, q
159156
queryExpressionRangeLength := getQueryRangeSeconds(query)
160157
// TODO(hc.zhu): add a flag for the threshold
161158
// The current gateway timeout is 5 minutes, so we cache the failed query running longer than 5 minutes - 10 seconds.
162-
if queryResponseTime > time.Second * (60 * 5 - 10) {
159+
if queryResponseTime > time.Second*(60*5-10) {
163160
// Cache long running queries regardless of the error code. The most common case is "context canceled".
164161
f.addCacheEntry(queryExpressionNormalized, queryExpressionRangeLength)
165162
message := createLogMessage("Cached a failed long running query", queryExpressionNormalized, -1, queryExpressionRangeLength, err)

internal/cortex/frontend/transport/utils/utils_test.go

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,16 @@ func verifyMetricCount(t *testing.T, reg *prometheus.Registry, expectedCount int
3434

3535
func TestNewFailedQueryCache(t *testing.T) {
3636
reg := prometheus.NewRegistry()
37-
cache, err := NewFailedQueryCache(2, reg)
37+
cache := NewFailedQueryCache(2, 0, reg)
3838
if cache == nil {
3939
t.Fatalf("Expected cache to be created, but got nil")
4040
}
41-
if err != nil {
42-
t.Fatalf("Expected no error message, but got: %s", err.Error())
43-
}
4441
verifyMetricCount(t, reg, 2)
4542
}
4643

4744
func TestUpdateFailedQueryCache(t *testing.T) {
4845
reg := prometheus.NewRegistry()
49-
cache, _ := NewFailedQueryCache(3, reg)
46+
cache := NewFailedQueryCache(3, 0, reg)
5047

5148
tests := []struct {
5249
name string
@@ -206,7 +203,7 @@ func TestUpdateFailedQueryCache(t *testing.T) {
206203
// TestQueryHitCache tests the QueryHitCache method
207204
func TestQueryHitCache(t *testing.T) {
208205
reg := prometheus.NewRegistry()
209-
cache, _ := NewFailedQueryCache(2, reg)
206+
cache := NewFailedQueryCache(2, 0, reg)
210207
lruCache := cache.lruCache
211208

212209
lruCache.Add("test_query", 100)
@@ -289,7 +286,7 @@ func TestQueryHitCache(t *testing.T) {
289286

290287
func TestCacheCounterVec(t *testing.T) {
291288
reg := prometheus.NewRegistry()
292-
cache, _ := NewFailedQueryCache(2, reg)
289+
cache := NewFailedQueryCache(2, 0, reg)
293290
lruCache := cache.lruCache
294291

295292
lruCache.Add("test_query", 100)
@@ -371,12 +368,12 @@ func TestCacheCounterVec(t *testing.T) {
371368

372369
func TestCacheLongRunningFailedQuery(t *testing.T) {
373370
reg := prometheus.NewRegistry()
374-
cache, _ := NewFailedQueryCache(3, reg)
371+
cache := NewFailedQueryCache(3, 0, reg)
375372

376373
tests := []struct {
377-
name string
378-
err error
379-
query url.Values
374+
name string
375+
err error
376+
query url.Values
380377
}{
381378
{
382379
name: "No error code in error message",
@@ -401,7 +398,7 @@ func TestCacheLongRunningFailedQuery(t *testing.T) {
401398
for _, tt := range tests {
402399
t.Run(tt.name, func(t *testing.T) {
403400
// Long running failed query without an error code
404-
cached, _ := cache.UpdateFailedQueryCache(tt.err, tt.query, time.Second*(5 * 60 - 1))
401+
cached, _ := cache.UpdateFailedQueryCache(tt.err, tt.query, time.Second*(5*60-1))
405402
if !cached {
406403
t.Errorf("Should cache short running failed query without an error code")
407404
}

0 commit comments

Comments
 (0)