Skip to content

Commit 5e9a263

Browse files
committed
search: statistic skips for regex constraint
Signed-off-by: Michael Hoffmann <[email protected]>
1 parent 2bf03c4 commit 5e9a263

File tree

2 files changed

+139
-37
lines changed

2 files changed

+139
-37
lines changed

search/constraint.go

Lines changed: 120 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@ import (
2121
"sort"
2222

2323
"github.com/parquet-go/parquet-go"
24-
"github.com/pkg/errors"
2524
"github.com/prometheus/prometheus/model/labels"
2625

2726
"github.com/prometheus-community/parquet-common/schema"
@@ -47,28 +46,48 @@ type Constraint interface {
4746
func MatchersToConstraints(matchers ...*labels.Matcher) ([]Constraint, error) {
4847
r := make([]Constraint, 0, len(matchers))
4948
for _, matcher := range matchers {
49+
var c Constraint
50+
S:
5051
switch matcher.Type {
5152
case labels.MatchEqual:
52-
r = append(r, Equal(schema.LabelToColumn(matcher.Name), parquet.ValueOf(matcher.Value)))
53+
c = Equal(schema.LabelToColumn(matcher.Name), parquet.ValueOf(matcher.Value))
5354
case labels.MatchNotEqual:
54-
r = append(r, Not(Equal(schema.LabelToColumn(matcher.Name), parquet.ValueOf(matcher.Value))))
55+
c = Not(Equal(schema.LabelToColumn(matcher.Name), parquet.ValueOf(matcher.Value)))
5556
case labels.MatchRegexp:
56-
res, err := labels.NewFastRegexMatcher(matcher.Value)
57+
if matcher.GetRegexString() == ".+" {
58+
c = Not(Equal(schema.LabelToColumn(matcher.Name), parquet.ValueOf("")))
59+
break S
60+
}
61+
if set := matcher.SetMatches(); len(set) == 1 {
62+
c = Equal(schema.LabelToColumn(matcher.Name), parquet.ValueOf(set[0]))
63+
break S
64+
}
65+
rc, err := Regex(schema.LabelToColumn(matcher.Name), matcher)
5766
if err != nil {
58-
return nil, err
67+
return nil, fmt.Errorf("unable to construct regex matcher: %w", err)
5968
}
60-
r = append(r, Regex(schema.LabelToColumn(matcher.Name), res))
69+
c = rc
6170
case labels.MatchNotRegexp:
62-
res, err := labels.NewFastRegexMatcher(matcher.Value)
71+
inverted, err := matcher.Inverse()
72+
if err != nil {
73+
return nil, fmt.Errorf("unable to invert matcher: %w", err)
74+
}
75+
if set := inverted.SetMatches(); len(set) == 1 {
76+
c = Not(Equal(schema.LabelToColumn(matcher.Name), parquet.ValueOf(set[0])))
77+
break S
78+
}
79+
rc, err := Regex(schema.LabelToColumn(matcher.Name), inverted)
6380
if err != nil {
64-
return nil, err
81+
return nil, fmt.Errorf("unable to construct regex matcher: %w", err)
6582
}
66-
r = append(r, Not(Regex(schema.LabelToColumn(matcher.Name), res)))
83+
c = Not(rc)
6784
default:
6885
return nil, fmt.Errorf("unsupported matcher type %s", matcher.Type)
6986
}
87+
r = append(r, c)
7088
}
7189
return r, nil
90+
7291
}
7392

7493
// Initialize prepares the given constraints for use with the specified parquet file.
@@ -382,6 +401,7 @@ func (ec *equalConstraint) filter(ctx context.Context, rgIdx int, primary bool,
382401
res = append(res, RowRange{pfrom + int64(off), int64(count)})
383402
}
384403
}
404+
parquet.Release(pg)
385405
}
386406

387407
if len(res) == 0 {
@@ -431,15 +451,26 @@ func (ec *equalConstraint) skipByBloomfilter(cc parquet.ColumnChunk) (bool, erro
431451
return !ok, nil
432452
}
433453

434-
func Regex(path string, r *labels.FastRegexMatcher) Constraint {
435-
return &regexConstraint{pth: path, cache: make(map[parquet.Value]bool), r: r}
454+
func Regex(path string, r *labels.Matcher) (Constraint, error) {
455+
if r.Type != labels.MatchRegexp {
456+
return nil, fmt.Errorf("unsupported matcher type: %s", r.Type)
457+
}
458+
return &regexConstraint{pth: path, cache: make(map[parquet.Value]bool), r: r}, nil
436459
}
437460

438461
type regexConstraint struct {
462+
f storage.ParquetFileView
439463
pth string
440464
cache map[parquet.Value]bool
441-
f storage.ParquetFileView
442-
r *labels.FastRegexMatcher
465+
466+
// if its a "set" or "prefix" regex
467+
// for set, those are minv and maxv of the set, for prefix minv is the prefix, maxv is prefix+max(charset)*16
468+
minv parquet.Value
469+
maxv parquet.Value
470+
471+
r *labels.Matcher
472+
473+
comp func(l, r parquet.Value) int
443474
}
444475

445476
func (rc *regexConstraint) String() string {
@@ -465,13 +496,6 @@ func (rc *regexConstraint) filter(ctx context.Context, rgIdx int, primary bool,
465496
}
466497
cc := rg.ColumnChunks()[col.ColumnIndex]
467498

468-
pgs, err := rc.f.GetPages(ctx, cc, 0, 0)
469-
if err != nil {
470-
return nil, errors.Wrap(err, "failed to get pages")
471-
}
472-
473-
defer func() { _ = pgs.Close() }()
474-
475499
oidx, err := cc.OffsetIndex()
476500
if err != nil {
477501
return nil, fmt.Errorf("unable to read offset index: %w", err)
@@ -480,11 +504,13 @@ func (rc *regexConstraint) filter(ctx context.Context, rgIdx int, primary bool,
480504
if err != nil {
481505
return nil, fmt.Errorf("unable to read column index: %w", err)
482506
}
483-
var (
484-
symbols = new(symbolTable)
485-
res = make([]RowRange, 0)
486-
)
507+
res := make([]RowRange, 0)
508+
509+
readPgs := make([]pageToRead, 0, 10)
510+
487511
for i := 0; i < cidx.NumPages(); i++ {
512+
poff, pcsz := uint64(oidx.Offset(i)), oidx.CompressedPageSize(i)
513+
488514
// If page does not intersect from, to; we can immediately discard it
489515
pfrom := oidx.FirstRowIndex(i)
490516
pcount := rg.NumRows() - pfrom
@@ -505,9 +531,56 @@ func (rc *regexConstraint) filter(ctx context.Context, rgIdx int, primary bool,
505531
}
506532
continue
507533
}
508-
// TODO: use setmatches / prefix for statistics
534+
// If we have a special regular expression that works with statistics, we can use them to skip.
535+
// This works for i.e.: 'pod_name=~"thanos-.*"' or 'status_code=~"403|404"'
536+
minv, maxv := cidx.MinValue(i), cidx.MaxValue(i)
537+
if !rc.minv.IsNull() && !rc.maxv.IsNull() {
538+
if !rc.matches(parquet.ValueOf("")) && !maxv.IsNull() && rc.comp(rc.minv, maxv) > 0 {
539+
if cidx.IsDescending() {
540+
break
541+
}
542+
continue
543+
}
544+
if !rc.matches(parquet.ValueOf("")) && !minv.IsNull() && rc.comp(rc.maxv, minv) < 0 {
545+
if cidx.IsAscending() {
546+
break
547+
}
548+
continue
549+
}
550+
}
509551

510552
// We cannot discard the page through statistics but we might need to read it to see if it has the value
553+
readPgs = append(readPgs, pageToRead{pfrom: pfrom, pto: pto, idx: i, off: int(poff), csz: int(pcsz)})
554+
}
555+
556+
// Did not find any pages
557+
if len(readPgs) == 0 {
558+
return intersectRowRanges(simplify(res), rr), nil
559+
}
560+
561+
dictOff, dictSz := rc.f.DictionaryPageBounds(rgIdx, col.ColumnIndex)
562+
563+
minOffset := uint64(readPgs[0].off)
564+
maxOffset := readPgs[len(readPgs)-1].off + readPgs[len(readPgs)-1].csz
565+
566+
// If the gap between the first page and the dic page is less than PagePartitioningMaxGapSize,
567+
// we include the dic to be read in the single read
568+
if int(minOffset-(dictOff+dictSz)) < rc.f.PagePartitioningMaxGapSize() {
569+
minOffset = dictOff
570+
}
571+
572+
pgs, err := rc.f.GetPages(ctx, cc, int64(minOffset), int64(maxOffset))
573+
if err != nil {
574+
return nil, err
575+
}
576+
577+
defer func() { _ = pgs.Close() }()
578+
579+
symbols := new(symbolTable)
580+
for _, p := range readPgs {
581+
pfrom := p.pfrom
582+
pto := p.pto
583+
511584
if err := pgs.SeekToRow(pfrom); err != nil {
512585
return nil, fmt.Errorf("unable to seek to row: %w", err)
513586
}
@@ -539,7 +612,9 @@ func (rc *regexConstraint) filter(ctx context.Context, rgIdx int, primary bool,
539612
if count != 0 {
540613
res = append(res, RowRange{pfrom + int64(off), int64(count)})
541614
}
615+
parquet.Release(pg)
542616
}
617+
543618
if len(res) == 0 {
544619
return nil, nil
545620
}
@@ -556,7 +631,26 @@ func (rc *regexConstraint) init(f storage.ParquetFileView) error {
556631
return fmt.Errorf("schema: cannot search value of kind %s in column of kind %s", stringKind, c.Node.Type().Kind())
557632
}
558633
rc.cache = make(map[parquet.Value]bool)
634+
rc.comp = c.Node.Type().Compare
635+
636+
// if applicable compute the minv and maxv of the implied set of matches
637+
rc.minv = parquet.NullValue()
638+
rc.maxv = parquet.NullValue()
639+
if len(rc.r.SetMatches()) > 0 {
640+
sm := make([]parquet.Value, len(rc.r.SetMatches()))
641+
for i, m := range rc.r.SetMatches() {
642+
sm[i] = parquet.ValueOf(m)
643+
}
644+
rc.minv = slices.MinFunc(sm, rc.comp)
645+
rc.maxv = slices.MaxFunc(sm, rc.comp)
646+
} else if len(rc.r.Prefix()) > 0 {
647+
rc.minv = parquet.ValueOf(rc.r.Prefix())
648+
// 16 is the default prefix length, maybe we should read the actual value from somewhere?
649+
rc.maxv = parquet.ValueOf(append([]byte(rc.r.Prefix()), bytes.Repeat([]byte{0xff}, 16)...))
650+
}
651+
559652
return nil
653+
560654
}
561655

562656
func (rc *regexConstraint) path() string {
@@ -566,7 +660,7 @@ func (rc *regexConstraint) path() string {
566660
func (rc *regexConstraint) matches(v parquet.Value) bool {
567661
accept, seen := rc.cache[v]
568662
if !seen {
569-
accept = rc.r.MatchString(util.YoloString(v.ByteArray()))
663+
accept = rc.r.Matches(util.YoloString(v.ByteArray()))
570664
rc.cache[v] = accept
571665
}
572666
return accept

search/constraint_test.go

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,22 @@ func buildFile[T any](t testing.TB, rows []T) storage.ParquetShard {
6262
return shard
6363
}
6464

65-
func mustNewFastRegexMatcher(t testing.TB, s string) *labels.FastRegexMatcher {
66-
res, err := labels.NewFastRegexMatcher(s)
65+
func mustNewMatcher(t testing.TB, s string) *labels.Matcher {
66+
res, err := labels.NewMatcher(labels.MatchRegexp, "doesntmatter", s)
6767
if err != nil {
6868
t.Fatalf("unable to build fast regex matcher: %s", err)
6969
}
7070
return res
7171
}
7272

73+
func mustRegexConstraint(t testing.TB, col string, m *labels.Matcher) Constraint {
74+
res, err := Regex(col, m)
75+
if err != nil {
76+
t.Fatalf("unable to build regex constraint: %s", err)
77+
}
78+
return res
79+
}
80+
7381
func BenchmarkConstraints(b *testing.B) {
7482
type s struct {
7583
A string `parquet:",optional,dict"`
@@ -112,14 +120,14 @@ func BenchmarkConstraints(b *testing.B) {
112120
c: []Constraint{
113121
Equal("A", parquet.ValueOf(rows[0].A)),
114122
Equal("B", parquet.ValueOf(rows[0].B)),
115-
Regex("Random", mustNewFastRegexMatcher(b, rows[0].Random)),
123+
mustRegexConstraint(b, "Random", mustNewMatcher(b, rows[0].Random)),
116124
},
117125
},
118126
{
119127
c: []Constraint{
120128
Equal("A", parquet.ValueOf(rows[len(rows)-1].A)),
121129
Equal("B", parquet.ValueOf(rows[len(rows)-1].B)),
122-
Regex("Random", mustNewFastRegexMatcher(b, rows[len(rows)-1].Random)),
130+
mustRegexConstraint(b, "Random", mustNewMatcher(b, rows[len(rows)-1].Random)),
123131
},
124132
},
125133
}
@@ -161,7 +169,7 @@ func TestContextCancelled(t *testing.T) {
161169

162170
for _, c := range []Constraint{
163171
Equal("A", parquet.ValueOf(rows[len(rows)-1].A)),
164-
Regex("A", mustNewFastRegexMatcher(t, rows[len(rows)-1].A)),
172+
mustRegexConstraint(t, "A", mustNewMatcher(t, rows[len(rows)-1].A)),
165173
Not(Equal("A", parquet.ValueOf(rows[len(rows)-1].A))),
166174
} {
167175
if err := Initialize(shard.LabelsFile(), c); err != nil {
@@ -258,7 +266,7 @@ func TestFilter(t *testing.T) {
258266
},
259267
{
260268
constraints: []Constraint{
261-
Regex("C", mustNewFastRegexMatcher(t, "a|c|d")),
269+
mustRegexConstraint(t, "C", mustNewMatcher(t, "a|c|d")),
262270
},
263271
expect: []RowRange{
264272
{From: 0, Count: 1},
@@ -368,7 +376,7 @@ func TestFilter(t *testing.T) {
368376
expectations: []expectation{
369377
{
370378
constraints: []Constraint{
371-
Regex("C", mustNewFastRegexMatcher(t, "f.*")),
379+
mustRegexConstraint(t, "C", mustNewMatcher(t, "f.*")),
372380
},
373381
expect: []RowRange{
374382
{From: 0, Count: 1},
@@ -377,7 +385,7 @@ func TestFilter(t *testing.T) {
377385
},
378386
{
379387
constraints: []Constraint{
380-
Regex("C", mustNewFastRegexMatcher(t, "b.*")),
388+
mustRegexConstraint(t, "C", mustNewMatcher(t, "b.*")),
381389
},
382390
expect: []RowRange{
383391
{From: 1, Count: 1},
@@ -386,7 +394,7 @@ func TestFilter(t *testing.T) {
386394
},
387395
{
388396
constraints: []Constraint{
389-
Regex("C", mustNewFastRegexMatcher(t, "f.*|b.*")),
397+
mustRegexConstraint(t, "C", mustNewMatcher(t, "f.*|b.*")),
390398
},
391399
expect: []RowRange{
392400
{From: 0, Count: 4},
@@ -440,14 +448,14 @@ func TestFilter(t *testing.T) {
440448
{
441449
constraints: []Constraint{
442450
Equal("A", parquet.ValueOf("1")),
443-
Regex("None", mustNewFastRegexMatcher(t, "f.*|b.*")),
451+
mustRegexConstraint(t, "None", mustNewMatcher(t, "f.*|b.*")),
444452
},
445453
expect: []RowRange{},
446454
},
447455
{
448456
constraints: []Constraint{
449457
Equal("A", parquet.ValueOf("1")),
450-
Regex("None", mustNewFastRegexMatcher(t, "f.*|b.*|")),
458+
mustRegexConstraint(t, "None", mustNewMatcher(t, "f.*|b.*|")),
451459
},
452460
expect: []RowRange{
453461
{From: 0, Count: 2},

0 commit comments

Comments
 (0)