From b8d46b0ca3dc0c9b3eef64dd3ccd754571aa55ac Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Mon, 1 Sep 2025 17:20:58 +0300 Subject: [PATCH 1/2] perf: use `bytes.Index` for substring search --- pattern/pattern.go | 8 +- pattern/substring.go | 62 +++---------- pattern/substring_test.go | 178 +++++++++++++++++++++++--------------- 3 files changed, 121 insertions(+), 127 deletions(-) diff --git a/pattern/pattern.go b/pattern/pattern.go index 270e22ce..0dca70f1 100644 --- a/pattern/pattern.go +++ b/pattern/pattern.go @@ -75,7 +75,7 @@ type wildcardSearch struct { baseSearch prefix []byte suffix []byte - middle []*substring + middle [][]byte middleLen int narrowed bool } @@ -96,9 +96,9 @@ func newWildcardSearch(base baseSearch, token *parser.Literal) *wildcardSearch { // all of the rest can be an asterix or a middle for i := 1; i < len(terms)-1; i++ { if terms[i].Kind == parser.TermText { - term := newSubstringPattern([]byte(terms[i].Data)) - s.middle = append(s.middle, term) - s.middleLen += len(terms[i].Data) + val := util.StringToByteUnsafe(terms[i].Data) + s.middle = append(s.middle, val) + s.middleLen += len(val) } } return s diff --git a/pattern/substring.go b/pattern/substring.go index d4a1d4a5..6c0f7ac4 100644 --- a/pattern/substring.go +++ b/pattern/substring.go @@ -1,59 +1,17 @@ package pattern -/* - Finds substrings in a string in a fast (linear way) - Uses prefix function (kinda KMP algorithm) - For fast usage precalc prefFunc (via calcPrefFunc) for every substring - Then, call 'findSequence' and it will try to found all substrings in O(len(string)) (linear time!) -*/ +import ( + "bytes" +) -type substring struct { - val []byte - prefFunc []int32 -} - -func newSubstringPattern(str []byte) *substring { - s := substring{val: str, prefFunc: make([]int32, len(str))} - s.calcPrefFunc() - return &s -} - -func (s *substring) calcPrefFunc() { - curPrefFunc := int32(0) - for i, b := range s.val[1:] { - for curPrefFunc > 0 && b != s.val[curPrefFunc] { - curPrefFunc = s.prefFunc[curPrefFunc-1] - } - if b == s.val[curPrefFunc] { - curPrefFunc++ - } - s.prefFunc[i+1] = curPrefFunc - } -} - -func findSubstring(s []byte, to *substring) int { - curPrefFunc := int32(0) - for i, b := range s { - for curPrefFunc > 0 && b != to.val[curPrefFunc] { - curPrefFunc = to.prefFunc[curPrefFunc-1] - } - if b == to.val[curPrefFunc] { - curPrefFunc++ - } - if curPrefFunc == int32(len(to.val)) { - return i + 1 - } - } - return -1 -} - -func findSequence(s []byte, to []*substring) int { - for cur := 0; cur < len(to); cur++ { - end := findSubstring(s, to[cur]) - if end == -1 { +func findSequence(haystack []byte, needles [][]byte) int { + for cur := range needles { + val := needles[cur] + start := bytes.Index(haystack, needles[cur]) + if start == -1 { return cur } - s = s[end:] + haystack = haystack[start+len(val):] } - return len(to) + return len(needles) } diff --git a/pattern/substring_test.go b/pattern/substring_test.go index e61fc72c..5033af41 100644 --- a/pattern/substring_test.go +++ b/pattern/substring_test.go @@ -1,99 +1,135 @@ package pattern import ( + "math/rand" + "strconv" + "strings" "testing" "github.com/stretchr/testify/assert" ) -func TestPrefixFunction(t *testing.T) { +func testFindSequence(a *assert.Assertions, cnt int, needles []string, haystack string) { + var needlesB [][]byte + for _, needle := range needles { + needlesB = append(needlesB, []byte(needle)) + } + res := findSequence([]byte(haystack), needlesB) + a.Equal(cnt, res, "wrong total number of matches") +} + +func TestFindSequence(t *testing.T) { a := assert.New(t) - var str *substring - str = newSubstringPattern([]byte("aabaaab")) - a.Equal([]int32{0, 1, 0, 1, 2, 2, 3}, str.prefFunc, "wrong prefix function") + testFindSequence(a, 2, []string{"abra", "ada"}, "abracadabra") + testFindSequence(a, 2, []string{"aba", "aba"}, "abacaba") + testFindSequence(a, 2, []string{"aba", "caba"}, "abacaba") + testFindSequence(a, 1, []string{"abacaba"}, "abacaba") + testFindSequence(a, 0, []string{"abacaba"}, "aba") + testFindSequence(a, 1, []string{"aba"}, "abacaba") + testFindSequence(a, 0, []string{"dad"}, "abacaba") + testFindSequence(a, 1, []string{"aba", "dad"}, "abacaba") + testFindSequence(a, 0, []string{"dad", "aba"}, "abacaba") - str = newSubstringPattern([]byte("abacaba")) - a.Equal([]int32{0, 0, 1, 0, 1, 2, 3}, str.prefFunc, "wrong prefix function") + testFindSequence(a, 2, []string{"needle", "haystack"}, "can you find a needle in a haystack?") + testFindSequence(a, 2, []string{"k8s_pod", "_prod"}, "\"k8s_pod\":{\"main_prod\"}") - str = newSubstringPattern([]byte("abracadabra")) - a.Equal([]int32{0, 0, 0, 1, 0, 1, 0, 1, 2, 3, 4}, str.prefFunc, "wrong prefix function") + testFindSequence(a, 2, []string{"!13", "37#"}, "woah!13@37#test") - str = newSubstringPattern([]byte("abacdadba")) - a.Equal([]int32{0, 0, 1, 0, 0, 1, 0, 0, 1}, str.prefFunc, "wrong prefix function") + testFindSequence(a, 1, []string{"abc"}, strings.Repeat("ab", 1024)+"c") +} - str = newSubstringPattern([]byte("!@#\"123{}();'!@#")) - a.Equal([]int32{1, 2, 3}, str.prefFunc[len(str.prefFunc)-3:], "wrong prefix function") +func BenchmarkFindSequence_Deterministic(b *testing.B) { + type testCase struct { + haystack []byte + needles [][]byte + } - str = newSubstringPattern([]byte("template#find template in templates text")) - a.Equal(int32(8), str.prefFunc[21], "wrong prefix function") - a.Equal(int32(8), str.prefFunc[33], "wrong prefix function") - a.Equal(int32(0), str.prefFunc[34], "wrong prefix function") -} + type namedTestCase struct { + name string + cases []testCase + } -func testSubstring(a *assert.Assertions, cnt int, substr, text string) { - subs := newSubstringPattern([]byte(substr)) - str := []byte(text) - total := 0 - for { - to := findSubstring(str, subs) - if to == -1 { - break - } - total++ - a.Equal(string(subs.val), string(str[to-len(subs.val):to]), "substring doesn't match") - str = str[to:] + testCases := []namedTestCase{ + { + name: "regular-cases", + cases: []testCase{ + {bb("Hello, world!"), [][]byte{bb("orl")}}, + {bb("some-k8s-service"), [][]byte{bb("k8s")}}, + }, + }, + { + name: "corner-cases", + cases: []testCase{ + {bb(strings.Repeat("ab", 32) + "c"), [][]byte{bb("abc")}}, + {bb(strings.Repeat("ab", 64) + "c"), [][]byte{bb("abc")}}, + {bb(strings.Repeat("ab", 1024) + "c"), [][]byte{bb("abc")}}, + {bb(strings.Repeat("ab", 16384) + "c"), [][]byte{bb("abc")}}, + }, + }, } - a.Equal(cnt, total, "wrong total number of matches") -} -func testSequence(a *assert.Assertions, cnt int, substr []string, text string) { - subs := make([]*substring, len(substr)) - for i, s := range substr { - subs[i] = newSubstringPattern([]byte(s)) + for _, tc := range testCases { + for i, c := range tc.cases { + b.Run(tc.name+"-"+strconv.Itoa(i), func(b *testing.B) { + for b.Loop() { + findSequence([]byte(c.haystack), c.needles) + } + }) + } } - res := findSequence([]byte(text), subs) - a.Equal(cnt, res, "wrong total number of matches") } -func TestSubstring(t *testing.T) { - a := assert.New(t) +func BenchmarkFindSequence_Random(b *testing.B) { + sizes := []struct { + name string + haystackSize int + needleSize int + needleCount int + }{ + {"tiny", 64, 3, 2}, + {"small", 256, 10, 3}, + {"medium", 1024, 50, 5}, + {"large", 16384, 200, 10}, + {"extra-large", 1048576, 1024, 100}, + } - testSubstring(a, 2, "aba", "abacaba") - testSubstring(a, 0, "abc", "abacaba") - testSubstring(a, 1, "abacaba", "abacaba") - testSubstring(a, 0, "abacaba", "aba") - testSubstring(a, 0, "longtext", "a") - testSubstring(a, 4, "a", "abacaba") - testSubstring(a, 0, "d", "abacaba") - testSubstring(a, 1, "aca", "abacaba") - testSubstring(a, 3, "aab", "aabaaabaab") - testSubstring(a, 2, "aa", "aaaaa") // actually there are 4, but for our purposes we want this behaviour - testSubstring(a, 1, "abaab", "abaabaab") // actually there are 2 - - testSubstring(a, 1, "needle", "can you find a needle in a haystack?") - testSubstring(a, 1, "haystack", "can you find a needle in a haystack?") - testSubstring(a, 0, "elephant", "can you find a needle in a haystack?") - - testSubstring(a, 1, "@", "symbols@test") - testSubstring(a, 1, "!1337#", "woah!1337#test") + for _, size := range sizes { + b.Run(size.name, func(b *testing.B) { + haystack, needles := generateTestData( + size.haystackSize, size.needleSize, size.needleCount, 256, + ) + b.ResetTimer() + for b.Loop() { + findSequence(haystack, needles) + b.SetBytes(int64(len(haystack))) + } + }) + } } -func TestSequence(t *testing.T) { - a := assert.New(t) +func generateTestData(haystackSize, needleSize, needleCount, charset int) ([]byte, [][]byte) { + haystack := generateRandomBytes(haystackSize, charset) - testSequence(a, 2, []string{"abra", "ada"}, "abracadabra") - testSequence(a, 2, []string{"aba", "aba"}, "abacaba") - testSequence(a, 2, []string{"aba", "caba"}, "abacaba") - testSequence(a, 1, []string{"abacaba"}, "abacaba") - testSequence(a, 0, []string{"abacaba"}, "aba") - testSequence(a, 1, []string{"aba"}, "abacaba") - testSequence(a, 0, []string{"dad"}, "abacaba") - testSequence(a, 1, []string{"aba", "dad"}, "abacaba") - testSequence(a, 0, []string{"dad", "aba"}, "abacaba") + needles := make([][]byte, needleCount) + for i := range needleCount { + pattern := generateRandomBytes(needleSize, charset) + pos := rand.Intn(len(haystack) - needleSize) + copy(haystack[pos:], pattern) + needles[i] = pattern + } + + return haystack, needles +} - testSequence(a, 2, []string{"needle", "haystack"}, "can you find a needle in a haystack?") - testSequence(a, 2, []string{"k8s_pod", "_prod"}, "\"k8s_pod\":{\"main_prod\"}") +func generateRandomBytes(size, charset int) []byte { + b := make([]byte, size) + for i := range b { + b[i] = byte(rand.Intn(charset)) + } + return b +} - testSequence(a, 2, []string{"!13", "37#"}, "woah!13@37#test") +func bb(s string) []byte { + return []byte(s) } From fe6da01fad24d639cde47c71a1562bd63849196c Mon Sep 17 00:00:00 2001 From: Daniil Porokhnin Date: Wed, 10 Sep 2025 11:46:31 +0300 Subject: [PATCH 2/2] chore: remove substring file --- pattern/pattern.go | 11 ++++ pattern/pattern_test.go | 126 +++++++++++++++++++++++++++++++++++ pattern/substring.go | 17 ----- pattern/substring_test.go | 135 -------------------------------------- 4 files changed, 137 insertions(+), 152 deletions(-) delete mode 100644 pattern/substring.go delete mode 100644 pattern/substring_test.go diff --git a/pattern/pattern.go b/pattern/pattern.go index 0dca70f1..182dd989 100644 --- a/pattern/pattern.go +++ b/pattern/pattern.go @@ -151,6 +151,17 @@ func (s *wildcardSearch) checkMiddle(val []byte) bool { return findSequence(val[len(s.prefix):len(val)-len(s.suffix)], s.middle) == len(s.middle) } +func findSequence(haystack []byte, needles [][]byte) int { + for cur, val := range needles { + start := bytes.Index(haystack, val) + if start == -1 { + return cur + } + haystack = haystack[start+len(val):] + } + return len(needles) +} + func (s *wildcardSearch) check(val []byte) bool { return s.checkPrefix(val) && s.checkSuffix(val) && s.checkMiddle(val) } diff --git a/pattern/pattern_test.go b/pattern/pattern_test.go index 362eb614..2c9a5702 100644 --- a/pattern/pattern_test.go +++ b/pattern/pattern_test.go @@ -6,6 +6,7 @@ import ( "math/rand" "sort" "strconv" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -482,3 +483,128 @@ func TestPatternIPRange(t *testing.T) { testAll(t, tp, tests) } + +func testFindSequence(a *assert.Assertions, cnt int, needles []string, haystack string) { + var needlesB [][]byte + for _, needle := range needles { + needlesB = append(needlesB, []byte(needle)) + } + res := findSequence([]byte(haystack), needlesB) + a.Equal(cnt, res, "wrong total number of matches") +} + +func TestFindSequence(t *testing.T) { + a := assert.New(t) + + testFindSequence(a, 2, []string{"abra", "ada"}, "abracadabra") + testFindSequence(a, 2, []string{"aba", "aba"}, "abacaba") + testFindSequence(a, 2, []string{"aba", "caba"}, "abacaba") + testFindSequence(a, 1, []string{"abacaba"}, "abacaba") + testFindSequence(a, 0, []string{"abacaba"}, "aba") + testFindSequence(a, 1, []string{"aba"}, "abacaba") + testFindSequence(a, 0, []string{"dad"}, "abacaba") + testFindSequence(a, 1, []string{"aba", "dad"}, "abacaba") + testFindSequence(a, 0, []string{"dad", "aba"}, "abacaba") + + testFindSequence(a, 2, []string{"needle", "haystack"}, "can you find a needle in a haystack?") + testFindSequence(a, 2, []string{"k8s_pod", "_prod"}, "\"k8s_pod\":{\"main_prod\"}") + + testFindSequence(a, 2, []string{"!13", "37#"}, "woah!13@37#test") + + testFindSequence(a, 1, []string{"abc"}, strings.Repeat("ab", 1024)+"c") +} + +func BenchmarkFindSequence_Deterministic(b *testing.B) { + type testCase struct { + haystack []byte + needles [][]byte + } + + type namedTestCase struct { + name string + cases []testCase + } + + testCases := []namedTestCase{ + { + name: "regular-cases", + cases: []testCase{ + {bb("Hello, world!"), [][]byte{bb("orl")}}, + {bb("some-k8s-service"), [][]byte{bb("k8s")}}, + }, + }, + { + name: "corner-cases", + cases: []testCase{ + {bb(strings.Repeat("ab", 32) + "c"), [][]byte{bb("abc")}}, + {bb(strings.Repeat("ab", 64) + "c"), [][]byte{bb("abc")}}, + {bb(strings.Repeat("ab", 1024) + "c"), [][]byte{bb("abc")}}, + {bb(strings.Repeat("ab", 16384) + "c"), [][]byte{bb("abc")}}, + }, + }, + } + + for _, tc := range testCases { + for i, c := range tc.cases { + b.Run(tc.name+"-"+strconv.Itoa(i), func(b *testing.B) { + for b.Loop() { + findSequence([]byte(c.haystack), c.needles) + } + }) + } + } +} + +func BenchmarkFindSequence_Random(b *testing.B) { + sizes := []struct { + name string + haystackSize int + needleSize int + needleCount int + }{ + {"tiny", 64, 3, 2}, + {"small", 256, 10, 3}, + {"medium", 1024, 50, 5}, + {"large", 16384, 200, 10}, + {"extra-large", 1048576, 1024, 100}, + } + + for _, size := range sizes { + b.Run(size.name, func(b *testing.B) { + haystack, needles := generateTestData( + size.haystackSize, size.needleSize, size.needleCount, 256, + ) + b.ResetTimer() + for b.Loop() { + findSequence(haystack, needles) + b.SetBytes(int64(len(haystack))) + } + }) + } +} + +func generateTestData(haystackSize, needleSize, needleCount, charset int) ([]byte, [][]byte) { + haystack := generateRandomBytes(haystackSize, charset) + + needles := make([][]byte, needleCount) + for i := range needleCount { + pattern := generateRandomBytes(needleSize, charset) + pos := rand.Intn(len(haystack) - needleSize) + copy(haystack[pos:], pattern) + needles[i] = pattern + } + + return haystack, needles +} + +func generateRandomBytes(size, charset int) []byte { + b := make([]byte, size) + for i := range b { + b[i] = byte(rand.Intn(charset)) + } + return b +} + +func bb(s string) []byte { + return []byte(s) +} diff --git a/pattern/substring.go b/pattern/substring.go deleted file mode 100644 index 6c0f7ac4..00000000 --- a/pattern/substring.go +++ /dev/null @@ -1,17 +0,0 @@ -package pattern - -import ( - "bytes" -) - -func findSequence(haystack []byte, needles [][]byte) int { - for cur := range needles { - val := needles[cur] - start := bytes.Index(haystack, needles[cur]) - if start == -1 { - return cur - } - haystack = haystack[start+len(val):] - } - return len(needles) -} diff --git a/pattern/substring_test.go b/pattern/substring_test.go deleted file mode 100644 index 5033af41..00000000 --- a/pattern/substring_test.go +++ /dev/null @@ -1,135 +0,0 @@ -package pattern - -import ( - "math/rand" - "strconv" - "strings" - "testing" - - "github.com/stretchr/testify/assert" -) - -func testFindSequence(a *assert.Assertions, cnt int, needles []string, haystack string) { - var needlesB [][]byte - for _, needle := range needles { - needlesB = append(needlesB, []byte(needle)) - } - res := findSequence([]byte(haystack), needlesB) - a.Equal(cnt, res, "wrong total number of matches") -} - -func TestFindSequence(t *testing.T) { - a := assert.New(t) - - testFindSequence(a, 2, []string{"abra", "ada"}, "abracadabra") - testFindSequence(a, 2, []string{"aba", "aba"}, "abacaba") - testFindSequence(a, 2, []string{"aba", "caba"}, "abacaba") - testFindSequence(a, 1, []string{"abacaba"}, "abacaba") - testFindSequence(a, 0, []string{"abacaba"}, "aba") - testFindSequence(a, 1, []string{"aba"}, "abacaba") - testFindSequence(a, 0, []string{"dad"}, "abacaba") - testFindSequence(a, 1, []string{"aba", "dad"}, "abacaba") - testFindSequence(a, 0, []string{"dad", "aba"}, "abacaba") - - testFindSequence(a, 2, []string{"needle", "haystack"}, "can you find a needle in a haystack?") - testFindSequence(a, 2, []string{"k8s_pod", "_prod"}, "\"k8s_pod\":{\"main_prod\"}") - - testFindSequence(a, 2, []string{"!13", "37#"}, "woah!13@37#test") - - testFindSequence(a, 1, []string{"abc"}, strings.Repeat("ab", 1024)+"c") -} - -func BenchmarkFindSequence_Deterministic(b *testing.B) { - type testCase struct { - haystack []byte - needles [][]byte - } - - type namedTestCase struct { - name string - cases []testCase - } - - testCases := []namedTestCase{ - { - name: "regular-cases", - cases: []testCase{ - {bb("Hello, world!"), [][]byte{bb("orl")}}, - {bb("some-k8s-service"), [][]byte{bb("k8s")}}, - }, - }, - { - name: "corner-cases", - cases: []testCase{ - {bb(strings.Repeat("ab", 32) + "c"), [][]byte{bb("abc")}}, - {bb(strings.Repeat("ab", 64) + "c"), [][]byte{bb("abc")}}, - {bb(strings.Repeat("ab", 1024) + "c"), [][]byte{bb("abc")}}, - {bb(strings.Repeat("ab", 16384) + "c"), [][]byte{bb("abc")}}, - }, - }, - } - - for _, tc := range testCases { - for i, c := range tc.cases { - b.Run(tc.name+"-"+strconv.Itoa(i), func(b *testing.B) { - for b.Loop() { - findSequence([]byte(c.haystack), c.needles) - } - }) - } - } -} - -func BenchmarkFindSequence_Random(b *testing.B) { - sizes := []struct { - name string - haystackSize int - needleSize int - needleCount int - }{ - {"tiny", 64, 3, 2}, - {"small", 256, 10, 3}, - {"medium", 1024, 50, 5}, - {"large", 16384, 200, 10}, - {"extra-large", 1048576, 1024, 100}, - } - - for _, size := range sizes { - b.Run(size.name, func(b *testing.B) { - haystack, needles := generateTestData( - size.haystackSize, size.needleSize, size.needleCount, 256, - ) - b.ResetTimer() - for b.Loop() { - findSequence(haystack, needles) - b.SetBytes(int64(len(haystack))) - } - }) - } -} - -func generateTestData(haystackSize, needleSize, needleCount, charset int) ([]byte, [][]byte) { - haystack := generateRandomBytes(haystackSize, charset) - - needles := make([][]byte, needleCount) - for i := range needleCount { - pattern := generateRandomBytes(needleSize, charset) - pos := rand.Intn(len(haystack) - needleSize) - copy(haystack[pos:], pattern) - needles[i] = pattern - } - - return haystack, needles -} - -func generateRandomBytes(size, charset int) []byte { - b := make([]byte, size) - for i := range b { - b[i] = byte(rand.Intn(charset)) - } - return b -} - -func bb(s string) []byte { - return []byte(s) -}