perf: use bytes.Index for substring search

dkharms · dkharms · commit abb9bb536499 · 2025-09-02T20:10:38.000+03:00
diff --git a/pattern/pattern.go b/pattern/pattern.go
@@ -75,7 +75,7 @@ type wildcardSearch struct {
 	baseSearch
 	prefix    []byte
 	suffix    []byte
-	middle    []*substring
+	middle    [][]byte
 	middleLen int
 	narrowed  bool
 }
@@ -96,9 +96,9 @@ func newWildcardSearch(base baseSearch, token *parser.Literal) *wildcardSearch {
 	// all of the rest can be an asterix or a middle
 	for i := 1; i < len(terms)-1; i++ {
 		if terms[i].Kind == parser.TermText {
-			term := newSubstringPattern([]byte(terms[i].Data))
-			s.middle = append(s.middle, term)
-			s.middleLen += len(terms[i].Data)
+			val := util.StringToByteUnsafe(terms[i].Data)
+			s.middle = append(s.middle, val)
+			s.middleLen += len(val)
 		}
 	}
 	return s
diff --git a/pattern/substring.go b/pattern/substring.go
@@ -1,59 +1,17 @@
 package pattern
 
-/*
- Finds substrings in a string in a fast (linear way)
- Uses prefix function (kinda KMP algorithm)
- For fast usage precalc prefFunc (via calcPrefFunc) for every substring
- Then, call 'findSequence' and it will try to found all substrings in O(len(string)) (linear time!)
-*/
+import (
+	"bytes"
+)
 
-type substring struct {
-	val      []byte
-	prefFunc []int32
-}
-
-func newSubstringPattern(str []byte) *substring {
-	s := substring{val: str, prefFunc: make([]int32, len(str))}
-	s.calcPrefFunc()
-	return &s
-}
-
-func (s *substring) calcPrefFunc() {
-	curPrefFunc := int32(0)
-	for i, b := range s.val[1:] {
-		for curPrefFunc > 0 && b != s.val[curPrefFunc] {
-			curPrefFunc = s.prefFunc[curPrefFunc-1]
-		}
-		if b == s.val[curPrefFunc] {
-			curPrefFunc++
-		}
-		s.prefFunc[i+1] = curPrefFunc
-	}
-}
-
-func findSubstring(s []byte, to *substring) int {
-	curPrefFunc := int32(0)
-	for i, b := range s {
-		for curPrefFunc > 0 && b != to.val[curPrefFunc] {
-			curPrefFunc = to.prefFunc[curPrefFunc-1]
-		}
-		if b == to.val[curPrefFunc] {
-			curPrefFunc++
-		}
-		if curPrefFunc == int32(len(to.val)) {
-			return i + 1
-		}
-	}
-	return -1
-}
-
-func findSequence(s []byte, to []*substring) int {
-	for cur := 0; cur < len(to); cur++ {
-		end := findSubstring(s, to[cur])
-		if end == -1 {
+func findSequence(haystack []byte, needles [][]byte) int {
+	for cur := range needles {
+		val := needles[cur]
+		start := bytes.Index(haystack, needles[cur])
+		if start == -1 {
 			return cur
 		}
-		s = s[end:]
+		haystack = haystack[start+len(val):]
 	}
-	return len(to)
+	return len(needles)
 }
diff --git a/pattern/substring_test.go b/pattern/substring_test.go
@@ -1,99 +1,135 @@
 package pattern
 
 import (
+	"math/rand"
+	"strconv"
+	"strings"
 	"testing"
 
 	"github.com/stretchr/testify/assert"
 )
 
-func TestPrefixFunction(t *testing.T) {
+func testFindSequence(a *assert.Assertions, cnt int, needles []string, haystack string) {
+	var needlesB [][]byte
+	for _, needle := range needles {
+		needlesB = append(needlesB, []byte(needle))
+	}
+	res := findSequence([]byte(haystack), needlesB)
+	a.Equal(cnt, res, "wrong total number of matches")
+}
+
+func TestFindSequence(t *testing.T) {
 	a := assert.New(t)
-	var str *substring
 
-	str = newSubstringPattern([]byte("aabaaab"))
-	a.Equal([]int32{0, 1, 0, 1, 2, 2, 3}, str.prefFunc, "wrong prefix function")
+	testFindSequence(a, 2, []string{"abra", "ada"}, "abracadabra")
+	testFindSequence(a, 2, []string{"aba", "aba"}, "abacaba")
+	testFindSequence(a, 2, []string{"aba", "caba"}, "abacaba")
+	testFindSequence(a, 1, []string{"abacaba"}, "abacaba")
+	testFindSequence(a, 0, []string{"abacaba"}, "aba")
+	testFindSequence(a, 1, []string{"aba"}, "abacaba")
+	testFindSequence(a, 0, []string{"dad"}, "abacaba")
+	testFindSequence(a, 1, []string{"aba", "dad"}, "abacaba")
+	testFindSequence(a, 0, []string{"dad", "aba"}, "abacaba")
 
-	str = newSubstringPattern([]byte("abacaba"))
-	a.Equal([]int32{0, 0, 1, 0, 1, 2, 3}, str.prefFunc, "wrong prefix function")
+	testFindSequence(a, 2, []string{"needle", "haystack"}, "can you find a needle in a haystack?")
+	testFindSequence(a, 2, []string{"k8s_pod", "_prod"}, "\"k8s_pod\":{\"main_prod\"}")
 
-	str = newSubstringPattern([]byte("abracadabra"))
-	a.Equal([]int32{0, 0, 0, 1, 0, 1, 0, 1, 2, 3, 4}, str.prefFunc, "wrong prefix function")
+	testFindSequence(a, 2, []string{"!13", "37#"}, "woah!13@37#test")
 
-	str = newSubstringPattern([]byte("abacdadba"))
-	a.Equal([]int32{0, 0, 1, 0, 0, 1, 0, 0, 1}, str.prefFunc, "wrong prefix function")
+	testFindSequence(a, 1, []string{"abc"}, strings.Repeat("ab", 1024)+"c")
+}
 
-	str = newSubstringPattern([]byte("!@#\"123{}();'!@#"))
-	a.Equal([]int32{1, 2, 3}, str.prefFunc[len(str.prefFunc)-3:], "wrong prefix function")
+func BenchmarkFindSequence_Deterministic(b *testing.B) {
+	type testCase struct {
+		haystack []byte
+		needles  [][]byte
+	}
 
-	str = newSubstringPattern([]byte("template#find template in templates text"))
-	a.Equal(int32(8), str.prefFunc[21], "wrong prefix function")
-	a.Equal(int32(8), str.prefFunc[33], "wrong prefix function")
-	a.Equal(int32(0), str.prefFunc[34], "wrong prefix function")
-}
+	type namedTestCase struct {
+		name  string
+		cases []testCase
+	}
 
-func testSubstring(a *assert.Assertions, cnt int, substr, text string) {
-	subs := newSubstringPattern([]byte(substr))
-	str := []byte(text)
-	total := 0
-	for {
-		to := findSubstring(str, subs)
-		if to == -1 {
-			break
-		}
-		total++
-		a.Equal(string(subs.val), string(str[to-len(subs.val):to]), "substring doesn't match")
-		str = str[to:]
+	testCases := []namedTestCase{
+		{
+			name: "regular-cases",
+			cases: []testCase{
+				{bb("Hello, world!"), [][]byte{bb("orl")}},
+				{bb("some-k8s-service"), [][]byte{bb("k8s")}},
+			},
+		},
+		{
+			name: "corner-cases",
+			cases: []testCase{
+				{bb(strings.Repeat("ab", 32) + "c"), [][]byte{bb("abc")}},
+				{bb(strings.Repeat("ab", 64) + "c"), [][]byte{bb("abc")}},
+				{bb(strings.Repeat("ab", 1024) + "c"), [][]byte{bb("abc")}},
+				{bb(strings.Repeat("ab", 16384) + "c"), [][]byte{bb("abc")}},
+			},
+		},
 	}
-	a.Equal(cnt, total, "wrong total number of matches")
-}
 
-func testSequence(a *assert.Assertions, cnt int, substr []string, text string) {
-	subs := make([]*substring, len(substr))
-	for i, s := range substr {
-		subs[i] = newSubstringPattern([]byte(s))
+	for _, tc := range testCases {
+		for i, c := range tc.cases {
+			b.Run(tc.name+"-"+strconv.Itoa(i), func(b *testing.B) {
+				for b.Loop() {
+					findSequence([]byte(c.haystack), c.needles)
+				}
+			})
+		}
 	}
-	res := findSequence([]byte(text), subs)
-	a.Equal(cnt, res, "wrong total number of matches")
 }
 
-func TestSubstring(t *testing.T) {
-	a := assert.New(t)
+func BenchmarkFindSequence_Random(b *testing.B) {
+	sizes := []struct {
+		name         string
+		haystackSize int
+		needleSize   int
+		needleCount  int
+	}{
+		{"tiny", 64, 3, 2},
+		{"small", 256, 10, 3},
+		{"medium", 1024, 50, 5},
+		{"large", 16384, 200, 10},
+		{"extra-large", 1048576, 1024, 100},
+	}
 
-	testSubstring(a, 2, "aba", "abacaba")
-	testSubstring(a, 0, "abc", "abacaba")
-	testSubstring(a, 1, "abacaba", "abacaba")
-	testSubstring(a, 0, "abacaba", "aba")
-	testSubstring(a, 0, "longtext", "a")
-	testSubstring(a, 4, "a", "abacaba")
-	testSubstring(a, 0, "d", "abacaba")
-	testSubstring(a, 1, "aca", "abacaba")
-	testSubstring(a, 3, "aab", "aabaaabaab")
-	testSubstring(a, 2, "aa", "aaaaa")       // actually there are 4, but for our purposes we want this behaviour
-	testSubstring(a, 1, "abaab", "abaabaab") // actually there are 2
-
-	testSubstring(a, 1, "needle", "can you find a needle in a haystack?")
-	testSubstring(a, 1, "haystack", "can you find a needle in a haystack?")
-	testSubstring(a, 0, "elephant", "can you find a needle in a haystack?")
-
-	testSubstring(a, 1, "@", "symbols@test")
-	testSubstring(a, 1, "!1337#", "woah!1337#test")
+	for _, size := range sizes {
+		b.Run(size.name, func(b *testing.B) {
+			haystack, needles := generateTestData(
+				size.haystackSize, size.needleSize, size.needleCount, 256,
+			)
+			b.ResetTimer()
+			for b.Loop() {
+				findSequence(haystack, needles)
+				b.SetBytes(int64(len(haystack)))
+			}
+		})
+	}
 }
 
-func TestSequence(t *testing.T) {
-	a := assert.New(t)
+func generateTestData(haystackSize, needleSize, needleCount, charset int) ([]byte, [][]byte) {
+	haystack := generateRandomBytes(haystackSize, charset)
 
-	testSequence(a, 2, []string{"abra", "ada"}, "abracadabra")
-	testSequence(a, 2, []string{"aba", "aba"}, "abacaba")
-	testSequence(a, 2, []string{"aba", "caba"}, "abacaba")
-	testSequence(a, 1, []string{"abacaba"}, "abacaba")
-	testSequence(a, 0, []string{"abacaba"}, "aba")
-	testSequence(a, 1, []string{"aba"}, "abacaba")
-	testSequence(a, 0, []string{"dad"}, "abacaba")
-	testSequence(a, 1, []string{"aba", "dad"}, "abacaba")
-	testSequence(a, 0, []string{"dad", "aba"}, "abacaba")
+	needles := make([][]byte, needleCount)
+	for i := range needleCount {
+		pattern := generateRandomBytes(needleSize, charset)
+		pos := rand.Intn(len(haystack) - needleSize)
+		copy(haystack[pos:], pattern)
+		needles[i] = pattern
+	}
+
+	return haystack, needles
+}
 
-	testSequence(a, 2, []string{"needle", "haystack"}, "can you find a needle in a haystack?")
-	testSequence(a, 2, []string{"k8s_pod", "_prod"}, "\"k8s_pod\":{\"main_prod\"}")
+func generateRandomBytes(size, charset int) []byte {
+	b := make([]byte, size)
+	for i := range b {
+		b[i] = byte(rand.Intn(charset))
+	}
+	return b
+}
 
-	testSequence(a, 2, []string{"!13", "37#"}, "woah!13@37#test")
+func bb(s string) []byte {
+	return []byte(s)
 }