Skip to content

Commit abb9bb5

Browse files
committed
perf: use bytes.Index for substring search
1 parent 3b00e02 commit abb9bb5

File tree

3 files changed

+121
-127
lines changed

3 files changed

+121
-127
lines changed

pattern/pattern.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ type wildcardSearch struct {
7575
baseSearch
7676
prefix []byte
7777
suffix []byte
78-
middle []*substring
78+
middle [][]byte
7979
middleLen int
8080
narrowed bool
8181
}
@@ -96,9 +96,9 @@ func newWildcardSearch(base baseSearch, token *parser.Literal) *wildcardSearch {
9696
// all of the rest can be an asterix or a middle
9797
for i := 1; i < len(terms)-1; i++ {
9898
if terms[i].Kind == parser.TermText {
99-
term := newSubstringPattern([]byte(terms[i].Data))
100-
s.middle = append(s.middle, term)
101-
s.middleLen += len(terms[i].Data)
99+
val := util.StringToByteUnsafe(terms[i].Data)
100+
s.middle = append(s.middle, val)
101+
s.middleLen += len(val)
102102
}
103103
}
104104
return s

pattern/substring.go

Lines changed: 10 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,17 @@
11
package pattern
22

3-
/*
4-
Finds substrings in a string in a fast (linear way)
5-
Uses prefix function (kinda KMP algorithm)
6-
For fast usage precalc prefFunc (via calcPrefFunc) for every substring
7-
Then, call 'findSequence' and it will try to found all substrings in O(len(string)) (linear time!)
8-
*/
3+
import (
4+
"bytes"
5+
)
96

10-
type substring struct {
11-
val []byte
12-
prefFunc []int32
13-
}
14-
15-
func newSubstringPattern(str []byte) *substring {
16-
s := substring{val: str, prefFunc: make([]int32, len(str))}
17-
s.calcPrefFunc()
18-
return &s
19-
}
20-
21-
func (s *substring) calcPrefFunc() {
22-
curPrefFunc := int32(0)
23-
for i, b := range s.val[1:] {
24-
for curPrefFunc > 0 && b != s.val[curPrefFunc] {
25-
curPrefFunc = s.prefFunc[curPrefFunc-1]
26-
}
27-
if b == s.val[curPrefFunc] {
28-
curPrefFunc++
29-
}
30-
s.prefFunc[i+1] = curPrefFunc
31-
}
32-
}
33-
34-
func findSubstring(s []byte, to *substring) int {
35-
curPrefFunc := int32(0)
36-
for i, b := range s {
37-
for curPrefFunc > 0 && b != to.val[curPrefFunc] {
38-
curPrefFunc = to.prefFunc[curPrefFunc-1]
39-
}
40-
if b == to.val[curPrefFunc] {
41-
curPrefFunc++
42-
}
43-
if curPrefFunc == int32(len(to.val)) {
44-
return i + 1
45-
}
46-
}
47-
return -1
48-
}
49-
50-
func findSequence(s []byte, to []*substring) int {
51-
for cur := 0; cur < len(to); cur++ {
52-
end := findSubstring(s, to[cur])
53-
if end == -1 {
7+
func findSequence(haystack []byte, needles [][]byte) int {
8+
for cur := range needles {
9+
val := needles[cur]
10+
start := bytes.Index(haystack, needles[cur])
11+
if start == -1 {
5412
return cur
5513
}
56-
s = s[end:]
14+
haystack = haystack[start+len(val):]
5715
}
58-
return len(to)
16+
return len(needles)
5917
}

pattern/substring_test.go

Lines changed: 107 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -1,99 +1,135 @@
11
package pattern
22

33
import (
4+
"math/rand"
5+
"strconv"
6+
"strings"
47
"testing"
58

69
"github.com/stretchr/testify/assert"
710
)
811

9-
func TestPrefixFunction(t *testing.T) {
12+
func testFindSequence(a *assert.Assertions, cnt int, needles []string, haystack string) {
13+
var needlesB [][]byte
14+
for _, needle := range needles {
15+
needlesB = append(needlesB, []byte(needle))
16+
}
17+
res := findSequence([]byte(haystack), needlesB)
18+
a.Equal(cnt, res, "wrong total number of matches")
19+
}
20+
21+
func TestFindSequence(t *testing.T) {
1022
a := assert.New(t)
11-
var str *substring
1223

13-
str = newSubstringPattern([]byte("aabaaab"))
14-
a.Equal([]int32{0, 1, 0, 1, 2, 2, 3}, str.prefFunc, "wrong prefix function")
24+
testFindSequence(a, 2, []string{"abra", "ada"}, "abracadabra")
25+
testFindSequence(a, 2, []string{"aba", "aba"}, "abacaba")
26+
testFindSequence(a, 2, []string{"aba", "caba"}, "abacaba")
27+
testFindSequence(a, 1, []string{"abacaba"}, "abacaba")
28+
testFindSequence(a, 0, []string{"abacaba"}, "aba")
29+
testFindSequence(a, 1, []string{"aba"}, "abacaba")
30+
testFindSequence(a, 0, []string{"dad"}, "abacaba")
31+
testFindSequence(a, 1, []string{"aba", "dad"}, "abacaba")
32+
testFindSequence(a, 0, []string{"dad", "aba"}, "abacaba")
1533

16-
str = newSubstringPattern([]byte("abacaba"))
17-
a.Equal([]int32{0, 0, 1, 0, 1, 2, 3}, str.prefFunc, "wrong prefix function")
34+
testFindSequence(a, 2, []string{"needle", "haystack"}, "can you find a needle in a haystack?")
35+
testFindSequence(a, 2, []string{"k8s_pod", "_prod"}, "\"k8s_pod\":{\"main_prod\"}")
1836

19-
str = newSubstringPattern([]byte("abracadabra"))
20-
a.Equal([]int32{0, 0, 0, 1, 0, 1, 0, 1, 2, 3, 4}, str.prefFunc, "wrong prefix function")
37+
testFindSequence(a, 2, []string{"!13", "37#"}, "woah!13@37#test")
2138

22-
str = newSubstringPattern([]byte("abacdadba"))
23-
a.Equal([]int32{0, 0, 1, 0, 0, 1, 0, 0, 1}, str.prefFunc, "wrong prefix function")
39+
testFindSequence(a, 1, []string{"abc"}, strings.Repeat("ab", 1024)+"c")
40+
}
2441

25-
str = newSubstringPattern([]byte("!@#\"123{}();'!@#"))
26-
a.Equal([]int32{1, 2, 3}, str.prefFunc[len(str.prefFunc)-3:], "wrong prefix function")
42+
func BenchmarkFindSequence_Deterministic(b *testing.B) {
43+
type testCase struct {
44+
haystack []byte
45+
needles [][]byte
46+
}
2747

28-
str = newSubstringPattern([]byte("template#find template in templates text"))
29-
a.Equal(int32(8), str.prefFunc[21], "wrong prefix function")
30-
a.Equal(int32(8), str.prefFunc[33], "wrong prefix function")
31-
a.Equal(int32(0), str.prefFunc[34], "wrong prefix function")
32-
}
48+
type namedTestCase struct {
49+
name string
50+
cases []testCase
51+
}
3352

34-
func testSubstring(a *assert.Assertions, cnt int, substr, text string) {
35-
subs := newSubstringPattern([]byte(substr))
36-
str := []byte(text)
37-
total := 0
38-
for {
39-
to := findSubstring(str, subs)
40-
if to == -1 {
41-
break
42-
}
43-
total++
44-
a.Equal(string(subs.val), string(str[to-len(subs.val):to]), "substring doesn't match")
45-
str = str[to:]
53+
testCases := []namedTestCase{
54+
{
55+
name: "regular-cases",
56+
cases: []testCase{
57+
{bb("Hello, world!"), [][]byte{bb("orl")}},
58+
{bb("some-k8s-service"), [][]byte{bb("k8s")}},
59+
},
60+
},
61+
{
62+
name: "corner-cases",
63+
cases: []testCase{
64+
{bb(strings.Repeat("ab", 32) + "c"), [][]byte{bb("abc")}},
65+
{bb(strings.Repeat("ab", 64) + "c"), [][]byte{bb("abc")}},
66+
{bb(strings.Repeat("ab", 1024) + "c"), [][]byte{bb("abc")}},
67+
{bb(strings.Repeat("ab", 16384) + "c"), [][]byte{bb("abc")}},
68+
},
69+
},
4670
}
47-
a.Equal(cnt, total, "wrong total number of matches")
48-
}
4971

50-
func testSequence(a *assert.Assertions, cnt int, substr []string, text string) {
51-
subs := make([]*substring, len(substr))
52-
for i, s := range substr {
53-
subs[i] = newSubstringPattern([]byte(s))
72+
for _, tc := range testCases {
73+
for i, c := range tc.cases {
74+
b.Run(tc.name+"-"+strconv.Itoa(i), func(b *testing.B) {
75+
for b.Loop() {
76+
findSequence([]byte(c.haystack), c.needles)
77+
}
78+
})
79+
}
5480
}
55-
res := findSequence([]byte(text), subs)
56-
a.Equal(cnt, res, "wrong total number of matches")
5781
}
5882

59-
func TestSubstring(t *testing.T) {
60-
a := assert.New(t)
83+
func BenchmarkFindSequence_Random(b *testing.B) {
84+
sizes := []struct {
85+
name string
86+
haystackSize int
87+
needleSize int
88+
needleCount int
89+
}{
90+
{"tiny", 64, 3, 2},
91+
{"small", 256, 10, 3},
92+
{"medium", 1024, 50, 5},
93+
{"large", 16384, 200, 10},
94+
{"extra-large", 1048576, 1024, 100},
95+
}
6196

62-
testSubstring(a, 2, "aba", "abacaba")
63-
testSubstring(a, 0, "abc", "abacaba")
64-
testSubstring(a, 1, "abacaba", "abacaba")
65-
testSubstring(a, 0, "abacaba", "aba")
66-
testSubstring(a, 0, "longtext", "a")
67-
testSubstring(a, 4, "a", "abacaba")
68-
testSubstring(a, 0, "d", "abacaba")
69-
testSubstring(a, 1, "aca", "abacaba")
70-
testSubstring(a, 3, "aab", "aabaaabaab")
71-
testSubstring(a, 2, "aa", "aaaaa") // actually there are 4, but for our purposes we want this behaviour
72-
testSubstring(a, 1, "abaab", "abaabaab") // actually there are 2
73-
74-
testSubstring(a, 1, "needle", "can you find a needle in a haystack?")
75-
testSubstring(a, 1, "haystack", "can you find a needle in a haystack?")
76-
testSubstring(a, 0, "elephant", "can you find a needle in a haystack?")
77-
78-
testSubstring(a, 1, "@", "symbols@test")
79-
testSubstring(a, 1, "!1337#", "woah!1337#test")
97+
for _, size := range sizes {
98+
b.Run(size.name, func(b *testing.B) {
99+
haystack, needles := generateTestData(
100+
size.haystackSize, size.needleSize, size.needleCount, 256,
101+
)
102+
b.ResetTimer()
103+
for b.Loop() {
104+
findSequence(haystack, needles)
105+
b.SetBytes(int64(len(haystack)))
106+
}
107+
})
108+
}
80109
}
81110

82-
func TestSequence(t *testing.T) {
83-
a := assert.New(t)
111+
func generateTestData(haystackSize, needleSize, needleCount, charset int) ([]byte, [][]byte) {
112+
haystack := generateRandomBytes(haystackSize, charset)
84113

85-
testSequence(a, 2, []string{"abra", "ada"}, "abracadabra")
86-
testSequence(a, 2, []string{"aba", "aba"}, "abacaba")
87-
testSequence(a, 2, []string{"aba", "caba"}, "abacaba")
88-
testSequence(a, 1, []string{"abacaba"}, "abacaba")
89-
testSequence(a, 0, []string{"abacaba"}, "aba")
90-
testSequence(a, 1, []string{"aba"}, "abacaba")
91-
testSequence(a, 0, []string{"dad"}, "abacaba")
92-
testSequence(a, 1, []string{"aba", "dad"}, "abacaba")
93-
testSequence(a, 0, []string{"dad", "aba"}, "abacaba")
114+
needles := make([][]byte, needleCount)
115+
for i := range needleCount {
116+
pattern := generateRandomBytes(needleSize, charset)
117+
pos := rand.Intn(len(haystack) - needleSize)
118+
copy(haystack[pos:], pattern)
119+
needles[i] = pattern
120+
}
121+
122+
return haystack, needles
123+
}
94124

95-
testSequence(a, 2, []string{"needle", "haystack"}, "can you find a needle in a haystack?")
96-
testSequence(a, 2, []string{"k8s_pod", "_prod"}, "\"k8s_pod\":{\"main_prod\"}")
125+
func generateRandomBytes(size, charset int) []byte {
126+
b := make([]byte, size)
127+
for i := range b {
128+
b[i] = byte(rand.Intn(charset))
129+
}
130+
return b
131+
}
97132

98-
testSequence(a, 2, []string{"!13", "37#"}, "woah!13@37#test")
133+
func bb(s string) []byte {
134+
return []byte(s)
99135
}

0 commit comments

Comments
 (0)