Improve find performance

Signed-off-by: Daniel <845765@qq.com>
This commit is contained in:
Daniel
2026-01-28 10:44:42 +08:00
parent 34e3261bce
commit 1aa84bd749

View File

@@ -17,8 +17,8 @@
package search package search
import ( import (
"bufio"
"bytes" "bytes"
"io"
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
@@ -47,13 +47,20 @@ func FindAllMatches(root string, targets []string) []Match {
return nil return nil
} }
var searchBytes [][]byte // 构建基于首字节的模式索引,并计算最长模式长度
patternIndex := make(map[byte][][]byte)
var maxLen int
for _, t := range targets { for _, t := range targets {
if t != "" { if t == "" {
searchBytes = append(searchBytes, []byte(t)) continue
} }
b := []byte(t)
if len(b) > maxLen {
maxLen = len(b)
}
patternIndex[b[0]] = append(patternIndex[b[0]], b)
} }
if len(searchBytes) == 0 { if len(patternIndex) == 0 {
return nil return nil
} }
@@ -78,7 +85,7 @@ func FindAllMatches(root string, targets []string) []Match {
go func() { go func() {
defer wg.Done() defer wg.Done()
for p := range jobs { for p := range jobs {
hits := scanFileForTargets(p, searchBytes) hits := scanFileForTargets(p, patternIndex, maxLen)
if len(hits) > 0 { if len(hits) > 0 {
for _, t := range hits { for _, t := range hits {
results <- Match{Path: p, Target: t} results <- Match{Path: p, Target: t}
@@ -102,47 +109,73 @@ func FindAllMatches(root string, targets []string) []Match {
return matches return matches
} }
// scanFileForTargets 在文件中搜索所有目标,返回去重后的命中目标字符串列表 // scanFileForTargets 在文件中流式搜索所有目标(基于首字节索引),返回去重后的命中目标字符串列表
func scanFileForTargets(path string, targets [][]byte) []string { func scanFileForTargets(path string, patternIndex map[byte][][]byte, maxLen int) []string {
f, err := os.Open(path) f, err := os.Open(path)
if err != nil { if err != nil {
return nil return nil
} }
defer f.Close() defer f.Close()
reader := bufio.NewReaderSize(f, 1024*1024) // 1MB 缓冲 // 构建字节位图,加速首字节检测
var bitmap [256]bool
for b := range patternIndex {
bitmap[b] = true
}
found := make(map[string]struct{}) found := make(map[string]struct{})
remaining := len(targets) buf := make([]byte, 64<<10) // 64KB
// 保留上一次块末尾的重叠数据以支持跨块匹配
var tail []byte
for { for {
line, err := reader.ReadSlice('\n') n, err := f.Read(buf)
if len(line) > 0 { if n > 0 {
for _, t := range targets { // data = tail + buf[:n]
ts := string(t) data := make([]byte, len(tail)+n)
if _, ok := found[ts]; ok { copy(data, tail)
continue copy(data[len(tail):], buf[:n])
// 扫描 data查找任意候选首字节位置
i := 0
for i < len(data) {
// 快速跳过非候选字节
for i < len(data) && !bitmap[data[i]] {
i++
} }
if bytes.Contains(line, t) { if i >= len(data) {
found[ts] = struct{}{} break
remaining-- }
if remaining == 0 { b := data[i]
// 找到所有目标,提前返回 // 对应首字节的所有模式进行校验
res := make([]string, 0, len(found)) for _, pat := range patternIndex[b] {
for k := range found { pl := len(pat)
res = append(res, k) // 如果剩余字节不足以完全匹配,则交由下一轮(通过 tail 保证)
if i+pl <= len(data) {
if bytes.Equal(pat, data[i:i+pl]) {
found[string(pat)] = struct{}{}
} }
return res
} }
} }
i++
}
// 保留最后 maxLen-1 字节作为下一块的 tail避免超长内存分配
if maxLen <= 1 {
tail = nil
} else {
if len(data) >= maxLen-1 {
tail = append(tail[:0], data[len(data)-(maxLen-1):]...)
} else {
tail = append(tail[:0], data...)
}
} }
} }
if err != nil { if err != nil {
if err == bufio.ErrBufferFull { if err == io.EOF {
for err == bufio.ErrBufferFull { break
_, err = reader.ReadSlice('\n')
}
continue
} }
// 读取出错,返回已有结果
break break
} }
} }