Files
cloudreve/pkg/searcher/indexer/chunker.go
2026-02-11 16:05:09 +08:00

102 lines
2.3 KiB
Go

package indexer
import "strings"
const defaultMaxBytes = 2000
// ChunkText splits text into chunks of approximately maxBytes bytes each.
// It splits on paragraph breaks (\n\n), combines small paragraphs until the
// byte limit is reached, and splits large paragraphs at word boundaries.
func ChunkText(text string, maxBytes int) []string {
if maxBytes <= 0 {
maxBytes = defaultMaxBytes
}
text = strings.TrimSpace(text)
if text == "" {
return nil
}
paragraphs := strings.Split(text, "\n\n")
var chunks []string
var current []string
currentBytes := 0
for _, para := range paragraphs {
para = strings.TrimSpace(para)
if para == "" {
continue
}
paraBytes := len(para)
// If a single paragraph exceeds maxBytes, split it at word boundaries
if paraBytes > maxBytes {
// Flush accumulated content first
if currentBytes > 0 {
chunks = append(chunks, strings.Join(current, "\n\n"))
current = nil
currentBytes = 0
}
chunks = append(chunks, splitByBytes(para, maxBytes)...)
continue
}
// If adding this paragraph (plus separator) would exceed the limit, flush
joinerLen := 0
if currentBytes > 0 {
joinerLen = 2 // "\n\n"
}
if currentBytes+joinerLen+paraBytes > maxBytes && currentBytes > 0 {
chunks = append(chunks, strings.Join(current, "\n\n"))
current = nil
currentBytes = 0
}
if currentBytes > 0 {
currentBytes += 2 // account for "\n\n" joiner
}
current = append(current, para)
currentBytes += paraBytes
}
// Flush remaining
if currentBytes > 0 {
chunks = append(chunks, strings.Join(current, "\n\n"))
}
return chunks
}
// splitByBytes splits text into chunks at word boundaries, each at most maxBytes bytes.
func splitByBytes(text string, maxBytes int) []string {
words := strings.Fields(text)
var chunks []string
var current []string
currentBytes := 0
for _, w := range words {
wLen := len(w)
spaceLen := 0
if currentBytes > 0 {
spaceLen = 1
}
if currentBytes+spaceLen+wLen > maxBytes && currentBytes > 0 {
chunks = append(chunks, strings.Join(current, " "))
current = nil
currentBytes = 0
spaceLen = 0
}
current = append(current, w)
currentBytes += spaceLen + wLen
}
if len(current) > 0 {
chunks = append(chunks, strings.Join(current, " "))
}
return chunks
}