Files
cloudreve/pkg/searcher/indexer/chunker_test.go
2026-02-11 16:05:09 +08:00

85 lines
2.3 KiB
Go

package indexer
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
func TestChunkText_Empty(t *testing.T) {
assert.Nil(t, ChunkText("", 500))
assert.Nil(t, ChunkText(" ", 500))
}
func TestChunkText_SingleSmallParagraph(t *testing.T) {
chunks := ChunkText("Hello world", 500)
assert.Equal(t, []string{"Hello world"}, chunks)
}
func TestChunkText_MultipleParagraphsCombined(t *testing.T) {
text := "First paragraph.\n\nSecond paragraph.\n\nThird paragraph."
chunks := ChunkText(text, 500)
assert.Len(t, chunks, 1)
assert.Contains(t, chunks[0], "First paragraph.")
assert.Contains(t, chunks[0], "Third paragraph.")
}
func TestChunkText_SplitOnParagraphBoundary(t *testing.T) {
// Create two paragraphs each ~300 bytes
para := strings.Repeat("abcde ", 50) // 300 bytes
para = strings.TrimSpace(para)
text := para + "\n\n" + para
chunks := ChunkText(text, 500)
assert.Len(t, chunks, 2)
}
func TestChunkText_LargeParagraphSplit(t *testing.T) {
// Create a single paragraph of 1200 bytes (240 words * 5 bytes each)
words := make([]string, 240)
for i := range words {
words[i] = "word" // 4 bytes + 1 space = 5 per word
}
text := strings.Join(words, " ") // 240*4 + 239 = 1199 bytes
chunks := ChunkText(text, 500)
assert.Len(t, chunks, 3)
assert.LessOrEqual(t, len(chunks[0]), 500)
assert.LessOrEqual(t, len(chunks[1]), 500)
assert.Greater(t, len(chunks[2]), 0)
}
func TestChunkText_DefaultMaxBytes(t *testing.T) {
chunks := ChunkText("hello", 0)
assert.Equal(t, []string{"hello"}, chunks)
}
func TestChunkText_EmptyParagraphsIgnored(t *testing.T) {
text := "First.\n\n\n\n\n\nSecond."
chunks := ChunkText(text, 500)
assert.Len(t, chunks, 1)
assert.Equal(t, "First.\n\nSecond.", chunks[0])
}
func TestChunkText_ParagraphKeptWhole(t *testing.T) {
// A paragraph under the limit should not be split
para := strings.Repeat("x", 400)
chunks := ChunkText(para, 500)
assert.Len(t, chunks, 1)
assert.Equal(t, para, chunks[0])
}
func TestChunkText_JoinerAccountedInLimit(t *testing.T) {
// Two paragraphs that fit individually but exceed the limit when joined with "\n\n"
p1 := strings.Repeat("a", 250)
p2 := strings.Repeat("b", 250)
text := p1 + "\n\n" + p2
chunks := ChunkText(text, 500)
// 250 + 2 + 250 = 502 > 500, so they should be split
assert.Len(t, chunks, 2)
assert.Equal(t, p1, chunks[0])
assert.Equal(t, p2, chunks[1])
}