Files
cloudreve/pkg/searcher/extractor/tika.go
2026-02-11 16:05:09 +08:00

83 lines
2.1 KiB
Go

package extractor
import (
"context"
"fmt"
"io"
"strings"
"github.com/cloudreve/Cloudreve/v4/pkg/logging"
"github.com/cloudreve/Cloudreve/v4/pkg/request"
"github.com/cloudreve/Cloudreve/v4/pkg/setting"
)
// TikaExtractor extracts text from documents using Apache Tika.
type TikaExtractor struct {
client request.Client
settings setting.Provider
l logging.Logger
exts []string
maxFileSize int64
}
// NewTikaExtractor creates a new TikaExtractor.
func NewTikaExtractor(client request.Client, settings setting.Provider, l logging.Logger, cfg *setting.FTSTikaExtractorSetting) *TikaExtractor {
exts := cfg.Exts
return &TikaExtractor{
client: client,
settings: settings,
l: l,
exts: exts,
maxFileSize: cfg.MaxFileSize,
}
}
// Exts returns the list of supported file extensions.
func (t *TikaExtractor) Exts() []string {
return t.exts
}
// MaxFileSize returns the maximum file size for text extraction.
func (t *TikaExtractor) MaxFileSize() int64 {
return t.maxFileSize
}
// Extract sends the document to Tika and returns the extracted plain text.
func (t *TikaExtractor) Extract(ctx context.Context, reader io.Reader) (string, error) {
tikaCfg := t.settings.FTSTikaExtractor(ctx)
if tikaCfg.Endpoint == "" {
return "", fmt.Errorf("tika endpoint not configured")
}
endpoint := strings.TrimRight(tikaCfg.Endpoint, "/") + "/tika"
resp := t.client.Request(
"PUT",
endpoint,
reader,
request.WithHeader(map[string][]string{
"Accept": {"text/plain"},
}),
)
if resp.Err != nil {
return "", fmt.Errorf("tika request failed: %w", resp.Err)
}
defer resp.Response.Body.Close()
if resp.Response.StatusCode != 200 {
return "", fmt.Errorf("tika returned status %d", resp.Response.StatusCode)
}
maxSize := tikaCfg.MaxResponseSize
if maxSize <= 0 {
maxSize = 10 * 1024 * 1024 // default 10MB
}
limited := io.LimitReader(resp.Response.Body, maxSize)
body, err := io.ReadAll(limited)
if err != nil {
return "", fmt.Errorf("failed to read tika response: %w", err)
}
return strings.TrimSpace(string(body)), nil
}