From 1ae7b2bf71bf8cd19f866d47b3a4446989550ea6 Mon Sep 17 00:00:00 2001 From: rishikanthc Date: Tue, 30 Dec 2025 12:03:45 -0800 Subject: [PATCH] feat: add Voxtral-mini transcription support - Add VoxtralAdapter using transformers library with direct model loading - Add Python transcription script with apply_transcription_request() method - Register Voxtral adapter in main.go with dedicated environment - Add UI configuration in TranscriptionConfigDialog with warning banner - Support multilingual transcription without word-level timestamps - Auto GPU/CPU detection, no device parameter needed - Graceful degradation for missing timestamp features Voxtral provides high-quality text-only transcription but does not support word-level timestamps. UI warns users that synchronized playback and seek features won't be available. --- cmd/server/main.go | 5 + .../adapters/py/voxtral/pyproject.toml | 35 ++ .../adapters/py/voxtral/voxtral_transcribe.py | 174 +++++++++ .../transcription/adapters/voxtral_adapter.go | 339 ++++++++++++++++++ .../TranscriptionConfigDialog.tsx | 57 +++ 5 files changed, 610 insertions(+) create mode 100644 internal/transcription/adapters/py/voxtral/pyproject.toml create mode 100644 internal/transcription/adapters/py/voxtral/voxtral_transcribe.py create mode 100644 internal/transcription/adapters/voxtral_adapter.go diff --git a/cmd/server/main.go b/cmd/server/main.go index e7a80223..3ca15160 100644 --- a/cmd/server/main.go +++ b/cmd/server/main.go @@ -227,6 +227,9 @@ func registerAdapters(cfg *config.Config) { // Dedicated environment path for PyAnnote (to avoid dependency conflicts) pyannoteEnvPath := filepath.Join(cfg.WhisperXEnv, "pyannote") + // Dedicated environment path for Voxtral (Mistral AI model) + voxtralEnvPath := filepath.Join(cfg.WhisperXEnv, "voxtral") + // Register transcription adapters registry.RegisterTranscriptionAdapter("whisperx", adapters.NewWhisperXAdapter(cfg.WhisperXEnv)) @@ -234,6 +237,8 @@ func registerAdapters(cfg *config.Config) { adapters.NewParakeetAdapter(nvidiaEnvPath)) registry.RegisterTranscriptionAdapter("canary", adapters.NewCanaryAdapter(nvidiaEnvPath)) // Shares with Parakeet + registry.RegisterTranscriptionAdapter("voxtral", + adapters.NewVoxtralAdapter(voxtralEnvPath)) registry.RegisterTranscriptionAdapter("openai_whisper", adapters.NewOpenAIAdapter(cfg.OpenAIAPIKey)) diff --git a/internal/transcription/adapters/py/voxtral/pyproject.toml b/internal/transcription/adapters/py/voxtral/pyproject.toml new file mode 100644 index 00000000..b3866294 --- /dev/null +++ b/internal/transcription/adapters/py/voxtral/pyproject.toml @@ -0,0 +1,35 @@ +[project] +name = "voxtral-transcription" +version = "0.1.0" +description = "Audio transcription using Mistral Voxtral-mini model" +requires-python = ">=3.11" +dependencies = [ + "transformers>=4.45.0", + "torch", + "torchaudio", + "accelerate", + "librosa", + "soundfile", +] + +[tool.uv.sources] +torch = [ + { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" }, + { index = "pytorch-cpu", marker = "platform_machine != 'x86_64' and sys_platform != 'darwin'" }, + { index = "pytorch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] +torchaudio = [ + { index = "pytorch-cpu", marker = "sys_platform == 'darwin'" }, + { index = "pytorch-cpu", marker = "platform_machine != 'x86_64' and sys_platform != 'darwin'" }, + { index = "pytorch", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, +] + +[[tool.uv.index]] +name = "pytorch" +url = "https://download.pytorch.org/whl/cu126" +explicit = true + +[[tool.uv.index]] +name = "pytorch-cpu" +url = "https://download.pytorch.org/whl/cpu" +explicit = true diff --git a/internal/transcription/adapters/py/voxtral/voxtral_transcribe.py b/internal/transcription/adapters/py/voxtral/voxtral_transcribe.py new file mode 100644 index 00000000..d3bd9ce0 --- /dev/null +++ b/internal/transcription/adapters/py/voxtral/voxtral_transcribe.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Voxtral-mini transcription script for Scriberr +Transcribes audio using Mistral's Voxtral-mini model +""" + +import argparse +import json +import sys +import torch +from pathlib import Path +from transformers import VoxtralForConditionalGeneration, AutoProcessor + + +def transcribe_audio( + audio_path: str, + output_path: str, + language: str = "en", + model_id: str = "mistralai/Voxtral-mini", + device: str = "auto", + max_new_tokens: int = 500, +) -> dict: + """ + Transcribe audio using Voxtral-mini model. + + Args: + audio_path: Path to input audio file + output_path: Path to output JSON file + language: Language code (e.g., 'en', 'es', 'fr') + model_id: HuggingFace model ID + device: Device to use ('cpu', 'cuda', or 'auto') + max_new_tokens: Maximum number of tokens to generate + + Returns: + Dictionary containing transcription results + """ + # Determine device + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + + print(f"Loading Voxtral model on {device}...", file=sys.stderr) + + # Load processor and model + processor = AutoProcessor.from_pretrained(model_id) + + # Use appropriate dtype based on device + dtype = torch.bfloat16 if device == "cuda" else torch.float32 + + model = VoxtralForConditionalGeneration.from_pretrained( + model_id, + torch_dtype=dtype, + device_map=device, + ) + + print(f"Model loaded successfully", file=sys.stderr) + print(f"Processing audio: {audio_path}", file=sys.stderr) + + # Prepare transcription request using the proper method + inputs = processor.apply_transcription_request( + language=language, + audio=audio_path, + model_id=model_id + ) + + # Move inputs to device with correct dtype + inputs = inputs.to(device, dtype=dtype) + + print(f"Generating transcription...", file=sys.stderr) + + # Generate transcription + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=max_new_tokens, + ) + + # Decode only the newly generated tokens (skip the input prompt) + decoded_outputs = processor.batch_decode( + outputs[:, inputs.input_ids.shape[1]:], + skip_special_tokens=True + ) + + transcription_text = decoded_outputs[0] + + print(f"Transcription completed ({len(transcription_text)} chars)", file=sys.stderr) + + # Prepare output in Scriberr format + # Note: Voxtral doesn't provide word-level timestamps, so we create a single segment + result = { + "text": transcription_text, + "segments": [ + { + "id": 0, + "start": 0.0, + "end": 0.0, # Duration unknown without audio analysis + "text": transcription_text, + "words": [] # Voxtral doesn't provide word-level timestamps + } + ], + "language": language, + "model": model_id, + "has_word_timestamps": False, # Important: Voxtral doesn't support timestamps + } + + # Write output + output_file = Path(output_path) + with output_file.open('w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + print(f"Results written to {output_path}", file=sys.stderr) + + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Transcribe audio using Voxtral-mini model" + ) + parser.add_argument( + "audio_path", + type=str, + help="Path to input audio file" + ) + parser.add_argument( + "output_path", + type=str, + help="Path to output JSON file" + ) + parser.add_argument( + "--language", + type=str, + default="en", + help="Language code (default: en)" + ) + parser.add_argument( + "--model-id", + type=str, + default="mistralai/Voxtral-mini", + help="HuggingFace model ID (default: mistralai/Voxtral-mini)" + ) + parser.add_argument( + "--device", + type=str, + default="auto", + choices=["cpu", "cuda", "auto"], + help="Device to use (default: auto)" + ) + parser.add_argument( + "--max-new-tokens", + type=int, + default=500, + help="Maximum number of tokens to generate (default: 500)" + ) + + args = parser.parse_args() + + try: + transcribe_audio( + audio_path=args.audio_path, + output_path=args.output_path, + language=args.language, + model_id=args.model_id, + device=args.device, + max_new_tokens=args.max_new_tokens, + ) + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/internal/transcription/adapters/voxtral_adapter.go b/internal/transcription/adapters/voxtral_adapter.go new file mode 100644 index 00000000..9eac0a72 --- /dev/null +++ b/internal/transcription/adapters/voxtral_adapter.go @@ -0,0 +1,339 @@ +package adapters + +import ( + "context" + "embed" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "scriberr/internal/transcription/interfaces" + "scriberr/pkg/logger" +) + +//go:embed py/voxtral/* +var voxtralScripts embed.FS + +// VoxtralAdapter implements the TranscriptionAdapter interface for Mistral Voxtral-mini +type VoxtralAdapter struct { + *BaseAdapter + envPath string +} + +// NewVoxtralAdapter creates a new Voxtral adapter +func NewVoxtralAdapter(envPath string) *VoxtralAdapter { + capabilities := interfaces.ModelCapabilities{ + ModelID: "voxtral", + ModelFamily: "mistral_voxtral", + DisplayName: "Mistral Voxtral-mini", + Description: "Mistral's multilingual audio transcription model", + Version: "1.0.0", + SupportedLanguages: []string{ + "en", "es", "fr", "de", "it", "pt", "nl", "pl", "ru", "zh", "ja", "ko", + // Voxtral supports many languages + }, + SupportedFormats: []string{"wav", "mp3", "flac", "m4a", "ogg"}, + RequiresGPU: false, // Can run on CPU but GPU recommended + MemoryRequirement: 4096, // 4GB recommended + Features: map[string]bool{ + "timestamps": false, // Voxtral doesn't provide word-level timestamps + "word_level": false, + "multilingual": true, + "high_quality": true, + "fast_inference": true, + "transformers_based": true, + }, + Metadata: map[string]string{ + "engine": "mistral_ai", + "framework": "transformers", + "license": "Apache-2.0", + "model_id": "mistralai/Voxtral-mini", + "no_word_timestamps": "true", // Important metadata for frontend + }, + } + + schema := []interfaces.ParameterSchema{ + // Language selection + { + Name: "language", + Type: "string", + Required: false, + Default: "en", + Options: []string{"en", "es", "fr", "de", "it", "pt", "nl", "pl", "ru", "zh", "ja", "ko"}, + Description: "Language of the audio", + Group: "basic", + }, + + // Generation settings + { + Name: "max_new_tokens", + Type: "int", + Required: false, + Default: 500, + Min: &[]float64{100}[0], + Max: &[]float64{2000}[0], + Description: "Maximum number of tokens to generate", + Group: "advanced", + }, + } + + baseAdapter := NewBaseAdapter("voxtral", envPath, capabilities, schema) + + adapter := &VoxtralAdapter{ + BaseAdapter: baseAdapter, + envPath: envPath, + } + + return adapter +} + +// GetSupportedModels returns the available Voxtral models +func (v *VoxtralAdapter) GetSupportedModels() []string { + return []string{"mistralai/Voxtral-mini"} +} + +// PrepareEnvironment sets up the Voxtral environment +func (v *VoxtralAdapter) PrepareEnvironment(ctx context.Context) error { + logger.Info("Preparing Voxtral environment", "env_path", v.envPath) + + // Copy transcription script + if err := v.copyTranscriptionScript(); err != nil { + return fmt.Errorf("failed to copy transcription script: %w", err) + } + + // Check if environment is already ready + if CheckEnvironmentReady(v.envPath, "from transformers import VoxtralForConditionalGeneration") { + logger.Info("Voxtral environment already ready") + v.initialized = true + return nil + } + + // Setup environment + if err := v.setupVoxtralEnvironment(); err != nil { + return fmt.Errorf("failed to setup Voxtral environment: %w", err) + } + + v.initialized = true + logger.Info("Voxtral environment prepared successfully") + return nil +} + +// setupVoxtralEnvironment creates the Python environment for Voxtral +func (v *VoxtralAdapter) setupVoxtralEnvironment() error { + if err := os.MkdirAll(v.envPath, 0755); err != nil { + return fmt.Errorf("failed to create voxtral directory: %w", err) + } + + // Read pyproject.toml + pyprojectContent, err := voxtralScripts.ReadFile("py/voxtral/pyproject.toml") + if err != nil { + return fmt.Errorf("failed to read embedded pyproject.toml: %w", err) + } + + // Replace the hardcoded PyTorch URL with the dynamic one based on environment + contentStr := strings.Replace( + string(pyprojectContent), + "https://download.pytorch.org/whl/cu126", + GetPyTorchWheelURL(), + 1, + ) + + pyprojectPath := filepath.Join(v.envPath, "pyproject.toml") + if err := os.WriteFile(pyprojectPath, []byte(contentStr), 0644); err != nil { + return fmt.Errorf("failed to write pyproject.toml: %w", err) + } + + // Run uv sync + logger.Info("Installing Voxtral dependencies") + cmd := exec.Command("uv", "sync", "--native-tls") + cmd.Dir = v.envPath + out, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("uv sync failed: %w: %s", err, strings.TrimSpace(string(out))) + } + + return nil +} + +// copyTranscriptionScript creates the Python script for Voxtral transcription +func (v *VoxtralAdapter) copyTranscriptionScript() error { + // Ensure directory exists before writing script + if err := os.MkdirAll(v.envPath, 0755); err != nil { + return fmt.Errorf("failed to create directory: %w", err) + } + + scriptContent, err := voxtralScripts.ReadFile("py/voxtral/voxtral_transcribe.py") + if err != nil { + return fmt.Errorf("failed to read embedded voxtral_transcribe.py: %w", err) + } + + scriptPath := filepath.Join(v.envPath, "voxtral_transcribe.py") + if err := os.WriteFile(scriptPath, scriptContent, 0755); err != nil { + return fmt.Errorf("failed to write transcription script: %w", err) + } + + return nil +} + +// Transcribe processes audio using Voxtral +func (v *VoxtralAdapter) Transcribe(ctx context.Context, input interfaces.AudioInput, params map[string]interface{}, procCtx interfaces.ProcessingContext) (*interfaces.TranscriptResult, error) { + startTime := time.Now() + v.LogProcessingStart(input, procCtx) + defer func() { + v.LogProcessingEnd(procCtx, time.Since(startTime), nil) + }() + + // Validate input + if err := v.ValidateAudioInput(input); err != nil { + return nil, fmt.Errorf("invalid audio input: %w", err) + } + + // Validate parameters + if err := v.ValidateParameters(params); err != nil { + return nil, fmt.Errorf("invalid parameters: %w", err) + } + + // Create temporary directory + tempDir, err := v.CreateTempDirectory(procCtx) + if err != nil { + return nil, fmt.Errorf("failed to create temp directory: %w", err) + } + defer v.CleanupTempDirectory(tempDir) + + // Build command arguments + args, err := v.buildVoxtralArgs(input, params, tempDir) + if err != nil { + return nil, fmt.Errorf("failed to build command: %w", err) + } + + // Execute Voxtral + cmd := exec.CommandContext(ctx, "uv", args...) + cmd.Env = append(os.Environ(), "PYTHONUNBUFFERED=1") + + // Setup log file + logFile, err := os.OpenFile(filepath.Join(procCtx.OutputDirectory, "transcription.log"), os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) + if err != nil { + logger.Warn("Failed to create log file", "error", err) + } else { + defer logFile.Close() + cmd.Stdout = logFile + cmd.Stderr = logFile + } + + logger.Info("Executing Voxtral command", "args", strings.Join(args, " ")) + + if err := cmd.Run(); err != nil { + if ctx.Err() == context.Canceled { + return nil, fmt.Errorf("transcription was cancelled") + } + + // Read tail of log file for context + logPath := filepath.Join(procCtx.OutputDirectory, "transcription.log") + logTail, readErr := v.ReadLogTail(logPath, 2048) + if readErr != nil { + logger.Warn("Failed to read log tail", "error", readErr) + } + + logger.Error("Voxtral execution failed", "error", err) + return nil, fmt.Errorf("Voxtral execution failed: %w\nLogs:\n%s", err, logTail) + } + + // Parse result + result, err := v.parseResult(tempDir) + if err != nil { + return nil, fmt.Errorf("failed to parse result: %w", err) + } + + result.ProcessingTime = time.Since(startTime) + result.ModelUsed = "mistralai/Voxtral-mini" + + logger.Info("Voxtral transcription completed", + "text_length", len(result.Text), + "processing_time", result.ProcessingTime) + + return result, nil +} + +// buildVoxtralArgs builds the command arguments for Voxtral +func (v *VoxtralAdapter) buildVoxtralArgs(input interfaces.AudioInput, params map[string]interface{}, tempDir string) ([]string, error) { + outputFile := filepath.Join(tempDir, "result.json") + + scriptPath := filepath.Join(v.envPath, "voxtral_transcribe.py") + args := []string{ + "run", "--native-tls", "--project", v.envPath, "python", scriptPath, + input.FilePath, + outputFile, + } + + // Add language + if language := v.GetStringParameter(params, "language"); language != "" { + args = append(args, "--language", language) + } + + // Device auto-detection (like Parakeet/Canary) - no device parameter needed + // Python script will auto-detect and use GPU if available + + // Add max tokens + if maxTokens := v.GetIntParameter(params, "max_new_tokens"); maxTokens > 0 { + args = append(args, "--max-new-tokens", fmt.Sprintf("%d", maxTokens)) + } + + return args, nil +} + +// parseResult parses the Voxtral output +func (v *VoxtralAdapter) parseResult(tempDir string) (*interfaces.TranscriptResult, error) { + resultFile := filepath.Join(tempDir, "result.json") + + data, err := os.ReadFile(resultFile) + if err != nil { + return nil, fmt.Errorf("failed to read result file: %w", err) + } + + var voxtralResult struct { + Text string `json:"text"` + Language string `json:"language"` + Model string `json:"model"` + HasWordTimestamps bool `json:"has_word_timestamps"` + Segments []struct { + Start float64 `json:"start"` + End float64 `json:"end"` + Text string `json:"text"` + } `json:"segments"` + } + + if err := json.Unmarshal(data, &voxtralResult); err != nil { + return nil, fmt.Errorf("failed to parse JSON result: %w", err) + } + + // Convert to standard format + // Note: Voxtral doesn't provide word-level timestamps, so we create segments without words + result := &interfaces.TranscriptResult{ + Text: voxtralResult.Text, + Language: voxtralResult.Language, + Segments: make([]interfaces.TranscriptSegment, len(voxtralResult.Segments)), + } + + for i, seg := range voxtralResult.Segments { + result.Segments[i] = interfaces.TranscriptSegment{ + Start: seg.Start, + End: seg.End, + Text: seg.Text, + } + } + + return result, nil +} + +// GetEstimatedProcessingTime provides Voxtral-specific time estimation +func (v *VoxtralAdapter) GetEstimatedProcessingTime(input interfaces.AudioInput) time.Duration { + // Voxtral is relatively fast + baseTime := v.BaseAdapter.GetEstimatedProcessingTime(input) + + // Voxtral typically processes at about 10-20% of audio duration + return time.Duration(float64(baseTime) * 0.15) +} diff --git a/web/frontend/src/components/transcription/TranscriptionConfigDialog.tsx b/web/frontend/src/components/transcription/TranscriptionConfigDialog.tsx index d15b2623..7c95cb0a 100644 --- a/web/frontend/src/components/transcription/TranscriptionConfigDialog.tsx +++ b/web/frontend/src/components/transcription/TranscriptionConfigDialog.tsx @@ -383,6 +383,9 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog NVIDIA Canary + + Mistral Voxtral + OpenAI @@ -433,6 +436,13 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog onValidate={validateAPIKey} /> )} + + {params.model_family === "mistral_voxtral" && ( + + )} {/* Footer */} @@ -993,3 +1003,50 @@ function OpenAIConfig({ ); } + +function VoxtralConfig({ params, updateParam }: ConfigProps) { + return ( +
+ {/* Voxtral Warning Banner */} + + Voxtral does not support word-level timestamps. Synchronized playback, audio seeking, and timestamp-based features won't be available. + + +
+ + + +
+ + {/* Advanced Settings */} + + + + Advanced Settings + + + + updateParam('max_line_width', parseInt(e.target.value) || 500)} + className={inputClassName} + /> + + + + +
+ ); +}