diff --git a/web/frontend/package-lock.json b/web/frontend/package-lock.json index 1539b292..f6bfb36c 100644 --- a/web/frontend/package-lock.json +++ b/web/frontend/package-lock.json @@ -8,6 +8,7 @@ "name": "frontend", "version": "0.0.0", "dependencies": { + "@radix-ui/react-accordion": "^1.2.12", "@radix-ui/react-alert-dialog": "^1.1.15", "@radix-ui/react-checkbox": "^1.3.3", "@radix-ui/react-dialog": "^1.1.15", @@ -2471,6 +2472,37 @@ "integrity": "sha512-JTF99U/6XIjCBo0wqkU5sK10glYe27MRRsfwoiq5zzOEZLHU3A3KCMa5X/azekYRCJ0HlwI0crAXS/5dEHTzDg==", "license": "MIT" }, + "node_modules/@radix-ui/react-accordion": { + "version": "1.2.12", + "resolved": "https://registry.npmjs.org/@radix-ui/react-accordion/-/react-accordion-1.2.12.tgz", + "integrity": "sha512-T4nygeh9YE9dLRPhAHSeOZi7HBXo+0kYIPJXayZfvWOWA0+n3dESrZbjfDPUABkUNym6Hd+f2IR113To8D2GPA==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-collapsible": "1.1.12", + "@radix-ui/react-collection": "1.1.7", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-direction": "1.1.1", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-controllable-state": "1.2.2" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-alert-dialog": { "version": "1.1.15", "resolved": "https://registry.npmjs.org/@radix-ui/react-alert-dialog/-/react-alert-dialog-1.1.15.tgz", @@ -2552,6 +2584,36 @@ } } }, + "node_modules/@radix-ui/react-collapsible": { + "version": "1.1.12", + "resolved": "https://registry.npmjs.org/@radix-ui/react-collapsible/-/react-collapsible-1.1.12.tgz", + "integrity": "sha512-Uu+mSh4agx2ib1uIGPP4/CKNULyajb3p92LsVXmH2EHVMTfZWpll88XJ0j4W0z3f8NK1eYl1+Mf/szHPmcHzyA==", + "license": "MIT", + "dependencies": { + "@radix-ui/primitive": "1.1.3", + "@radix-ui/react-compose-refs": "1.1.2", + "@radix-ui/react-context": "1.1.2", + "@radix-ui/react-id": "1.1.1", + "@radix-ui/react-presence": "1.1.5", + "@radix-ui/react-primitive": "2.1.3", + "@radix-ui/react-use-controllable-state": "1.2.2", + "@radix-ui/react-use-layout-effect": "1.1.1" + }, + "peerDependencies": { + "@types/react": "*", + "@types/react-dom": "*", + "react": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc", + "react-dom": "^16.8 || ^17.0 || ^18.0 || ^19.0 || ^19.0.0-rc" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "@types/react-dom": { + "optional": true + } + } + }, "node_modules/@radix-ui/react-collection": { "version": "1.1.7", "resolved": "https://registry.npmjs.org/@radix-ui/react-collection/-/react-collection-1.1.7.tgz", diff --git a/web/frontend/package.json b/web/frontend/package.json index 052f862c..cbb0b633 100644 --- a/web/frontend/package.json +++ b/web/frontend/package.json @@ -10,6 +10,7 @@ "preview": "vite preview" }, "dependencies": { + "@radix-ui/react-accordion": "^1.2.12", "@radix-ui/react-alert-dialog": "^1.1.15", "@radix-ui/react-checkbox": "^1.3.3", "@radix-ui/react-dialog": "^1.1.15", diff --git a/web/frontend/src/components/TranscriptionConfigDialog.tsx b/web/frontend/src/components/TranscriptionConfigDialog.tsx index 3c78ce1f..e456b418 100644 --- a/web/frontend/src/components/TranscriptionConfigDialog.tsx +++ b/web/frontend/src/components/TranscriptionConfigDialog.tsx @@ -1,2030 +1,5 @@ -import { useState, useEffect, memo } from "react"; -import { - Dialog, - DialogContent, - DialogDescription, - DialogFooter, - DialogHeader, - DialogTitle, -} from "@/components/ui/dialog"; -import { - Tabs, - TabsContent, - TabsList, - TabsTrigger, -} from "@/components/ui/tabs"; -import { Button } from "@/components/ui/button"; -import { Input } from "@/components/ui/input"; -import { Label } from "@/components/ui/label"; -import { Switch } from "@/components/ui/switch"; -import { Slider } from "@/components/ui/slider"; -import { - Select, - SelectContent, - SelectItem, - SelectTrigger, - SelectValue, -} from "@/components/ui/select"; -import { Textarea } from "@/components/ui/textarea"; -import { Separator } from "@/components/ui/separator"; -import { HoverCard, HoverCardContent, HoverCardTrigger } from "@/components/ui/hover-card"; -import { Info, Check, XCircle, Loader2 } from "lucide-react"; -import { useAuth } from "@/features/auth/hooks/useAuth"; - -export interface WhisperXParams { - // Model family (whisper or nvidia) - model_family: string; - - // Model parameters - model: string; - model_cache_only: boolean; - model_dir?: string; - - // Device and computation - device: string; - device_index: number; - batch_size: number; - compute_type: string; - threads: number; - - // Output settings - output_format: string; - verbose: boolean; - - // Task and language - task: string; - language?: string; - - // Alignment settings - align_model?: string; - interpolate_method: string; - no_align: boolean; - return_char_alignments: boolean; - - // VAD settings - vad_method: string; - vad_onset: number; - vad_offset: number; - chunk_size: number; - - // Diarization settings - diarize: boolean; - min_speakers?: number; - max_speakers?: number; - diarize_model: string; - speaker_embeddings: boolean; - - // Transcription quality settings - temperature: number; - best_of: number; - beam_size: number; - patience: number; - length_penalty: number; - suppress_tokens?: string; - suppress_numerals: boolean; - initial_prompt?: string; - condition_on_previous_text: boolean; - fp16: boolean; - temperature_increment_on_fallback: number; - compression_ratio_threshold: number; - logprob_threshold: number; - no_speech_threshold: number; - - // Output formatting - max_line_width?: number; - max_line_count?: number; - highlight_words: boolean; - segment_resolution: string; - - // Token and progress - hf_token?: string; - print_progress: boolean; - - // NVIDIA Parakeet-specific parameters for long-form audio - attention_context_left: number; - attention_context_right: number; - - // Multi-track transcription settings - is_multi_track_enabled: boolean; - - // OpenAI settings - api_key?: string; -} - -// Parameter descriptions for hover cards -const PARAM_DESCRIPTIONS = { - model: "Size of the Whisper model to use. Larger models are more accurate but slower and require more memory.", - language: "Source language of the audio. Leave as auto-detect for automatic language detection.", - task: "Whether to transcribe the audio or translate it to English.", - device: "Processing device: CPU (slower, universal), GPU (faster, requires CUDA), or AUTO (automatic selection).", - compute_type: "Precision type: Float16 (faster, less memory), Float32 (more accurate), Int8 (fastest, least accurate).", - batch_size: "Number of audio segments processed simultaneously. Higher values are faster but use more memory.", - diarize: "Enable speaker diarization to identify and separate different speakers in the audio.", - min_speakers: "Minimum number of speakers expected in the audio (leave empty for automatic detection).", - max_speakers: "Maximum number of speakers expected in the audio (leave empty for automatic detection).", - diarize_model: "Choose diarization model: Pyannote models require a HuggingFace token and support unlimited speakers. NVIDIA Sortformer is optimized for 4-speaker scenarios and doesn't require a token.", - temperature: "Controls randomness in output. 0 = deterministic, higher values = more creative but less accurate.", - beam_size: "Number of beams for beam search decoding. Higher values improve quality but are slower.", - best_of: "Number of candidate sequences when sampling. Higher values improve quality but are slower.", - patience: "Patience factor for beam search. Higher values wait longer for better sequences.", - length_penalty: "Penalty applied to longer sequences. >1 favors longer, <1 favors shorter sequences.", - initial_prompt: "Optional text to provide context for the first transcription window.", - suppress_numerals: "Suppress numeric symbols and currency symbols during transcription sampling.", - condition_on_previous_text: "Use previous transcription output as context for next segment (may cause repetition loops).", - vad_method: "Voice Activity Detection method: Pyannote (more accurate) or Silero (faster).", - vad_onset: "Sensitivity threshold for detecting speech start. Lower = more sensitive to quiet speech.", - vad_offset: "Sensitivity threshold for detecting speech end. Lower = continues longer into silence.", - chunk_size: "Duration in seconds for merging adjacent speech segments detected by VAD.", - compression_ratio_threshold: "Fail transcription if text compression ratio exceeds this value (indicates repetitive output).", - logprob_threshold: "Fail transcription if average log probability is below this value (indicates low confidence).", - no_speech_threshold: "Consider segment as silence if no-speech probability exceeds this value.", - suppress_tokens: "Comma-separated token IDs to suppress during generation (e.g., -1 for default special tokens).", - no_align: "Skip phoneme-level alignment for faster processing but less precise word timestamps.", - return_char_alignments: "Include character-level timing alignments in the output (increases processing time).", - fp16: "Use 16-bit floating point precision for faster inference with slightly reduced accuracy.", - output_format: "File format(s) to generate: SRT (subtitles), VTT (web), TXT (plain text), JSON (structured), TSV (tabular), or All.", - segment_resolution: "How to break up transcription: Sentence (natural breaks) or Chunk (fixed VAD segments).", - max_line_width: "Maximum characters per line in subtitle formats before text wrapping.", - max_line_count: "Maximum number of lines per subtitle segment.", - highlight_words: "Add word-level timing highlights in SRT/VTT formats (underlines words as spoken).", - verbose: "Show detailed progress and debug messages during transcription.", - print_progress: "Display processing progress information in the console output.", - hf_token: "Hugging Face API token required for accessing private or gated models.", - is_multi_track_enabled: "Enable multi-track transcription mode for processing individual speaker tracks. When enabled, diarization is automatically disabled as each track represents a single speaker.", - align_model: "Custom alignment model to use (e.g., KBLab/wav2vec2-large-voxrex-swedish).\nThe model format must be WhisperX compatible!\nLeave empty to use default." -}; - -interface TranscriptionConfigDialogProps { - open: boolean; - onOpenChange: (open: boolean) => void; - onStartTranscription: (params: WhisperXParams & { profileName?: string; profileDescription?: string }) => void; - loading?: boolean; - isProfileMode?: boolean; - initialParams?: WhisperXParams; - initialName?: string; - initialDescription?: string; - isMultiTrack?: boolean; - title?: string; -} - -const DEFAULT_PARAMS: WhisperXParams = { - model_family: "whisper", - model: "small", - model_cache_only: false, - device: "cpu", - device_index: 0, - batch_size: 8, - compute_type: "float32", - threads: 0, - output_format: "all", - verbose: true, - task: "transcribe", - interpolate_method: "nearest", - no_align: false, - return_char_alignments: false, - vad_method: "pyannote", - vad_onset: 0.5, - vad_offset: 0.363, - chunk_size: 30, - diarize: false, - diarize_model: "pyannote", - speaker_embeddings: false, - temperature: 0, - best_of: 5, - beam_size: 5, - patience: 1.0, - length_penalty: 1.0, - suppress_numerals: false, - condition_on_previous_text: false, - fp16: true, - temperature_increment_on_fallback: 0.2, - compression_ratio_threshold: 2.4, - logprob_threshold: -1.0, - no_speech_threshold: 0.6, - highlight_words: false, - segment_resolution: "sentence", - print_progress: false, - attention_context_left: 256, - attention_context_right: 256, - is_multi_track_enabled: false, - api_key: "", -}; - -const WHISPER_MODELS = [ - "tiny", "tiny.en", - "base", "base.en", - "small", "small.en", - "medium", "medium.en", - "large", "large-v1", "large-v2", "large-v3" -]; - -const LANGUAGES = [ - { value: "auto", label: "Auto-detect" }, - { value: "en", label: "English" }, - { value: "zh", label: "Chinese" }, - { value: "de", label: "German" }, - { value: "es", label: "Spanish" }, - { value: "ru", label: "Russian" }, - { value: "ko", label: "Korean" }, - { value: "fr", label: "French" }, - { value: "ja", label: "Japanese" }, - { value: "pt", label: "Portuguese" }, - { value: "tr", label: "Turkish" }, - { value: "pl", label: "Polish" }, - { value: "ca", label: "Catalan" }, - { value: "nl", label: "Dutch" }, - { value: "ar", label: "Arabic" }, - { value: "sv", label: "Swedish" }, - { value: "it", label: "Italian" }, - { value: "id", label: "Indonesian" }, - { value: "hi", label: "Hindi" }, - { value: "fi", label: "Finnish" }, - { value: "vi", label: "Vietnamese" }, - { value: "he", label: "Hebrew" }, - { value: "uk", label: "Ukrainian" }, - { value: "el", label: "Greek" }, - { value: "ms", label: "Malay" }, - { value: "cs", label: "Czech" }, - { value: "ro", label: "Romanian" }, - { value: "da", label: "Danish" }, - { value: "hu", label: "Hungarian" }, - { value: "ta", label: "Tamil" }, - { value: "no", label: "Norwegian" }, - { value: "th", label: "Thai" }, - { value: "ur", label: "Urdu" }, - { value: "hr", label: "Croatian" }, - { value: "bg", label: "Bulgarian" }, - { value: "lt", label: "Lithuanian" }, - { value: "la", label: "Latin" }, - { value: "mi", label: "Maori" }, - { value: "ml", label: "Malayalam" }, - { value: "cy", label: "Welsh" }, - { value: "sk", label: "Slovak" }, - { value: "te", label: "Telugu" }, - { value: "fa", label: "Persian" }, - { value: "lv", label: "Latvian" }, - { value: "bn", label: "Bengali" }, - { value: "sr", label: "Serbian" }, - { value: "az", label: "Azerbaijani" }, - { value: "sl", label: "Slovenian" }, - { value: "kn", label: "Kannada" }, - { value: "et", label: "Estonian" }, - { value: "mk", label: "Macedonian" }, - { value: "br", label: "Breton" }, - { value: "eu", label: "Basque" }, - { value: "is", label: "Icelandic" }, - { value: "hy", label: "Armenian" }, - { value: "ne", label: "Nepali" }, - { value: "mn", label: "Mongolian" }, - { value: "bs", label: "Bosnian" }, - { value: "kk", label: "Kazakh" }, - { value: "sq", label: "Albanian" }, - { value: "sw", label: "Swahili" }, - { value: "gl", label: "Galician" }, - { value: "mr", label: "Marathi" }, - { value: "pa", label: "Punjabi" }, - { value: "si", label: "Sinhala" }, - { value: "km", label: "Khmer" }, - { value: "sn", label: "Shona" }, - { value: "yo", label: "Yoruba" }, - { value: "so", label: "Somali" }, - { value: "af", label: "Afrikaans" }, - { value: "oc", label: "Occitan" }, - { value: "ka", label: "Georgian" }, - { value: "be", label: "Belarusian" }, - { value: "tg", label: "Tajik" }, - { value: "sd", label: "Sindhi" }, - { value: "gu", label: "Gujarati" }, - { value: "am", label: "Amharic" }, - { value: "yi", label: "Yiddish" }, - { value: "lo", label: "Lao" }, - { value: "uz", label: "Uzbek" }, - { value: "fo", label: "Faroese" }, - { value: "ht", label: "Haitian Creole" }, - { value: "ps", label: "Pashto" }, - { value: "tk", label: "Turkmen" }, - { value: "nn", label: "Nynorsk" }, - { value: "mt", label: "Maltese" }, - { value: "sa", label: "Sanskrit" }, - { value: "lb", label: "Luxembourgish" }, - { value: "my", label: "Myanmar" }, - { value: "bo", label: "Tibetan" }, - { value: "tl", label: "Tagalog" }, - { value: "mg", label: "Malagasy" }, - { value: "as", label: "Assamese" }, - { value: "tt", label: "Tatar" }, - { value: "haw", label: "Hawaiian" }, - { value: "ln", label: "Lingala" }, - { value: "ha", label: "Hausa" }, - { value: "ba", label: "Bashkir" }, - { value: "jw", label: "Javanese" }, - { value: "su", label: "Sundanese" }, -]; - -const CANARY_LANGUAGES = [ - { value: "bg", label: "Bulgarian" }, - { value: "hr", label: "Croatian" }, - { value: "cs", label: "Czech" }, - { value: "da", label: "Danish" }, - { value: "nl", label: "Dutch" }, - { value: "en", label: "English" }, - { value: "et", label: "Estonian" }, - { value: "fi", label: "Finnish" }, - { value: "fr", label: "French" }, - { value: "de", label: "German" }, - { value: "el", label: "Greek" }, - { value: "hu", label: "Hungarian" }, - { value: "it", label: "Italian" }, - { value: "lv", label: "Latvian" }, - { value: "lt", label: "Lithuanian" }, - { value: "mt", label: "Maltese" }, - { value: "pl", label: "Polish" }, - { value: "pt", label: "Portuguese" }, - { value: "ro", label: "Romanian" }, - { value: "sk", label: "Slovak" }, - { value: "sl", label: "Slovenian" }, - { value: "es", label: "Spanish" }, - { value: "sv", label: "Swedish" }, - { value: "ru", label: "Russian" }, - { value: "uk", label: "Ukrainian" }, -]; - -export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog({ - open, - onOpenChange, - onStartTranscription, - loading = false, - isProfileMode = false, - initialParams, - initialName = "", - initialDescription = "", - isMultiTrack = false, - title, -}: TranscriptionConfigDialogProps) { - const [params, setParams] = useState(DEFAULT_PARAMS); - const [profileName, setProfileName] = useState(""); - const [profileDescription, setProfileDescription] = useState(""); - - // OpenAI Validation State - const [isValidating, setIsValidating] = useState(false); - const [validationStatus, setValidationStatus] = useState<'idle' | 'valid' | 'invalid'>('idle'); - const [validationMessage, setValidationMessage] = useState(""); - const { getAuthHeaders } = useAuth(); - const [availableModels, setAvailableModels] = useState(["whisper-1"]); - - const validateAPIKey = async () => { - // Allow validation with empty key (will use server default) - - setIsValidating(true); - setValidationStatus('idle'); - setValidationMessage(""); - - try { - const response = await fetch('/api/v1/config/openai/validate', { - method: 'POST', - headers: { - 'Content-Type': 'application/json', - ...getAuthHeaders(), - }, - body: JSON.stringify({ api_key: params.api_key }), - }); - - const data = await response.json(); - - if (response.ok && data.valid) { - setValidationStatus('valid'); - setAvailableModels(data.models || ["whisper-1"]); - setValidationMessage("API key validated successfully"); - } else { - setValidationStatus('invalid'); - setValidationMessage(data.error || "Invalid API key"); - setAvailableModels(["whisper-1"]); - } - } catch (error) { - setValidationStatus('invalid'); - setValidationMessage("Failed to validate API key"); - console.error("Validation error:", error); - } finally { - setIsValidating(false); - } - }; - - // Reset to defaults or initial values when dialog opens - useEffect(() => { - if (open) { - const baseParams = initialParams || DEFAULT_PARAMS; - // Auto-set multi-track and diarization based on file type - setParams({ - ...baseParams, - is_multi_track_enabled: isMultiTrack, - diarize: isMultiTrack ? false : baseParams.diarize - }); - setProfileName(initialName); - setProfileDescription(initialDescription); - } - }, [open, initialParams, initialName, initialDescription, isMultiTrack]); - - const updateParam = ( - key: K, - value: WhisperXParams[K] - ) => { - setParams(prev => { - const newParams = { ...prev, [key]: value }; - - // If switching to Whisper family, ensure diarize_model is pyannote - if (key === 'model_family' && value === 'whisper') { - newParams.diarize_model = 'pyannote'; - } - - return newParams; - }); - }; - - const handleStartTranscription = () => { - if (isProfileMode) { - onStartTranscription({ ...params, profileName, profileDescription }); - } else { - onStartTranscription(params); - } - }; - - return ( - - - - - {title || (isProfileMode - ? (initialName ? `Edit "${initialName}"` : "New Transcription Profile") - : "Transcription Configuration") - } - - - {isProfileMode - ? (initialName ? "Update your transcription profile settings." : "Create a new profile to save and reuse your transcription settings.") - : "Configure WhisperX parameters for your transcription. Advanced settings allow fine-tuning quality and performance." - } - - - -
- {isProfileMode && ( -
-
- - setProfileName(e.target.value)} - placeholder="Enter a name for this profile..." - className="bg-[var(--bg-main)] border-[var(--border-subtle)] text-[var(--text-primary)] focus:ring-[var(--brand-light)] focus:border-[var(--brand-solid)] rounded-[var(--radius-btn)]" - required - /> -
-
- -