feat: implement OpenAI transcription support

2026-06-28 14:55:46 +00:00 · 2025-12-01 12:37:41 -08:00
parent b62a944988
commit f3266b31e5
7 changed files with 438 additions and 0 deletions
--- a/cmd/server/main.go
+++ b/cmd/server/main.go
@@ -215,6 +215,8 @@ func registerAdapters(cfg *config.Config) {
 		adapters.NewParakeetAdapter(nvidiaEnvPath))
 	registry.RegisterTranscriptionAdapter("canary",
 		adapters.NewCanaryAdapter(nvidiaEnvPath)) // Shares with Parakeet
+	registry.RegisterTranscriptionAdapter("openai_whisper",
+		adapters.NewOpenAIAdapter(cfg.OpenAIAPIKey))

 	// Register diarization adapters
 	registry.RegisterDiarizationAdapter("pyannote",
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -33,6 +33,9 @@ type Config struct {
 	// Python/WhisperX configuration
 	UVPath      string
 	WhisperXEnv string
+
+	// OpenAI configuration
+	OpenAIAPIKey string
 }

 // Load loads configuration from environment variables and .env file
@@ -51,6 +54,7 @@ func Load() *Config {
 		TranscriptsDir: getEnv("TRANSCRIPTS_DIR", "data/transcripts"),
 		UVPath:         findUVPath(),
 		WhisperXEnv:    getEnv("WHISPERX_ENV", "data/whisperx-env"),
+		OpenAIAPIKey:   getEnv("OPENAI_API_KEY", ""),
 	}
 }

--- a/internal/models/transcription.go
+++ b/internal/models/transcription.go
@@ -124,6 +124,9 @@ type WhisperXParams struct {

 	// Webhook settings
 	CallbackURL *string `json:"callback_url,omitempty" gorm:"type:text"`
+
+	// OpenAI settings
+	APIKey *string `json:"api_key,omitempty" gorm:"type:text"`
 }

 // BeforeCreate sets the ID if not already set
--- a/internal/transcription/adapters/openai_adapter.go
+++ b/internal/transcription/adapters/openai_adapter.go
@@ -0,0 +1,261 @@
+package adapters
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"os"
+	"path/filepath"
+	"time"
+
+	"scriberr/internal/transcription/interfaces"
+)
+
+// OpenAIAdapter implements the TranscriptionAdapter interface for OpenAI API
+type OpenAIAdapter struct {
+	*BaseAdapter
+	apiKey string
+}
+
+// NewOpenAIAdapter creates a new OpenAI adapter
+func NewOpenAIAdapter(apiKey string) *OpenAIAdapter {
+	capabilities := interfaces.ModelCapabilities{
+		ModelID:     "openai_whisper",
+		ModelFamily: "openai",
+		DisplayName: "OpenAI Whisper API",
+		Description: "Cloud-based transcription using OpenAI's Whisper model",
+		Version:     "v1",
+		SupportedLanguages: []string{
+			"af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy",
+		},
+		SupportedFormats:  []string{"flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"},
+		RequiresGPU:       false,
+		MemoryRequirement: 0, // Cloud-based
+		Features: map[string]bool{
+			"timestamps":         true,  // Verbose JSON response includes segments
+			"word_level":         false, // Not supported by standard API yet (unless using verbose_json with timestamp_granularities which is beta)
+			"diarization":        false, // Not supported by OpenAI API
+			"translation":        true,
+			"language_detection": true,
+			"vad":                true, // Implicit
+		},
+		Metadata: map[string]string{
+			"provider": "openai",
+			"api_url":  "https://api.openai.com/v1/audio/transcriptions",
+		},
+	}
+
+	schema := []interfaces.ParameterSchema{
+		{
+			Name:        "api_key",
+			Type:        "string",
+			Required:    false, // Can be provided in config
+			Description: "OpenAI API Key (overrides system default)",
+			Group:       "authentication",
+		},
+		{
+			Name:        "model",
+			Type:        "string",
+			Required:    false,
+			Default:     "whisper-1",
+			Options:     []string{"whisper-1"},
+			Description: "ID of the model to use",
+			Group:       "basic",
+		},
+		{
+			Name:        "language",
+			Type:        "string",
+			Required:    false,
+			Description: "Language of the input audio (ISO-639-1)",
+			Group:       "basic",
+		},
+		{
+			Name:        "prompt",
+			Type:        "string",
+			Required:    false,
+			Description: "Optional text to guide the model's style or continue a previous audio segment",
+			Group:       "advanced",
+		},
+		{
+			Name:        "temperature",
+			Type:        "float",
+			Required:    false,
+			Default:     0.0,
+			Min:         &[]float64{0.0}[0],
+			Max:         &[]float64{1.0}[0],
+			Description: "Sampling temperature",
+			Group:       "quality",
+		},
+	}
+
+	baseAdapter := NewBaseAdapter("openai_whisper", "", capabilities, schema)
+
+	return &OpenAIAdapter{
+		BaseAdapter: baseAdapter,
+		apiKey:      apiKey,
+	}
+}
+
+// GetSupportedModels returns the list of OpenAI models supported
+func (a *OpenAIAdapter) GetSupportedModels() []string {
+	return []string{"whisper-1"}
+}
+
+// PrepareEnvironment is a no-op for cloud adapters
+func (a *OpenAIAdapter) PrepareEnvironment(ctx context.Context) error {
+	a.initialized = true
+	return nil
+}
+
+// Transcribe processes audio using OpenAI API
+func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioInput, params map[string]interface{}, procCtx interfaces.ProcessingContext) (*interfaces.TranscriptResult, error) {
+	startTime := time.Now()
+	a.LogProcessingStart(input, procCtx)
+	defer func() {
+		a.LogProcessingEnd(procCtx, time.Since(startTime), nil)
+	}()
+
+	// Validate input
+	if err := a.ValidateAudioInput(input); err != nil {
+		return nil, fmt.Errorf("invalid audio input: %w", err)
+	}
+
+	// Get API Key
+	apiKey := a.apiKey
+	if key, ok := params["api_key"].(string); ok && key != "" {
+		apiKey = key
+	}
+
+	if apiKey == "" {
+		return nil, fmt.Errorf("OpenAI API key is required but not provided")
+	}
+
+	// Prepare request body
+	body := &bytes.Buffer{}
+	writer := multipart.NewWriter(body)
+
+	// Add file
+	file, err := os.Open(input.FilePath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to open audio file: %w", err)
+	}
+	defer file.Close()
+
+	part, err := writer.CreateFormFile("file", filepath.Base(input.FilePath))
+	if err != nil {
+		return nil, fmt.Errorf("failed to create form file: %w", err)
+	}
+	if _, err := io.Copy(part, file); err != nil {
+		return nil, fmt.Errorf("failed to copy file content: %w", err)
+	}
+
+	// Add parameters
+	model := a.GetStringParameter(params, "model")
+	if model == "" {
+		model = "whisper-1"
+	}
+	_ = writer.WriteField("model", model)
+	_ = writer.WriteField("response_format", "verbose_json")
+	_ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps
+
+	if lang := a.GetStringParameter(params, "language"); lang != "" {
+		_ = writer.WriteField("language", lang)
+	}
+
+	if prompt := a.GetStringParameter(params, "prompt"); prompt != "" {
+		_ = writer.WriteField("prompt", prompt)
+	}
+
+	temp := a.GetFloatParameter(params, "temperature")
+	_ = writer.WriteField("temperature", fmt.Sprintf("%.2f", temp))
+
+	if err := writer.Close(); err != nil {
+		return nil, fmt.Errorf("failed to close multipart writer: %w", err)
+	}
+
+	// Create request
+	req, err := http.NewRequestWithContext(ctx, "POST", "https://api.openai.com/v1/audio/transcriptions", body)
+	if err != nil {
+		return nil, fmt.Errorf("failed to create request: %w", err)
+	}
+
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+	req.Header.Set("Authorization", "Bearer "+apiKey)
+
+	// Execute request
+	client := &http.Client{
+		Timeout: 10 * time.Minute, // Generous timeout for large files
+	}
+	resp, err := client.Do(req)
+	if err != nil {
+		return nil, fmt.Errorf("request failed: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return nil, fmt.Errorf("OpenAI API error (status %d): %s", resp.StatusCode, string(respBody))
+	}
+
+	// Parse response
+	var openAIResponse struct {
+		Task     string  `json:"task"`
+		Language string  `json:"language"`
+		Duration float64 `json:"duration"`
+		Text     string  `json:"text"`
+		Segments []struct {
+			ID               int     `json:"id"`
+			Seek             int     `json:"seek"`
+			Start            float64 `json:"start"`
+			End              float64 `json:"end"`
+			Text             string  `json:"text"`
+			Tokens           []int   `json:"tokens"`
+			Temperature      float64 `json:"temperature"`
+			AvgLogprob       float64 `json:"avg_logprob"`
+			CompressionRatio float64 `json:"compression_ratio"`
+			NoSpeechProb     float64 `json:"no_speech_prob"`
+		} `json:"segments"`
+	}
+
+	if err := json.NewDecoder(resp.Body).Decode(&openAIResponse); err != nil {
+		return nil, fmt.Errorf("failed to decode response: %w", err)
+	}
+
+	// Convert to TranscriptResult
+	result := &interfaces.TranscriptResult{
+		Language:       openAIResponse.Language,
+		Text:           openAIResponse.Text,
+		Segments:       make([]interfaces.TranscriptSegment, len(openAIResponse.Segments)),
+		ProcessingTime: time.Since(startTime),
+		ModelUsed:      model,
+		Metadata:       a.CreateDefaultMetadata(params),
+	}
+
+	for i, seg := range openAIResponse.Segments {
+		result.Segments[i] = interfaces.TranscriptSegment{
+			Start: seg.Start,
+			End:   seg.End,
+			Text:  seg.Text,
+		}
+	}
+
+	// OpenAI doesn't provide word-level timestamps in standard verbose_json without extra beta flags
+	// For now, we'll leave WordSegments empty or implement a basic splitter if needed.
+	// Given the requirements, segment-level is sufficient for now.
+
+	return result, nil
+}
+
+// GetEstimatedProcessingTime provides OpenAI-specific time estimation
+func (a *OpenAIAdapter) GetEstimatedProcessingTime(input interfaces.AudioInput) time.Duration {
+	// Cloud transcription is generally faster, approx 10-20% of audio duration
+	audioDuration := input.Duration
+	if audioDuration == 0 {
+		return 30 * time.Second // Fallback
+	}
+	return time.Duration(float64(audioDuration) * 0.15)
+}
--- a/internal/transcription/unified_service.go
+++ b/internal/transcription/unified_service.go
@@ -337,6 +337,8 @@ func (u *UnifiedTranscriptionService) selectModels(params models.WhisperXParams)
 		transcriptionModelID = "canary"
 	case "whisper":
 		transcriptionModelID = "whisperx"
+	case "openai":
+		transcriptionModelID = "openai_whisper"
 	default:
 		transcriptionModelID = "whisperx" // Default fallback
 	}
@@ -510,12 +512,36 @@ func (u *UnifiedTranscriptionService) convertParametersForModel(params models.Wh
 		return u.convertToPyannoteParams(params)
 	case "sortformer":
 		return u.convertToSortformerParams(params)
+	case "openai_whisper":
+		return u.convertToOpenAIParams(params)
 	default:
 		// Fallback to legacy conversion
 		return u.parametersToMap(params)
 	}
 }

+// convertToOpenAIParams converts to OpenAI-specific parameters
+func (u *UnifiedTranscriptionService) convertToOpenAIParams(params models.WhisperXParams) map[string]interface{} {
+	paramMap := map[string]interface{}{
+		"model":       params.Model,
+		"temperature": params.Temperature,
+	}
+
+	if params.Language != nil {
+		paramMap["language"] = *params.Language
+	}
+	if params.InitialPrompt != nil {
+		paramMap["prompt"] = *params.InitialPrompt
+	}
+
+	// Add API key if provided in params (e.g. from UI override)
+	if params.APIKey != nil && *params.APIKey != "" {
+		paramMap["api_key"] = *params.APIKey
+	}
+
+	return paramMap
+}
+
 // convertToParakeetParams converts to Parakeet-specific parameters
 func (u *UnifiedTranscriptionService) convertToParakeetParams(params models.WhisperXParams) map[string]interface{} {
 	return map[string]interface{}{
--- a/BIN
+++ b/BIN
--- a/web/frontend/src/components/TranscriptionConfigDialog.tsx
+++ b/web/frontend/src/components/TranscriptionConfigDialog.tsx
@@ -105,6 +105,9 @@ export interface WhisperXParams {

  // Multi-track transcription settings
  is_multi_track_enabled: boolean;
+
+  // OpenAI settings
+  api_key?: string;
 }

 // Parameter descriptions for hover cards
@@ -202,6 +205,7 @@ const DEFAULT_PARAMS: WhisperXParams = {
  attention_context_left: 256,
  attention_context_right: 256,
  is_multi_track_enabled: false,
+  api_key: "",
 };

 const WHISPER_MODELS = [
@@ -469,6 +473,9 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
                <SelectItem value="nvidia_canary" className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
                  NVIDIA Canary
                </SelectItem>
+                <SelectItem value="openai" className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
+                  OpenAI API
+                </SelectItem>
              </SelectContent>
            </Select>
          </div>
@@ -933,6 +940,141 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
              </div>
            )}
          </div>
+        ) : params.model_family === "openai" ? (
+          <div className="space-y-6">
+            <div className="p-4 border border-blue-200 dark:border-blue-700 rounded-lg bg-blue-50 dark:bg-blue-900/20">
+              <div className="flex items-center gap-2">
+                <Info className="h-4 w-4 text-blue-600 dark:text-blue-400" />
+                <span className="text-sm font-medium text-blue-800 dark:text-blue-200">Cloud Transcription</span>
+              </div>
+              <p className="text-sm text-blue-700 dark:text-blue-300 mt-1">
+                Audio will be sent to OpenAI servers for processing.
+              </p>
+            </div>
+
+            {/* API Key */}
+            <div className="space-y-2">
+              <div className="flex items-center gap-2">
+                <Label htmlFor="openai_api_key" className="text-carbon-700 dark:text-carbon-300 font-medium">
+                  OpenAI API Key
+                </Label>
+                <HoverCard>
+                  <HoverCardTrigger asChild>
+                    <Info className="h-4 w-4 text-carbon-400 cursor-help" />
+                  </HoverCardTrigger>
+                  <HoverCardContent className="w-80 bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
+                    <p className="text-sm text-carbon-700 dark:text-carbon-300">
+                      Your OpenAI API key. If not provided, the server-configured key will be used (if any).
+                    </p>
+                  </HoverCardContent>
+                </HoverCard>
+              </div>
+              <Input
+                id="openai_api_key"
+                type="password"
+                value={params.api_key || ""}
+                onChange={(e) => updateParam('api_key', e.target.value)}
+                placeholder="sk-..."
+                className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100"
+              />
+            </div>
+
+            {/* Model Selection */}
+            <div className="space-y-2">
+              <Label htmlFor="openai_model" className="text-carbon-700 dark:text-carbon-300 font-medium">
+                Model
+              </Label>
+              <Select
+                value={params.model || "whisper-1"}
+                onValueChange={(value) => updateParam('model', value)}
+              >
+                <SelectTrigger className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100">
+                  <SelectValue />
+                </SelectTrigger>
+                <SelectContent className="bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
+                  <SelectItem value="whisper-1" className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
+                    whisper-1
+                  </SelectItem>
+                </SelectContent>
+              </Select>
+            </div>
+
+            {/* Language Selection */}
+            <div className="space-y-2">
+              <Label htmlFor="openai_language" className="text-carbon-700 dark:text-carbon-300 font-medium">
+                Language
+              </Label>
+              <Select
+                value={params.language || "auto"}
+                onValueChange={(value) => updateParam('language', value === "auto" ? undefined : value)}
+              >
+                <SelectTrigger className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100">
+                  <SelectValue placeholder="Auto-detect" />
+                </SelectTrigger>
+                <SelectContent className="bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700 max-h-60">
+                  {LANGUAGES.map((lang) => (
+                    <SelectItem key={lang.value} value={lang.value} className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
+                      {lang.label}
+                    </SelectItem>
+                  ))}
+                </SelectContent>
+              </Select>
+            </div>
+
+            {/* Temperature */}
+            <div className="space-y-2">
+              <div className="flex items-center gap-2">
+                <Label htmlFor="openai_temperature" className="text-carbon-700 dark:text-carbon-300">
+                  Temperature
+                </Label>
+                <HoverCard>
+                  <HoverCardTrigger asChild>
+                    <Info className="h-4 w-4 text-carbon-400 cursor-help" />
+                  </HoverCardTrigger>
+                  <HoverCardContent className="w-80 bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
+                    <p className="text-sm text-carbon-700 dark:text-carbon-300">{PARAM_DESCRIPTIONS.temperature}</p>
+                  </HoverCardContent>
+                </HoverCard>
+              </div>
+              <div className="flex items-center gap-4">
+                <Slider
+                  value={[params.temperature]}
+                  onValueChange={(value) => updateParam('temperature', value[0])}
+                  max={1}
+                  step={0.1}
+                  className="flex-1"
+                />
+                <span className="w-12 text-right text-sm text-carbon-600 dark:text-carbon-400">
+                  {params.temperature}
+                </span>
+              </div>
+            </div>
+
+            {/* Initial Prompt */}
+            <div className="space-y-2">
+              <div className="flex items-center gap-2">
+                <Label htmlFor="openai_prompt" className="text-carbon-700 dark:text-carbon-300">
+                  Initial Prompt
+                </Label>
+                <HoverCard>
+                  <HoverCardTrigger asChild>
+                    <Info className="h-4 w-4 text-carbon-400 cursor-help" />
+                  </HoverCardTrigger>
+                  <HoverCardContent className="w-80 bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
+                    <p className="text-sm text-carbon-700 dark:text-carbon-300">{PARAM_DESCRIPTIONS.initial_prompt}</p>
+                  </HoverCardContent>
+                </HoverCard>
+              </div>
+              <Textarea
+                id="openai_prompt"
+                value={params.initial_prompt || ""}
+                onChange={(e) => updateParam('initial_prompt', e.target.value || undefined)}
+                placeholder="Optional text to guide the model's style..."
+                className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100 resize-none"
+                rows={3}
+              />
+            </div>
+          </div>
        ) : (
          <Tabs defaultValue="basic" className="w-full">
            <TabsList className={`grid w-full items-center h-auto bg-carbon-100 dark:bg-carbon-800 p-1 rounded-lg ${isMultiTrack ? 'grid-cols-3' : 'grid-cols-4'}`}>