feat: add gpt-4o support, fix response formats, and add ui warnings for timestamp limitations

2026-03-03 03:57:01 +00:00 · 2025-12-01 13:58:12 -08:00
parent be85b3d286
commit c8f17ea2f9
3 changed files with 62 additions and 16 deletions
--- a/internal/api/openai_handler.go
+++ b/internal/api/openai_handler.go
@@ -4,6 +4,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"net/http"
+	"strings"
 	"time"

 	"github.com/gin-gonic/gin"
@@ -92,21 +93,25 @@ func (h *Handler) ValidateOpenAIKey(c *gin.Context) {
 		return
 	}

-	// Filter for whisper models
-	var whisperModels []string
+	// Filter for whisper and gpt-4o transcription models
+	var availableModels []string
 	for _, model := range modelList.Data {
-		if model.ID == "whisper-1" || (len(model.ID) > 7 && model.ID[:7] == "whisper") {
-			whisperModels = append(whisperModels, model.ID)
+		isWhisper := model.ID == "whisper-1" || (len(model.ID) > 7 && model.ID[:7] == "whisper")
+		isGPT4oAudio := (len(model.ID) > 6 && model.ID[:6] == "gpt-4o") &&
+			(strings.Contains(model.ID, "transcribe") || strings.Contains(model.ID, "audio"))
+
+		if isWhisper || isGPT4oAudio {
+			availableModels = append(availableModels, model.ID)
 		}
 	}

-	// If no whisper models found (unlikely), default to whisper-1
-	if len(whisperModels) == 0 {
-		whisperModels = []string{"whisper-1"}
+	// If no models found (unlikely), default to whisper-1
+	if len(availableModels) == 0 {
+		availableModels = []string{"whisper-1"}
 	}

 	c.JSON(http.StatusOK, gin.H{
 		"valid":  true,
-		"models": whisperModels,
+		"models": availableModels,
 	})
 }
--- a/internal/transcription/adapters/openai_adapter.go
+++ b/internal/transcription/adapters/openai_adapter.go
@@ -10,6 +10,7 @@ import (
 	"net/http"
 	"os"
 	"path/filepath"
+	"strings"
 	"time"

 	"scriberr/internal/transcription/interfaces"
@@ -184,9 +185,22 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn
 	}
 	writeLog("Model: %s", model)
 	_ = writer.WriteField("model", model)
-	_ = writer.WriteField("response_format", "verbose_json")
-	_ = writer.WriteField("timestamp_granularities[]", "word")    // Request word timestamps
-	_ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps
+
+	if strings.HasPrefix(model, "gpt-4o") {
+		if strings.Contains(model, "diarize") {
+			_ = writer.WriteField("response_format", "diarized_json")
+		} else {
+			_ = writer.WriteField("response_format", "json")
+		}
+		// gpt-4o models don't support timestamp_granularities with these formats
+	} else {
+		_ = writer.WriteField("response_format", "verbose_json")
+		// timestamp_granularities is only supported for whisper-1
+		if model == "whisper-1" {
+			_ = writer.WriteField("timestamp_granularities[]", "word")    // Request word timestamps
+			_ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps
+		}
+	}

 	if lang := a.GetStringParameter(params, "language"); lang != "" {
 		writeLog("Language: %s", lang)
@@ -280,11 +294,22 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn
 		Metadata:       a.CreateDefaultMetadata(params),
 	}

-	for i, seg := range openAIResponse.Segments {
-		result.Segments[i] = interfaces.TranscriptSegment{
-			Start: seg.Start,
-			End:   seg.End,
-			Text:  seg.Text,
+	if len(openAIResponse.Segments) > 0 {
+		for i, seg := range openAIResponse.Segments {
+			result.Segments[i] = interfaces.TranscriptSegment{
+				Start: seg.Start,
+				End:   seg.End,
+				Text:  seg.Text,
+			}
+		}
+	} else if openAIResponse.Text != "" {
+		// If no segments returned (e.g. standard json format), create one segment with the whole text
+		result.Segments = []interfaces.TranscriptSegment{
+			{
+				Start: 0,
+				End:   openAIResponse.Duration,
+				Text:  openAIResponse.Text,
+			},
 		}
 	}

--- a/web/frontend/src/components/TranscriptionConfigDialog.tsx
+++ b/web/frontend/src/components/TranscriptionConfigDialog.tsx
@@ -1074,6 +1074,22 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
              </Select>
            </div>

+            {params.model && params.model !== "whisper-1" && (
+              <div className="p-4 border border-orange-200 dark:border-orange-700 rounded-lg bg-orange-50 dark:bg-orange-900/20">
+                <div className="flex items-start gap-3">
+                  <div className="text-orange-500 dark:text-orange-400 mt-0.5">⚠️</div>
+                  <div>
+                    <h4 className="text-sm font-medium text-orange-800 dark:text-orange-200 mb-1">
+                      Limited Synchronization
+                    </h4>
+                    <p className="text-sm text-orange-700 dark:text-orange-300">
+                      Word-level timestamps are only supported by the <strong>whisper-1</strong> model. Synchronized playback will not be available for this model.
+                    </p>
+                  </div>
+                </div>
+              </div>
+            )}
+
            {/* Language Selection */}
            <div className="space-y-2">
              <Label htmlFor="openai_language" className="text-carbon-700 dark:text-carbon-300 font-medium">