feat: add gpt-4o support, fix response formats, and add ui warnings for timestamp limitations

This commit is contained in:
rishikanthc
2025-12-01 13:58:12 -08:00
parent be85b3d286
commit c8f17ea2f9
3 changed files with 62 additions and 16 deletions

View File

@@ -4,6 +4,7 @@ import (
"encoding/json"
"fmt"
"net/http"
"strings"
"time"
"github.com/gin-gonic/gin"
@@ -92,21 +93,25 @@ func (h *Handler) ValidateOpenAIKey(c *gin.Context) {
return
}
// Filter for whisper models
var whisperModels []string
// Filter for whisper and gpt-4o transcription models
var availableModels []string
for _, model := range modelList.Data {
if model.ID == "whisper-1" || (len(model.ID) > 7 && model.ID[:7] == "whisper") {
whisperModels = append(whisperModels, model.ID)
isWhisper := model.ID == "whisper-1" || (len(model.ID) > 7 && model.ID[:7] == "whisper")
isGPT4oAudio := (len(model.ID) > 6 && model.ID[:6] == "gpt-4o") &&
(strings.Contains(model.ID, "transcribe") || strings.Contains(model.ID, "audio"))
if isWhisper || isGPT4oAudio {
availableModels = append(availableModels, model.ID)
}
}
// If no whisper models found (unlikely), default to whisper-1
if len(whisperModels) == 0 {
whisperModels = []string{"whisper-1"}
// If no models found (unlikely), default to whisper-1
if len(availableModels) == 0 {
availableModels = []string{"whisper-1"}
}
c.JSON(http.StatusOK, gin.H{
"valid": true,
"models": whisperModels,
"models": availableModels,
})
}

View File

@@ -10,6 +10,7 @@ import (
"net/http"
"os"
"path/filepath"
"strings"
"time"
"scriberr/internal/transcription/interfaces"
@@ -184,9 +185,22 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn
}
writeLog("Model: %s", model)
_ = writer.WriteField("model", model)
_ = writer.WriteField("response_format", "verbose_json")
_ = writer.WriteField("timestamp_granularities[]", "word") // Request word timestamps
_ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps
if strings.HasPrefix(model, "gpt-4o") {
if strings.Contains(model, "diarize") {
_ = writer.WriteField("response_format", "diarized_json")
} else {
_ = writer.WriteField("response_format", "json")
}
// gpt-4o models don't support timestamp_granularities with these formats
} else {
_ = writer.WriteField("response_format", "verbose_json")
// timestamp_granularities is only supported for whisper-1
if model == "whisper-1" {
_ = writer.WriteField("timestamp_granularities[]", "word") // Request word timestamps
_ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps
}
}
if lang := a.GetStringParameter(params, "language"); lang != "" {
writeLog("Language: %s", lang)
@@ -280,11 +294,22 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn
Metadata: a.CreateDefaultMetadata(params),
}
for i, seg := range openAIResponse.Segments {
result.Segments[i] = interfaces.TranscriptSegment{
Start: seg.Start,
End: seg.End,
Text: seg.Text,
if len(openAIResponse.Segments) > 0 {
for i, seg := range openAIResponse.Segments {
result.Segments[i] = interfaces.TranscriptSegment{
Start: seg.Start,
End: seg.End,
Text: seg.Text,
}
}
} else if openAIResponse.Text != "" {
// If no segments returned (e.g. standard json format), create one segment with the whole text
result.Segments = []interfaces.TranscriptSegment{
{
Start: 0,
End: openAIResponse.Duration,
Text: openAIResponse.Text,
},
}
}

View File

@@ -1074,6 +1074,22 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
</Select>
</div>
{params.model && params.model !== "whisper-1" && (
<div className="p-4 border border-orange-200 dark:border-orange-700 rounded-lg bg-orange-50 dark:bg-orange-900/20">
<div className="flex items-start gap-3">
<div className="text-orange-500 dark:text-orange-400 mt-0.5"></div>
<div>
<h4 className="text-sm font-medium text-orange-800 dark:text-orange-200 mb-1">
Limited Synchronization
</h4>
<p className="text-sm text-orange-700 dark:text-orange-300">
Word-level timestamps are only supported by the <strong>whisper-1</strong> model. Synchronized playback will not be available for this model.
</p>
</div>
</div>
</div>
)}
{/* Language Selection */}
<div className="space-y-2">
<Label htmlFor="openai_language" className="text-carbon-700 dark:text-carbon-300 font-medium">