mirror of
https://github.com/rishikanthc/Scriberr.git
synced 2026-03-03 03:57:01 +00:00
feat: add gpt-4o support, fix response formats, and add ui warnings for timestamp limitations
This commit is contained in:
@@ -4,6 +4,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
@@ -92,21 +93,25 @@ func (h *Handler) ValidateOpenAIKey(c *gin.Context) {
|
||||
return
|
||||
}
|
||||
|
||||
// Filter for whisper models
|
||||
var whisperModels []string
|
||||
// Filter for whisper and gpt-4o transcription models
|
||||
var availableModels []string
|
||||
for _, model := range modelList.Data {
|
||||
if model.ID == "whisper-1" || (len(model.ID) > 7 && model.ID[:7] == "whisper") {
|
||||
whisperModels = append(whisperModels, model.ID)
|
||||
isWhisper := model.ID == "whisper-1" || (len(model.ID) > 7 && model.ID[:7] == "whisper")
|
||||
isGPT4oAudio := (len(model.ID) > 6 && model.ID[:6] == "gpt-4o") &&
|
||||
(strings.Contains(model.ID, "transcribe") || strings.Contains(model.ID, "audio"))
|
||||
|
||||
if isWhisper || isGPT4oAudio {
|
||||
availableModels = append(availableModels, model.ID)
|
||||
}
|
||||
}
|
||||
|
||||
// If no whisper models found (unlikely), default to whisper-1
|
||||
if len(whisperModels) == 0 {
|
||||
whisperModels = []string{"whisper-1"}
|
||||
// If no models found (unlikely), default to whisper-1
|
||||
if len(availableModels) == 0 {
|
||||
availableModels = []string{"whisper-1"}
|
||||
}
|
||||
|
||||
c.JSON(http.StatusOK, gin.H{
|
||||
"valid": true,
|
||||
"models": whisperModels,
|
||||
"models": availableModels,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"scriberr/internal/transcription/interfaces"
|
||||
@@ -184,9 +185,22 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn
|
||||
}
|
||||
writeLog("Model: %s", model)
|
||||
_ = writer.WriteField("model", model)
|
||||
_ = writer.WriteField("response_format", "verbose_json")
|
||||
_ = writer.WriteField("timestamp_granularities[]", "word") // Request word timestamps
|
||||
_ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps
|
||||
|
||||
if strings.HasPrefix(model, "gpt-4o") {
|
||||
if strings.Contains(model, "diarize") {
|
||||
_ = writer.WriteField("response_format", "diarized_json")
|
||||
} else {
|
||||
_ = writer.WriteField("response_format", "json")
|
||||
}
|
||||
// gpt-4o models don't support timestamp_granularities with these formats
|
||||
} else {
|
||||
_ = writer.WriteField("response_format", "verbose_json")
|
||||
// timestamp_granularities is only supported for whisper-1
|
||||
if model == "whisper-1" {
|
||||
_ = writer.WriteField("timestamp_granularities[]", "word") // Request word timestamps
|
||||
_ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps
|
||||
}
|
||||
}
|
||||
|
||||
if lang := a.GetStringParameter(params, "language"); lang != "" {
|
||||
writeLog("Language: %s", lang)
|
||||
@@ -280,11 +294,22 @@ func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn
|
||||
Metadata: a.CreateDefaultMetadata(params),
|
||||
}
|
||||
|
||||
for i, seg := range openAIResponse.Segments {
|
||||
result.Segments[i] = interfaces.TranscriptSegment{
|
||||
Start: seg.Start,
|
||||
End: seg.End,
|
||||
Text: seg.Text,
|
||||
if len(openAIResponse.Segments) > 0 {
|
||||
for i, seg := range openAIResponse.Segments {
|
||||
result.Segments[i] = interfaces.TranscriptSegment{
|
||||
Start: seg.Start,
|
||||
End: seg.End,
|
||||
Text: seg.Text,
|
||||
}
|
||||
}
|
||||
} else if openAIResponse.Text != "" {
|
||||
// If no segments returned (e.g. standard json format), create one segment with the whole text
|
||||
result.Segments = []interfaces.TranscriptSegment{
|
||||
{
|
||||
Start: 0,
|
||||
End: openAIResponse.Duration,
|
||||
Text: openAIResponse.Text,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1074,6 +1074,22 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
|
||||
</Select>
|
||||
</div>
|
||||
|
||||
{params.model && params.model !== "whisper-1" && (
|
||||
<div className="p-4 border border-orange-200 dark:border-orange-700 rounded-lg bg-orange-50 dark:bg-orange-900/20">
|
||||
<div className="flex items-start gap-3">
|
||||
<div className="text-orange-500 dark:text-orange-400 mt-0.5">⚠️</div>
|
||||
<div>
|
||||
<h4 className="text-sm font-medium text-orange-800 dark:text-orange-200 mb-1">
|
||||
Limited Synchronization
|
||||
</h4>
|
||||
<p className="text-sm text-orange-700 dark:text-orange-300">
|
||||
Word-level timestamps are only supported by the <strong>whisper-1</strong> model. Synchronized playback will not be available for this model.
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Language Selection */}
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="openai_language" className="text-carbon-700 dark:text-carbon-300 font-medium">
|
||||
|
||||
Reference in New Issue
Block a user