mirror of
https://github.com/rishikanthc/Scriberr.git
synced 2026-06-28 14:55:46 +00:00
feat: implement OpenAI transcription support
This commit is contained in:
committed by
Rishikanth Chandrasekaran
parent
b62a944988
commit
f3266b31e5
@@ -215,6 +215,8 @@ func registerAdapters(cfg *config.Config) {
|
||||
adapters.NewParakeetAdapter(nvidiaEnvPath))
|
||||
registry.RegisterTranscriptionAdapter("canary",
|
||||
adapters.NewCanaryAdapter(nvidiaEnvPath)) // Shares with Parakeet
|
||||
registry.RegisterTranscriptionAdapter("openai_whisper",
|
||||
adapters.NewOpenAIAdapter(cfg.OpenAIAPIKey))
|
||||
|
||||
// Register diarization adapters
|
||||
registry.RegisterDiarizationAdapter("pyannote",
|
||||
|
||||
@@ -33,6 +33,9 @@ type Config struct {
|
||||
// Python/WhisperX configuration
|
||||
UVPath string
|
||||
WhisperXEnv string
|
||||
|
||||
// OpenAI configuration
|
||||
OpenAIAPIKey string
|
||||
}
|
||||
|
||||
// Load loads configuration from environment variables and .env file
|
||||
@@ -51,6 +54,7 @@ func Load() *Config {
|
||||
TranscriptsDir: getEnv("TRANSCRIPTS_DIR", "data/transcripts"),
|
||||
UVPath: findUVPath(),
|
||||
WhisperXEnv: getEnv("WHISPERX_ENV", "data/whisperx-env"),
|
||||
OpenAIAPIKey: getEnv("OPENAI_API_KEY", ""),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -124,6 +124,9 @@ type WhisperXParams struct {
|
||||
|
||||
// Webhook settings
|
||||
CallbackURL *string `json:"callback_url,omitempty" gorm:"type:text"`
|
||||
|
||||
// OpenAI settings
|
||||
APIKey *string `json:"api_key,omitempty" gorm:"type:text"`
|
||||
}
|
||||
|
||||
// BeforeCreate sets the ID if not already set
|
||||
|
||||
261
internal/transcription/adapters/openai_adapter.go
Normal file
261
internal/transcription/adapters/openai_adapter.go
Normal file
@@ -0,0 +1,261 @@
|
||||
package adapters
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"mime/multipart"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"time"
|
||||
|
||||
"scriberr/internal/transcription/interfaces"
|
||||
)
|
||||
|
||||
// OpenAIAdapter implements the TranscriptionAdapter interface for OpenAI API
|
||||
type OpenAIAdapter struct {
|
||||
*BaseAdapter
|
||||
apiKey string
|
||||
}
|
||||
|
||||
// NewOpenAIAdapter creates a new OpenAI adapter
|
||||
func NewOpenAIAdapter(apiKey string) *OpenAIAdapter {
|
||||
capabilities := interfaces.ModelCapabilities{
|
||||
ModelID: "openai_whisper",
|
||||
ModelFamily: "openai",
|
||||
DisplayName: "OpenAI Whisper API",
|
||||
Description: "Cloud-based transcription using OpenAI's Whisper model",
|
||||
Version: "v1",
|
||||
SupportedLanguages: []string{
|
||||
"af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy",
|
||||
},
|
||||
SupportedFormats: []string{"flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"},
|
||||
RequiresGPU: false,
|
||||
MemoryRequirement: 0, // Cloud-based
|
||||
Features: map[string]bool{
|
||||
"timestamps": true, // Verbose JSON response includes segments
|
||||
"word_level": false, // Not supported by standard API yet (unless using verbose_json with timestamp_granularities which is beta)
|
||||
"diarization": false, // Not supported by OpenAI API
|
||||
"translation": true,
|
||||
"language_detection": true,
|
||||
"vad": true, // Implicit
|
||||
},
|
||||
Metadata: map[string]string{
|
||||
"provider": "openai",
|
||||
"api_url": "https://api.openai.com/v1/audio/transcriptions",
|
||||
},
|
||||
}
|
||||
|
||||
schema := []interfaces.ParameterSchema{
|
||||
{
|
||||
Name: "api_key",
|
||||
Type: "string",
|
||||
Required: false, // Can be provided in config
|
||||
Description: "OpenAI API Key (overrides system default)",
|
||||
Group: "authentication",
|
||||
},
|
||||
{
|
||||
Name: "model",
|
||||
Type: "string",
|
||||
Required: false,
|
||||
Default: "whisper-1",
|
||||
Options: []string{"whisper-1"},
|
||||
Description: "ID of the model to use",
|
||||
Group: "basic",
|
||||
},
|
||||
{
|
||||
Name: "language",
|
||||
Type: "string",
|
||||
Required: false,
|
||||
Description: "Language of the input audio (ISO-639-1)",
|
||||
Group: "basic",
|
||||
},
|
||||
{
|
||||
Name: "prompt",
|
||||
Type: "string",
|
||||
Required: false,
|
||||
Description: "Optional text to guide the model's style or continue a previous audio segment",
|
||||
Group: "advanced",
|
||||
},
|
||||
{
|
||||
Name: "temperature",
|
||||
Type: "float",
|
||||
Required: false,
|
||||
Default: 0.0,
|
||||
Min: &[]float64{0.0}[0],
|
||||
Max: &[]float64{1.0}[0],
|
||||
Description: "Sampling temperature",
|
||||
Group: "quality",
|
||||
},
|
||||
}
|
||||
|
||||
baseAdapter := NewBaseAdapter("openai_whisper", "", capabilities, schema)
|
||||
|
||||
return &OpenAIAdapter{
|
||||
BaseAdapter: baseAdapter,
|
||||
apiKey: apiKey,
|
||||
}
|
||||
}
|
||||
|
||||
// GetSupportedModels returns the list of OpenAI models supported
|
||||
func (a *OpenAIAdapter) GetSupportedModels() []string {
|
||||
return []string{"whisper-1"}
|
||||
}
|
||||
|
||||
// PrepareEnvironment is a no-op for cloud adapters
|
||||
func (a *OpenAIAdapter) PrepareEnvironment(ctx context.Context) error {
|
||||
a.initialized = true
|
||||
return nil
|
||||
}
|
||||
|
||||
// Transcribe processes audio using OpenAI API
|
||||
func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioInput, params map[string]interface{}, procCtx interfaces.ProcessingContext) (*interfaces.TranscriptResult, error) {
|
||||
startTime := time.Now()
|
||||
a.LogProcessingStart(input, procCtx)
|
||||
defer func() {
|
||||
a.LogProcessingEnd(procCtx, time.Since(startTime), nil)
|
||||
}()
|
||||
|
||||
// Validate input
|
||||
if err := a.ValidateAudioInput(input); err != nil {
|
||||
return nil, fmt.Errorf("invalid audio input: %w", err)
|
||||
}
|
||||
|
||||
// Get API Key
|
||||
apiKey := a.apiKey
|
||||
if key, ok := params["api_key"].(string); ok && key != "" {
|
||||
apiKey = key
|
||||
}
|
||||
|
||||
if apiKey == "" {
|
||||
return nil, fmt.Errorf("OpenAI API key is required but not provided")
|
||||
}
|
||||
|
||||
// Prepare request body
|
||||
body := &bytes.Buffer{}
|
||||
writer := multipart.NewWriter(body)
|
||||
|
||||
// Add file
|
||||
file, err := os.Open(input.FilePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to open audio file: %w", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
part, err := writer.CreateFormFile("file", filepath.Base(input.FilePath))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create form file: %w", err)
|
||||
}
|
||||
if _, err := io.Copy(part, file); err != nil {
|
||||
return nil, fmt.Errorf("failed to copy file content: %w", err)
|
||||
}
|
||||
|
||||
// Add parameters
|
||||
model := a.GetStringParameter(params, "model")
|
||||
if model == "" {
|
||||
model = "whisper-1"
|
||||
}
|
||||
_ = writer.WriteField("model", model)
|
||||
_ = writer.WriteField("response_format", "verbose_json")
|
||||
_ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps
|
||||
|
||||
if lang := a.GetStringParameter(params, "language"); lang != "" {
|
||||
_ = writer.WriteField("language", lang)
|
||||
}
|
||||
|
||||
if prompt := a.GetStringParameter(params, "prompt"); prompt != "" {
|
||||
_ = writer.WriteField("prompt", prompt)
|
||||
}
|
||||
|
||||
temp := a.GetFloatParameter(params, "temperature")
|
||||
_ = writer.WriteField("temperature", fmt.Sprintf("%.2f", temp))
|
||||
|
||||
if err := writer.Close(); err != nil {
|
||||
return nil, fmt.Errorf("failed to close multipart writer: %w", err)
|
||||
}
|
||||
|
||||
// Create request
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", "https://api.openai.com/v1/audio/transcriptions", body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", writer.FormDataContentType())
|
||||
req.Header.Set("Authorization", "Bearer "+apiKey)
|
||||
|
||||
// Execute request
|
||||
client := &http.Client{
|
||||
Timeout: 10 * time.Minute, // Generous timeout for large files
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
respBody, _ := io.ReadAll(resp.Body)
|
||||
return nil, fmt.Errorf("OpenAI API error (status %d): %s", resp.StatusCode, string(respBody))
|
||||
}
|
||||
|
||||
// Parse response
|
||||
var openAIResponse struct {
|
||||
Task string `json:"task"`
|
||||
Language string `json:"language"`
|
||||
Duration float64 `json:"duration"`
|
||||
Text string `json:"text"`
|
||||
Segments []struct {
|
||||
ID int `json:"id"`
|
||||
Seek int `json:"seek"`
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Tokens []int `json:"tokens"`
|
||||
Temperature float64 `json:"temperature"`
|
||||
AvgLogprob float64 `json:"avg_logprob"`
|
||||
CompressionRatio float64 `json:"compression_ratio"`
|
||||
NoSpeechProb float64 `json:"no_speech_prob"`
|
||||
} `json:"segments"`
|
||||
}
|
||||
|
||||
if err := json.NewDecoder(resp.Body).Decode(&openAIResponse); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode response: %w", err)
|
||||
}
|
||||
|
||||
// Convert to TranscriptResult
|
||||
result := &interfaces.TranscriptResult{
|
||||
Language: openAIResponse.Language,
|
||||
Text: openAIResponse.Text,
|
||||
Segments: make([]interfaces.TranscriptSegment, len(openAIResponse.Segments)),
|
||||
ProcessingTime: time.Since(startTime),
|
||||
ModelUsed: model,
|
||||
Metadata: a.CreateDefaultMetadata(params),
|
||||
}
|
||||
|
||||
for i, seg := range openAIResponse.Segments {
|
||||
result.Segments[i] = interfaces.TranscriptSegment{
|
||||
Start: seg.Start,
|
||||
End: seg.End,
|
||||
Text: seg.Text,
|
||||
}
|
||||
}
|
||||
|
||||
// OpenAI doesn't provide word-level timestamps in standard verbose_json without extra beta flags
|
||||
// For now, we'll leave WordSegments empty or implement a basic splitter if needed.
|
||||
// Given the requirements, segment-level is sufficient for now.
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// GetEstimatedProcessingTime provides OpenAI-specific time estimation
|
||||
func (a *OpenAIAdapter) GetEstimatedProcessingTime(input interfaces.AudioInput) time.Duration {
|
||||
// Cloud transcription is generally faster, approx 10-20% of audio duration
|
||||
audioDuration := input.Duration
|
||||
if audioDuration == 0 {
|
||||
return 30 * time.Second // Fallback
|
||||
}
|
||||
return time.Duration(float64(audioDuration) * 0.15)
|
||||
}
|
||||
@@ -337,6 +337,8 @@ func (u *UnifiedTranscriptionService) selectModels(params models.WhisperXParams)
|
||||
transcriptionModelID = "canary"
|
||||
case "whisper":
|
||||
transcriptionModelID = "whisperx"
|
||||
case "openai":
|
||||
transcriptionModelID = "openai_whisper"
|
||||
default:
|
||||
transcriptionModelID = "whisperx" // Default fallback
|
||||
}
|
||||
@@ -510,12 +512,36 @@ func (u *UnifiedTranscriptionService) convertParametersForModel(params models.Wh
|
||||
return u.convertToPyannoteParams(params)
|
||||
case "sortformer":
|
||||
return u.convertToSortformerParams(params)
|
||||
case "openai_whisper":
|
||||
return u.convertToOpenAIParams(params)
|
||||
default:
|
||||
// Fallback to legacy conversion
|
||||
return u.parametersToMap(params)
|
||||
}
|
||||
}
|
||||
|
||||
// convertToOpenAIParams converts to OpenAI-specific parameters
|
||||
func (u *UnifiedTranscriptionService) convertToOpenAIParams(params models.WhisperXParams) map[string]interface{} {
|
||||
paramMap := map[string]interface{}{
|
||||
"model": params.Model,
|
||||
"temperature": params.Temperature,
|
||||
}
|
||||
|
||||
if params.Language != nil {
|
||||
paramMap["language"] = *params.Language
|
||||
}
|
||||
if params.InitialPrompt != nil {
|
||||
paramMap["prompt"] = *params.InitialPrompt
|
||||
}
|
||||
|
||||
// Add API key if provided in params (e.g. from UI override)
|
||||
if params.APIKey != nil && *params.APIKey != "" {
|
||||
paramMap["api_key"] = *params.APIKey
|
||||
}
|
||||
|
||||
return paramMap
|
||||
}
|
||||
|
||||
// convertToParakeetParams converts to Parakeet-specific parameters
|
||||
func (u *UnifiedTranscriptionService) convertToParakeetParams(params models.WhisperXParams) map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
|
||||
@@ -105,6 +105,9 @@ export interface WhisperXParams {
|
||||
|
||||
// Multi-track transcription settings
|
||||
is_multi_track_enabled: boolean;
|
||||
|
||||
// OpenAI settings
|
||||
api_key?: string;
|
||||
}
|
||||
|
||||
// Parameter descriptions for hover cards
|
||||
@@ -202,6 +205,7 @@ const DEFAULT_PARAMS: WhisperXParams = {
|
||||
attention_context_left: 256,
|
||||
attention_context_right: 256,
|
||||
is_multi_track_enabled: false,
|
||||
api_key: "",
|
||||
};
|
||||
|
||||
const WHISPER_MODELS = [
|
||||
@@ -469,6 +473,9 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
|
||||
<SelectItem value="nvidia_canary" className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
|
||||
NVIDIA Canary
|
||||
</SelectItem>
|
||||
<SelectItem value="openai" className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
|
||||
OpenAI API
|
||||
</SelectItem>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
@@ -933,6 +940,141 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
) : params.model_family === "openai" ? (
|
||||
<div className="space-y-6">
|
||||
<div className="p-4 border border-blue-200 dark:border-blue-700 rounded-lg bg-blue-50 dark:bg-blue-900/20">
|
||||
<div className="flex items-center gap-2">
|
||||
<Info className="h-4 w-4 text-blue-600 dark:text-blue-400" />
|
||||
<span className="text-sm font-medium text-blue-800 dark:text-blue-200">Cloud Transcription</span>
|
||||
</div>
|
||||
<p className="text-sm text-blue-700 dark:text-blue-300 mt-1">
|
||||
Audio will be sent to OpenAI servers for processing.
|
||||
</p>
|
||||
</div>
|
||||
|
||||
{/* API Key */}
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<Label htmlFor="openai_api_key" className="text-carbon-700 dark:text-carbon-300 font-medium">
|
||||
OpenAI API Key
|
||||
</Label>
|
||||
<HoverCard>
|
||||
<HoverCardTrigger asChild>
|
||||
<Info className="h-4 w-4 text-carbon-400 cursor-help" />
|
||||
</HoverCardTrigger>
|
||||
<HoverCardContent className="w-80 bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
|
||||
<p className="text-sm text-carbon-700 dark:text-carbon-300">
|
||||
Your OpenAI API key. If not provided, the server-configured key will be used (if any).
|
||||
</p>
|
||||
</HoverCardContent>
|
||||
</HoverCard>
|
||||
</div>
|
||||
<Input
|
||||
id="openai_api_key"
|
||||
type="password"
|
||||
value={params.api_key || ""}
|
||||
onChange={(e) => updateParam('api_key', e.target.value)}
|
||||
placeholder="sk-..."
|
||||
className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100"
|
||||
/>
|
||||
</div>
|
||||
|
||||
{/* Model Selection */}
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="openai_model" className="text-carbon-700 dark:text-carbon-300 font-medium">
|
||||
Model
|
||||
</Label>
|
||||
<Select
|
||||
value={params.model || "whisper-1"}
|
||||
onValueChange={(value) => updateParam('model', value)}
|
||||
>
|
||||
<SelectTrigger className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100">
|
||||
<SelectValue />
|
||||
</SelectTrigger>
|
||||
<SelectContent className="bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
|
||||
<SelectItem value="whisper-1" className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
|
||||
whisper-1
|
||||
</SelectItem>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
|
||||
{/* Language Selection */}
|
||||
<div className="space-y-2">
|
||||
<Label htmlFor="openai_language" className="text-carbon-700 dark:text-carbon-300 font-medium">
|
||||
Language
|
||||
</Label>
|
||||
<Select
|
||||
value={params.language || "auto"}
|
||||
onValueChange={(value) => updateParam('language', value === "auto" ? undefined : value)}
|
||||
>
|
||||
<SelectTrigger className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100">
|
||||
<SelectValue placeholder="Auto-detect" />
|
||||
</SelectTrigger>
|
||||
<SelectContent className="bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700 max-h-60">
|
||||
{LANGUAGES.map((lang) => (
|
||||
<SelectItem key={lang.value} value={lang.value} className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
|
||||
{lang.label}
|
||||
</SelectItem>
|
||||
))}
|
||||
</SelectContent>
|
||||
</Select>
|
||||
</div>
|
||||
|
||||
{/* Temperature */}
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<Label htmlFor="openai_temperature" className="text-carbon-700 dark:text-carbon-300">
|
||||
Temperature
|
||||
</Label>
|
||||
<HoverCard>
|
||||
<HoverCardTrigger asChild>
|
||||
<Info className="h-4 w-4 text-carbon-400 cursor-help" />
|
||||
</HoverCardTrigger>
|
||||
<HoverCardContent className="w-80 bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
|
||||
<p className="text-sm text-carbon-700 dark:text-carbon-300">{PARAM_DESCRIPTIONS.temperature}</p>
|
||||
</HoverCardContent>
|
||||
</HoverCard>
|
||||
</div>
|
||||
<div className="flex items-center gap-4">
|
||||
<Slider
|
||||
value={[params.temperature]}
|
||||
onValueChange={(value) => updateParam('temperature', value[0])}
|
||||
max={1}
|
||||
step={0.1}
|
||||
className="flex-1"
|
||||
/>
|
||||
<span className="w-12 text-right text-sm text-carbon-600 dark:text-carbon-400">
|
||||
{params.temperature}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{/* Initial Prompt */}
|
||||
<div className="space-y-2">
|
||||
<div className="flex items-center gap-2">
|
||||
<Label htmlFor="openai_prompt" className="text-carbon-700 dark:text-carbon-300">
|
||||
Initial Prompt
|
||||
</Label>
|
||||
<HoverCard>
|
||||
<HoverCardTrigger asChild>
|
||||
<Info className="h-4 w-4 text-carbon-400 cursor-help" />
|
||||
</HoverCardTrigger>
|
||||
<HoverCardContent className="w-80 bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
|
||||
<p className="text-sm text-carbon-700 dark:text-carbon-300">{PARAM_DESCRIPTIONS.initial_prompt}</p>
|
||||
</HoverCardContent>
|
||||
</HoverCard>
|
||||
</div>
|
||||
<Textarea
|
||||
id="openai_prompt"
|
||||
value={params.initial_prompt || ""}
|
||||
onChange={(e) => updateParam('initial_prompt', e.target.value || undefined)}
|
||||
placeholder="Optional text to guide the model's style..."
|
||||
className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100 resize-none"
|
||||
rows={3}
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
) : (
|
||||
<Tabs defaultValue="basic" className="w-full">
|
||||
<TabsList className={`grid w-full items-center h-auto bg-carbon-100 dark:bg-carbon-800 p-1 rounded-lg ${isMultiTrack ? 'grid-cols-3' : 'grid-cols-4'}`}>
|
||||
|
||||
Reference in New Issue
Block a user