feat: implement OpenAI transcription support

This commit is contained in:
rishikanthc
2025-12-01 12:37:41 -08:00
committed by Rishikanth Chandrasekaran
parent b62a944988
commit f3266b31e5
7 changed files with 438 additions and 0 deletions

View File

@@ -215,6 +215,8 @@ func registerAdapters(cfg *config.Config) {
adapters.NewParakeetAdapter(nvidiaEnvPath))
registry.RegisterTranscriptionAdapter("canary",
adapters.NewCanaryAdapter(nvidiaEnvPath)) // Shares with Parakeet
registry.RegisterTranscriptionAdapter("openai_whisper",
adapters.NewOpenAIAdapter(cfg.OpenAIAPIKey))
// Register diarization adapters
registry.RegisterDiarizationAdapter("pyannote",

View File

@@ -33,6 +33,9 @@ type Config struct {
// Python/WhisperX configuration
UVPath string
WhisperXEnv string
// OpenAI configuration
OpenAIAPIKey string
}
// Load loads configuration from environment variables and .env file
@@ -51,6 +54,7 @@ func Load() *Config {
TranscriptsDir: getEnv("TRANSCRIPTS_DIR", "data/transcripts"),
UVPath: findUVPath(),
WhisperXEnv: getEnv("WHISPERX_ENV", "data/whisperx-env"),
OpenAIAPIKey: getEnv("OPENAI_API_KEY", ""),
}
}

View File

@@ -124,6 +124,9 @@ type WhisperXParams struct {
// Webhook settings
CallbackURL *string `json:"callback_url,omitempty" gorm:"type:text"`
// OpenAI settings
APIKey *string `json:"api_key,omitempty" gorm:"type:text"`
}
// BeforeCreate sets the ID if not already set

View File

@@ -0,0 +1,261 @@
package adapters
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"mime/multipart"
"net/http"
"os"
"path/filepath"
"time"
"scriberr/internal/transcription/interfaces"
)
// OpenAIAdapter implements the TranscriptionAdapter interface for OpenAI API
type OpenAIAdapter struct {
*BaseAdapter
apiKey string
}
// NewOpenAIAdapter creates a new OpenAI adapter
func NewOpenAIAdapter(apiKey string) *OpenAIAdapter {
capabilities := interfaces.ModelCapabilities{
ModelID: "openai_whisper",
ModelFamily: "openai",
DisplayName: "OpenAI Whisper API",
Description: "Cloud-based transcription using OpenAI's Whisper model",
Version: "v1",
SupportedLanguages: []string{
"af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi", "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy",
},
SupportedFormats: []string{"flac", "mp3", "mp4", "mpeg", "mpga", "m4a", "ogg", "wav", "webm"},
RequiresGPU: false,
MemoryRequirement: 0, // Cloud-based
Features: map[string]bool{
"timestamps": true, // Verbose JSON response includes segments
"word_level": false, // Not supported by standard API yet (unless using verbose_json with timestamp_granularities which is beta)
"diarization": false, // Not supported by OpenAI API
"translation": true,
"language_detection": true,
"vad": true, // Implicit
},
Metadata: map[string]string{
"provider": "openai",
"api_url": "https://api.openai.com/v1/audio/transcriptions",
},
}
schema := []interfaces.ParameterSchema{
{
Name: "api_key",
Type: "string",
Required: false, // Can be provided in config
Description: "OpenAI API Key (overrides system default)",
Group: "authentication",
},
{
Name: "model",
Type: "string",
Required: false,
Default: "whisper-1",
Options: []string{"whisper-1"},
Description: "ID of the model to use",
Group: "basic",
},
{
Name: "language",
Type: "string",
Required: false,
Description: "Language of the input audio (ISO-639-1)",
Group: "basic",
},
{
Name: "prompt",
Type: "string",
Required: false,
Description: "Optional text to guide the model's style or continue a previous audio segment",
Group: "advanced",
},
{
Name: "temperature",
Type: "float",
Required: false,
Default: 0.0,
Min: &[]float64{0.0}[0],
Max: &[]float64{1.0}[0],
Description: "Sampling temperature",
Group: "quality",
},
}
baseAdapter := NewBaseAdapter("openai_whisper", "", capabilities, schema)
return &OpenAIAdapter{
BaseAdapter: baseAdapter,
apiKey: apiKey,
}
}
// GetSupportedModels returns the list of OpenAI models supported
func (a *OpenAIAdapter) GetSupportedModels() []string {
return []string{"whisper-1"}
}
// PrepareEnvironment is a no-op for cloud adapters
func (a *OpenAIAdapter) PrepareEnvironment(ctx context.Context) error {
a.initialized = true
return nil
}
// Transcribe processes audio using OpenAI API
func (a *OpenAIAdapter) Transcribe(ctx context.Context, input interfaces.AudioInput, params map[string]interface{}, procCtx interfaces.ProcessingContext) (*interfaces.TranscriptResult, error) {
startTime := time.Now()
a.LogProcessingStart(input, procCtx)
defer func() {
a.LogProcessingEnd(procCtx, time.Since(startTime), nil)
}()
// Validate input
if err := a.ValidateAudioInput(input); err != nil {
return nil, fmt.Errorf("invalid audio input: %w", err)
}
// Get API Key
apiKey := a.apiKey
if key, ok := params["api_key"].(string); ok && key != "" {
apiKey = key
}
if apiKey == "" {
return nil, fmt.Errorf("OpenAI API key is required but not provided")
}
// Prepare request body
body := &bytes.Buffer{}
writer := multipart.NewWriter(body)
// Add file
file, err := os.Open(input.FilePath)
if err != nil {
return nil, fmt.Errorf("failed to open audio file: %w", err)
}
defer file.Close()
part, err := writer.CreateFormFile("file", filepath.Base(input.FilePath))
if err != nil {
return nil, fmt.Errorf("failed to create form file: %w", err)
}
if _, err := io.Copy(part, file); err != nil {
return nil, fmt.Errorf("failed to copy file content: %w", err)
}
// Add parameters
model := a.GetStringParameter(params, "model")
if model == "" {
model = "whisper-1"
}
_ = writer.WriteField("model", model)
_ = writer.WriteField("response_format", "verbose_json")
_ = writer.WriteField("timestamp_granularities[]", "segment") // Request segment timestamps
if lang := a.GetStringParameter(params, "language"); lang != "" {
_ = writer.WriteField("language", lang)
}
if prompt := a.GetStringParameter(params, "prompt"); prompt != "" {
_ = writer.WriteField("prompt", prompt)
}
temp := a.GetFloatParameter(params, "temperature")
_ = writer.WriteField("temperature", fmt.Sprintf("%.2f", temp))
if err := writer.Close(); err != nil {
return nil, fmt.Errorf("failed to close multipart writer: %w", err)
}
// Create request
req, err := http.NewRequestWithContext(ctx, "POST", "https://api.openai.com/v1/audio/transcriptions", body)
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", writer.FormDataContentType())
req.Header.Set("Authorization", "Bearer "+apiKey)
// Execute request
client := &http.Client{
Timeout: 10 * time.Minute, // Generous timeout for large files
}
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("request failed: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
respBody, _ := io.ReadAll(resp.Body)
return nil, fmt.Errorf("OpenAI API error (status %d): %s", resp.StatusCode, string(respBody))
}
// Parse response
var openAIResponse struct {
Task string `json:"task"`
Language string `json:"language"`
Duration float64 `json:"duration"`
Text string `json:"text"`
Segments []struct {
ID int `json:"id"`
Seek int `json:"seek"`
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
Tokens []int `json:"tokens"`
Temperature float64 `json:"temperature"`
AvgLogprob float64 `json:"avg_logprob"`
CompressionRatio float64 `json:"compression_ratio"`
NoSpeechProb float64 `json:"no_speech_prob"`
} `json:"segments"`
}
if err := json.NewDecoder(resp.Body).Decode(&openAIResponse); err != nil {
return nil, fmt.Errorf("failed to decode response: %w", err)
}
// Convert to TranscriptResult
result := &interfaces.TranscriptResult{
Language: openAIResponse.Language,
Text: openAIResponse.Text,
Segments: make([]interfaces.TranscriptSegment, len(openAIResponse.Segments)),
ProcessingTime: time.Since(startTime),
ModelUsed: model,
Metadata: a.CreateDefaultMetadata(params),
}
for i, seg := range openAIResponse.Segments {
result.Segments[i] = interfaces.TranscriptSegment{
Start: seg.Start,
End: seg.End,
Text: seg.Text,
}
}
// OpenAI doesn't provide word-level timestamps in standard verbose_json without extra beta flags
// For now, we'll leave WordSegments empty or implement a basic splitter if needed.
// Given the requirements, segment-level is sufficient for now.
return result, nil
}
// GetEstimatedProcessingTime provides OpenAI-specific time estimation
func (a *OpenAIAdapter) GetEstimatedProcessingTime(input interfaces.AudioInput) time.Duration {
// Cloud transcription is generally faster, approx 10-20% of audio duration
audioDuration := input.Duration
if audioDuration == 0 {
return 30 * time.Second // Fallback
}
return time.Duration(float64(audioDuration) * 0.15)
}

View File

@@ -337,6 +337,8 @@ func (u *UnifiedTranscriptionService) selectModels(params models.WhisperXParams)
transcriptionModelID = "canary"
case "whisper":
transcriptionModelID = "whisperx"
case "openai":
transcriptionModelID = "openai_whisper"
default:
transcriptionModelID = "whisperx" // Default fallback
}
@@ -510,12 +512,36 @@ func (u *UnifiedTranscriptionService) convertParametersForModel(params models.Wh
return u.convertToPyannoteParams(params)
case "sortformer":
return u.convertToSortformerParams(params)
case "openai_whisper":
return u.convertToOpenAIParams(params)
default:
// Fallback to legacy conversion
return u.parametersToMap(params)
}
}
// convertToOpenAIParams converts to OpenAI-specific parameters
func (u *UnifiedTranscriptionService) convertToOpenAIParams(params models.WhisperXParams) map[string]interface{} {
paramMap := map[string]interface{}{
"model": params.Model,
"temperature": params.Temperature,
}
if params.Language != nil {
paramMap["language"] = *params.Language
}
if params.InitialPrompt != nil {
paramMap["prompt"] = *params.InitialPrompt
}
// Add API key if provided in params (e.g. from UI override)
if params.APIKey != nil && *params.APIKey != "" {
paramMap["api_key"] = *params.APIKey
}
return paramMap
}
// convertToParakeetParams converts to Parakeet-specific parameters
func (u *UnifiedTranscriptionService) convertToParakeetParams(params models.WhisperXParams) map[string]interface{} {
return map[string]interface{}{

BIN
server Executable file

Binary file not shown.

View File

@@ -105,6 +105,9 @@ export interface WhisperXParams {
// Multi-track transcription settings
is_multi_track_enabled: boolean;
// OpenAI settings
api_key?: string;
}
// Parameter descriptions for hover cards
@@ -202,6 +205,7 @@ const DEFAULT_PARAMS: WhisperXParams = {
attention_context_left: 256,
attention_context_right: 256,
is_multi_track_enabled: false,
api_key: "",
};
const WHISPER_MODELS = [
@@ -469,6 +473,9 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
<SelectItem value="nvidia_canary" className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
NVIDIA Canary
</SelectItem>
<SelectItem value="openai" className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
OpenAI API
</SelectItem>
</SelectContent>
</Select>
</div>
@@ -933,6 +940,141 @@ export const TranscriptionConfigDialog = memo(function TranscriptionConfigDialog
</div>
)}
</div>
) : params.model_family === "openai" ? (
<div className="space-y-6">
<div className="p-4 border border-blue-200 dark:border-blue-700 rounded-lg bg-blue-50 dark:bg-blue-900/20">
<div className="flex items-center gap-2">
<Info className="h-4 w-4 text-blue-600 dark:text-blue-400" />
<span className="text-sm font-medium text-blue-800 dark:text-blue-200">Cloud Transcription</span>
</div>
<p className="text-sm text-blue-700 dark:text-blue-300 mt-1">
Audio will be sent to OpenAI servers for processing.
</p>
</div>
{/* API Key */}
<div className="space-y-2">
<div className="flex items-center gap-2">
<Label htmlFor="openai_api_key" className="text-carbon-700 dark:text-carbon-300 font-medium">
OpenAI API Key
</Label>
<HoverCard>
<HoverCardTrigger asChild>
<Info className="h-4 w-4 text-carbon-400 cursor-help" />
</HoverCardTrigger>
<HoverCardContent className="w-80 bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
<p className="text-sm text-carbon-700 dark:text-carbon-300">
Your OpenAI API key. If not provided, the server-configured key will be used (if any).
</p>
</HoverCardContent>
</HoverCard>
</div>
<Input
id="openai_api_key"
type="password"
value={params.api_key || ""}
onChange={(e) => updateParam('api_key', e.target.value)}
placeholder="sk-..."
className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100"
/>
</div>
{/* Model Selection */}
<div className="space-y-2">
<Label htmlFor="openai_model" className="text-carbon-700 dark:text-carbon-300 font-medium">
Model
</Label>
<Select
value={params.model || "whisper-1"}
onValueChange={(value) => updateParam('model', value)}
>
<SelectTrigger className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100">
<SelectValue />
</SelectTrigger>
<SelectContent className="bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
<SelectItem value="whisper-1" className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
whisper-1
</SelectItem>
</SelectContent>
</Select>
</div>
{/* Language Selection */}
<div className="space-y-2">
<Label htmlFor="openai_language" className="text-carbon-700 dark:text-carbon-300 font-medium">
Language
</Label>
<Select
value={params.language || "auto"}
onValueChange={(value) => updateParam('language', value === "auto" ? undefined : value)}
>
<SelectTrigger className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100">
<SelectValue placeholder="Auto-detect" />
</SelectTrigger>
<SelectContent className="bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700 max-h-60">
{LANGUAGES.map((lang) => (
<SelectItem key={lang.value} value={lang.value} className="text-carbon-900 dark:text-carbon-100 focus:bg-carbon-100 dark:focus:bg-carbon-700">
{lang.label}
</SelectItem>
))}
</SelectContent>
</Select>
</div>
{/* Temperature */}
<div className="space-y-2">
<div className="flex items-center gap-2">
<Label htmlFor="openai_temperature" className="text-carbon-700 dark:text-carbon-300">
Temperature
</Label>
<HoverCard>
<HoverCardTrigger asChild>
<Info className="h-4 w-4 text-carbon-400 cursor-help" />
</HoverCardTrigger>
<HoverCardContent className="w-80 bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
<p className="text-sm text-carbon-700 dark:text-carbon-300">{PARAM_DESCRIPTIONS.temperature}</p>
</HoverCardContent>
</HoverCard>
</div>
<div className="flex items-center gap-4">
<Slider
value={[params.temperature]}
onValueChange={(value) => updateParam('temperature', value[0])}
max={1}
step={0.1}
className="flex-1"
/>
<span className="w-12 text-right text-sm text-carbon-600 dark:text-carbon-400">
{params.temperature}
</span>
</div>
</div>
{/* Initial Prompt */}
<div className="space-y-2">
<div className="flex items-center gap-2">
<Label htmlFor="openai_prompt" className="text-carbon-700 dark:text-carbon-300">
Initial Prompt
</Label>
<HoverCard>
<HoverCardTrigger asChild>
<Info className="h-4 w-4 text-carbon-400 cursor-help" />
</HoverCardTrigger>
<HoverCardContent className="w-80 bg-white dark:bg-carbon-800 border-carbon-200 dark:border-carbon-700">
<p className="text-sm text-carbon-700 dark:text-carbon-300">{PARAM_DESCRIPTIONS.initial_prompt}</p>
</HoverCardContent>
</HoverCard>
</div>
<Textarea
id="openai_prompt"
value={params.initial_prompt || ""}
onChange={(e) => updateParam('initial_prompt', e.target.value || undefined)}
placeholder="Optional text to guide the model's style..."
className="bg-white dark:bg-carbon-800 border-carbon-300 dark:border-carbon-600 text-carbon-900 dark:text-carbon-100 resize-none"
rows={3}
/>
</div>
</div>
) : (
<Tabs defaultValue="basic" className="w-full">
<TabsList className={`grid w-full items-center h-auto bg-carbon-100 dark:bg-carbon-800 p-1 rounded-lg ${isMultiTrack ? 'grid-cols-3' : 'grid-cols-4'}`}>