fix database mistmatch for word level timestamps for multi-track audio

This commit is contained in:
rishikanthc
2025-09-13 09:22:19 -07:00
parent eebdc83906
commit 9da6a71ecb
7 changed files with 50 additions and 100 deletions

View File

@@ -574,7 +574,7 @@ func (c *CanaryAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn
logger.Info("Canary transcription completed",
"segments", len(result.Segments),
"words", len(result.Words),
"words", len(result.WordSegments),
"processing_time", result.ProcessingTime,
"task", c.GetStringParameter(params, "task"))
@@ -665,7 +665,7 @@ func (c *CanaryAdapter) parseResult(tempDir string, input interfaces.AudioInput,
Text: canaryResult.Transcription,
Language: resultLanguage,
Segments: make([]interfaces.TranscriptSegment, len(canaryResult.SegmentTimestamps)),
Words: make([]interfaces.TranscriptWord, len(canaryResult.WordTimestamps)),
WordSegments: make([]interfaces.TranscriptWord, len(canaryResult.WordTimestamps)),
Confidence: 0.0, // Default confidence
}
@@ -681,7 +681,7 @@ func (c *CanaryAdapter) parseResult(tempDir string, input interfaces.AudioInput,
// Convert words
for i, word := range canaryResult.WordTimestamps {
result.Words[i] = interfaces.TranscriptWord{
result.WordSegments[i] = interfaces.TranscriptWord{
Start: word.Start,
End: word.End,
Word: word.Word,

View File

@@ -512,7 +512,7 @@ func (p *ParakeetAdapter) Transcribe(ctx context.Context, input interfaces.Audio
logger.Info("Parakeet transcription completed",
"segments", len(result.Segments),
"words", len(result.Words),
"words", len(result.WordSegments),
"processing_time", result.ProcessingTime)
return result, nil
@@ -581,7 +581,7 @@ func (p *ParakeetAdapter) parseResult(tempDir string, input interfaces.AudioInpu
Text: parakeetResult.Transcription,
Language: parakeetResult.Language,
Segments: make([]interfaces.TranscriptSegment, len(parakeetResult.SegmentTimestamps)),
Words: make([]interfaces.TranscriptWord, len(parakeetResult.WordTimestamps)),
WordSegments: make([]interfaces.TranscriptWord, len(parakeetResult.WordTimestamps)),
Confidence: 0.0, // Default confidence
}
@@ -596,7 +596,7 @@ func (p *ParakeetAdapter) parseResult(tempDir string, input interfaces.AudioInpu
// Convert words
for i, word := range parakeetResult.WordTimestamps {
result.Words[i] = interfaces.TranscriptWord{
result.WordSegments[i] = interfaces.TranscriptWord{
Start: word.Start,
End: word.End,
Word: word.Word,

View File

@@ -428,7 +428,7 @@ func (w *WhisperXAdapter) Transcribe(ctx context.Context, input interfaces.Audio
logger.Info("WhisperX transcription completed",
"segments", len(result.Segments),
"words", len(result.Words),
"words", len(result.WordSegments),
"processing_time", result.ProcessingTime)
return result, nil
@@ -552,7 +552,7 @@ func (w *WhisperXAdapter) parseResult(outputDir string, input interfaces.AudioIn
result := &interfaces.TranscriptResult{
Language: whisperxResult.Language,
Segments: make([]interfaces.TranscriptSegment, len(whisperxResult.Segments)),
Words: make([]interfaces.TranscriptWord, len(whisperxResult.Word)),
WordSegments: make([]interfaces.TranscriptWord, len(whisperxResult.Word)),
Confidence: 0.0, // WhisperX doesn't provide overall confidence
}
@@ -570,7 +570,7 @@ func (w *WhisperXAdapter) parseResult(outputDir string, input interfaces.AudioIn
// Convert words
for i, word := range whisperxResult.Word {
result.Words[i] = interfaces.TranscriptWord{
result.WordSegments[i] = interfaces.TranscriptWord{
Start: word.Start,
End: word.End,
Word: word.Word,

View File

@@ -78,7 +78,7 @@ type TranscriptResult struct {
Text string `json:"text"`
Language string `json:"language"`
Segments []TranscriptSegment `json:"segments"`
Words []TranscriptWord `json:"words,omitempty"`
WordSegments []TranscriptWord `json:"word_segments,omitempty"`
Confidence float64 `json:"confidence"`
ProcessingTime time.Duration `json:"processing_time"`
ModelUsed string `json:"model_used"`

View File

@@ -331,7 +331,7 @@ func (mt *MultiTrackTranscriber) transcribeIndividualTrack(ctx context.Context,
logger.Info("Successfully transcribed track",
"track_name", trackFile.FileName,
"model_family", trackParams.ModelFamily,
"word_count", len(result.Words),
"word_count", len(result.WordSegments),
"segment_count", len(result.Segments))
return result, nil
@@ -434,10 +434,10 @@ func (mt *MultiTrackTranscriber) mergeTrackTranscripts(trackTranscripts []TrackT
logger.Info("Collecting words from track",
"speaker", speaker,
"offset", offset,
"word_count", len(trackTranscript.Result.Words))
"word_count", len(trackTranscript.Result.WordSegments))
// Collect words with offset adjustment and speaker assignment
for _, word := range trackTranscript.Result.Words {
for _, word := range trackTranscript.Result.WordSegments {
adjustedWord := interfaces.Word{
Start: word.Start + offset,
End: word.End + offset,
@@ -502,10 +502,10 @@ func (mt *MultiTrackTranscriber) mergeTrackTranscripts(trackTranscripts []TrackT
}
mergedResult := &interfaces.TranscriptResult{
Segments: speakerTurns,
Words: allWords,
Language: language,
Text: mergedText.String(),
Segments: speakerTurns,
WordSegments: allWords,
Language: language,
Text: mergedText.String(),
}
logger.Info("Sort-and-group merging completed successfully",
@@ -655,7 +655,7 @@ func (mt *MultiTrackTranscriber) logIndividualTranscript(fileName string, result
"offset", offset,
"language", result.Language,
"total_segments", len(result.Segments),
"total_words", len(result.Words))
"total_words", len(result.WordSegments))
// Log segment-level data
logger.Info("--- SEGMENTS (Original Timestamps) ---", "file", fileName)
@@ -687,7 +687,7 @@ func (mt *MultiTrackTranscriber) logIndividualTranscript(fileName string, result
// Log word-level data (original timestamps)
logger.Info("--- WORDS (Original Timestamps) ---", "file", fileName)
for i, word := range result.Words {
for i, word := range result.WordSegments {
logger.Debug("Word",
"file", fileName,
"index", i+1,
@@ -700,7 +700,7 @@ func (mt *MultiTrackTranscriber) logIndividualTranscript(fileName string, result
// Log word-level data with offset applied
logger.Info("--- WORDS (With Offset Applied) ---", "file", fileName, "offset", offset)
for i, word := range result.Words {
for i, word := range result.WordSegments {
adjustedStart := word.Start + offset
adjustedEnd := word.End + offset
logger.Info("Adjusted Word",

View File

@@ -702,12 +702,12 @@ func (u *UnifiedTranscriptionService) mergeDiarizationWithTranscription(transcri
}
// Also assign speakers to words if available
if len(transcript.Words) > 0 {
mergedTranscript.Words = make([]interfaces.TranscriptWord, len(transcript.Words))
copy(mergedTranscript.Words, transcript.Words)
for i := range mergedTranscript.Words {
word := &mergedTranscript.Words[i]
if len(transcript.WordSegments) > 0 {
mergedTranscript.WordSegments = make([]interfaces.TranscriptWord, len(transcript.WordSegments))
copy(mergedTranscript.WordSegments, transcript.WordSegments)
for i := range mergedTranscript.WordSegments {
word := &mergedTranscript.WordSegments[i]
bestSpeaker := u.findBestSpeakerForSegment(word.Start, word.End, diarization.Segments)
if bestSpeaker != "" {
word.Speaker = &bestSpeaker
@@ -757,81 +757,10 @@ func (u *UnifiedTranscriptionService) saveTranscriptionResults(jobID string, res
return nil
}
// convertTranscriptResultToJSON converts the interface result to the expected JSON format
// convertTranscriptResultToJSON converts the interface result to JSON format
func (u *UnifiedTranscriptionService) convertTranscriptResultToJSON(result *interfaces.TranscriptResult) (string, error) {
// Convert to the format expected by the existing database schema
legacyFormat := struct {
Segments []struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
Speaker *string `json:"speaker,omitempty"`
} `json:"segments"`
Word []struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Word string `json:"word"`
Score float64 `json:"score"`
Speaker *string `json:"speaker,omitempty"`
} `json:"word_segments,omitempty"`
Language string `json:"language"`
Text string `json:"text"`
}{
Language: result.Language,
Text: result.Text,
}
// Convert segments
legacyFormat.Segments = make([]struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
Speaker *string `json:"speaker,omitempty"`
}, len(result.Segments))
for i, seg := range result.Segments {
legacyFormat.Segments[i] = struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Text string `json:"text"`
Speaker *string `json:"speaker,omitempty"`
}{
Start: seg.Start,
End: seg.End,
Text: seg.Text,
Speaker: seg.Speaker,
}
}
// Convert words
if len(result.Words) > 0 {
legacyFormat.Word = make([]struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Word string `json:"word"`
Score float64 `json:"score"`
Speaker *string `json:"speaker,omitempty"`
}, len(result.Words))
for i, word := range result.Words {
legacyFormat.Word[i] = struct {
Start float64 `json:"start"`
End float64 `json:"end"`
Word string `json:"word"`
Score float64 `json:"score"`
Speaker *string `json:"speaker,omitempty"`
}{
Start: word.Start,
End: word.End,
Word: word.Word,
Score: word.Score,
Speaker: word.Speaker,
}
}
}
// Convert to JSON string
jsonBytes, err := json.Marshal(legacyFormat)
// Now that the struct fields match the JSON field names, we can directly marshal
jsonBytes, err := json.Marshal(result)
if err != nil {
return "", err
}

View File

@@ -70,6 +70,27 @@
"description": "Upload a video file, extract audio from it using ffmpeg, and create a transcription job",
"tag": "transcription"
},
{
"method": "POST",
"path": "/api/v1/transcription/upload-multitrack",
"summary": "Upload multi-track audio files",
"description": "Upload multiple audio files with an .aup file for multi-track transcription",
"tag": "transcription"
},
{
"method": "GET",
"path": "/api/v1/transcription/{id}/merge-status",
"summary": "Get multi-track merge status",
"description": "Get the current merge status for a multi-track job",
"tag": "transcription"
},
{
"method": "GET",
"path": "/api/v1/transcription/{id}/track-progress",
"summary": "Get multi-track job progress",
"description": "Get real-time progress information for individual tracks in a multi-track job",
"tag": "transcription"
},
{
"method": "POST",
"path": "/api/v1/transcription/submit",