mirror of
https://github.com/rishikanthc/Scriberr.git
synced 2026-07-01 08:15:46 +00:00
fix database mistmatch for word level timestamps for multi-track audio
This commit is contained in:
@@ -574,7 +574,7 @@ func (c *CanaryAdapter) Transcribe(ctx context.Context, input interfaces.AudioIn
|
||||
|
||||
logger.Info("Canary transcription completed",
|
||||
"segments", len(result.Segments),
|
||||
"words", len(result.Words),
|
||||
"words", len(result.WordSegments),
|
||||
"processing_time", result.ProcessingTime,
|
||||
"task", c.GetStringParameter(params, "task"))
|
||||
|
||||
@@ -665,7 +665,7 @@ func (c *CanaryAdapter) parseResult(tempDir string, input interfaces.AudioInput,
|
||||
Text: canaryResult.Transcription,
|
||||
Language: resultLanguage,
|
||||
Segments: make([]interfaces.TranscriptSegment, len(canaryResult.SegmentTimestamps)),
|
||||
Words: make([]interfaces.TranscriptWord, len(canaryResult.WordTimestamps)),
|
||||
WordSegments: make([]interfaces.TranscriptWord, len(canaryResult.WordTimestamps)),
|
||||
Confidence: 0.0, // Default confidence
|
||||
}
|
||||
|
||||
@@ -681,7 +681,7 @@ func (c *CanaryAdapter) parseResult(tempDir string, input interfaces.AudioInput,
|
||||
|
||||
// Convert words
|
||||
for i, word := range canaryResult.WordTimestamps {
|
||||
result.Words[i] = interfaces.TranscriptWord{
|
||||
result.WordSegments[i] = interfaces.TranscriptWord{
|
||||
Start: word.Start,
|
||||
End: word.End,
|
||||
Word: word.Word,
|
||||
|
||||
@@ -512,7 +512,7 @@ func (p *ParakeetAdapter) Transcribe(ctx context.Context, input interfaces.Audio
|
||||
|
||||
logger.Info("Parakeet transcription completed",
|
||||
"segments", len(result.Segments),
|
||||
"words", len(result.Words),
|
||||
"words", len(result.WordSegments),
|
||||
"processing_time", result.ProcessingTime)
|
||||
|
||||
return result, nil
|
||||
@@ -581,7 +581,7 @@ func (p *ParakeetAdapter) parseResult(tempDir string, input interfaces.AudioInpu
|
||||
Text: parakeetResult.Transcription,
|
||||
Language: parakeetResult.Language,
|
||||
Segments: make([]interfaces.TranscriptSegment, len(parakeetResult.SegmentTimestamps)),
|
||||
Words: make([]interfaces.TranscriptWord, len(parakeetResult.WordTimestamps)),
|
||||
WordSegments: make([]interfaces.TranscriptWord, len(parakeetResult.WordTimestamps)),
|
||||
Confidence: 0.0, // Default confidence
|
||||
}
|
||||
|
||||
@@ -596,7 +596,7 @@ func (p *ParakeetAdapter) parseResult(tempDir string, input interfaces.AudioInpu
|
||||
|
||||
// Convert words
|
||||
for i, word := range parakeetResult.WordTimestamps {
|
||||
result.Words[i] = interfaces.TranscriptWord{
|
||||
result.WordSegments[i] = interfaces.TranscriptWord{
|
||||
Start: word.Start,
|
||||
End: word.End,
|
||||
Word: word.Word,
|
||||
|
||||
@@ -428,7 +428,7 @@ func (w *WhisperXAdapter) Transcribe(ctx context.Context, input interfaces.Audio
|
||||
|
||||
logger.Info("WhisperX transcription completed",
|
||||
"segments", len(result.Segments),
|
||||
"words", len(result.Words),
|
||||
"words", len(result.WordSegments),
|
||||
"processing_time", result.ProcessingTime)
|
||||
|
||||
return result, nil
|
||||
@@ -552,7 +552,7 @@ func (w *WhisperXAdapter) parseResult(outputDir string, input interfaces.AudioIn
|
||||
result := &interfaces.TranscriptResult{
|
||||
Language: whisperxResult.Language,
|
||||
Segments: make([]interfaces.TranscriptSegment, len(whisperxResult.Segments)),
|
||||
Words: make([]interfaces.TranscriptWord, len(whisperxResult.Word)),
|
||||
WordSegments: make([]interfaces.TranscriptWord, len(whisperxResult.Word)),
|
||||
Confidence: 0.0, // WhisperX doesn't provide overall confidence
|
||||
}
|
||||
|
||||
@@ -570,7 +570,7 @@ func (w *WhisperXAdapter) parseResult(outputDir string, input interfaces.AudioIn
|
||||
|
||||
// Convert words
|
||||
for i, word := range whisperxResult.Word {
|
||||
result.Words[i] = interfaces.TranscriptWord{
|
||||
result.WordSegments[i] = interfaces.TranscriptWord{
|
||||
Start: word.Start,
|
||||
End: word.End,
|
||||
Word: word.Word,
|
||||
|
||||
@@ -78,7 +78,7 @@ type TranscriptResult struct {
|
||||
Text string `json:"text"`
|
||||
Language string `json:"language"`
|
||||
Segments []TranscriptSegment `json:"segments"`
|
||||
Words []TranscriptWord `json:"words,omitempty"`
|
||||
WordSegments []TranscriptWord `json:"word_segments,omitempty"`
|
||||
Confidence float64 `json:"confidence"`
|
||||
ProcessingTime time.Duration `json:"processing_time"`
|
||||
ModelUsed string `json:"model_used"`
|
||||
|
||||
@@ -331,7 +331,7 @@ func (mt *MultiTrackTranscriber) transcribeIndividualTrack(ctx context.Context,
|
||||
logger.Info("Successfully transcribed track",
|
||||
"track_name", trackFile.FileName,
|
||||
"model_family", trackParams.ModelFamily,
|
||||
"word_count", len(result.Words),
|
||||
"word_count", len(result.WordSegments),
|
||||
"segment_count", len(result.Segments))
|
||||
|
||||
return result, nil
|
||||
@@ -434,10 +434,10 @@ func (mt *MultiTrackTranscriber) mergeTrackTranscripts(trackTranscripts []TrackT
|
||||
logger.Info("Collecting words from track",
|
||||
"speaker", speaker,
|
||||
"offset", offset,
|
||||
"word_count", len(trackTranscript.Result.Words))
|
||||
"word_count", len(trackTranscript.Result.WordSegments))
|
||||
|
||||
// Collect words with offset adjustment and speaker assignment
|
||||
for _, word := range trackTranscript.Result.Words {
|
||||
for _, word := range trackTranscript.Result.WordSegments {
|
||||
adjustedWord := interfaces.Word{
|
||||
Start: word.Start + offset,
|
||||
End: word.End + offset,
|
||||
@@ -502,10 +502,10 @@ func (mt *MultiTrackTranscriber) mergeTrackTranscripts(trackTranscripts []TrackT
|
||||
}
|
||||
|
||||
mergedResult := &interfaces.TranscriptResult{
|
||||
Segments: speakerTurns,
|
||||
Words: allWords,
|
||||
Language: language,
|
||||
Text: mergedText.String(),
|
||||
Segments: speakerTurns,
|
||||
WordSegments: allWords,
|
||||
Language: language,
|
||||
Text: mergedText.String(),
|
||||
}
|
||||
|
||||
logger.Info("Sort-and-group merging completed successfully",
|
||||
@@ -655,7 +655,7 @@ func (mt *MultiTrackTranscriber) logIndividualTranscript(fileName string, result
|
||||
"offset", offset,
|
||||
"language", result.Language,
|
||||
"total_segments", len(result.Segments),
|
||||
"total_words", len(result.Words))
|
||||
"total_words", len(result.WordSegments))
|
||||
|
||||
// Log segment-level data
|
||||
logger.Info("--- SEGMENTS (Original Timestamps) ---", "file", fileName)
|
||||
@@ -687,7 +687,7 @@ func (mt *MultiTrackTranscriber) logIndividualTranscript(fileName string, result
|
||||
|
||||
// Log word-level data (original timestamps)
|
||||
logger.Info("--- WORDS (Original Timestamps) ---", "file", fileName)
|
||||
for i, word := range result.Words {
|
||||
for i, word := range result.WordSegments {
|
||||
logger.Debug("Word",
|
||||
"file", fileName,
|
||||
"index", i+1,
|
||||
@@ -700,7 +700,7 @@ func (mt *MultiTrackTranscriber) logIndividualTranscript(fileName string, result
|
||||
|
||||
// Log word-level data with offset applied
|
||||
logger.Info("--- WORDS (With Offset Applied) ---", "file", fileName, "offset", offset)
|
||||
for i, word := range result.Words {
|
||||
for i, word := range result.WordSegments {
|
||||
adjustedStart := word.Start + offset
|
||||
adjustedEnd := word.End + offset
|
||||
logger.Info("Adjusted Word",
|
||||
|
||||
@@ -702,12 +702,12 @@ func (u *UnifiedTranscriptionService) mergeDiarizationWithTranscription(transcri
|
||||
}
|
||||
|
||||
// Also assign speakers to words if available
|
||||
if len(transcript.Words) > 0 {
|
||||
mergedTranscript.Words = make([]interfaces.TranscriptWord, len(transcript.Words))
|
||||
copy(mergedTranscript.Words, transcript.Words)
|
||||
|
||||
for i := range mergedTranscript.Words {
|
||||
word := &mergedTranscript.Words[i]
|
||||
if len(transcript.WordSegments) > 0 {
|
||||
mergedTranscript.WordSegments = make([]interfaces.TranscriptWord, len(transcript.WordSegments))
|
||||
copy(mergedTranscript.WordSegments, transcript.WordSegments)
|
||||
|
||||
for i := range mergedTranscript.WordSegments {
|
||||
word := &mergedTranscript.WordSegments[i]
|
||||
bestSpeaker := u.findBestSpeakerForSegment(word.Start, word.End, diarization.Segments)
|
||||
if bestSpeaker != "" {
|
||||
word.Speaker = &bestSpeaker
|
||||
@@ -757,81 +757,10 @@ func (u *UnifiedTranscriptionService) saveTranscriptionResults(jobID string, res
|
||||
return nil
|
||||
}
|
||||
|
||||
// convertTranscriptResultToJSON converts the interface result to the expected JSON format
|
||||
// convertTranscriptResultToJSON converts the interface result to JSON format
|
||||
func (u *UnifiedTranscriptionService) convertTranscriptResultToJSON(result *interfaces.TranscriptResult) (string, error) {
|
||||
// Convert to the format expected by the existing database schema
|
||||
legacyFormat := struct {
|
||||
Segments []struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Speaker *string `json:"speaker,omitempty"`
|
||||
} `json:"segments"`
|
||||
Word []struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Word string `json:"word"`
|
||||
Score float64 `json:"score"`
|
||||
Speaker *string `json:"speaker,omitempty"`
|
||||
} `json:"word_segments,omitempty"`
|
||||
Language string `json:"language"`
|
||||
Text string `json:"text"`
|
||||
}{
|
||||
Language: result.Language,
|
||||
Text: result.Text,
|
||||
}
|
||||
|
||||
// Convert segments
|
||||
legacyFormat.Segments = make([]struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Speaker *string `json:"speaker,omitempty"`
|
||||
}, len(result.Segments))
|
||||
|
||||
for i, seg := range result.Segments {
|
||||
legacyFormat.Segments[i] = struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Text string `json:"text"`
|
||||
Speaker *string `json:"speaker,omitempty"`
|
||||
}{
|
||||
Start: seg.Start,
|
||||
End: seg.End,
|
||||
Text: seg.Text,
|
||||
Speaker: seg.Speaker,
|
||||
}
|
||||
}
|
||||
|
||||
// Convert words
|
||||
if len(result.Words) > 0 {
|
||||
legacyFormat.Word = make([]struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Word string `json:"word"`
|
||||
Score float64 `json:"score"`
|
||||
Speaker *string `json:"speaker,omitempty"`
|
||||
}, len(result.Words))
|
||||
|
||||
for i, word := range result.Words {
|
||||
legacyFormat.Word[i] = struct {
|
||||
Start float64 `json:"start"`
|
||||
End float64 `json:"end"`
|
||||
Word string `json:"word"`
|
||||
Score float64 `json:"score"`
|
||||
Speaker *string `json:"speaker,omitempty"`
|
||||
}{
|
||||
Start: word.Start,
|
||||
End: word.End,
|
||||
Word: word.Word,
|
||||
Score: word.Score,
|
||||
Speaker: word.Speaker,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Convert to JSON string
|
||||
jsonBytes, err := json.Marshal(legacyFormat)
|
||||
// Now that the struct fields match the JSON field names, we can directly marshal
|
||||
jsonBytes, err := json.Marshal(result)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
@@ -70,6 +70,27 @@
|
||||
"description": "Upload a video file, extract audio from it using ffmpeg, and create a transcription job",
|
||||
"tag": "transcription"
|
||||
},
|
||||
{
|
||||
"method": "POST",
|
||||
"path": "/api/v1/transcription/upload-multitrack",
|
||||
"summary": "Upload multi-track audio files",
|
||||
"description": "Upload multiple audio files with an .aup file for multi-track transcription",
|
||||
"tag": "transcription"
|
||||
},
|
||||
{
|
||||
"method": "GET",
|
||||
"path": "/api/v1/transcription/{id}/merge-status",
|
||||
"summary": "Get multi-track merge status",
|
||||
"description": "Get the current merge status for a multi-track job",
|
||||
"tag": "transcription"
|
||||
},
|
||||
{
|
||||
"method": "GET",
|
||||
"path": "/api/v1/transcription/{id}/track-progress",
|
||||
"summary": "Get multi-track job progress",
|
||||
"description": "Get real-time progress information for individual tracks in a multi-track job",
|
||||
"tag": "transcription"
|
||||
},
|
||||
{
|
||||
"method": "POST",
|
||||
"path": "/api/v1/transcription/submit",
|
||||
|
||||
Reference in New Issue
Block a user