Add test suite for python adapter scripts

2026-06-28 14:55:46 +00:00 · 2025-12-26 17:12:54 -08:00
parent 50dd4130ff
commit 7471a2a1b6
7 changed files with 412 additions and 0 deletions
--- a/internal/transcription/adapters/py/README.md
+++ b/internal/transcription/adapters/py/README.md
@@ -0,0 +1,30 @@
+# Python Adapters Testing
+
+This directory contains the Python adapter scripts for various transcription and diarization models used by Scriberr.
+
+## Running Tests
+
+The tests are located in the `tests/` subdirectory of each adapter folder (e.g., `nvidia/tests/`, `pyannote/tests/`). These tests verify that the Python scripts can be executed and produce the expected output.
+
+To run the tests, you need `uv` installed and the `parakeet` environment set up (which serves as a shared environment for these tests).
+
+### Prerequisites
+
+1.  Ensure you have `uv` installed.
+2.  Ensure the `parakeet` and `pyannote` environments set up within `data/whisperx-env/`. This is typically handled by the application startup.
+3.  Ensure you have the test data available (e.g., `tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav`).
+
+### Running Tests with pytest
+
+```bash
+# Run all NVIDIA adapter tests
+uv run --with pytest --project data/whisperx-env/parakeet pytest internal/transcription/adapters/py/nvidia/tests
+
+# Run PyAnnote adapter tests
+uv run --with pytest --project data/whisperx-env/pyannote pytest internal/transcription/adapters/py/pyannote/tests
+```
+
+### Troubleshooting
+
+*   **Audio file not found**: Ensure `tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav` exists.
+*   **Environment not found**: Ensure `data/whisperx-env/parakeet` and the `pyannote` one exist and is a valid virtual environment. This may not be true if scriberr hasn't run yet.
--- a/internal/transcription/adapters/py/nvidia/tests/test_canary_transcribe.py
+++ b/internal/transcription/adapters/py/nvidia/tests/test_canary_transcribe.py
@@ -0,0 +1,83 @@
+"""Tests for canary_transcribe.py"""
+import pytest
+import subprocess
+import json
+import os
+from pathlib import Path
+
+# Paths
+SCRIPT_DIR = Path(__file__).parent.parent
+TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
+AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
+
+import tempfile
+
+def test_canary_transcription_output():
+    """Verify Canary transcription output matches expected results."""
+
+    assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
+
+    # Locate project root and paths
+    project_root = Path(__file__).resolve().parents[6]
+    env_path = project_root / "data/whisperx-env/parakeet" # Canary uses the same env as Parakeet in this setup
+    script_path = SCRIPT_DIR / "canary_transcribe.py"
+
+    assert env_path.exists(), f"Environment not found at: {env_path}"
+
+    # Create a temporary file for output
+    with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
+        output_file = tmp_file.name
+
+    try:
+        cmd = [
+            "uv", "run",
+            "--project", str(env_path),
+            "python", str(script_path),
+            str(AUDIO_FILE),
+            "--timestamps",
+            "--output", output_file
+        ]
+
+        print(f"Running command: {' '.join(cmd)}")
+
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            cwd=project_root
+        )
+
+        if result.returncode != 0:
+            pytest.fail(f"Script failed with error:\n{result.stderr}")
+
+        # Verify output file exists and is valid JSON
+        assert os.path.exists(output_file), "Output file was not created"
+
+        with open(output_file, 'r') as f:
+            data = json.load(f)
+
+        # Assertions based on the provided sample output
+        assert data["source_language"] == "en"
+        assert data["target_language"] == "en"
+        assert data["task"] == "transcribe"
+        assert data["model"] == "canary-1b-v2"
+
+        # Canary output text check
+        assert "Most of us" in data["transcription"]
+        assert "desktop computers" in data["transcription"]
+
+        assert "word_timestamps" in data
+        assert len(data["word_timestamps"]) > 0
+        # Check first word
+        first_word = data["word_timestamps"][0]
+        assert first_word["word"] == "Most"
+        # Allow for small floating point differences
+        assert abs(first_word["start"] - 0.0) < 0.1
+
+        assert "segment_timestamps" in data
+        assert len(data["segment_timestamps"]) > 0
+
+    finally:
+        # Cleanup
+        if os.path.exists(output_file):
+            os.remove(output_file)
--- a/internal/transcription/adapters/py/nvidia/tests/test_parakeet_transcribe.py
+++ b/internal/transcription/adapters/py/nvidia/tests/test_parakeet_transcribe.py
@@ -0,0 +1,89 @@
+"""Tests for parakeet_transcribe.py"""
+import pytest
+import subprocess
+import json
+import os
+from pathlib import Path
+
+# Paths
+SCRIPT_DIR = Path(__file__).parent.parent
+TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
+AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
+
+import tempfile
+
+def test_parakeet_transcription_output():
+    """Verify Parakeet transcription output matches expected results."""
+
+    assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
+
+    # Construct command
+    # uv run --project data/whisperx-env/parakeet python internal/transcription/adapters/py/nvidia/parakeet_transcribe.py ...
+
+    # Locate project root (Scriberr directory)
+    # This file is in internal/transcription/adapters/py/nvidia/tests/
+    project_root = Path(__file__).resolve().parents[6]
+    env_path = project_root / "data/whisperx-env/parakeet"
+    script_path = SCRIPT_DIR / "parakeet_transcribe.py"
+
+    assert env_path.exists(), f"Environment not found at: {env_path}"
+
+    # Create a temporary file for output
+    with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
+        output_file = tmp_file.name
+
+    try:
+        cmd = [
+            "uv", "run",
+            "--project", str(env_path),
+            "python", str(script_path),
+            str(AUDIO_FILE),
+            "--timestamps",
+            "--context-left", "256",
+            "--context-right", "256",
+            "--output", output_file
+        ]
+
+        print(f"Running command: {' '.join(cmd)}")
+
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            cwd=project_root # Run from project root to ensure paths are correct if relative
+        )
+
+        if result.returncode != 0:
+            pytest.fail(f"Script failed with error:\n{result.stderr}")
+
+        # Verify output file exists and is valid JSON
+        assert os.path.exists(output_file), "Output file was not created"
+
+        with open(output_file, 'r') as f:
+            data = json.load(f)
+
+        # Assertions based on the provided sample output
+        assert data["language"] == "en"
+        assert data["model"] == "parakeet-tdt-0.6b-v3"
+        assert "transcription" in data
+        assert "First of all, have desktop computers" in data["transcription"]
+        assert "reading room" in data["transcription"]
+
+        assert "word_timestamps" in data
+        assert len(data["word_timestamps"]) > 0
+        # Check first word
+        first_word = data["word_timestamps"][0]
+        assert first_word["word"] == "First"
+        # Allow for small floating point differences
+        assert abs(first_word["start"] - 0.4) < 0.1
+
+        assert "segment_timestamps" in data
+        assert len(data["segment_timestamps"]) > 0
+
+        assert data["context"]["left"] == 256
+        assert data["context"]["right"] == 256
+
+    finally:
+        # Cleanup
+        if os.path.exists(output_file):
+            os.remove(output_file)
--- a/internal/transcription/adapters/py/nvidia/tests/test_parakeet_transcribe_buffered.py
+++ b/internal/transcription/adapters/py/nvidia/tests/test_parakeet_transcribe_buffered.py
@@ -0,0 +1,76 @@
+"""Tests for parakeet_transcribe_buffered.py"""
+import pytest
+import subprocess
+import json
+import os
+import tempfile
+from pathlib import Path
+
+# Paths
+SCRIPT_DIR = Path(__file__).parent.parent
+TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
+AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
+
+def test_parakeet_buffered_transcription_output():
+    """Verify Parakeet buffered transcription output matches expected results."""
+
+    assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
+
+    # Locate project root and paths
+    project_root = Path(__file__).resolve().parents[6]
+    env_path = project_root / "data/whisperx-env/parakeet"
+    script_path = SCRIPT_DIR / "parakeet_transcribe_buffered.py"
+
+    assert env_path.exists(), f"Environment not found at: {env_path}"
+
+    # Create a temporary file for output
+    with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
+        output_file = tmp_file.name
+
+    try:
+        cmd = [
+            "uv", "run",
+            "--project", str(env_path),
+            "python", str(script_path),
+            str(AUDIO_FILE),
+            "--output", output_file,
+            # Use a small chunk length to force buffering behavior on our test file which is 19 sec long
+            "--chunk-len", "10"
+        ]
+
+        print(f"Running command: {' '.join(cmd)}")
+
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            cwd=project_root
+        )
+
+        if result.returncode != 0:
+            pytest.fail(f"Script failed with error:\n{result.stderr}\nStdout:\n{result.stdout}")
+
+        # Verify output file exists and is valid JSON
+        assert os.path.exists(output_file), "Output file was not created"
+
+        with open(output_file, 'r') as f:
+            data = json.load(f)
+
+        # Assertions
+        assert data["language"] == "en"
+        assert data["model"] == "parakeet-tdt-0.6b-v3"
+        assert data.get("buffered") is True
+        assert "transcription" in data
+        assert len(data["transcription"]) > 0
+
+        # Check that we have timestamps
+        assert "word_timestamps" in data
+        assert len(data["word_timestamps"]) > 0
+
+        assert "segment_timestamps" in data
+        assert len(data["segment_timestamps"]) > 0
+
+    finally:
+        # Cleanup
+        if os.path.exists(output_file):
+            os.remove(output_file)
--- a/internal/transcription/adapters/py/nvidia/tests/test_sortformer_diarize.py
+++ b/internal/transcription/adapters/py/nvidia/tests/test_sortformer_diarize.py
@@ -0,0 +1,87 @@
+"""Tests for sortformer_diarize.py"""
+import pytest
+import subprocess
+import json
+import os
+import tempfile
+from pathlib import Path
+
+# Paths
+SCRIPT_DIR = Path(__file__).parent.parent
+TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
+AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
+
+def test_sortformer_diarization_output():
+    """Verify Sortformer diarization output matches expected results."""
+
+    assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
+
+    # Locate project root and paths
+    project_root = Path(__file__).resolve().parents[6]
+    env_path = project_root / "data/whisperx-env/parakeet"
+    script_path = SCRIPT_DIR / "sortformer_diarize.py"
+
+    assert env_path.exists(), f"Environment not found at: {env_path}"
+
+    # Create a temporary file for output
+    with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
+        output_file = tmp_file.name
+
+    try:
+        cmd = [
+            "uv", "run",
+            "--project", str(env_path),
+            "python", str(script_path),
+            str(AUDIO_FILE),
+            output_file
+        ]
+
+        print(f"Running command: {' '.join(cmd)}")
+
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            cwd=project_root
+        )
+
+        if result.returncode != 0:
+            pytest.fail(f"Script failed with error:\n{result.stderr}\nStdout:\n{result.stdout}")
+
+        # Verify output file exists and is valid JSON
+        assert os.path.exists(output_file), "Output file was not created"
+
+        with open(output_file, 'r') as f:
+            data = json.load(f)
+
+        # Assertions based on the provided sample output
+        assert data["model"] == "nvidia/diar_streaming_sortformer_4spk-v2"
+        assert "segments" in data
+        assert len(data["segments"]) > 0
+
+        # Check speakers
+        assert "speakers" in data
+        assert len(data["speakers"]) == 4 # Based on sample output which found 4 speakers
+        assert "speaker_0" in data["speakers"]
+        assert "speaker_1" in data["speakers"]
+
+        # Check first segment
+        first_segment = data["segments"][0]
+        assert "start" in first_segment
+        assert "end" in first_segment
+        assert "speaker" in first_segment
+        assert first_segment["start"] == 0.0
+        assert first_segment["speaker"] == "speaker_0"
+
+        assert data["speaker_count"] == 4
+
+        assert "total_segments" in data
+        assert data["total_segments"] > 15
+
+        assert "total_duration" in data
+        assert data["total_duration"] > 17
+
+    finally:
+        # Cleanup
+        if os.path.exists(output_file):
+            os.remove(output_file)
--- a/internal/transcription/adapters/py/pyannote/tests/test_pyannote_diarize.py
+++ b/internal/transcription/adapters/py/pyannote/tests/test_pyannote_diarize.py
@@ -0,0 +1,47 @@
+"""Tests for pyannote_diarize.py"""
+import pytest
+import subprocess
+from pathlib import Path
+
+# Paths
+SCRIPT_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = SCRIPT_DIR / "pyannote_diarize.py"
+
+# TODO: Add proper diarization testing once a dummy HF token or mock pipeline is available.
+# uv run --project data/whisperx-env/pyannote/ python internal/transcription/adapters/py/pyannote/pyannote_diarize.py --output=/tmp/pyan.json --hf-token $HF_TOKEN tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav
+def test_pyannote_diarize_exists():
+    """Verify pyannote_diarize.py exists."""
+    assert SCRIPT_PATH.exists(), "pyannote_diarize.py should exist"
+
+
+def test_pyannote_diarize_help():
+    """Verify pyannote_diarize.py --help works."""
+
+    # Locate project root (Scriberr directory)
+    # This file is in internal/transcription/adapters/py/pyannote/tests/
+    project_root = Path(__file__).resolve().parents[6]
+    env_path = project_root / "data/whisperx-env/pyannote"
+
+    assert env_path.exists(), f"Environment not found at: {env_path}"
+
+
+
+    cmd = [
+        "uv", "run",
+        "--project", str(env_path),
+        "python", str(SCRIPT_PATH),
+        "--help"
+    ]
+
+    print(f"Running command: {' '.join(cmd)}")
+    result = subprocess.run(
+        cmd,
+        capture_output=True,
+        text=True,
+        cwd=project_root
+    )
+
+    assert result.returncode == 0
+    assert "usage: pyannote_diarize.py" in result.stdout
+    assert "--hf-token" in result.stdout
+    assert "--model" in result.stdout
--- a/tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav
+++ b/tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav