Add test suite for python adapter scripts

This commit is contained in:
Paul Irish
2025-12-26 17:12:54 -08:00
committed by Rishikanth Chandrasekaran
parent 50dd4130ff
commit 7471a2a1b6
7 changed files with 412 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
# Python Adapters Testing
This directory contains the Python adapter scripts for various transcription and diarization models used by Scriberr.
## Running Tests
The tests are located in the `tests/` subdirectory of each adapter folder (e.g., `nvidia/tests/`, `pyannote/tests/`). These tests verify that the Python scripts can be executed and produce the expected output.
To run the tests, you need `uv` installed and the `parakeet` environment set up (which serves as a shared environment for these tests).
### Prerequisites
1. Ensure you have `uv` installed.
2. Ensure the `parakeet` and `pyannote` environments set up within `data/whisperx-env/`. This is typically handled by the application startup.
3. Ensure you have the test data available (e.g., `tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav`).
### Running Tests with pytest
```bash
# Run all NVIDIA adapter tests
uv run --with pytest --project data/whisperx-env/parakeet pytest internal/transcription/adapters/py/nvidia/tests
# Run PyAnnote adapter tests
uv run --with pytest --project data/whisperx-env/pyannote pytest internal/transcription/adapters/py/pyannote/tests
```
### Troubleshooting
* **Audio file not found**: Ensure `tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav` exists.
* **Environment not found**: Ensure `data/whisperx-env/parakeet` and the `pyannote` one exist and is a valid virtual environment. This may not be true if scriberr hasn't run yet.

View File

@@ -0,0 +1,83 @@
"""Tests for canary_transcribe.py"""
import pytest
import subprocess
import json
import os
from pathlib import Path
# Paths
SCRIPT_DIR = Path(__file__).parent.parent
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
import tempfile
def test_canary_transcription_output():
"""Verify Canary transcription output matches expected results."""
assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
# Locate project root and paths
project_root = Path(__file__).resolve().parents[6]
env_path = project_root / "data/whisperx-env/parakeet" # Canary uses the same env as Parakeet in this setup
script_path = SCRIPT_DIR / "canary_transcribe.py"
assert env_path.exists(), f"Environment not found at: {env_path}"
# Create a temporary file for output
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
output_file = tmp_file.name
try:
cmd = [
"uv", "run",
"--project", str(env_path),
"python", str(script_path),
str(AUDIO_FILE),
"--timestamps",
"--output", output_file
]
print(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
cwd=project_root
)
if result.returncode != 0:
pytest.fail(f"Script failed with error:\n{result.stderr}")
# Verify output file exists and is valid JSON
assert os.path.exists(output_file), "Output file was not created"
with open(output_file, 'r') as f:
data = json.load(f)
# Assertions based on the provided sample output
assert data["source_language"] == "en"
assert data["target_language"] == "en"
assert data["task"] == "transcribe"
assert data["model"] == "canary-1b-v2"
# Canary output text check
assert "Most of us" in data["transcription"]
assert "desktop computers" in data["transcription"]
assert "word_timestamps" in data
assert len(data["word_timestamps"]) > 0
# Check first word
first_word = data["word_timestamps"][0]
assert first_word["word"] == "Most"
# Allow for small floating point differences
assert abs(first_word["start"] - 0.0) < 0.1
assert "segment_timestamps" in data
assert len(data["segment_timestamps"]) > 0
finally:
# Cleanup
if os.path.exists(output_file):
os.remove(output_file)

View File

@@ -0,0 +1,89 @@
"""Tests for parakeet_transcribe.py"""
import pytest
import subprocess
import json
import os
from pathlib import Path
# Paths
SCRIPT_DIR = Path(__file__).parent.parent
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
import tempfile
def test_parakeet_transcription_output():
"""Verify Parakeet transcription output matches expected results."""
assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
# Construct command
# uv run --project data/whisperx-env/parakeet python internal/transcription/adapters/py/nvidia/parakeet_transcribe.py ...
# Locate project root (Scriberr directory)
# This file is in internal/transcription/adapters/py/nvidia/tests/
project_root = Path(__file__).resolve().parents[6]
env_path = project_root / "data/whisperx-env/parakeet"
script_path = SCRIPT_DIR / "parakeet_transcribe.py"
assert env_path.exists(), f"Environment not found at: {env_path}"
# Create a temporary file for output
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
output_file = tmp_file.name
try:
cmd = [
"uv", "run",
"--project", str(env_path),
"python", str(script_path),
str(AUDIO_FILE),
"--timestamps",
"--context-left", "256",
"--context-right", "256",
"--output", output_file
]
print(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
cwd=project_root # Run from project root to ensure paths are correct if relative
)
if result.returncode != 0:
pytest.fail(f"Script failed with error:\n{result.stderr}")
# Verify output file exists and is valid JSON
assert os.path.exists(output_file), "Output file was not created"
with open(output_file, 'r') as f:
data = json.load(f)
# Assertions based on the provided sample output
assert data["language"] == "en"
assert data["model"] == "parakeet-tdt-0.6b-v3"
assert "transcription" in data
assert "First of all, have desktop computers" in data["transcription"]
assert "reading room" in data["transcription"]
assert "word_timestamps" in data
assert len(data["word_timestamps"]) > 0
# Check first word
first_word = data["word_timestamps"][0]
assert first_word["word"] == "First"
# Allow for small floating point differences
assert abs(first_word["start"] - 0.4) < 0.1
assert "segment_timestamps" in data
assert len(data["segment_timestamps"]) > 0
assert data["context"]["left"] == 256
assert data["context"]["right"] == 256
finally:
# Cleanup
if os.path.exists(output_file):
os.remove(output_file)

View File

@@ -0,0 +1,76 @@
"""Tests for parakeet_transcribe_buffered.py"""
import pytest
import subprocess
import json
import os
import tempfile
from pathlib import Path
# Paths
SCRIPT_DIR = Path(__file__).parent.parent
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
def test_parakeet_buffered_transcription_output():
"""Verify Parakeet buffered transcription output matches expected results."""
assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
# Locate project root and paths
project_root = Path(__file__).resolve().parents[6]
env_path = project_root / "data/whisperx-env/parakeet"
script_path = SCRIPT_DIR / "parakeet_transcribe_buffered.py"
assert env_path.exists(), f"Environment not found at: {env_path}"
# Create a temporary file for output
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
output_file = tmp_file.name
try:
cmd = [
"uv", "run",
"--project", str(env_path),
"python", str(script_path),
str(AUDIO_FILE),
"--output", output_file,
# Use a small chunk length to force buffering behavior on our test file which is 19 sec long
"--chunk-len", "10"
]
print(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
cwd=project_root
)
if result.returncode != 0:
pytest.fail(f"Script failed with error:\n{result.stderr}\nStdout:\n{result.stdout}")
# Verify output file exists and is valid JSON
assert os.path.exists(output_file), "Output file was not created"
with open(output_file, 'r') as f:
data = json.load(f)
# Assertions
assert data["language"] == "en"
assert data["model"] == "parakeet-tdt-0.6b-v3"
assert data.get("buffered") is True
assert "transcription" in data
assert len(data["transcription"]) > 0
# Check that we have timestamps
assert "word_timestamps" in data
assert len(data["word_timestamps"]) > 0
assert "segment_timestamps" in data
assert len(data["segment_timestamps"]) > 0
finally:
# Cleanup
if os.path.exists(output_file):
os.remove(output_file)

View File

@@ -0,0 +1,87 @@
"""Tests for sortformer_diarize.py"""
import pytest
import subprocess
import json
import os
import tempfile
from pathlib import Path
# Paths
SCRIPT_DIR = Path(__file__).parent.parent
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
def test_sortformer_diarization_output():
"""Verify Sortformer diarization output matches expected results."""
assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
# Locate project root and paths
project_root = Path(__file__).resolve().parents[6]
env_path = project_root / "data/whisperx-env/parakeet"
script_path = SCRIPT_DIR / "sortformer_diarize.py"
assert env_path.exists(), f"Environment not found at: {env_path}"
# Create a temporary file for output
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
output_file = tmp_file.name
try:
cmd = [
"uv", "run",
"--project", str(env_path),
"python", str(script_path),
str(AUDIO_FILE),
output_file
]
print(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
cwd=project_root
)
if result.returncode != 0:
pytest.fail(f"Script failed with error:\n{result.stderr}\nStdout:\n{result.stdout}")
# Verify output file exists and is valid JSON
assert os.path.exists(output_file), "Output file was not created"
with open(output_file, 'r') as f:
data = json.load(f)
# Assertions based on the provided sample output
assert data["model"] == "nvidia/diar_streaming_sortformer_4spk-v2"
assert "segments" in data
assert len(data["segments"]) > 0
# Check speakers
assert "speakers" in data
assert len(data["speakers"]) == 4 # Based on sample output which found 4 speakers
assert "speaker_0" in data["speakers"]
assert "speaker_1" in data["speakers"]
# Check first segment
first_segment = data["segments"][0]
assert "start" in first_segment
assert "end" in first_segment
assert "speaker" in first_segment
assert first_segment["start"] == 0.0
assert first_segment["speaker"] == "speaker_0"
assert data["speaker_count"] == 4
assert "total_segments" in data
assert data["total_segments"] > 15
assert "total_duration" in data
assert data["total_duration"] > 17
finally:
# Cleanup
if os.path.exists(output_file):
os.remove(output_file)

View File

@@ -0,0 +1,47 @@
"""Tests for pyannote_diarize.py"""
import pytest
import subprocess
from pathlib import Path
# Paths
SCRIPT_DIR = Path(__file__).parent.parent
SCRIPT_PATH = SCRIPT_DIR / "pyannote_diarize.py"
# TODO: Add proper diarization testing once a dummy HF token or mock pipeline is available.
# uv run --project data/whisperx-env/pyannote/ python internal/transcription/adapters/py/pyannote/pyannote_diarize.py --output=/tmp/pyan.json --hf-token $HF_TOKEN tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav
def test_pyannote_diarize_exists():
"""Verify pyannote_diarize.py exists."""
assert SCRIPT_PATH.exists(), "pyannote_diarize.py should exist"
def test_pyannote_diarize_help():
"""Verify pyannote_diarize.py --help works."""
# Locate project root (Scriberr directory)
# This file is in internal/transcription/adapters/py/pyannote/tests/
project_root = Path(__file__).resolve().parents[6]
env_path = project_root / "data/whisperx-env/pyannote"
assert env_path.exists(), f"Environment not found at: {env_path}"
cmd = [
"uv", "run",
"--project", str(env_path),
"python", str(SCRIPT_PATH),
"--help"
]
print(f"Running command: {' '.join(cmd)}")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
cwd=project_root
)
assert result.returncode == 0
assert "usage: pyannote_diarize.py" in result.stdout
assert "--hf-token" in result.stdout
assert "--model" in result.stdout

Binary file not shown.