mirror of
https://github.com/rishikanthc/Scriberr.git
synced 2026-06-28 14:55:46 +00:00
Add test suite for python adapter scripts
This commit is contained in:
committed by
Rishikanth Chandrasekaran
parent
50dd4130ff
commit
7471a2a1b6
30
internal/transcription/adapters/py/README.md
Normal file
30
internal/transcription/adapters/py/README.md
Normal file
@@ -0,0 +1,30 @@
|
||||
# Python Adapters Testing
|
||||
|
||||
This directory contains the Python adapter scripts for various transcription and diarization models used by Scriberr.
|
||||
|
||||
## Running Tests
|
||||
|
||||
The tests are located in the `tests/` subdirectory of each adapter folder (e.g., `nvidia/tests/`, `pyannote/tests/`). These tests verify that the Python scripts can be executed and produce the expected output.
|
||||
|
||||
To run the tests, you need `uv` installed and the `parakeet` environment set up (which serves as a shared environment for these tests).
|
||||
|
||||
### Prerequisites
|
||||
|
||||
1. Ensure you have `uv` installed.
|
||||
2. Ensure the `parakeet` and `pyannote` environments set up within `data/whisperx-env/`. This is typically handled by the application startup.
|
||||
3. Ensure you have the test data available (e.g., `tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav`).
|
||||
|
||||
### Running Tests with pytest
|
||||
|
||||
```bash
|
||||
# Run all NVIDIA adapter tests
|
||||
uv run --with pytest --project data/whisperx-env/parakeet pytest internal/transcription/adapters/py/nvidia/tests
|
||||
|
||||
# Run PyAnnote adapter tests
|
||||
uv run --with pytest --project data/whisperx-env/pyannote pytest internal/transcription/adapters/py/pyannote/tests
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
* **Audio file not found**: Ensure `tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav` exists.
|
||||
* **Environment not found**: Ensure `data/whisperx-env/parakeet` and the `pyannote` one exist and is a valid virtual environment. This may not be true if scriberr hasn't run yet.
|
||||
@@ -0,0 +1,83 @@
|
||||
"""Tests for canary_transcribe.py"""
|
||||
import pytest
|
||||
import subprocess
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Paths
|
||||
SCRIPT_DIR = Path(__file__).parent.parent
|
||||
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
|
||||
AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
|
||||
|
||||
import tempfile
|
||||
|
||||
def test_canary_transcription_output():
|
||||
"""Verify Canary transcription output matches expected results."""
|
||||
|
||||
assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
|
||||
|
||||
# Locate project root and paths
|
||||
project_root = Path(__file__).resolve().parents[6]
|
||||
env_path = project_root / "data/whisperx-env/parakeet" # Canary uses the same env as Parakeet in this setup
|
||||
script_path = SCRIPT_DIR / "canary_transcribe.py"
|
||||
|
||||
assert env_path.exists(), f"Environment not found at: {env_path}"
|
||||
|
||||
# Create a temporary file for output
|
||||
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
|
||||
output_file = tmp_file.name
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"uv", "run",
|
||||
"--project", str(env_path),
|
||||
"python", str(script_path),
|
||||
str(AUDIO_FILE),
|
||||
"--timestamps",
|
||||
"--output", output_file
|
||||
]
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=project_root
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pytest.fail(f"Script failed with error:\n{result.stderr}")
|
||||
|
||||
# Verify output file exists and is valid JSON
|
||||
assert os.path.exists(output_file), "Output file was not created"
|
||||
|
||||
with open(output_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Assertions based on the provided sample output
|
||||
assert data["source_language"] == "en"
|
||||
assert data["target_language"] == "en"
|
||||
assert data["task"] == "transcribe"
|
||||
assert data["model"] == "canary-1b-v2"
|
||||
|
||||
# Canary output text check
|
||||
assert "Most of us" in data["transcription"]
|
||||
assert "desktop computers" in data["transcription"]
|
||||
|
||||
assert "word_timestamps" in data
|
||||
assert len(data["word_timestamps"]) > 0
|
||||
# Check first word
|
||||
first_word = data["word_timestamps"][0]
|
||||
assert first_word["word"] == "Most"
|
||||
# Allow for small floating point differences
|
||||
assert abs(first_word["start"] - 0.0) < 0.1
|
||||
|
||||
assert "segment_timestamps" in data
|
||||
assert len(data["segment_timestamps"]) > 0
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if os.path.exists(output_file):
|
||||
os.remove(output_file)
|
||||
@@ -0,0 +1,89 @@
|
||||
"""Tests for parakeet_transcribe.py"""
|
||||
import pytest
|
||||
import subprocess
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
# Paths
|
||||
SCRIPT_DIR = Path(__file__).parent.parent
|
||||
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
|
||||
AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
|
||||
|
||||
import tempfile
|
||||
|
||||
def test_parakeet_transcription_output():
|
||||
"""Verify Parakeet transcription output matches expected results."""
|
||||
|
||||
assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
|
||||
|
||||
# Construct command
|
||||
# uv run --project data/whisperx-env/parakeet python internal/transcription/adapters/py/nvidia/parakeet_transcribe.py ...
|
||||
|
||||
# Locate project root (Scriberr directory)
|
||||
# This file is in internal/transcription/adapters/py/nvidia/tests/
|
||||
project_root = Path(__file__).resolve().parents[6]
|
||||
env_path = project_root / "data/whisperx-env/parakeet"
|
||||
script_path = SCRIPT_DIR / "parakeet_transcribe.py"
|
||||
|
||||
assert env_path.exists(), f"Environment not found at: {env_path}"
|
||||
|
||||
# Create a temporary file for output
|
||||
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
|
||||
output_file = tmp_file.name
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"uv", "run",
|
||||
"--project", str(env_path),
|
||||
"python", str(script_path),
|
||||
str(AUDIO_FILE),
|
||||
"--timestamps",
|
||||
"--context-left", "256",
|
||||
"--context-right", "256",
|
||||
"--output", output_file
|
||||
]
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=project_root # Run from project root to ensure paths are correct if relative
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pytest.fail(f"Script failed with error:\n{result.stderr}")
|
||||
|
||||
# Verify output file exists and is valid JSON
|
||||
assert os.path.exists(output_file), "Output file was not created"
|
||||
|
||||
with open(output_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Assertions based on the provided sample output
|
||||
assert data["language"] == "en"
|
||||
assert data["model"] == "parakeet-tdt-0.6b-v3"
|
||||
assert "transcription" in data
|
||||
assert "First of all, have desktop computers" in data["transcription"]
|
||||
assert "reading room" in data["transcription"]
|
||||
|
||||
assert "word_timestamps" in data
|
||||
assert len(data["word_timestamps"]) > 0
|
||||
# Check first word
|
||||
first_word = data["word_timestamps"][0]
|
||||
assert first_word["word"] == "First"
|
||||
# Allow for small floating point differences
|
||||
assert abs(first_word["start"] - 0.4) < 0.1
|
||||
|
||||
assert "segment_timestamps" in data
|
||||
assert len(data["segment_timestamps"]) > 0
|
||||
|
||||
assert data["context"]["left"] == 256
|
||||
assert data["context"]["right"] == 256
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if os.path.exists(output_file):
|
||||
os.remove(output_file)
|
||||
@@ -0,0 +1,76 @@
|
||||
"""Tests for parakeet_transcribe_buffered.py"""
|
||||
import pytest
|
||||
import subprocess
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
# Paths
|
||||
SCRIPT_DIR = Path(__file__).parent.parent
|
||||
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
|
||||
AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
|
||||
|
||||
def test_parakeet_buffered_transcription_output():
|
||||
"""Verify Parakeet buffered transcription output matches expected results."""
|
||||
|
||||
assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
|
||||
|
||||
# Locate project root and paths
|
||||
project_root = Path(__file__).resolve().parents[6]
|
||||
env_path = project_root / "data/whisperx-env/parakeet"
|
||||
script_path = SCRIPT_DIR / "parakeet_transcribe_buffered.py"
|
||||
|
||||
assert env_path.exists(), f"Environment not found at: {env_path}"
|
||||
|
||||
# Create a temporary file for output
|
||||
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
|
||||
output_file = tmp_file.name
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"uv", "run",
|
||||
"--project", str(env_path),
|
||||
"python", str(script_path),
|
||||
str(AUDIO_FILE),
|
||||
"--output", output_file,
|
||||
# Use a small chunk length to force buffering behavior on our test file which is 19 sec long
|
||||
"--chunk-len", "10"
|
||||
]
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=project_root
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pytest.fail(f"Script failed with error:\n{result.stderr}\nStdout:\n{result.stdout}")
|
||||
|
||||
# Verify output file exists and is valid JSON
|
||||
assert os.path.exists(output_file), "Output file was not created"
|
||||
|
||||
with open(output_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Assertions
|
||||
assert data["language"] == "en"
|
||||
assert data["model"] == "parakeet-tdt-0.6b-v3"
|
||||
assert data.get("buffered") is True
|
||||
assert "transcription" in data
|
||||
assert len(data["transcription"]) > 0
|
||||
|
||||
# Check that we have timestamps
|
||||
assert "word_timestamps" in data
|
||||
assert len(data["word_timestamps"]) > 0
|
||||
|
||||
assert "segment_timestamps" in data
|
||||
assert len(data["segment_timestamps"]) > 0
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if os.path.exists(output_file):
|
||||
os.remove(output_file)
|
||||
@@ -0,0 +1,87 @@
|
||||
"""Tests for sortformer_diarize.py"""
|
||||
import pytest
|
||||
import subprocess
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
# Paths
|
||||
SCRIPT_DIR = Path(__file__).parent.parent
|
||||
TEST_DATA_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.parent / "tests/data"
|
||||
AUDIO_FILE = TEST_DATA_DIR / "AMI-Corpus-IB4002.Mix-Headset-clip.wav"
|
||||
|
||||
def test_sortformer_diarization_output():
|
||||
"""Verify Sortformer diarization output matches expected results."""
|
||||
|
||||
assert AUDIO_FILE.exists(), f"Audio file not found: {AUDIO_FILE}"
|
||||
|
||||
# Locate project root and paths
|
||||
project_root = Path(__file__).resolve().parents[6]
|
||||
env_path = project_root / "data/whisperx-env/parakeet"
|
||||
script_path = SCRIPT_DIR / "sortformer_diarize.py"
|
||||
|
||||
assert env_path.exists(), f"Environment not found at: {env_path}"
|
||||
|
||||
# Create a temporary file for output
|
||||
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as tmp_file:
|
||||
output_file = tmp_file.name
|
||||
|
||||
try:
|
||||
cmd = [
|
||||
"uv", "run",
|
||||
"--project", str(env_path),
|
||||
"python", str(script_path),
|
||||
str(AUDIO_FILE),
|
||||
output_file
|
||||
]
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=project_root
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
pytest.fail(f"Script failed with error:\n{result.stderr}\nStdout:\n{result.stdout}")
|
||||
|
||||
# Verify output file exists and is valid JSON
|
||||
assert os.path.exists(output_file), "Output file was not created"
|
||||
|
||||
with open(output_file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Assertions based on the provided sample output
|
||||
assert data["model"] == "nvidia/diar_streaming_sortformer_4spk-v2"
|
||||
assert "segments" in data
|
||||
assert len(data["segments"]) > 0
|
||||
|
||||
# Check speakers
|
||||
assert "speakers" in data
|
||||
assert len(data["speakers"]) == 4 # Based on sample output which found 4 speakers
|
||||
assert "speaker_0" in data["speakers"]
|
||||
assert "speaker_1" in data["speakers"]
|
||||
|
||||
# Check first segment
|
||||
first_segment = data["segments"][0]
|
||||
assert "start" in first_segment
|
||||
assert "end" in first_segment
|
||||
assert "speaker" in first_segment
|
||||
assert first_segment["start"] == 0.0
|
||||
assert first_segment["speaker"] == "speaker_0"
|
||||
|
||||
assert data["speaker_count"] == 4
|
||||
|
||||
assert "total_segments" in data
|
||||
assert data["total_segments"] > 15
|
||||
|
||||
assert "total_duration" in data
|
||||
assert data["total_duration"] > 17
|
||||
|
||||
finally:
|
||||
# Cleanup
|
||||
if os.path.exists(output_file):
|
||||
os.remove(output_file)
|
||||
@@ -0,0 +1,47 @@
|
||||
"""Tests for pyannote_diarize.py"""
|
||||
import pytest
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
# Paths
|
||||
SCRIPT_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = SCRIPT_DIR / "pyannote_diarize.py"
|
||||
|
||||
# TODO: Add proper diarization testing once a dummy HF token or mock pipeline is available.
|
||||
# uv run --project data/whisperx-env/pyannote/ python internal/transcription/adapters/py/pyannote/pyannote_diarize.py --output=/tmp/pyan.json --hf-token $HF_TOKEN tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav
|
||||
def test_pyannote_diarize_exists():
|
||||
"""Verify pyannote_diarize.py exists."""
|
||||
assert SCRIPT_PATH.exists(), "pyannote_diarize.py should exist"
|
||||
|
||||
|
||||
def test_pyannote_diarize_help():
|
||||
"""Verify pyannote_diarize.py --help works."""
|
||||
|
||||
# Locate project root (Scriberr directory)
|
||||
# This file is in internal/transcription/adapters/py/pyannote/tests/
|
||||
project_root = Path(__file__).resolve().parents[6]
|
||||
env_path = project_root / "data/whisperx-env/pyannote"
|
||||
|
||||
assert env_path.exists(), f"Environment not found at: {env_path}"
|
||||
|
||||
|
||||
|
||||
cmd = [
|
||||
"uv", "run",
|
||||
"--project", str(env_path),
|
||||
"python", str(SCRIPT_PATH),
|
||||
"--help"
|
||||
]
|
||||
|
||||
print(f"Running command: {' '.join(cmd)}")
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
cwd=project_root
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert "usage: pyannote_diarize.py" in result.stdout
|
||||
assert "--hf-token" in result.stdout
|
||||
assert "--model" in result.stdout
|
||||
BIN
tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav
Normal file
BIN
tests/data/AMI-Corpus-IB4002.Mix-Headset-clip.wav
Normal file
Binary file not shown.
Reference in New Issue
Block a user