Files
romm/backend/tests/tasks/test_recompute_save_content_hashes.py
nendo edb5d15420 Fix save-sync hash drift, archival save leak, and dedupe scoping
Cleanup pass on save-sync addressing three independent failure modes
that interact in production data: content_hash drift between client
and server, null-slot archival saves leaking into sync flows, and
content-hash dedupe collapsing legitimately-distinct slots.

Bug fixes
- compute_content_hash dispatched on zipfile.is_zipfile(relative_path),
  which silently returned False whenever the process's CWD wasn't
  ASSETS_BASE_PATH. Every zip save fell through to the raw-MD5 branch,
  persisting hashes that disagreed with clients computing the intended
  per-entry zip-hash. Resolve to a full path before the dispatch.
- _build_negotiate_plan, sync_push_pull_task, and sync_watcher all
  treated null-slot saves as sync-eligible. Null-slot saves represent
  web-UI / archival uploads; including them in negotiate plans matched
  them against device pushes by filename and overwrote archival data.
  Filter null-slot saves at all three call sites.
- get_save_by_content_hash matched on (rom_id, user_id, content_hash)
  only, so identical bytes uploaded to different slots collapsed into
  one record. Scope the lookup by slot when provided so clone-save-
  to-new-slot creates a distinct row per slot.
- get_save_by_filename matched on (rom_id, user_id, file_name) only.
  When two uploads to different slots happened in the same wall-clock
  second (the datetime tag is per-second), the second upload UPDATED
  the first record's slot instead of creating a distinct row. Scope
  the filename lookup by slot too.

One-shot recovery
- New recompute_save_content_hashes manual task walks every Save row,
  recomputes via the fixed dispatch, and updates rows whose values
  differ. Idempotent; safe to re-run.
- Backend startup runs a COUNT(content_hash IS NULL) query and, if
  any rows exist, enqueues the recompute task on the low-priority
  RQ queue. The API process moves on; the worker handles the
  recompute out-of-band. Subsequent restarts find zero NULL hashes
  and skip. Admins can also trigger the task manually.

Test infrastructure
- Added tests/_zipfile_shim.reload_zipfile() mirroring the pattern
  from utils/zip_cache.py for the same zipfile-inflate64 + CPython
  3.13.5 incompatibility. Test fixtures that build ZIPs call it
  immediately before opening the archive.
2026-05-29 17:00:01 +09:00

404 lines
14 KiB
Python

"""Tests for RecomputeSaveContentHashesTask.
Background lives in commit 7996c1293: every zip save's content_hash was
incorrectly stored as the raw MD5 of the zip wrapper instead of the
per-entry zip-hash. This task walks every Save row, recomputes via the
fixed dispatch, and rewrites stale values. These tests pin the
update / unchanged / missing-fs accounting.
ZIP_STORED is used everywhere so the wrapper bytes are deterministic and
the pinned per-entry digests stay valid.
"""
import hashlib
import zipfile
from pathlib import Path
import pytest
from tests._zipfile_shim import reload_zipfile
from handler.database import db_save_handler
from handler.filesystem import fs_asset_handler
from models.assets import Save
from models.platform import Platform
from models.rom import Rom
from models.user import User
from tasks.manual.recompute_save_content_hashes import (
RecomputeSaveContentHashesTask,
recompute_save_content_hashes_task,
)
FIXTURE_A_PINNED_HASH = "b3636b49ca5c3d807adee33e75d410ca"
def _write_fixture_a_zip(target: Path) -> bytes:
"""Write the shared Fixture A zip to target; return the raw bytes."""
target.parent.mkdir(parents=True, exist_ok=True)
reload_zipfile()
with zipfile.ZipFile(target, "w", zipfile.ZIP_STORED) as zf:
zf.writestr("save.bin", b"\x42" * 256)
return target.read_bytes()
@pytest.fixture
def isolated_assets_dir(tmp_path, monkeypatch):
"""Redirect fs_asset_handler to tmp_path so writes/lookups stay scoped."""
new_base = Path(tmp_path).resolve()
monkeypatch.setattr(fs_asset_handler, "base_path", new_base)
return new_base
class TestRecomputeSaveContentHashesTask:
@pytest.fixture
def task(self) -> RecomputeSaveContentHashesTask:
return RecomputeSaveContentHashesTask()
def test_module_singleton_exists(self):
assert recompute_save_content_hashes_task is not None
assert isinstance(
recompute_save_content_hashes_task, RecomputeSaveContentHashesTask
)
def test_init(self, task: RecomputeSaveContentHashesTask):
assert task.title == "Recompute save content hashes"
assert task.manual_run is True
assert task.cron_string is None
async def test_correct_hash_is_unchanged(
self,
task: RecomputeSaveContentHashesTask,
isolated_assets_dir: Path,
admin_user: User,
rom: Rom,
platform: Platform,
):
"""A save whose stored hash already matches must not be rewritten."""
rel_dir = f"{platform.fs_slug}/saves/test_emulator"
file_name = "correct.zip"
zip_path = isolated_assets_dir / rel_dir / file_name
_write_fixture_a_zip(zip_path)
save = db_save_handler.add_save(
Save(
rom_id=rom.id,
user_id=admin_user.id,
file_name=file_name,
file_name_no_tags="correct",
file_name_no_ext="correct",
file_extension="zip",
emulator="test_emulator",
slot="autosave",
file_path=rel_dir,
file_size_bytes=zip_path.stat().st_size,
content_hash=FIXTURE_A_PINNED_HASH,
)
)
stats = await task.run()
assert stats["saves_scanned"] == 1
assert stats["saves_unchanged"] == 1
assert stats["saves_updated"] == 0
assert stats["saves_missing_fs"] == 0
assert stats["errors"] == 0
refreshed = db_save_handler.get_save(user_id=admin_user.id, id=save.id)
assert refreshed is not None
assert refreshed.content_hash == FIXTURE_A_PINNED_HASH
async def test_stale_raw_md5_is_rewritten_to_zip_hash(
self,
task: RecomputeSaveContentHashesTask,
isolated_assets_dir: Path,
admin_user: User,
rom: Rom,
platform: Platform,
):
"""A zip save whose stored hash is the raw wrapper MD5 (the pre-fix
value) must be rewritten to the per-entry zip-hash. This is the exact
recovery the task exists for."""
rel_dir = f"{platform.fs_slug}/saves/test_emulator"
file_name = "stale.zip"
zip_path = isolated_assets_dir / rel_dir / file_name
raw_bytes = _write_fixture_a_zip(zip_path)
# Pre-fix value: raw MD5 of the zip wrapper, not the per-entry digest.
stale_raw_md5 = hashlib.md5(raw_bytes, usedforsecurity=False).hexdigest()
assert stale_raw_md5 != FIXTURE_A_PINNED_HASH
save = db_save_handler.add_save(
Save(
rom_id=rom.id,
user_id=admin_user.id,
file_name=file_name,
file_name_no_tags="stale",
file_name_no_ext="stale",
file_extension="zip",
emulator="test_emulator",
slot="autosave",
file_path=rel_dir,
file_size_bytes=zip_path.stat().st_size,
content_hash=stale_raw_md5,
)
)
stats = await task.run()
assert stats["saves_scanned"] == 1
assert stats["saves_updated"] == 1
assert stats["saves_unchanged"] == 0
assert stats["saves_missing_fs"] == 0
assert stats["errors"] == 0
refreshed = db_save_handler.get_save(user_id=admin_user.id, id=save.id)
assert refreshed is not None
assert refreshed.content_hash == FIXTURE_A_PINNED_HASH
async def test_raw_md5_non_zip_correct_hash_is_unchanged(
self,
task: RecomputeSaveContentHashesTask,
isolated_assets_dir: Path,
admin_user: User,
rom: Rom,
platform: Platform,
):
"""A non-zip save (e.g. raw .sav) whose stored hash already matches
the raw MD5 of its bytes must not be rewritten. This exercises the
_compute_file_hash dispatch path that the zip-only suite missed.
"""
rel_dir = f"{platform.fs_slug}/saves/test_emulator"
file_name = "raw_correct.sav"
sav_path = isolated_assets_dir / rel_dir / file_name
sav_path.parent.mkdir(parents=True, exist_ok=True)
sav_bytes = b"raw save bytes, definitely not a zip"
sav_path.write_bytes(sav_bytes)
# Sanity: this file must NOT look like a zip to the dispatch.
assert not zipfile.is_zipfile(sav_path)
raw_md5 = hashlib.md5(sav_bytes, usedforsecurity=False).hexdigest()
save = db_save_handler.add_save(
Save(
rom_id=rom.id,
user_id=admin_user.id,
file_name=file_name,
file_name_no_tags="raw_correct",
file_name_no_ext="raw_correct",
file_extension="sav",
emulator="test_emulator",
slot="autosave",
file_path=rel_dir,
file_size_bytes=sav_path.stat().st_size,
content_hash=raw_md5,
)
)
stats = await task.run()
assert stats["saves_scanned"] == 1
assert stats["saves_unchanged"] == 1
assert stats["saves_updated"] == 0
assert stats["saves_missing_fs"] == 0
assert stats["errors"] == 0
refreshed = db_save_handler.get_save(user_id=admin_user.id, id=save.id)
assert refreshed is not None
assert refreshed.content_hash == raw_md5
async def test_raw_md5_non_zip_stale_hash_is_rewritten(
self,
task: RecomputeSaveContentHashesTask,
isolated_assets_dir: Path,
admin_user: User,
rom: Rom,
platform: Platform,
):
"""A non-zip save whose stored hash is wrong must be rewritten to the
raw MD5 of its bytes. This pins that the dispatch correctly routes
non-zip files to _compute_file_hash and not _compute_zip_hash.
"""
rel_dir = f"{platform.fs_slug}/saves/test_emulator"
file_name = "raw_stale.sav"
sav_path = isolated_assets_dir / rel_dir / file_name
sav_path.parent.mkdir(parents=True, exist_ok=True)
sav_bytes = b"\x00\x01\x02arbitrary bytes\xff\xfe"
sav_path.write_bytes(sav_bytes)
assert not zipfile.is_zipfile(sav_path)
raw_md5 = hashlib.md5(sav_bytes, usedforsecurity=False).hexdigest()
stale_hash = "0" * 32
assert stale_hash != raw_md5
save = db_save_handler.add_save(
Save(
rom_id=rom.id,
user_id=admin_user.id,
file_name=file_name,
file_name_no_tags="raw_stale",
file_name_no_ext="raw_stale",
file_extension="sav",
emulator="test_emulator",
slot="autosave",
file_path=rel_dir,
file_size_bytes=sav_path.stat().st_size,
content_hash=stale_hash,
)
)
stats = await task.run()
assert stats["saves_scanned"] == 1
assert stats["saves_updated"] == 1
assert stats["saves_unchanged"] == 0
assert stats["saves_missing_fs"] == 0
assert stats["errors"] == 0
refreshed = db_save_handler.get_save(user_id=admin_user.id, id=save.id)
assert refreshed is not None
assert refreshed.content_hash == raw_md5
async def test_raw_md5_non_zip_routes_through_compute_file_hash(
self,
task: RecomputeSaveContentHashesTask,
isolated_assets_dir: Path,
admin_user: User,
rom: Rom,
platform: Platform,
):
"""Direct dispatch check: for a non-zip file, compute_content_hash
must call _compute_file_hash, not _compute_zip_hash. Spying on both
nails down the routing in a way the hash-equality assertions don't.
"""
from unittest.mock import patch
from handler.filesystem import fs_asset_handler
rel_dir = f"{platform.fs_slug}/saves/test_emulator"
file_name = "raw_spy.sav"
sav_path = isolated_assets_dir / rel_dir / file_name
sav_path.parent.mkdir(parents=True, exist_ok=True)
sav_path.write_bytes(b"spy on me")
assert not zipfile.is_zipfile(sav_path)
db_save_handler.add_save(
Save(
rom_id=rom.id,
user_id=admin_user.id,
file_name=file_name,
file_name_no_tags="raw_spy",
file_name_no_ext="raw_spy",
file_extension="sav",
emulator="test_emulator",
slot="autosave",
file_path=rel_dir,
file_size_bytes=sav_path.stat().st_size,
content_hash="anything",
)
)
original_file_hash = fs_asset_handler._compute_file_hash
original_zip_hash = fs_asset_handler._compute_zip_hash
with patch.object(
fs_asset_handler,
"_compute_file_hash",
wraps=original_file_hash,
) as spy_file, patch.object(
fs_asset_handler,
"_compute_zip_hash",
wraps=original_zip_hash,
) as spy_zip:
await task.run()
assert spy_file.call_count == 1
assert spy_zip.call_count == 0
async def test_missing_file_is_counted_but_unchanged(
self,
task: RecomputeSaveContentHashesTask,
isolated_assets_dir: Path,
admin_user: User,
rom: Rom,
platform: Platform,
):
"""A DB row whose backing file is gone must be tallied as missing_fs
and the stored hash must be left alone (no overwrite-to-None)."""
rel_dir = f"{platform.fs_slug}/saves/test_emulator"
file_name = "ghost.zip"
original_hash = "deadbeefdeadbeefdeadbeefdeadbeef"
save = db_save_handler.add_save(
Save(
rom_id=rom.id,
user_id=admin_user.id,
file_name=file_name,
file_name_no_tags="ghost",
file_name_no_ext="ghost",
file_extension="zip",
emulator="test_emulator",
slot="autosave",
file_path=rel_dir,
file_size_bytes=1,
content_hash=original_hash,
)
)
# Sanity: ensure the file really isn't on disk.
assert not (isolated_assets_dir / rel_dir / file_name).exists()
stats = await task.run()
assert stats["saves_scanned"] == 1
assert stats["saves_missing_fs"] == 1
assert stats["saves_updated"] == 0
assert stats["saves_unchanged"] == 0
assert stats["errors"] == 0
refreshed = db_save_handler.get_save(user_id=admin_user.id, id=save.id)
assert refreshed is not None
assert refreshed.content_hash == original_hash
async def test_update_save_failure_increments_errors(
self,
task: RecomputeSaveContentHashesTask,
isolated_assets_dir: Path,
admin_user: User,
rom: Rom,
platform: Platform,
mocker,
):
"""A row whose hash genuinely needs rewriting but whose update_save
call raises should land in the errors bucket, not silently succeed."""
rel_dir = f"{platform.fs_slug}/saves/test_emulator"
file_name = "fixture_a.zip"
_write_fixture_a_zip(isolated_assets_dir / rel_dir / file_name)
db_save_handler.add_save(
Save(
rom_id=rom.id,
user_id=admin_user.id,
file_name=file_name,
file_name_no_tags="fixture_a",
file_name_no_ext="fixture_a",
file_extension="zip",
emulator="test_emulator",
slot="autosave",
file_path=rel_dir,
file_size_bytes=1,
content_hash="00000000000000000000000000000000",
)
)
mocker.patch.object(
db_save_handler,
"update_save",
side_effect=RuntimeError("simulated DB failure"),
)
stats = await task.run()
assert stats["saves_scanned"] == 1
assert stats["errors"] == 1
assert stats["saves_updated"] == 0
assert stats["saves_unchanged"] == 0
assert stats["saves_missing_fs"] == 0