Files
romm/backend/handler/database/saves_handler.py
nendo edb5d15420 Fix save-sync hash drift, archival save leak, and dedupe scoping
Cleanup pass on save-sync addressing three independent failure modes
that interact in production data: content_hash drift between client
and server, null-slot archival saves leaking into sync flows, and
content-hash dedupe collapsing legitimately-distinct slots.

Bug fixes
- compute_content_hash dispatched on zipfile.is_zipfile(relative_path),
  which silently returned False whenever the process's CWD wasn't
  ASSETS_BASE_PATH. Every zip save fell through to the raw-MD5 branch,
  persisting hashes that disagreed with clients computing the intended
  per-entry zip-hash. Resolve to a full path before the dispatch.
- _build_negotiate_plan, sync_push_pull_task, and sync_watcher all
  treated null-slot saves as sync-eligible. Null-slot saves represent
  web-UI / archival uploads; including them in negotiate plans matched
  them against device pushes by filename and overwrote archival data.
  Filter null-slot saves at all three call sites.
- get_save_by_content_hash matched on (rom_id, user_id, content_hash)
  only, so identical bytes uploaded to different slots collapsed into
  one record. Scope the lookup by slot when provided so clone-save-
  to-new-slot creates a distinct row per slot.
- get_save_by_filename matched on (rom_id, user_id, file_name) only.
  When two uploads to different slots happened in the same wall-clock
  second (the datetime tag is per-second), the second upload UPDATED
  the first record's slot instead of creating a distinct row. Scope
  the filename lookup by slot too.

One-shot recovery
- New recompute_save_content_hashes manual task walks every Save row,
  recomputes via the fixed dispatch, and updates rows whose values
  differ. Idempotent; safe to re-run.
- Backend startup runs a COUNT(content_hash IS NULL) query and, if
  any rows exist, enqueues the recompute task on the low-priority
  RQ queue. The API process moves on; the worker handles the
  recompute out-of-band. Subsequent restarts find zero NULL hashes
  and skip. Admins can also trigger the task manually.

Test infrastructure
- Added tests/_zipfile_shim.reload_zipfile() mirroring the pattern
  from utils/zip_cache.py for the same zipfile-inflate64 + CPython
  3.13.5 incompatibility. Test fixtures that build ZIPs call it
  immediately before opening the archive.
2026-05-29 17:00:01 +09:00

206 lines
5.9 KiB
Python

from collections.abc import Sequence
from typing import Literal
from sqlalchemy import and_, asc, delete, desc, func, select, update
from sqlalchemy.orm import QueryableAttribute, Session, load_only
from decorators.database import begin_session
from models.assets import Save
from models.rom import Rom
from .base_handler import DBBaseHandler
class DBSavesHandler(DBBaseHandler):
@begin_session
def add_save(
self,
save: Save,
session: Session = None, # type: ignore
) -> Save:
return session.merge(save)
@begin_session
def get_save(
self,
user_id: int,
id: int,
session: Session = None, # type: ignore
) -> Save | None:
return session.scalar(select(Save).filter_by(user_id=user_id, id=id).limit(1))
@begin_session
def get_save_by_filename(
self,
user_id: int,
rom_id: int,
file_name: str,
slot: str | None = None,
session: Session = None, # type: ignore
) -> Save | None:
query = select(Save).filter_by(
rom_id=rom_id, user_id=user_id, file_name=file_name
)
if slot is not None:
query = query.filter(Save.slot == slot)
return session.scalars(query.limit(1)).first()
@begin_session
def get_save_by_content_hash(
self,
user_id: int,
rom_id: int,
content_hash: str,
slot: str | None = None,
session: Session = None, # type: ignore
) -> Save | None:
query = select(Save).filter_by(
rom_id=rom_id, user_id=user_id, content_hash=content_hash
)
if slot is not None:
query = query.filter(Save.slot == slot)
return session.scalar(query.limit(1))
@begin_session
def get_saves(
self,
user_id: int,
rom_id: int | None = None,
platform_id: int | None = None,
slot: str | None = None,
order_by: Literal["updated_at", "created_at"] | None = None,
order_dir: Literal["asc", "desc"] = "desc",
only_fields: Sequence[QueryableAttribute] | None = None,
session: Session = None, # type: ignore
) -> Sequence[Save]:
query = select(Save).filter_by(user_id=user_id)
if rom_id:
query = query.filter_by(rom_id=rom_id)
if platform_id:
query = query.join(Rom, Save.rom_id == Rom.id).filter(
Rom.platform_id == platform_id
)
if slot is not None:
query = query.filter(Save.slot == slot)
if order_by:
order_col = getattr(Save, order_by)
order_fn = asc if order_dir == "asc" else desc
query = query.order_by(order_fn(order_col))
if only_fields:
query = query.options(load_only(*only_fields))
return session.scalars(query).all()
@begin_session
def update_save(
self,
id: int,
data: dict,
session: Session = None, # type: ignore
) -> Save:
session.execute(
update(Save)
.where(Save.id == id)
.values(**data)
.execution_options(synchronize_session="evaluate")
)
return session.query(Save).filter_by(id=id).one()
@begin_session
def delete_save(
self,
id: int,
session: Session = None, # type: ignore
) -> None:
session.execute(
delete(Save)
.where(Save.id == id)
.execution_options(synchronize_session="evaluate")
)
@begin_session
def mark_missing_saves(
self,
rom_id: int,
user_id: int,
saves_to_keep: list[str],
session: Session = None, # type: ignore
) -> Sequence[Save]:
missing_saves = session.scalars(
select(Save).filter(
and_(
Save.rom_id == rom_id,
Save.user_id == user_id,
Save.file_name.not_in(saves_to_keep),
)
)
).all()
session.execute(
update(Save)
.where(
and_(
Save.rom_id == rom_id,
Save.user_id == user_id,
Save.file_name.not_in(saves_to_keep),
)
)
.values(**{"missing_from_fs": True})
.execution_options(synchronize_session="evaluate")
)
return missing_saves
@begin_session
def get_saves_summary(
self,
user_id: int,
rom_id: int,
session: Session = None, # type: ignore
) -> dict:
saves = session.scalars(
select(Save)
.filter_by(user_id=user_id, rom_id=rom_id)
.order_by(desc(Save.updated_at))
).all()
slots_data: dict[str | None, dict] = {}
for save in saves:
slot_key = save.slot
if slot_key not in slots_data:
slots_data[slot_key] = {"slot": slot_key, "count": 0, "latest": save}
slots_data[slot_key]["count"] += 1
return {
"total_count": len(saves),
"slots": list(slots_data.values()),
}
@begin_session
def count_saves_missing_content_hash(
self,
session: Session = None, # type: ignore
) -> int:
"""Number of Save rows whose content_hash is NULL. Used at startup to
decide whether the one-shot recompute task needs to be enqueued."""
return (
session.scalar(
select(func.count(Save.id)).where(Save.content_hash.is_(None))
)
or 0
)
@begin_session
def get_all_saves(
self,
session: Session = None, # type: ignore
) -> Sequence[Save]:
"""Every Save row across all users, ordered by id. Used by the
recompute_save_content_hashes maintenance task."""
return session.scalars(select(Save).order_by(asc(Save.id))).all()