Files
romm/backend/handler/filesystem/assets_handler.py
nendo edb5d15420 Fix save-sync hash drift, archival save leak, and dedupe scoping
Cleanup pass on save-sync addressing three independent failure modes
that interact in production data: content_hash drift between client
and server, null-slot archival saves leaking into sync flows, and
content-hash dedupe collapsing legitimately-distinct slots.

Bug fixes
- compute_content_hash dispatched on zipfile.is_zipfile(relative_path),
  which silently returned False whenever the process's CWD wasn't
  ASSETS_BASE_PATH. Every zip save fell through to the raw-MD5 branch,
  persisting hashes that disagreed with clients computing the intended
  per-entry zip-hash. Resolve to a full path before the dispatch.
- _build_negotiate_plan, sync_push_pull_task, and sync_watcher all
  treated null-slot saves as sync-eligible. Null-slot saves represent
  web-UI / archival uploads; including them in negotiate plans matched
  them against device pushes by filename and overwrote archival data.
  Filter null-slot saves at all three call sites.
- get_save_by_content_hash matched on (rom_id, user_id, content_hash)
  only, so identical bytes uploaded to different slots collapsed into
  one record. Scope the lookup by slot when provided so clone-save-
  to-new-slot creates a distinct row per slot.
- get_save_by_filename matched on (rom_id, user_id, file_name) only.
  When two uploads to different slots happened in the same wall-clock
  second (the datetime tag is per-second), the second upload UPDATED
  the first record's slot instead of creating a distinct row. Scope
  the filename lookup by slot too.

One-shot recovery
- New recompute_save_content_hashes manual task walks every Save row,
  recomputes via the fixed dispatch, and updates rows whose values
  differ. Idempotent; safe to re-run.
- Backend startup runs a COUNT(content_hash IS NULL) query and, if
  any rows exist, enqueues the recompute task on the low-priority
  RQ queue. The API process moves on; the worker handles the
  recompute out-of-band. Subsequent restarts find zero NULL hashes
  and skip. Admins can also trigger the task manually.

Test infrastructure
- Added tests/_zipfile_shim.reload_zipfile() mirroring the pattern
  from utils/zip_cache.py for the same zipfile-inflate64 + CPython
  3.13.5 incompatibility. Test fixtures that build ZIPs call it
  immediately before opening the archive.
2026-05-29 17:00:01 +09:00

154 lines
5.2 KiB
Python

import hashlib
import os
import threading
import zipfile
from typing import Final
import magic
from fastapi import HTTPException, UploadFile, status
from config import ASSETS_BASE_PATH
from logger.logger import log
from models.user import User
from .base_handler import FSHandler
# Image MIME types we trust to (a) accept as avatar uploads and (b) serve
# inline from the raw asset endpoint
SAFE_IMAGE_MIME_TYPES: Final[dict[str, str]] = {
"image/png": "png",
"image/jpeg": "jpg",
"image/webp": "webp",
"image/gif": "gif",
}
# libmagic loads its database on construction (~few MB read from disk), so we
# share a single Magic instance across requests. The underlying magic_t handle
# is not thread-safe, so guard from_buffer with a lock. Endpoints that call
# this validator may execute in worker threads under sync routes.
_MIME_DETECTOR = magic.Magic(mime=True)
_MIME_DETECTOR_LOCK = threading.Lock()
def validate_image_upload(upload: UploadFile, *, label: str = "Image") -> str:
"""Validate that an uploaded file is one of the safe image types.
Sniffs the leading bytes with libmagic and returns the trusted extension
matching the detected MIME type. Raises HTTPException(400) if the file
is not a recognized PNG/JPEG/WebP/GIF, or if MIME sniffing fails.
Leaves the file cursor at 0.
"""
upload.file.seek(0)
header = upload.file.read(4096)
upload.file.seek(0)
try:
with _MIME_DETECTOR_LOCK:
detected_mime = _MIME_DETECTOR.from_buffer(header)
except magic.MagicException as exc:
log.error(f"libmagic failed to sniff uploaded {label.lower()}: {exc}")
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Could not determine {label.lower()} file type",
) from exc
safe_extension = SAFE_IMAGE_MIME_TYPES.get(detected_mime)
if not safe_extension:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=(
f"{label} must be a PNG, JPEG, WebP, or GIF image "
f"(detected {detected_mime or 'unknown type'})"
),
)
return safe_extension
class FSAssetsHandler(FSHandler):
def __init__(self) -> None:
super().__init__(base_path=ASSETS_BASE_PATH)
def user_folder_path(self, user: User):
return os.path.join("users", user.fs_safe_folder_name)
# /users/557365723a31/profile
def build_avatar_path(self, user: User):
return os.path.join(self.user_folder_path(user), "profile")
def _build_asset_file_path(
self,
user: User,
folder: str,
platform_fs_slug: str,
rom_id: int,
emulator: str | None = None,
):
user_folder_path = self.user_folder_path(user)
assets_path = os.path.join(
user_folder_path, folder, platform_fs_slug, str(rom_id)
)
if emulator:
assets_path = os.path.join(assets_path, emulator)
return assets_path
# /users/557365723a31/saves/n64/{rom.id}/mupen64plus/
def build_saves_file_path(
self,
user: User,
platform_fs_slug: str,
rom_id: int,
emulator: str | None = None,
):
return self._build_asset_file_path(
user, "saves", platform_fs_slug, rom_id, emulator
)
# /users/557365723a31/states/n64/{rom.id}/mupen64plus
def build_states_file_path(
self,
user: User,
platform_fs_slug: str,
rom_id: int,
emulator: str | None = None,
):
return self._build_asset_file_path(
user, "states", platform_fs_slug, rom_id, emulator
)
# /users/557365723a31/screenshots/{rom.id}/n64
def build_screenshots_file_path(
self, user: User, platform_fs_slug: str, rom_id: int
):
return self._build_asset_file_path(
user, "screenshots", platform_fs_slug, rom_id
)
async def _compute_file_hash(self, file_path: str) -> str:
hash_obj = hashlib.md5(usedforsecurity=False)
async with await self.stream_file(file_path=file_path) as f:
while chunk := await f.read(8192):
hash_obj.update(chunk)
return hash_obj.hexdigest()
async def _compute_zip_hash(self, zip_path: str) -> str:
with zipfile.ZipFile(f"{self.base_path}/{zip_path}", "r") as zf:
file_hashes = []
for name in sorted(zf.namelist()):
if not name.endswith("/"):
content = zf.read(name)
file_hash = hashlib.md5(content, usedforsecurity=False).hexdigest()
file_hashes.append(f"{name}:{file_hash}")
combined = "\n".join(file_hashes)
return hashlib.md5(combined.encode(), usedforsecurity=False).hexdigest()
async def compute_content_hash(self, file_path: str) -> str | None:
try:
full_path = f"{self.base_path}/{file_path}"
if zipfile.is_zipfile(full_path):
return await self._compute_zip_hash(file_path)
return await self._compute_file_hash(file_path)
except Exception as e:
log.debug(f"Failed to compute content hash for {file_path}: {e}")
return None