feat(hashing): persist per-member hashes on archive RomFile

Internal members of multi-file archives (zip/tar/7z/rar) are now hashed
individually (crc/md5/sha1) and stored in a new `archive_members` JSON
column on the archive's RomFile, alongside the existing composite hash
used for hash-database matching. Only the archive itself is surfaced as
a RomFile so full_path keeps pointing at a file that exists on disk,
which is the constraint that previously forced us to choose between
composite-only or broken downloads.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Georges-Antoine Assi
2026-05-28 09:41:04 -04:00
parent 9111f70d0a
commit 207d0dc4c6
6 changed files with 101 additions and 16 deletions

View File

@@ -0,0 +1,30 @@
"""Add archive_members JSON column to rom_files
Revision ID: 0081_add_archive_members
Revises: 0080_add_chd_sha1_hash
Create Date: 2026-05-28 00:00:00.000000
"""
import sqlalchemy as sa
from alembic import op
from utils.database import CustomJSON
revision = "0081_add_archive_members"
down_revision = "0080_add_chd_sha1_hash"
branch_labels = None
depends_on = None
def upgrade() -> None:
with op.batch_alter_table("rom_files", schema=None) as batch_op:
batch_op.add_column(
sa.Column("archive_members", CustomJSON(), nullable=True),
if_not_exists=True,
)
def downgrade() -> None:
with op.batch_alter_table("rom_files", schema=None) as batch_op:
batch_op.drop_column("archive_members", if_exists=True)

View File

@@ -147,6 +147,14 @@ class RomUserSchema(BaseModel):
return rom_user_schema_factory()
class ArchiveMemberSchema(TypedDict):
name: str
size: int
crc_hash: str
md5_hash: str
sha1_hash: str
class RomFileSchema(BaseModel):
model_config = ConfigDict(from_attributes=True)
@@ -164,6 +172,7 @@ class RomFileSchema(BaseModel):
sha1_hash: str | None
ra_hash: str | None
chd_sha1_hash: str | None
archive_members: list[ArchiveMemberSchema] | None
category: RomFileCategory | None

View File

@@ -265,6 +265,7 @@ class FSRomsHandler(FSHandler):
file_hash: FileHash,
file_size_bytes: int | None = None,
last_modified: float | None = None,
archive_members: list[dict[str, Any]] | None = None,
) -> RomFile:
abs_file_path = Path(self.base_path, rom_path, file_name)
@@ -298,6 +299,7 @@ class FSRomsHandler(FSHandler):
md5_hash=file_hash["md5_hash"],
sha1_hash=file_hash["sha1_hash"],
chd_sha1_hash=file_hash["chd_sha1_hash"],
archive_members=archive_members,
)
async def get_rom_files(
@@ -435,39 +437,54 @@ class FSRomsHandler(FSHandler):
# Multi-file archive: compute a composite hash across all
# internal entries (in ASCII path order) for hash-database
# matching, while still emitting a single RomFile for the
# archive file itself. Internal members are not surfaced as
# RomFile rows — only the archive file itself exists on disk,
# so emitting per-member RomFiles would produce full_paths that
# point nowhere and break downloads.
# archive file itself. Per-member hashes are stored on that
# RomFile in `archive_members` so consumers can identify each
# internal file without us inventing RomFile rows whose
# full_path would point inside the archive and break downloads.
assert rom_md5_h is not None and rom_sha1_h is not None
def _hash_archive_entries(
crc: int, md5_h: Any, sha1_h: Any
) -> tuple[bool, int]:
found = False
for _name, _size, chunks in ARCHIVE_READERS[rom_ext](
) -> tuple[list[dict[str, Any]], int]:
members: list[dict[str, Any]] = []
for name, size, chunks in ARCHIVE_READERS[rom_ext](
rom_dir,
DEFAULT_EXCLUDED_FILES,
DEFAULT_EXCLUDED_EXTENSIONS,
):
found = True
member_crc = 0
member_md5 = hashlib.md5(usedforsecurity=False)
member_sha1 = hashlib.sha1(usedforsecurity=False)
for chunk in chunks:
crc = binascii.crc32(chunk, crc)
md5_h.update(chunk)
sha1_h.update(chunk)
return found, crc
member_crc = binascii.crc32(chunk, member_crc)
member_md5.update(chunk)
member_sha1.update(chunk)
members.append(
{
"name": name,
"size": size,
"crc_hash": crc32_to_hex(member_crc),
"md5_hash": member_md5.hexdigest(),
"sha1_hash": member_sha1.hexdigest(),
}
)
return members, crc
found, rom_crc_c = await asyncio.to_thread(
members, rom_crc_c = await asyncio.to_thread(
_hash_archive_entries, rom_crc_c, rom_md5_h, rom_sha1_h
)
if found:
if members:
rom_files.append(
self._build_rom_file(
rom=rom,
rom_path=Path(rel_roms_path),
file_name=rom.fs_name,
file_hash=_make_file_hash(rom_crc_c, rom_md5_h, rom_sha1_h),
archive_members=members,
)
)
else:
@@ -475,7 +492,7 @@ class FSRomsHandler(FSHandler):
# file's raw bytes. We avoid `_calculate_rom_hashes` here because
# it would decompress based on extension and end up hashing the
# largest internal member, not the archive itself — and would
# crash on an empty zip.
# crash on an empty zip. `archive_members` stays None.
def _hash_raw_archive(crc: int) -> int:
for chunk in read_basic_file(rom_dir):
crc = binascii.crc32(chunk, crc)

View File

@@ -77,6 +77,13 @@ class RomFile(BaseModel):
sha1_hash: Mapped[str | None] = mapped_column(String(100))
ra_hash: Mapped[str | None] = mapped_column(String(100))
chd_sha1_hash: Mapped[str | None] = mapped_column(String(100))
# For multi-file archives (zip/tar/7z/rar): per-internal-member metadata
# ({"name", "size", "crc_hash", "md5_hash", "sha1_hash"}) so hash-database
# matching and the UI can reason about individual members without needing
# RomFile rows whose full_path would point inside the archive.
archive_members: Mapped[list[dict[str, Any]] | None] = mapped_column(
CustomJSON(), default=None, nullable=True
)
category: Mapped[RomFileCategory | None] = mapped_column(
Enum(RomFileCategory), default=None
)

View File

@@ -875,9 +875,28 @@ class TestFSRomsHandler:
)
# Only one RomFile (the archive itself) is surfaced, not one per member.
# Per-member hashes are stored on `archive_members`.
assert len(parsed.rom_files) == 1
assert parsed.rom_files[0].file_name == "game.zip"
assert parsed.rom_files[0].md5_hash == parsed.md5_hash
archive_rom_file = parsed.rom_files[0]
assert archive_rom_file.file_name == "game.zip"
assert archive_rom_file.md5_hash == parsed.md5_hash
# full_path resolves to a file that actually exists on disk
assert (Path(test_handler.base_path) / archive_rom_file.full_path).is_file()
assert archive_rom_file.archive_members is not None
# ASCII-sorted ordering, and each member has the right size + hashes
assert [m["name"] for m in archive_rom_file.archive_members] == sorted(contents)
for member in archive_rom_file.archive_members:
data = contents[member["name"]]
assert member["size"] == len(data)
assert (
member["md5_hash"]
== hashlib.md5(data, usedforsecurity=False).hexdigest()
)
assert (
member["sha1_hash"]
== hashlib.sha1(data, usedforsecurity=False).hexdigest()
)
@pytest.mark.asyncio
async def test_get_rom_files_zip_ordering_invariant(
@@ -963,6 +982,7 @@ class TestFSRomsHandler:
assert parsed.sha1_hash == hashlib.sha1(junk, usedforsecurity=False).hexdigest()
assert len(parsed.rom_files) == 1
assert parsed.rom_files[0].file_name == "fake.zip"
assert parsed.rom_files[0].archive_members is None
@pytest.mark.asyncio
async def test_get_rom_files_zip_with_only_excluded_entries_falls_back(
@@ -994,6 +1014,7 @@ class TestFSRomsHandler:
)
assert len(parsed.rom_files) == 1
assert parsed.rom_files[0].file_name == "only_excluded.zip"
assert parsed.rom_files[0].archive_members is None
@pytest.mark.asyncio
async def test_get_rom_files_empty_zip_falls_back_to_raw_bytes(
@@ -1024,6 +1045,7 @@ class TestFSRomsHandler:
)
assert len(parsed.rom_files) == 1
assert parsed.rom_files[0].file_name == "empty.zip"
assert parsed.rom_files[0].archive_members is None
@pytest.mark.asyncio
async def test_get_rom_files_with_non_v5_chd_fallback_to_std_hashing(

View File

@@ -55,7 +55,7 @@ def detect_mime_type(file_path: os.PathLike[str] | str) -> str:
try:
with _MIME_DETECTOR_LOCK:
return _MIME_DETECTOR.from_file(file_path)
except magic.MagicException:
except (OSError, magic.MagicException):
return ""
@@ -252,7 +252,7 @@ def read_tar_archive_files(
to the next entry, since the underlying file is closed at that point.
"""
try:
with tarfile.open(file_path, "r") as tf:
with tarfile.open(file_path, "r:*") as tf:
members = sorted(
(m for m in tf.getmembers() if m.isfile()),
key=lambda m: m.name,