mirror of
https://github.com/rommapp/romm.git
synced 2026-06-27 22:35:57 +00:00
feat(hashing): persist per-member hashes on archive RomFile
Internal members of multi-file archives (zip/tar/7z/rar) are now hashed individually (crc/md5/sha1) and stored in a new `archive_members` JSON column on the archive's RomFile, alongside the existing composite hash used for hash-database matching. Only the archive itself is surfaced as a RomFile so full_path keeps pointing at a file that exists on disk, which is the constraint that previously forced us to choose between composite-only or broken downloads. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
30
backend/alembic/versions/0081_add_archive_members.py
Normal file
30
backend/alembic/versions/0081_add_archive_members.py
Normal file
@@ -0,0 +1,30 @@
|
||||
"""Add archive_members JSON column to rom_files
|
||||
|
||||
Revision ID: 0081_add_archive_members
|
||||
Revises: 0080_add_chd_sha1_hash
|
||||
Create Date: 2026-05-28 00:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
import sqlalchemy as sa
|
||||
from alembic import op
|
||||
|
||||
from utils.database import CustomJSON
|
||||
|
||||
revision = "0081_add_archive_members"
|
||||
down_revision = "0080_add_chd_sha1_hash"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
with op.batch_alter_table("rom_files", schema=None) as batch_op:
|
||||
batch_op.add_column(
|
||||
sa.Column("archive_members", CustomJSON(), nullable=True),
|
||||
if_not_exists=True,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
with op.batch_alter_table("rom_files", schema=None) as batch_op:
|
||||
batch_op.drop_column("archive_members", if_exists=True)
|
||||
@@ -147,6 +147,14 @@ class RomUserSchema(BaseModel):
|
||||
return rom_user_schema_factory()
|
||||
|
||||
|
||||
class ArchiveMemberSchema(TypedDict):
|
||||
name: str
|
||||
size: int
|
||||
crc_hash: str
|
||||
md5_hash: str
|
||||
sha1_hash: str
|
||||
|
||||
|
||||
class RomFileSchema(BaseModel):
|
||||
model_config = ConfigDict(from_attributes=True)
|
||||
|
||||
@@ -164,6 +172,7 @@ class RomFileSchema(BaseModel):
|
||||
sha1_hash: str | None
|
||||
ra_hash: str | None
|
||||
chd_sha1_hash: str | None
|
||||
archive_members: list[ArchiveMemberSchema] | None
|
||||
category: RomFileCategory | None
|
||||
|
||||
|
||||
|
||||
@@ -265,6 +265,7 @@ class FSRomsHandler(FSHandler):
|
||||
file_hash: FileHash,
|
||||
file_size_bytes: int | None = None,
|
||||
last_modified: float | None = None,
|
||||
archive_members: list[dict[str, Any]] | None = None,
|
||||
) -> RomFile:
|
||||
abs_file_path = Path(self.base_path, rom_path, file_name)
|
||||
|
||||
@@ -298,6 +299,7 @@ class FSRomsHandler(FSHandler):
|
||||
md5_hash=file_hash["md5_hash"],
|
||||
sha1_hash=file_hash["sha1_hash"],
|
||||
chd_sha1_hash=file_hash["chd_sha1_hash"],
|
||||
archive_members=archive_members,
|
||||
)
|
||||
|
||||
async def get_rom_files(
|
||||
@@ -435,39 +437,54 @@ class FSRomsHandler(FSHandler):
|
||||
# Multi-file archive: compute a composite hash across all
|
||||
# internal entries (in ASCII path order) for hash-database
|
||||
# matching, while still emitting a single RomFile for the
|
||||
# archive file itself. Internal members are not surfaced as
|
||||
# RomFile rows — only the archive file itself exists on disk,
|
||||
# so emitting per-member RomFiles would produce full_paths that
|
||||
# point nowhere and break downloads.
|
||||
# archive file itself. Per-member hashes are stored on that
|
||||
# RomFile in `archive_members` so consumers can identify each
|
||||
# internal file without us inventing RomFile rows whose
|
||||
# full_path would point inside the archive and break downloads.
|
||||
assert rom_md5_h is not None and rom_sha1_h is not None
|
||||
|
||||
def _hash_archive_entries(
|
||||
crc: int, md5_h: Any, sha1_h: Any
|
||||
) -> tuple[bool, int]:
|
||||
found = False
|
||||
for _name, _size, chunks in ARCHIVE_READERS[rom_ext](
|
||||
) -> tuple[list[dict[str, Any]], int]:
|
||||
members: list[dict[str, Any]] = []
|
||||
for name, size, chunks in ARCHIVE_READERS[rom_ext](
|
||||
rom_dir,
|
||||
DEFAULT_EXCLUDED_FILES,
|
||||
DEFAULT_EXCLUDED_EXTENSIONS,
|
||||
):
|
||||
found = True
|
||||
member_crc = 0
|
||||
member_md5 = hashlib.md5(usedforsecurity=False)
|
||||
member_sha1 = hashlib.sha1(usedforsecurity=False)
|
||||
for chunk in chunks:
|
||||
crc = binascii.crc32(chunk, crc)
|
||||
md5_h.update(chunk)
|
||||
sha1_h.update(chunk)
|
||||
return found, crc
|
||||
member_crc = binascii.crc32(chunk, member_crc)
|
||||
member_md5.update(chunk)
|
||||
member_sha1.update(chunk)
|
||||
members.append(
|
||||
{
|
||||
"name": name,
|
||||
"size": size,
|
||||
"crc_hash": crc32_to_hex(member_crc),
|
||||
"md5_hash": member_md5.hexdigest(),
|
||||
"sha1_hash": member_sha1.hexdigest(),
|
||||
}
|
||||
)
|
||||
return members, crc
|
||||
|
||||
found, rom_crc_c = await asyncio.to_thread(
|
||||
members, rom_crc_c = await asyncio.to_thread(
|
||||
_hash_archive_entries, rom_crc_c, rom_md5_h, rom_sha1_h
|
||||
)
|
||||
|
||||
if found:
|
||||
if members:
|
||||
rom_files.append(
|
||||
self._build_rom_file(
|
||||
rom=rom,
|
||||
rom_path=Path(rel_roms_path),
|
||||
file_name=rom.fs_name,
|
||||
file_hash=_make_file_hash(rom_crc_c, rom_md5_h, rom_sha1_h),
|
||||
archive_members=members,
|
||||
)
|
||||
)
|
||||
else:
|
||||
@@ -475,7 +492,7 @@ class FSRomsHandler(FSHandler):
|
||||
# file's raw bytes. We avoid `_calculate_rom_hashes` here because
|
||||
# it would decompress based on extension and end up hashing the
|
||||
# largest internal member, not the archive itself — and would
|
||||
# crash on an empty zip.
|
||||
# crash on an empty zip. `archive_members` stays None.
|
||||
def _hash_raw_archive(crc: int) -> int:
|
||||
for chunk in read_basic_file(rom_dir):
|
||||
crc = binascii.crc32(chunk, crc)
|
||||
|
||||
@@ -77,6 +77,13 @@ class RomFile(BaseModel):
|
||||
sha1_hash: Mapped[str | None] = mapped_column(String(100))
|
||||
ra_hash: Mapped[str | None] = mapped_column(String(100))
|
||||
chd_sha1_hash: Mapped[str | None] = mapped_column(String(100))
|
||||
# For multi-file archives (zip/tar/7z/rar): per-internal-member metadata
|
||||
# ({"name", "size", "crc_hash", "md5_hash", "sha1_hash"}) so hash-database
|
||||
# matching and the UI can reason about individual members without needing
|
||||
# RomFile rows whose full_path would point inside the archive.
|
||||
archive_members: Mapped[list[dict[str, Any]] | None] = mapped_column(
|
||||
CustomJSON(), default=None, nullable=True
|
||||
)
|
||||
category: Mapped[RomFileCategory | None] = mapped_column(
|
||||
Enum(RomFileCategory), default=None
|
||||
)
|
||||
|
||||
@@ -875,9 +875,28 @@ class TestFSRomsHandler:
|
||||
)
|
||||
|
||||
# Only one RomFile (the archive itself) is surfaced, not one per member.
|
||||
# Per-member hashes are stored on `archive_members`.
|
||||
assert len(parsed.rom_files) == 1
|
||||
assert parsed.rom_files[0].file_name == "game.zip"
|
||||
assert parsed.rom_files[0].md5_hash == parsed.md5_hash
|
||||
archive_rom_file = parsed.rom_files[0]
|
||||
assert archive_rom_file.file_name == "game.zip"
|
||||
assert archive_rom_file.md5_hash == parsed.md5_hash
|
||||
# full_path resolves to a file that actually exists on disk
|
||||
assert (Path(test_handler.base_path) / archive_rom_file.full_path).is_file()
|
||||
|
||||
assert archive_rom_file.archive_members is not None
|
||||
# ASCII-sorted ordering, and each member has the right size + hashes
|
||||
assert [m["name"] for m in archive_rom_file.archive_members] == sorted(contents)
|
||||
for member in archive_rom_file.archive_members:
|
||||
data = contents[member["name"]]
|
||||
assert member["size"] == len(data)
|
||||
assert (
|
||||
member["md5_hash"]
|
||||
== hashlib.md5(data, usedforsecurity=False).hexdigest()
|
||||
)
|
||||
assert (
|
||||
member["sha1_hash"]
|
||||
== hashlib.sha1(data, usedforsecurity=False).hexdigest()
|
||||
)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_rom_files_zip_ordering_invariant(
|
||||
@@ -963,6 +982,7 @@ class TestFSRomsHandler:
|
||||
assert parsed.sha1_hash == hashlib.sha1(junk, usedforsecurity=False).hexdigest()
|
||||
assert len(parsed.rom_files) == 1
|
||||
assert parsed.rom_files[0].file_name == "fake.zip"
|
||||
assert parsed.rom_files[0].archive_members is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_rom_files_zip_with_only_excluded_entries_falls_back(
|
||||
@@ -994,6 +1014,7 @@ class TestFSRomsHandler:
|
||||
)
|
||||
assert len(parsed.rom_files) == 1
|
||||
assert parsed.rom_files[0].file_name == "only_excluded.zip"
|
||||
assert parsed.rom_files[0].archive_members is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_rom_files_empty_zip_falls_back_to_raw_bytes(
|
||||
@@ -1024,6 +1045,7 @@ class TestFSRomsHandler:
|
||||
)
|
||||
assert len(parsed.rom_files) == 1
|
||||
assert parsed.rom_files[0].file_name == "empty.zip"
|
||||
assert parsed.rom_files[0].archive_members is None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_rom_files_with_non_v5_chd_fallback_to_std_hashing(
|
||||
|
||||
@@ -55,7 +55,7 @@ def detect_mime_type(file_path: os.PathLike[str] | str) -> str:
|
||||
try:
|
||||
with _MIME_DETECTOR_LOCK:
|
||||
return _MIME_DETECTOR.from_file(file_path)
|
||||
except magic.MagicException:
|
||||
except (OSError, magic.MagicException):
|
||||
return ""
|
||||
|
||||
|
||||
@@ -252,7 +252,7 @@ def read_tar_archive_files(
|
||||
to the next entry, since the underlying file is closed at that point.
|
||||
"""
|
||||
try:
|
||||
with tarfile.open(file_path, "r") as tf:
|
||||
with tarfile.open(file_path, "r:*") as tf:
|
||||
members = sorted(
|
||||
(m for m in tf.getmembers() if m.isfile()),
|
||||
key=lambda m: m.name,
|
||||
|
||||
Reference in New Issue
Block a user