From 6aca8fdfcf48768402209236a804d459083e6093 Mon Sep 17 00:00:00 2001 From: Alex Vanderveen Date: Tue, 7 Apr 2026 20:25:55 -0400 Subject: [PATCH 1/2] Parse Content-Type essence before validating resource downloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem _check_content_type used the full Content-Type header string (lowercased) and matched it with startswith(...) against allowed prefixes. That is mostly fine when the server sends a bare type like application/pdf. It breaks down when vendors send parameters on the same header (e.g. name="…", charset=…). In theory application/force-download; name="…" should still start with application/force-download, but in practice you can get: Leading whitespace or a UTF‑8 BOM before the type token, so the string no longer starts with your prefix even though the MIME type is correct. Confusing logs: logging only the lowercased full header is fine, but the decision should be based on the standardized MIME essence (type + subtype, no parameters), which is what other stacks use for “what is this?” So the fix is to parse the header the usual way and only then apply your allowlist. What changed _content_type_essence(header_value) Takes everything before the first ; (the essence). Strips whitespace, lowercases, strips a leading BOM (\ufeff) so odd clients/proxies don’t break the check. _check_content_type Reads the raw content-type header once. Runs startswith on the essence, not on the full header with parameters. Rejects if the essence is empty (missing or useless header). Logging uses the raw header string (or (missing header)), so operators still see exactly what the server sent. Call sites and allowed prefixes (image/, application/pdf, etc.) are unchanged; only how the string is normalized before comparison changes. Security / SSRF This does not replace URL / SSRF controls; it only makes post-fetch type checking consistent with how Content-Type is defined (essence vs parameters). You are not widening the allowlist—same prefixes, stricter handling of “empty” and clearer matching on the actual type token. Risk / regression Low: same allowed prefixes, strictly more tolerant of benign formatting (whitespace, BOM, parameters). The only stricter case is empty essence after strip (e.g. malformed header), which correctly fails the check. \\\\\\\\\\\\\\\\\\\\\\\\\\\\\ I have reviewed the proposal and these edits will handle cases where the string we match against for the content_type is cleaned up more before comparing against the allow list of content_types. I have tested this, and confirm that I do not get any errors loading PDFs for game manuals using this. Please consider this, as this should be compatible with the existing content type allowlist, and easily work with any new types added to it. --- backend/handler/filesystem/resources_handler.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/backend/handler/filesystem/resources_handler.py b/backend/handler/filesystem/resources_handler.py index 8df2c27df..23ce23be3 100644 --- a/backend/handler/filesystem/resources_handler.py +++ b/backend/handler/filesystem/resources_handler.py @@ -20,12 +20,23 @@ from utils.validation import validate_url_for_http_request from .base_handler import CoverSize, FSHandler +def _content_type_essence(header_value: str) -> str: + """Return the MIME type token (before parameters), lowercased.""" + if not header_value: + return "" + essence = header_value.split(";", 1)[0].strip().lower() + return essence.lstrip("\ufeff").strip() + + def _check_content_type( response: httpx.Response, allowed_prefixes: tuple[str, ...], label: str ) -> bool: - content_type = response.headers.get("content-type", "").lower() - if not any(content_type.startswith(p) for p in allowed_prefixes): - log.warning(f"Unexpected content type for {label}: {content_type}") + raw = response.headers.get("content-type", "") + essence = _content_type_essence(raw) + if not essence or not any(essence.startswith(p) for p in allowed_prefixes): + log.warning( + f"Unexpected content type for {label}: {raw or '(missing header)'}", + ) return False return True From f227a3145d8cb1f078998abe477be1cb353bdf23 Mon Sep 17 00:00:00 2001 From: Georges-Antoine Assi Date: Tue, 7 Apr 2026 22:32:40 -0400 Subject: [PATCH 2/2] changes from bot review --- .../handler/filesystem/resources_handler.py | 6 +- .../filesystem/test_resources_handler.py | 138 +++++++++++++++++- 2 files changed, 141 insertions(+), 3 deletions(-) diff --git a/backend/handler/filesystem/resources_handler.py b/backend/handler/filesystem/resources_handler.py index 23ce23be3..d92f2b003 100644 --- a/backend/handler/filesystem/resources_handler.py +++ b/backend/handler/filesystem/resources_handler.py @@ -24,8 +24,10 @@ def _content_type_essence(header_value: str) -> str: """Return the MIME type token (before parameters), lowercased.""" if not header_value: return "" - essence = header_value.split(";", 1)[0].strip().lower() - return essence.lstrip("\ufeff").strip() + + return ( + header_value.split(";", 1)[0].strip().lower().lstrip("\ufeff") + ) # Remove BOM if present def _check_content_type( diff --git a/backend/tests/handler/filesystem/test_resources_handler.py b/backend/tests/handler/filesystem/test_resources_handler.py index b7bdd8f2f..00224c9f0 100644 --- a/backend/tests/handler/filesystem/test_resources_handler.py +++ b/backend/tests/handler/filesystem/test_resources_handler.py @@ -2,15 +2,151 @@ import os from pathlib import Path from unittest.mock import Mock, patch +import httpx import pytest from config import RESOURCES_BASE_PATH from handler.filesystem.base_handler import CoverSize -from handler.filesystem.resources_handler import FSResourcesHandler +from handler.filesystem.resources_handler import ( + FSResourcesHandler, + _check_content_type, + _content_type_essence, +) from models.collection import Collection from models.rom import Rom +class TestContentTypeEssence: + """Tests for the _content_type_essence helper.""" + + def test_simple_mime_type(self): + assert _content_type_essence("image/png") == "image/png" + + def test_with_charset_parameter(self): + assert _content_type_essence("text/html; charset=utf-8") == "text/html" + + def test_with_multiple_parameters(self): + assert ( + _content_type_essence("text/html; charset=utf-8; boundary=something") + == "text/html" + ) + + def test_leading_whitespace(self): + assert _content_type_essence(" image/jpeg") == "image/jpeg" + + def test_trailing_whitespace(self): + assert _content_type_essence("image/jpeg ") == "image/jpeg" + + def test_whitespace_around_semicolon(self): + assert _content_type_essence("image/jpeg ; charset=utf-8") == "image/jpeg" + + def test_uppercase_normalized_to_lower(self): + assert _content_type_essence("Image/PNG") == "image/png" + + def test_mixed_case_with_params(self): + assert ( + _content_type_essence("Application/PDF; charset=utf-8") == "application/pdf" + ) + + def test_utf8_bom_prefix(self): + assert _content_type_essence("\ufeffimage/png") == "image/png" + + def test_utf8_bom_with_params(self): + assert ( + _content_type_essence("\ufeffapplication/pdf; charset=utf-8") + == "application/pdf" + ) + + def test_empty_string(self): + assert _content_type_essence("") == "" + + def test_none_like_empty(self): + """Falsy input returns empty string.""" + assert _content_type_essence("") == "" + + def test_only_whitespace(self): + assert _content_type_essence(" ") == "" + + def test_octet_stream(self): + assert ( + _content_type_essence("application/octet-stream") + == "application/octet-stream" + ) + + def test_force_download(self): + assert ( + _content_type_essence("application/force-download") + == "application/force-download" + ) + + +class TestCheckContentType: + """Tests for the _check_content_type helper.""" + + @staticmethod + def _make_response(content_type: str | None) -> httpx.Response: + headers = {} + if content_type is not None: + headers["content-type"] = content_type + return httpx.Response(200, headers=headers) + + def test_valid_image_prefix(self): + resp = self._make_response("image/png") + assert _check_content_type(resp, ("image/",), "cover") is True + + def test_valid_image_with_charset(self): + resp = self._make_response("image/jpeg; charset=utf-8") + assert _check_content_type(resp, ("image/",), "cover") is True + + def test_valid_pdf(self): + resp = self._make_response("application/pdf") + assert ( + _check_content_type( + resp, + ("application/pdf", "application/octet-stream"), + "manual", + ) + is True + ) + + def test_valid_octet_stream(self): + resp = self._make_response("application/octet-stream") + assert ( + _check_content_type( + resp, + ("application/pdf", "application/octet-stream"), + "manual", + ) + is True + ) + + def test_wrong_content_type(self): + resp = self._make_response("text/html") + assert _check_content_type(resp, ("image/",), "cover") is False + + def test_missing_header(self): + resp = self._make_response(None) + assert _check_content_type(resp, ("image/",), "cover") is False + + def test_empty_header(self): + resp = self._make_response("") + assert _check_content_type(resp, ("image/",), "cover") is False + + def test_leading_whitespace_still_matches(self): + resp = self._make_response(" image/png") + assert _check_content_type(resp, ("image/",), "cover") is True + + def test_bom_still_matches(self): + # httpx rejects non-ASCII header values, so mock the response + resp = Mock(spec=httpx.Response) + resp.headers = {"content-type": "\ufeffimage/png"} + assert _check_content_type(resp, ("image/",), "cover") is True + + def test_case_insensitive(self): + resp = self._make_response("Image/PNG") + assert _check_content_type(resp, ("image/",), "cover") is True + + class TestFSResourcesHandler: """Test suite for FSResourcesHandler class"""