add options to normalization

This commit is contained in:
Georges-Antoine Assi
2025-07-12 10:48:04 -04:00
parent c76caed145
commit d349eef42a
5 changed files with 23 additions and 14 deletions

View File

@@ -52,7 +52,7 @@ PS2_SERIAL_INDEX_KEY: Final = "romm:ps2_serial_index"
PSP_SERIAL_INDEX_KEY: Final = "romm:psp_serial_index"
LEADING_ARTICLE_PATTERN = re.compile(r"^(a|an|the)\b")
COMMA_ARTICLE_PATTERN = re.compile(r",\b(a|an|the)\b")
COMMA_ARTICLE_PATTERN = re.compile(r",\s(a|an|the)\b$")
NON_WORD_SPACE_PATTERN = re.compile(r"[^\w\s]")
MULTIPLE_SPACE_PATTERN = re.compile(r"\s+")
@@ -61,24 +61,28 @@ CHAR_REMOVAL_TABLE = str.maketrans("_'\"", " ")
# This caches results to avoid repeated normalization of the same search term
@lru_cache(maxsize=1024)
def _normalize_search_term(name: str) -> str:
def _normalize_search_term(
name: str, remove_articles: bool = True, remove_punctuation: bool = True
) -> str:
# Single translate operation
name = name.lower().translate(CHAR_REMOVAL_TABLE)
# Remove articles (combined if possible)
name = LEADING_ARTICLE_PATTERN.sub("", name)
name = COMMA_ARTICLE_PATTERN.sub("", name)
if remove_articles:
name = LEADING_ARTICLE_PATTERN.sub("", name)
name = COMMA_ARTICLE_PATTERN.sub("", name)
# Remove punctuation and normalize spaces in one step
name = NON_WORD_SPACE_PATTERN.sub("", name)
name = MULTIPLE_SPACE_PATTERN.sub(" ", name).strip()
if remove_punctuation:
name = NON_WORD_SPACE_PATTERN.sub("", name)
name = MULTIPLE_SPACE_PATTERN.sub(" ", name)
# Unicode normalization and accent removal
if any(ord(c) > 127 for c in name): # Only if non-ASCII chars present
normalized = unicodedata.normalize("NFD", name)
name = "".join(c for c in normalized if not unicodedata.combining(c))
return name
return name.strip()
class MetadataHandler:
@@ -93,8 +97,10 @@ class MetadataHandler:
def normalize_cover_url(self, url: str) -> str:
return url if not url else f"https:{url.replace('https:', '')}"
def normalize_search_term(self, name: str) -> str:
return _normalize_search_term(name)
def normalize_search_term(
self, name: str, remove_articles: bool = True, remove_punctuation: bool = True
) -> str:
return _normalize_search_term(name, remove_articles, remove_punctuation)
async def _ps2_opl_format(self, match: re.Match[str], search_term: str) -> str:
serial_code = match.group(1)

View File

@@ -507,7 +507,6 @@ class IGDBHandler(MetadataHandler):
rom = await self._search_rom(search_term, platform_igdb_id)
# IGDB search is fuzzy so no need to split the search term by special characters
if not rom:
return fallback_rom

View File

@@ -47,17 +47,18 @@ class SGDBBaseHandler(MetadataHandler):
return list(filter(None, results))
async def get_details_by_name(self, game_name: str) -> SGDBRom:
search_term = self.normalize_search_term(game_name)
search_term = self.normalize_search_term(game_name, remove_articles=False)
games = await self.sgdb_service.search_games(term=search_term)
if not games:
log.debug(f"Could not find '{search_term}' on SteamGridDB")
return SGDBRom(sgdb_id=None)
# SGDB search is fuzzy so no need to split the search term by special characters
for game in games:
game_name_lower = game["name"].lower()
game_name_normalized = self.normalize_search_term(game["name"])
game_name_normalized = self.normalize_search_term(
game["name"], remove_articles=False
)
if (
game_name_lower == search_term.lower()

View File

@@ -386,7 +386,10 @@ class SSHandler(MetadataHandler):
search_term = await self._mame_format(search_term)
fallback_rom = SSRom(ss_id=None, name=search_term)
normalized_search_term = self.normalize_search_term(search_term)
## SS API requires punctuation to match
normalized_search_term = self.normalize_search_term(
search_term, remove_punctuation=False
)
res = await self._search_rom(normalized_search_term, platform_ss_id)
# SS API doesn't handle some special characters well

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 39 KiB