From d349eef42a147e2ff8fc24b5a4e9dcb87f292687 Mon Sep 17 00:00:00 2001 From: Georges-Antoine Assi Date: Sat, 12 Jul 2025 10:48:04 -0400 Subject: [PATCH] add options to normalization --- backend/handler/metadata/base_hander.py | 24 ++++++++++++++--------- backend/handler/metadata/igdb_handler.py | 1 - backend/handler/metadata/sgdb_handler.py | 7 ++++--- backend/handler/metadata/ss_handler.py | 5 ++++- frontend/assets/scrappers/sgdb.png | Bin 16885 -> 40377 bytes 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/backend/handler/metadata/base_hander.py b/backend/handler/metadata/base_hander.py index 9fe5602b6..5c6552a36 100644 --- a/backend/handler/metadata/base_hander.py +++ b/backend/handler/metadata/base_hander.py @@ -52,7 +52,7 @@ PS2_SERIAL_INDEX_KEY: Final = "romm:ps2_serial_index" PSP_SERIAL_INDEX_KEY: Final = "romm:psp_serial_index" LEADING_ARTICLE_PATTERN = re.compile(r"^(a|an|the)\b") -COMMA_ARTICLE_PATTERN = re.compile(r",\b(a|an|the)\b") +COMMA_ARTICLE_PATTERN = re.compile(r",\s(a|an|the)\b$") NON_WORD_SPACE_PATTERN = re.compile(r"[^\w\s]") MULTIPLE_SPACE_PATTERN = re.compile(r"\s+") @@ -61,24 +61,28 @@ CHAR_REMOVAL_TABLE = str.maketrans("_'\"", " ") # This caches results to avoid repeated normalization of the same search term @lru_cache(maxsize=1024) -def _normalize_search_term(name: str) -> str: +def _normalize_search_term( + name: str, remove_articles: bool = True, remove_punctuation: bool = True +) -> str: # Single translate operation name = name.lower().translate(CHAR_REMOVAL_TABLE) # Remove articles (combined if possible) - name = LEADING_ARTICLE_PATTERN.sub("", name) - name = COMMA_ARTICLE_PATTERN.sub("", name) + if remove_articles: + name = LEADING_ARTICLE_PATTERN.sub("", name) + name = COMMA_ARTICLE_PATTERN.sub("", name) # Remove punctuation and normalize spaces in one step - name = NON_WORD_SPACE_PATTERN.sub("", name) - name = MULTIPLE_SPACE_PATTERN.sub(" ", name).strip() + if remove_punctuation: + name = NON_WORD_SPACE_PATTERN.sub("", name) + name = MULTIPLE_SPACE_PATTERN.sub(" ", name) # Unicode normalization and accent removal if any(ord(c) > 127 for c in name): # Only if non-ASCII chars present normalized = unicodedata.normalize("NFD", name) name = "".join(c for c in normalized if not unicodedata.combining(c)) - return name + return name.strip() class MetadataHandler: @@ -93,8 +97,10 @@ class MetadataHandler: def normalize_cover_url(self, url: str) -> str: return url if not url else f"https:{url.replace('https:', '')}" - def normalize_search_term(self, name: str) -> str: - return _normalize_search_term(name) + def normalize_search_term( + self, name: str, remove_articles: bool = True, remove_punctuation: bool = True + ) -> str: + return _normalize_search_term(name, remove_articles, remove_punctuation) async def _ps2_opl_format(self, match: re.Match[str], search_term: str) -> str: serial_code = match.group(1) diff --git a/backend/handler/metadata/igdb_handler.py b/backend/handler/metadata/igdb_handler.py index e8773fe3c..1e744089c 100644 --- a/backend/handler/metadata/igdb_handler.py +++ b/backend/handler/metadata/igdb_handler.py @@ -507,7 +507,6 @@ class IGDBHandler(MetadataHandler): rom = await self._search_rom(search_term, platform_igdb_id) # IGDB search is fuzzy so no need to split the search term by special characters - if not rom: return fallback_rom diff --git a/backend/handler/metadata/sgdb_handler.py b/backend/handler/metadata/sgdb_handler.py index e504b145f..775482abd 100644 --- a/backend/handler/metadata/sgdb_handler.py +++ b/backend/handler/metadata/sgdb_handler.py @@ -47,17 +47,18 @@ class SGDBBaseHandler(MetadataHandler): return list(filter(None, results)) async def get_details_by_name(self, game_name: str) -> SGDBRom: - search_term = self.normalize_search_term(game_name) + search_term = self.normalize_search_term(game_name, remove_articles=False) games = await self.sgdb_service.search_games(term=search_term) if not games: log.debug(f"Could not find '{search_term}' on SteamGridDB") return SGDBRom(sgdb_id=None) # SGDB search is fuzzy so no need to split the search term by special characters - for game in games: game_name_lower = game["name"].lower() - game_name_normalized = self.normalize_search_term(game["name"]) + game_name_normalized = self.normalize_search_term( + game["name"], remove_articles=False + ) if ( game_name_lower == search_term.lower() diff --git a/backend/handler/metadata/ss_handler.py b/backend/handler/metadata/ss_handler.py index b5dc17416..10c838e84 100644 --- a/backend/handler/metadata/ss_handler.py +++ b/backend/handler/metadata/ss_handler.py @@ -386,7 +386,10 @@ class SSHandler(MetadataHandler): search_term = await self._mame_format(search_term) fallback_rom = SSRom(ss_id=None, name=search_term) - normalized_search_term = self.normalize_search_term(search_term) + ## SS API requires punctuation to match + normalized_search_term = self.normalize_search_term( + search_term, remove_punctuation=False + ) res = await self._search_rom(normalized_search_term, platform_ss_id) # SS API doesn't handle some special characters well diff --git a/frontend/assets/scrappers/sgdb.png b/frontend/assets/scrappers/sgdb.png index 21d61c346a3e0d26cf3013403d4ebcb48b6a3c18..57c7a14c8462ec75ae0f1463ab13922e9c8d32ae 100644 GIT binary patch literal 40377 zcmeG_30zIt|EHuv@)$-+yzC_uCMnB{2!*U=?OC#AiKxaf-9k|$8fB?uCrY#^S#F9@ z$e@kV@I$C%EQOZ;IrrXkJ3W>B^yWAI-ud`&&ppd`U(Rx$O_pQKnl@H%3;<|4a)jx4 z_)iJ|O(rVK!#5kBPId$!kQ!-fGSN9JKTE$U1%Se-2Kv_V4a;ryTKxU#3X_{7&w$Wm~AJGA46&L`h3s=BKM7@1Ulbz4s%S(9K(> z$sEU}Ym))hI~bX};J7ap!>`7{0GUtbI4&*2mvk^v^JFqfl9H^!!GIq&yMWA*x_7P4 z!AQlEAxX<5$?ChLg8|a=WHL#z3c5;nY%EhX0v^k)<%0 z#fsL8>|0w3nS6f)jw9gHmSaU7SD;eX~}q~*zE`)Sy8kR)q#Fh6Q`!7xY4%Zh3p zj4Xx0dMTZ?%jXV8N}deJBuPlJMh7EHVKA5KKZU8@!N^h=j^olX{LdZCzdD7P|4Ma5 zmb;rCxOMQ5#?J-Xt9Rt=_gHplRQP|$=Ty}jO}i$Yo^mK z>afb4q*&V`nR@Jmmfkm7w~cR0)#?Vk`9^*(TtbwObN zX7_s@iO!1|5a4iNwbiqmDQBimoVK`WgQxagou*_qdA@4$y_j<;r^YUoM$@f9vFXo|mo^rf5tm%R867V`zlU@;5i`4-V^g;Q5y1eRFypi!QD3Uu$d- zzCU_%EBDpMr)oS*d1*HDqRE~hAG?>C&ZVA{7Og6ajV@hcyL4db;j&{ldfnPQt~@q2 z*2OUTRtJsA6YW05E?*ZCec{21`%S}2t)DC|s~BK+$8AwrS=4KVQ=wLOFV5)y;lqo< z_fE(pUUj-yQPKv0Td`-$KR)?5Z7zJ5Z>rgTzYZ9*dBB>pab>OG8~V)vc&P7eHfhzl zW@r>F)C{A_YZYLKAD|mw&~D4Nfed{pe(@i@@f8W$yAiw;aR6QVK!qx60TpNX^Ou)6 zy`nUcCJnFbAB6fx{K2c&dvmBc81$>@$Sny6LT!5?*KehL%eI2u9_MBu6bf=;{IvUUV=E3Z8u1{lG=0B!TWFhf}+ zRG@@!ZTc$iZt=XNB4Mcp%Nad?2qe`S-w)YH*pjIqpeA5`fez+CGzVWv5kf3}-=+f% zO5BenxZk$UAXP68B0KMN6GcSf063l=zw59+OD>?N!zt)ZYFrh-0k}{Q2WRaWJZXwQ zL0(_j(12@4NbSJAQQs=~Qm+nO$7Zf(0l67DipNu>3!HQHF@EBwE#SmCRVKzbKy3K} z5#@^nn%12#R$@Ka6wFIY0B2U`$4QI=AY*1jW=h%w0umiC6#zQDHw#3y1H=!$U`p%X zoa_@s0(lcb&x(NM@+ap0`Xp&#$o|~h>Hr{t^uc~lG*Avqf87J?_fwbpwvE?>GBi;hjs=#0oEdl8^ueF&B*4n&|2u8!M`G+x{`_b$( zrd5R&cm*qsa#z^m*rQ4_sH*rpWTg1#ns^xb>TqhnSi)@`^B4nGzJT_E&<5pa$2U^Y z7c?-r^GHkR-X9AwRBv>3vL<625VMttCO_WX5Cz7()}$f==JEmz4uDD@UImGu z8<|)UEs@gCwd3zNP4JPcLt+%JGN6q;+?m{t(2j#aCt- za_yD~nFrQY+QAmm9-R$%iwWKXQN8Lo30srs7f6)P7HzC>OnXdK+^1L0eU^dk4d-CX zQA7(aNZ)XQK7s0WWg5njVc3JCn4Q!jh>%c&7orHcC!c~stWe$1v=6HI0jIz4)Np97 zFJce9_q=h8t5A8CaO0b1yUnQ`!_OW$4o&eA)01Hf9)r{%Cp6Zcs|W9f6vSMfUIo3F zpK$NQp8y?U%^NF2bh9)*^epAM2?rnpU3`N_H5(~Pj{`hc7-2Q5Dr>q&L>1k@U=fl} z1FIrgBH)$oGlGI_9q~|4B$xZ%Rmb8+5!80xL@+L6tXa$vzB_xnT7Jh%0uSIl+X+x`_&#(<6x zn);|`(-vqYB^-O0J)#Hf7re-&IAe*~t#RUUW5(Wktf!EoJtrdb!ubmjG5Q6R1hDo= z$0y=rVau>?1{OJbZ|8QCm8!$X~?l%WPsf+)>w!qw=$ylKI75Gt>s9q#uT zduA-*DZRs>-yY^{iefc)*Wj3n)a&xvDp1}}oqf0oB#6h;GI>*2boi^s$K%aLdhr1`+7uW z%qe5scmpy0Vxk|m!_V9p?#LMPxhkF%H!%SIz%wuaPV)dRV|`kapKa1OK~Kf>cJsI8 zR+G;hI@sAuJw5OIg!M%Ok)9$%mOPn?oxUClg}5Le`5^r;V-@9AU6I!khnlgihFg?Q z!zY0`VF}C7WMd|!acLo-n3O1Lj1r&%*Z45DMfYRWfQq&7Vnj}x!MeC(I<_q8FN`<> zgIKs~6bq1vD)IL_fQ?&v`#|R@Oy+fUnC65`eX?94|+^=NW;+}_gao-08nY5FNplu>*ILWc>Dhx z+GX_frg;*x6{SqU;<6l51Mx_2G+y)Zfr&d9BeXQZ1xf5m3Ge2ZsP46^PS%L0kese! zJ5Jh|Fp0ZKT3Ckt&p4U;1G%bx4!9?Z+xVH$&xlP4Rgx6kC@C3(-i5p#Nk%<925C;l z#xwkUlKQ@rcan&ObW)Dy{k{SPxt4*NpBDRc>?CkC-q&Vou!>G_yT%Fq6chv{sU!XJ z#W}}kTYZ|37cvqmj6steplvvzE3v53fd>!{|9HG1VC+!0`@ZI>7KQI)cMX}}2n60( zp#;z+pfw05OoI$<&$)!hSXRp0*5hiCR zIw3K~c&9AvQ*a#OmO5=xo3nswf8@i;kC1m%4#M}>gh9gKKu?`G6FHE(C(t<3Rju*t zZZQ|if6JL;(%y8G+kgoh;(d{-#+2fO4_1(0@$K4=;{>w|;+{WqCHOs3HC#~2@TxQ1 zOJYi`CWMC|J2ERNFT|GE0UTMu+suJNk$F$zQ{d-H40W{F%gV(gM*-?iwdx$ zCG-VPCz;~&?-6odfmqLsTNi=W`_xnK(rO!$B7o>RCBYkYD}x~FoX~;wXqHN zF(hnU={G0eKcRyXM}{0w4m-cDby2yqLbKqdrV)C_U2~WH`Dl=rob6fP-(R(C`fgHd zpaczr^6QPU7tfX83kHGEDDhG$<`bQL64?^&D9Kp-UXqT!<9V2337C2|*j3V3C4CB> zPJ%-v%bo|~CY8|xu$r>_6{vxlhvaLhgF#sjM~3e03CY3*g|n2&$3s_hFT@LQXY_&q zNDr6~(}fZ9;M>hz?#wg-BJ(NdlHm8R9~f-yU%WU8f*b7WANuOz;G7eIav*a2ZGjU6d;vVt8yk7KF8AGKXiV#SPNMA^eI5Z4~1nk}fiLOCgT_G}<8J3n4N| z?ZbT#F&EO0;i_&O^Ag-&QrYcIC*dXoWRnp$(TUGe#{#%NAyAgjC@UZ8>*v;`MT^Us z4r-`pw=P5qHiC$s{5~2~T`~`@p}R3U;9J-m^XMC=r8T-^GH$@HeueY-@c)ey<5>a1 z7nfWS6vE8ixH{375H<15F(8JZ5n+4qh?dojD6wq_N;0B`b_A|KQCv_yblv6L+?joM zbp(3&LvYFo`GAO8U|*s!G~N*m(>G4PYj=uVVjTgFYsSS{Xea-Nm!hdkB)n zVuA>@orK&KyHwgS8en#X94&4im`LL zzG!Y<$gGFy*A;)OFokT}cSCIyV z<@4Qk)zS^itS*Y?LYNFiYN@gm8)jx1VQ;YW29c_f@auv+L~#)0ePfZDaMO6>^q3tVZa{Q&$Lv9AHAa!Eb1wuZ3ye2g(iP1fQVC*{Hgy`?m3kK?(4ub_jj_ z%!N8m#p^BmzC^z|=ygqI;7poL{rh-&a2RqE*w_Z8A$in6cZ=`{)W2TQvX5AnG#pSO zm4u)&5BG)-QsV<7hvP#B2r z>}?$j;EAr9qB!H#!?$^&YtH*SGBX0gh0$c{TB|j%Ljm|7xoHM`z3L_=2-H*N^+#L< za1S&#kW*816P6TST+k*dD{)_{MQbH=n;5%p14gQKSZ5bi-nxN)iWnfmo){7E-8Kly zp?76<*)XFIq=dTH6bRVOrI<})**)c8@}!p@UdL!~ETu93Z+G? zBvc5OE+(DiP$B}Wk)}d=Np}JlJu;*D(Lhicv2%Wc8wSd5y)s;qJza&T)n2UYGrbMc zd04(HWwp=pmIFnH(H#V)yqN~K`mBP8wR7Q@l8_>+`}%0?g(!A|O>B^is=2$mnxlW2 zLRH?xtUm^YN7yC7I3WZHPanS3OWnK9)C>ZXIqNUvc$`SSl2O#E3aH*zP2(Ib6<|^? zI`OWT`h}G6P*?Mnrr3MLWHbR_C;6j+r2imXMoq~g+5H01|DW0ez^svOR-l&|fdJ)> zVEHBvJI_>kHo{uTXIV3y9Hw~}2;}7i(@MRbUhvn82ch2FBKF%iKM(?_FofrhW&$>h zCd>wG?#SQRMknl=+5%W-UvyGH)_C0cr&_Agbalw}EX)WS3vN z1wvkK`|%dMEIWy%emlnk5qc@|x%f`LV@s6E09jrRq_cn#uc4IX2Y=&J=fj)8pJs+XaJNI zL@MvwtLr|}edOzibC@sBs8)b>VRu?^p(ocDrA^}cIvlOlDxw`Dy=$s+c|(!MV1DPlQQQ}-A$zyDWnJR)O}bV2r~;I&(j zAOZ~r`;=Klx3+xw=&z-!TC82kHr`o2RFh~?**8MO3*KFc<84Y1rjQ*iGyQe<5Vt64@@n*d4U3t#Iwb!64bB-+b2iR&7GrF3wR1n`yng^1H3ohAf9y zo=IQuT`y`_vg_m?|0&0{;(cMZqN~t-|IzvL7J42&>i&np(GBw`N@d`ZnD#gNPqI#n zQo(40uL|tUC=r>7D3e(Q`dS1RVU-&pWH2I=nlGe9++_6*Bw`VlZUH-lf6B3bbC_>b zDC1S|chP#lMx{GRhkCtoTk^{)=H}(IB5eSEM|~~w-mPhw|JUlcX_j32#r(Kdsofmz zmN=P1k)-lmQ#}a3CZYQ?_Z@SSZ^DoEl@&|`ZR1NP%)IE4QY?xMG29K$_{GQJhkf*5 z@>(RxYl5RSWpE_)a6Xx%l}5u4OmYGKu9XUjgI}t*2_SA1)C(LayOso!)DR*Suq5-+ z#zX=t(!%Kj9Nvw@V*{aE!JKX_UcrcwqtoXM^CjdL`44C2b#p*|mKMsf5`xP^do67( zw*o%f6MH;$NP7M$t7fAV#IN=XqNoEZ^|h3(=ar-P4C79a0=WJ5wpr80m)n&3&0+CH zfB(2iA`Io*u2#(xl;CQ~kT9wrcuAubT&qP<4D6 zgMK9CYc>ct1*!K)$j$pdj_G$6Lq|^EI=yu5!zS*F?oDNvz4qt_Zh$!0+~wrxx!1%? zj?HATZOg1-)KDN!u|J=E+=cxRkMf;@9(IQGcV{DWTl^C~XjlpQ2|!#(HqzYxvkCgvA6+h-$!u43qM-CYzc6v>9>#-*-C)-fA&XmDf^fXz6+2 z%BQ{Q#~KZl*H3(R^i>b#NXwo@@B^EM>%GkxXQZ5xIM-QKOb3@99A#J11q)pts;M46FdzQ~ydD_2gW={HnR?Ah6 zu6>%bXxv$~J00%#zvBPc`NVJMGUC=fFqo~j`BLfHoih)e$=mmvYrFlQer;&f;-JMw zlRvvCyPaD$OQY?!4u&(GweC0i^l{T8lMv5s=dXKYI@sFApNO6PNf_NF(_iW4>5d~> zb()rCF-kwkENy}Oh^Cf*WfX5ni9WWepr?;rvnHGE8|%y|e)*d?r&Y`tyGaL~^PjuO z^*=dye9G=)J`UFFqweoc%<~*Ix!DQLGg_}siAINST|CjyF04dlrp{K6P0vosTXuha zw!7BBkLq@r?NU~0rh{V+KEsw!Z}02=I(NfW^@Rt1?HEO6t;}|`9p7;6+K{N8D>k<% z^L|;e)m#0~lLcl`sW!2r_bu&oBkt##lI=yzR_3QK%_Rd4bB`v{YW1E`2eORoRb zPIb|^{-OK>gnSnuPT46jJA-bep{>>S8M?_liBvsFVl{U)p`OS*i8QipcvlnN0Ec^* z+|CSomU$8>t=k1vQ<9Fd-2rLs4lvn54JnDtlSsqHt4m2_oyBzul6r^2sI;^< zd=L;R?M>ca-C*a@GJC&>{C6Rd&$oBg7B-N%Q^$vyAnt!>O!_zDBD@G~o zg?^Tj$UKQOJ&A2W<^5b6ak(P8pUaj1{&pUk;9q(@w(^qi1C_@+%63?qt63p*M-Cff J8a71i_J0L#1g-!8 literal 16885 zcmeAS@N?(olHy`uVBq!ia0y~y5bj`L;7s6P28sm#*PRBW7>k44ofy`glX(f`a29w( z7BevL9R^{>D(d7_^W$r}%=*sXVx7}Jc*jLPt-|``B z@y+15mwYE3_L+Fly?u*K;WVqfNoHvs2GNyIFX-n1jo>Z`@(X5gaBye^z&)5S5Q;?|oRj=T(t0t_3^emrpC=k+e0Neic5jFblIt2ik2VAWrctMzX%-}%A| zWH2Z^0+9K)tO$J9s2A8QR-aG&M9uhPl+cwBvlB z`jCO`XY~EY+C3oV1C#=k9l83EL#}=*kgK0jP0)IniUFZl_Qrt?7$FH0_J6hAcZdTN zIioQEZ3zu@KqOs{^J8FX(%{frlkku6w`ZchmRYFcncP|6)*7!9xM<-3H_$F9K$>O( zBTni7_clg;i}j$^?0(a~TTk}(?>NW39s?g>bt>p-U}BM(dV z8g1|ml16M2X1z4p;2RtbJ|%4R(rCsREE(tiJwFCEMh3Q^Uky#a&%fsS*ZDB7$01?! zYhWqdP`p=ub&JJ~H^62^L-XI}P)Icu5ezD!7dQ^{#x7(8WpK9dXTQ18$ODXo0}A!8 zt7}>XflTNq&uBAyzysoFpX>=xgz$XYzVCRr1k&K#XzvI*N;zNw5i^=`AnoM=ivgq& zY4oycG~+;qj?u$qFq9yp8E2r2uF;G$h+_bE79GtvgE-noYn;&Wm-4I`bjAoqCj5C^XMl%j*fN(JO^T}#mjP`Lx`#8fWIg^zK$kH^L zaYi%FXvP6ng`_qLff>gTxGt09zy_B8d6kwQHf}Kjt??gS_dYl_0L+Mel9QL{p}6=U@8it=BbX=nEoc`R!-w5}6RuQ(oTa@1vVQA&2uM7GzKdx=RyxpW;VvG#e%#o|ED4lv zaQt`e`wuQb+zutfe$Q)hMJ4n{__TAXoEX