From 167dafefb1a05b70e044f916ffa592af8cef9624 Mon Sep 17 00:00:00 2001 From: Egor fureunoir Gorbunov Date: Wed, 20 Aug 2025 06:09:16 +0300 Subject: [PATCH] Features: 1) Implement language-specific adjustments for CJK and RTL/Indic languages in Elasticsearch queries; 2) Enhance query fuzziness settings based on language; 3) Refine SMART_FIELDS logic for phonetic and ngram weighting; Fixes: 1) Correct logic to handle empty or missing language codes in requests; Extra: Clean up comments and redundant whitespace in analyzers and filters definitions; --- core/elasticsearch/__init__.py | 52 ++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/core/elasticsearch/__init__.py b/core/elasticsearch/__init__.py index d88d3d6b..a514e6a5 100644 --- a/core/elasticsearch/__init__.py +++ b/core/elasticsearch/__init__.py @@ -97,18 +97,41 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str, Q("term", **{"slug": slugify(query)}), ] + lang = "" + if request and hasattr(request, "LANGUAGE_CODE") and request.LANGUAGE_CODE: + lang = request.LANGUAGE_CODE.lower() + base = lang.split("-")[0] if lang else "" + + is_cjk = base in {"ja", "zh"} + is_rtl_or_indic = base in {"ar", "hi"} + + fields_all = SMART_FIELDS[:] + + if is_cjk or is_rtl_or_indic: + fields_all = [f for f in fields_all if ".phonetic" not in f] + + if is_cjk or is_rtl_or_indic: + fields_all = [ + f.replace("name.ngram^6", "name.ngram^8").replace("title.ngram^4", "title.ngram^6") for f in fields_all + ] + + if is_cjk or is_rtl_or_indic: + fuzzy = None + else: + fuzzy = "AUTO:5,8" + text_shoulds = [ Q( "multi_match", query=query, - fields=SMART_FIELDS, - fuzziness="AUTO", + fields=fields_all, operator="and", + **({"fuzziness": fuzzy} if fuzzy else {}), ), Q( "multi_match", query=query, - fields=[f for f in SMART_FIELDS if f.endswith(".auto")], + fields=[f for f in fields_all if f.endswith(".auto")], type="bool_prefix", ), ] @@ -215,17 +238,16 @@ LANGUAGE_ANALYZER_MAP = { "nl": "dutch", "pt": "portuguese", "ro": "romanian", - "ja": "cjk_search", "zh": "cjk_search", "ar": "arabic_search", "hi": "indic_search", - "ru": "russian", "pl": "standard", "kk": "standard", } + def _lang_analyzer(lang_code: str) -> str: base = lang_code.split("-")[0].lower() return LANGUAGE_ANALYZER_MAP.get(base, "icu_query") @@ -243,36 +265,24 @@ class ActiveOnlyMixin: COMMON_ANALYSIS = { "char_filter": { - # ICU normalizer tidies up Unicode (compatibility forms etc.) "icu_nfkc_cf": {"type": "icu_normalizer", "name": "nfkc_cf"}, }, "filter": { "edge_ngram_filter": {"type": "edge_ngram", "min_gram": 1, "max_gram": 20}, "ngram_filter": {"type": "ngram", "min_gram": 2, "max_gram": 20}, - - # CJK bigramming helps ja/zh when no language plugin is present "cjk_bigram": {"type": "cjk_bigram"}, - - # ICU casefolding/diacritics for *all* scripts "icu_folding": {"type": "icu_folding"}, - - # Your existing phonetic encoder (mainly helpful for Latin languages) "double_metaphone": {"type": "phonetic", "encoder": "double_metaphone", "replace": False}, - - # Script-specific light normalizers "arabic_norm": {"type": "arabic_normalization"}, "indic_norm": {"type": "indic_normalization"}, }, "analyzer": { - # Generic query analyzer: ICU normalize+fold across scripts "icu_query": { "type": "custom", "char_filter": ["icu_nfkc_cf"], "tokenizer": "icu_tokenizer", "filter": ["lowercase", "icu_folding"], }, - - # Autocomplete (works well for all scripts thanks to icu_tokenizer) "autocomplete": { "type": "custom", "char_filter": ["icu_nfkc_cf"], @@ -285,32 +295,24 @@ COMMON_ANALYSIS = { "tokenizer": "icu_tokenizer", "filter": ["lowercase", "icu_folding"], }, - - # Content ngram for recall (again ICU-aware) "name_ngram": { "type": "custom", "char_filter": ["icu_nfkc_cf"], "tokenizer": "icu_tokenizer", "filter": ["lowercase", "icu_folding", "ngram_filter"], }, - - # Phonetic for Latin fallback "name_phonetic": { "type": "custom", "char_filter": ["icu_nfkc_cf"], "tokenizer": "icu_tokenizer", "filter": ["lowercase", "icu_folding", "double_metaphone"], }, - - # CJK search analyzer (no stemming; bigram + ICU) "cjk_search": { "type": "custom", "char_filter": ["icu_nfkc_cf"], "tokenizer": "icu_tokenizer", "filter": ["lowercase", "icu_folding", "cjk_bigram"], }, - - # Arabic & Indic light normalizations (no stemming; reliable & fast) "arabic_search": { "type": "custom", "char_filter": ["icu_nfkc_cf"],