From 167dafefb1a05b70e044f916ffa592af8cef9624 Mon Sep 17 00:00:00 2001
From: Egor fureunoir Gorbunov <contact@fureunoir.com>
Date: Wed, 20 Aug 2025 06:09:16 +0300
Subject: [PATCH] Features: 1) Implement language-specific adjustments for CJK
 and RTL/Indic languages in Elasticsearch queries; 2) Enhance query fuzziness
 settings based on language; 3) Refine SMART_FIELDS logic for phonetic and
 ngram weighting;

Fixes: 1) Correct logic to handle empty or missing language codes in requests;

Extra: Clean up comments and redundant whitespace in analyzers and filters definitions;
---
 core/elasticsearch/__init__.py | 52 ++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/core/elasticsearch/__init__.py b/core/elasticsearch/__init__.py
index d88d3d6b..a514e6a5 100644
--- a/core/elasticsearch/__init__.py
+++ b/core/elasticsearch/__init__.py
@@ -97,18 +97,41 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str,
             Q("term", **{"slug": slugify(query)}),
         ]
 
+        lang = ""
+        if request and hasattr(request, "LANGUAGE_CODE") and request.LANGUAGE_CODE:
+            lang = request.LANGUAGE_CODE.lower()
+        base = lang.split("-")[0] if lang else ""
+
+        is_cjk = base in {"ja", "zh"}
+        is_rtl_or_indic = base in {"ar", "hi"}
+
+        fields_all = SMART_FIELDS[:]
+
+        if is_cjk or is_rtl_or_indic:
+            fields_all = [f for f in fields_all if ".phonetic" not in f]
+
+        if is_cjk or is_rtl_or_indic:
+            fields_all = [
+                f.replace("name.ngram^6", "name.ngram^8").replace("title.ngram^4", "title.ngram^6") for f in fields_all
+            ]
+
+        if is_cjk or is_rtl_or_indic:
+            fuzzy = None
+        else:
+            fuzzy = "AUTO:5,8"
+
         text_shoulds = [
             Q(
                 "multi_match",
                 query=query,
-                fields=SMART_FIELDS,
-                fuzziness="AUTO",
+                fields=fields_all,
                 operator="and",
+                **({"fuzziness": fuzzy} if fuzzy else {}),
             ),
             Q(
                 "multi_match",
                 query=query,
-                fields=[f for f in SMART_FIELDS if f.endswith(".auto")],
+                fields=[f for f in fields_all if f.endswith(".auto")],
                 type="bool_prefix",
             ),
         ]
@@ -215,17 +238,16 @@ LANGUAGE_ANALYZER_MAP = {
     "nl": "dutch",
     "pt": "portuguese",
     "ro": "romanian",
-
     "ja": "cjk_search",
     "zh": "cjk_search",
     "ar": "arabic_search",
     "hi": "indic_search",
-
     "ru": "russian",
     "pl": "standard",
     "kk": "standard",
 }
 
+
 def _lang_analyzer(lang_code: str) -> str:
     base = lang_code.split("-")[0].lower()
     return LANGUAGE_ANALYZER_MAP.get(base, "icu_query")
@@ -243,36 +265,24 @@ class ActiveOnlyMixin:
 
 COMMON_ANALYSIS = {
     "char_filter": {
-        # ICU normalizer tidies up Unicode (compatibility forms etc.)
         "icu_nfkc_cf": {"type": "icu_normalizer", "name": "nfkc_cf"},
     },
     "filter": {
         "edge_ngram_filter": {"type": "edge_ngram", "min_gram": 1, "max_gram": 20},
         "ngram_filter": {"type": "ngram", "min_gram": 2, "max_gram": 20},
-
-        # CJK bigramming helps ja/zh when no language plugin is present
         "cjk_bigram": {"type": "cjk_bigram"},
-
-        # ICU casefolding/diacritics for *all* scripts
         "icu_folding": {"type": "icu_folding"},
-
-        # Your existing phonetic encoder (mainly helpful for Latin languages)
         "double_metaphone": {"type": "phonetic", "encoder": "double_metaphone", "replace": False},
-
-        # Script-specific light normalizers
         "arabic_norm": {"type": "arabic_normalization"},
         "indic_norm": {"type": "indic_normalization"},
     },
     "analyzer": {
-        # Generic query analyzer: ICU normalize+fold across scripts
         "icu_query": {
             "type": "custom",
             "char_filter": ["icu_nfkc_cf"],
             "tokenizer": "icu_tokenizer",
             "filter": ["lowercase", "icu_folding"],
         },
-
-        # Autocomplete (works well for all scripts thanks to icu_tokenizer)
         "autocomplete": {
             "type": "custom",
             "char_filter": ["icu_nfkc_cf"],
@@ -285,32 +295,24 @@ COMMON_ANALYSIS = {
             "tokenizer": "icu_tokenizer",
             "filter": ["lowercase", "icu_folding"],
         },
-
-        # Content ngram for recall (again ICU-aware)
         "name_ngram": {
             "type": "custom",
             "char_filter": ["icu_nfkc_cf"],
             "tokenizer": "icu_tokenizer",
             "filter": ["lowercase", "icu_folding", "ngram_filter"],
         },
-
-        # Phonetic for Latin fallback
         "name_phonetic": {
             "type": "custom",
             "char_filter": ["icu_nfkc_cf"],
             "tokenizer": "icu_tokenizer",
             "filter": ["lowercase", "icu_folding", "double_metaphone"],
         },
-
-        # CJK search analyzer (no stemming; bigram + ICU)
         "cjk_search": {
             "type": "custom",
             "char_filter": ["icu_nfkc_cf"],
             "tokenizer": "icu_tokenizer",
             "filter": ["lowercase", "icu_folding", "cjk_bigram"],
         },
-
-        # Arabic & Indic light normalizations (no stemming; reliable & fast)
         "arabic_search": {
             "type": "custom",
             "char_filter": ["icu_nfkc_cf"],