Features: 1) Add brand_name and category_name fields to ProductDocument for enhanced search indexing; 2) Introduce expanded ICU-based analyzers for multi-language support, including Arabic, Indic, and CJK; 3) Enable search enhancements with revised weights in SMART_FIELDS and additional language-specific query mappings.

Fixes: 1) Correct typo in total_orders field name in ProductDocument; 2) Replace outdated query_lc analyzer with icu_query for consistent Unicode handling; 3) Remove unnecessary get_object_or_404 calls to optimize batch query performance. Extra: Refactor process_query function with clearer structure and batch-loading logic; streamline brand and category field handling across documents; improve reboot scripts to rebuild search index automatically.
2025-08-20 05:36:55 +03:00 · 2025-08-20 05:36:55 +03:00 · 0a375ad0d1
commit 0a375ad0d1
parent 6e83562ee6
4 changed files with 195 additions and 104 deletions
--- a/core/elasticsearch/init.py
+++ b/core/elasticsearch/init.py
@ -1,6 +1,5 @@
 from django.conf import settings
 from django.http import Http404
-from django.shortcuts import get_object_or_404
 from django.utils.text import slugify
 from django.utils.translation import gettext_lazy as _
 from django_elasticsearch_dsl import fields
@ -12,25 +11,24 @@ from rest_framework.request import Request
 from core.models import Brand, Category, Product

 SMART_FIELDS = [
-    "name^6",
-    "name.ngram^5",
-    "name.phonetic",
-    "title^4",
-    "title.ngram^3",
-    "title.phonetic",
+    "name^8",
+    "name.ngram^6",
+    "name.phonetic^3",
+    "title^5",
+    "title.ngram^4",
+    "title.phonetic^2",
    "description^2",
    "description.ngram",
    "description.phonetic",
-    "brand__name^3",
-    "brand__name.ngram",
-    "brand__name.auto",
-    "category__name^2",
-    "category__name.ngram",
-    "category__name.auto",
+    "brand_name^4",
+    "brand_name.ngram^3",
+    "brand_name.auto^3",
+    "category_name^3",
+    "category_name.ngram^2",
+    "category_name.auto^2",
 ]

 functions = [
-    # product-level boosts when searching for products
    {
        "filter": Q("term", **{"_index": "products"}),
        "field_value_factor": {
@ -67,7 +65,6 @@ functions = [
            "missing": 0,
        },
    },
-    # category-level boost when searching for categories
    {
        "filter": Q("term", **{"_index": "categories"}),
        "field_value_factor": {
@ -77,7 +74,6 @@ functions = [
            "missing": 0,
        },
    },
-    # brand-level boost when searching for brands
    {
        "filter": Q("term", **{"_index": "brands"}),
        "field_value_factor": {
@ -91,34 +87,35 @@ functions = [


 def process_query(query: str = "", request: Request | None = None) -> dict[str, list[dict]] | None:
-    """
-    Perform a lenient, typo‑tolerant, multi‑index search.
-
-    * Full‑text with fuzziness for spelling mistakes
-    * `bool_prefix` for edge‑ngram autocomplete / “icontains”
-    """
    if not query:
        raise ValueError(_("no search term provided."))

    query = query.strip()
    try:
+        exact_shoulds = [
+            Q("term", **{"name.raw": query}),
+            Q("term", **{"slug": slugify(query)}),
+        ]
+
+        text_shoulds = [
+            Q(
+                "multi_match",
+                query=query,
+                fields=SMART_FIELDS,
+                fuzziness="AUTO",
+                operator="and",
+            ),
+            Q(
+                "multi_match",
+                query=query,
+                fields=[f for f in SMART_FIELDS if f.endswith(".auto")],
+                type="bool_prefix",
+            ),
+        ]
+
        query_base = Q(
            "bool",
-            should=[
-                Q(
-                    "multi_match",
-                    query=query,
-                    fields=SMART_FIELDS,
-                    fuzziness="AUTO",
-                    operator="and",
-                ),
-                Q(
-                    "multi_match",
-                    query=query,
-                    fields=[f for f in SMART_FIELDS if f.endswith(".auto")],
-                    type="bool_prefix",
-                ),
-            ],
+            should=exact_shoulds + text_shoulds,
            minimum_should_match=1,
        )

@ -127,39 +124,61 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str,
            query=query_base,
            functions=functions,
            boost_mode="multiply",
-            score_mode="first",
+            score_mode="sum",
        )

        search = Search(index=["products", "categories", "brands", "posts"]).query(function_score_query).extra(size=100)
        response = search.execute()

+        # Batch-load related image data to avoid N+1 queries
        results: dict = {"products": [], "categories": [], "brands": [], "posts": []}
+        uuids_by_index: dict[str, list] = {"products": [], "categories": [], "brands": []}
+        hit_cache: list = []
+
        for hit in response.hits:
+            hit_cache.append(hit)
+            if getattr(hit, "uuid", None):
+                uuids_by_index.setdefault(hit.meta.index, []).append(str(hit.uuid))
+
+        products_by_uuid = {}
+        brands_by_uuid = {}
+        cats_by_uuid = {}
+
+        if request:
+            if uuids_by_index.get("products"):
+                products_by_uuid = {
+                    str(p.uuid): p
+                    for p in Product.objects.filter(uuid__in=uuids_by_index["products"])
+                    .select_related("brand", "category")
+                    .prefetch_related("images")
+                }
+            if uuids_by_index.get("brands"):
+                brands_by_uuid = {str(b.uuid): b for b in Brand.objects.filter(uuid__in=uuids_by_index["brands"])}
+            if uuids_by_index.get("categories"):
+                cats_by_uuid = {str(c.uuid): c for c in Category.objects.filter(uuid__in=uuids_by_index["categories"])}
+
+        for hit in hit_cache:
            obj_uuid = getattr(hit, "uuid", None) or hit.meta.id
            obj_name = getattr(hit, "name", None) or getattr(hit, "title", None) or "N/A"
-            obj_slug = ""
-            raw_slug = getattr(hit, "slug", None)
-            if raw_slug:
-                obj_slug = raw_slug
-            elif hit.meta.index == "brands":
-                obj_slug = slugify(obj_name)
-            elif hit.meta.index == "categories":
-                obj_slug = slugify(f"{obj_name}")
+            obj_slug = getattr(hit, "slug", "") or (
+                slugify(obj_name) if hit.meta.index in {"brands", "categories"} else ""
+            )

            image_url = None
            idx = hit.meta.index
            if idx == "products" and request:
-                prod = get_object_or_404(Product, uuid=obj_uuid)
-                first = prod.images.order_by("priority").first()
-                if first and first.image:
-                    image_url = request.build_absolute_uri(first.image.url)
+                prod = products_by_uuid.get(str(obj_uuid))
+                if prod:
+                    first = prod.images.order_by("priority").first()
+                    if first and first.image:
+                        image_url = request.build_absolute_uri(first.image.url)
            elif idx == "brands" and request:
-                brand = get_object_or_404(Brand, uuid=obj_uuid)
-                if brand.small_logo:
+                brand = brands_by_uuid.get(str(obj_uuid))
+                if brand and brand.small_logo:
                    image_url = request.build_absolute_uri(brand.small_logo.url)
            elif idx == "categories" and request:
-                cat = get_object_or_404(Category, uuid=obj_uuid)
-                if cat.image:
+                cat = cats_by_uuid.get(str(obj_uuid))
+                if cat and cat.image:
                    image_url = request.build_absolute_uri(cat.image.url)

            hit_result = {
@ -175,12 +194,8 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str,
                    hit_result["total_orders_debug"] = getattr(hit, "total_orders", 0)
                    hit_result["brand_priority_debug"] = getattr(hit, "brand_priority", 0)
                    hit_result["category_priority_debug"] = getattr(hit, "category_priority", 0)
-                if idx == "brands":
+                if idx in ("brands", "categories"):
                    hit_result["priority_debug"] = getattr(hit, "priority", 0)
-                if idx == "categories":
-                    hit_result["priority_debug"] = getattr(hit, "priority", 0)
-                if idx == "posts":
-                    pass

            results[idx].append(hit_result)

@ -190,30 +205,30 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str,


 LANGUAGE_ANALYZER_MAP = {
-    "ar": "arabic",
    "cs": "czech",
    "da": "danish",
    "de": "german",
    "en": "english",
    "es": "spanish",
    "fr": "french",
-    "hi": "hindi",
    "it": "italian",
-    "ja": "standard",
-    "kk": "standard",
    "nl": "dutch",
-    "pl": "standard",
    "pt": "portuguese",
    "ro": "romanian",
+
+    "ja": "cjk_search",
+    "zh": "cjk_search",
+    "ar": "arabic_search",
+    "hi": "indic_search",
+
    "ru": "russian",
-    "zh": "standard",
+    "pl": "standard",
+    "kk": "standard",
 }

-
 def _lang_analyzer(lang_code: str) -> str:
-    """Return the best‑guess ES analyzer for an ISO language code."""
    base = lang_code.split("-")[0].lower()
-    return LANGUAGE_ANALYZER_MAP.get(base, "standard")
+    return LANGUAGE_ANALYZER_MAP.get(base, "icu_query")


 class ActiveOnlyMixin:
@ -227,33 +242,87 @@ class ActiveOnlyMixin:


 COMMON_ANALYSIS = {
+    "char_filter": {
+        # ICU normalizer tidies up Unicode (compatibility forms etc.)
+        "icu_nfkc_cf": {"type": "icu_normalizer", "name": "nfkc_cf"},
+    },
    "filter": {
        "edge_ngram_filter": {"type": "edge_ngram", "min_gram": 1, "max_gram": 20},
        "ngram_filter": {"type": "ngram", "min_gram": 2, "max_gram": 20},
-        "double_metaphone": {
-            "type": "phonetic",
-            "encoder": "double_metaphone",
-            "replace": False,
-        },
+
+        # CJK bigramming helps ja/zh when no language plugin is present
+        "cjk_bigram": {"type": "cjk_bigram"},
+
+        # ICU casefolding/diacritics for *all* scripts
+        "icu_folding": {"type": "icu_folding"},
+
+        # Your existing phonetic encoder (mainly helpful for Latin languages)
+        "double_metaphone": {"type": "phonetic", "encoder": "double_metaphone", "replace": False},
+
+        # Script-specific light normalizers
+        "arabic_norm": {"type": "arabic_normalization"},
+        "indic_norm": {"type": "indic_normalization"},
    },
    "analyzer": {
+        # Generic query analyzer: ICU normalize+fold across scripts
+        "icu_query": {
+            "type": "custom",
+            "char_filter": ["icu_nfkc_cf"],
+            "tokenizer": "icu_tokenizer",
+            "filter": ["lowercase", "icu_folding"],
+        },
+
+        # Autocomplete (works well for all scripts thanks to icu_tokenizer)
        "autocomplete": {
-            "tokenizer": "standard",
-            "filter": ["lowercase", "asciifolding", "edge_ngram_filter"],
+            "type": "custom",
+            "char_filter": ["icu_nfkc_cf"],
+            "tokenizer": "icu_tokenizer",
+            "filter": ["lowercase", "icu_folding", "edge_ngram_filter"],
        },
        "autocomplete_search": {
-            "tokenizer": "standard",
-            "filter": ["lowercase", "asciifolding"],
+            "type": "custom",
+            "char_filter": ["icu_nfkc_cf"],
+            "tokenizer": "icu_tokenizer",
+            "filter": ["lowercase", "icu_folding"],
        },
+
+        # Content ngram for recall (again ICU-aware)
        "name_ngram": {
-            "tokenizer": "standard",
-            "filter": ["lowercase", "asciifolding", "ngram_filter"],
+            "type": "custom",
+            "char_filter": ["icu_nfkc_cf"],
+            "tokenizer": "icu_tokenizer",
+            "filter": ["lowercase", "icu_folding", "ngram_filter"],
        },
+
+        # Phonetic for Latin fallback
        "name_phonetic": {
-            "tokenizer": "standard",
-            "filter": ["lowercase", "asciifolding", "double_metaphone"],
+            "type": "custom",
+            "char_filter": ["icu_nfkc_cf"],
+            "tokenizer": "icu_tokenizer",
+            "filter": ["lowercase", "icu_folding", "double_metaphone"],
+        },
+
+        # CJK search analyzer (no stemming; bigram + ICU)
+        "cjk_search": {
+            "type": "custom",
+            "char_filter": ["icu_nfkc_cf"],
+            "tokenizer": "icu_tokenizer",
+            "filter": ["lowercase", "icu_folding", "cjk_bigram"],
+        },
+
+        # Arabic & Indic light normalizations (no stemming; reliable & fast)
+        "arabic_search": {
+            "type": "custom",
+            "char_filter": ["icu_nfkc_cf"],
+            "tokenizer": "icu_tokenizer",
+            "filter": ["lowercase", "icu_folding", "arabic_norm"],
+        },
+        "indic_search": {
+            "type": "custom",
+            "char_filter": ["icu_nfkc_cf"],
+            "tokenizer": "icu_tokenizer",
+            "filter": ["lowercase", "icu_folding", "indic_norm"],
        },
-        "query_lc": {"tokenizer": "standard", "filter": ["lowercase", "asciifolding"]},
    },
 }

@ -275,7 +344,7 @@ def _add_multilang_fields(cls):
                copy_to="name",
                fields={
                    "raw": fields.KeywordField(ignore_above=256),
-                    "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"),
+                    "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"),
                    "phonetic": fields.TextField(analyzer="name_phonetic"),
                },
            ),
@ -298,7 +367,7 @@ def _add_multilang_fields(cls):
                copy_to="description",
                fields={
                    "raw": fields.KeywordField(ignore_above=256),
-                    "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"),
+                    "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"),
                    "phonetic": fields.TextField(analyzer="name_phonetic"),
                },
            ),
--- a/core/elasticsearch/documents.py
+++ b/core/elasticsearch/documents.py
@ -6,13 +6,13 @@ from core.elasticsearch import COMMON_ANALYSIS, ActiveOnlyMixin, _add_multilang_
 from core.models import Brand, Category, Product


-class _BaseDoc(ActiveOnlyMixin, Document):
+class _BaseDoc(Document):
    name = fields.TextField(
        attr="name",
        analyzer="standard",
        fields={
            "raw": fields.KeywordField(ignore_above=256),
-            "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"),
+            "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"),
            "phonetic": fields.TextField(analyzer="name_phonetic"),
            "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"),
        },
@ -22,7 +22,7 @@ class _BaseDoc(ActiveOnlyMixin, Document):
        analyzer="standard",
        fields={
            "raw": fields.KeywordField(ignore_above=256),
-            "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"),
+            "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"),
            "phonetic": fields.TextField(analyzer="name_phonetic"),
            "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"),
        },
@ -44,9 +44,9 @@ class _BaseDoc(ActiveOnlyMixin, Document):
        return getattr(instance, "description", "") or ""


-class ProductDocument(_BaseDoc):
+class ProductDocument(ActiveOnlyMixin, _BaseDoc):
    rating = fields.FloatField(attr="rating")
-    total_order = fields.IntegerField(attr="total_orders")
+    total_orders = fields.IntegerField(attr="total_orders")
    brand_priority = fields.IntegerField(
        attr="brand.priority",
        index=True,
@ -58,6 +58,27 @@ class ProductDocument(_BaseDoc):
        fields={"raw": fields.KeywordField()},
    )

+    brand_name = fields.TextField(
+        attr="brand.name",
+        analyzer="standard",
+        fields={
+            "raw": fields.KeywordField(ignore_above=256),
+            "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"),
+            "phonetic": fields.TextField(analyzer="name_phonetic"),
+            "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"),
+        },
+    )
+    category_name = fields.TextField(
+        attr="category.name",
+        analyzer="standard",
+        fields={
+            "raw": fields.KeywordField(ignore_above=256),
+            "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"),
+            "phonetic": fields.TextField(analyzer="name_phonetic"),
+            "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"),
+        },
+    )
+
    class Index(_BaseDoc.Index):
        name = "products"

@ -70,7 +91,7 @@ _add_multilang_fields(ProductDocument)
 registry.register_document(ProductDocument)


-class CategoryDocument(_BaseDoc):
+class CategoryDocument(ActiveOnlyMixin, _BaseDoc):
    priority = fields.IntegerField(attr="priority")

    class Index(_BaseDoc.Index):
@ -85,26 +106,18 @@ _add_multilang_fields(CategoryDocument)
 registry.register_document(CategoryDocument)


-class BrandDocument(ActiveOnlyMixin, Document):
+class BrandDocument(ActiveOnlyMixin, _BaseDoc):
    priority = fields.IntegerField(attr="priority")

-    class Index:
+    class Index(_BaseDoc.Index):
        name = "brands"
-        settings = {
-            "number_of_shards": 1,
-            "number_of_replicas": 0,
-            "analysis": COMMON_ANALYSIS,
-            "index": {"max_ngram_diff": 18},
-        }

    class Django:
        model = Brand
        fields = ["uuid"]

-    def prepare_name(self, instance):
-        return getattr(instance, "name", "") or ""
-

+_add_multilang_fields(BrandDocument)
 registry.register_document(BrandDocument)


@ -114,9 +127,7 @@ class TestModelDocument(Document):

    class Django:
        model = TestModel
-        fields = [
-            "title",
-        ]
+        fields = ["title"]
        ignore_signals = True
        related_models: list = []
        auto_refresh = False
--- a/scripts/Unix/reboot.sh
+++ b/scripts/Unix/reboot.sh
@ -23,6 +23,10 @@ echo "Setting default caches..."
 docker compose exec app poetry run python manage.py set_default_caches
 echo "Default caches set successfully!"

+echo "Building search Index..."
+docker compose exec app poetry run python manage.py search_index --rebuild -f
+echo "Search Index built successfully!"
+
 echo "Cleaning up unused Docker data..."
 docker system prune -f
 echo "Unused Docker data cleaned successfully!"
--- a/scripts/Windows/reboot.ps1
+++ b/scripts/Windows/reboot.ps1
@ -42,6 +42,13 @@ if ($LASTEXITCODE -ne 0) {
 }
 Write-Host "Default caches set successfully!" -ForegroundColor Green

+Write-Host "Building search Index..." -ForegroundColor Magenta
+docker compose exec app poetry run python manage.py search_index --rebuild -f
+if ($LASTEXITCODE -ne 0) {
+    exit $LASTEXITCODE
+}
+Write-Host "Search Index built successfully!" -ForegroundColor Green
+
 Write-Host "Cleaning up unused Docker data..." -ForegroundColor Magenta
 docker system prune -f
 if ($LASTEXITCODE -ne 0) {