diff --git a/core/elasticsearch/__init__.py b/core/elasticsearch/__init__.py index cf6ebbfd..d88d3d6b 100644 --- a/core/elasticsearch/__init__.py +++ b/core/elasticsearch/__init__.py @@ -1,6 +1,5 @@ from django.conf import settings from django.http import Http404 -from django.shortcuts import get_object_or_404 from django.utils.text import slugify from django.utils.translation import gettext_lazy as _ from django_elasticsearch_dsl import fields @@ -12,25 +11,24 @@ from rest_framework.request import Request from core.models import Brand, Category, Product SMART_FIELDS = [ - "name^6", - "name.ngram^5", - "name.phonetic", - "title^4", - "title.ngram^3", - "title.phonetic", + "name^8", + "name.ngram^6", + "name.phonetic^3", + "title^5", + "title.ngram^4", + "title.phonetic^2", "description^2", "description.ngram", "description.phonetic", - "brand__name^3", - "brand__name.ngram", - "brand__name.auto", - "category__name^2", - "category__name.ngram", - "category__name.auto", + "brand_name^4", + "brand_name.ngram^3", + "brand_name.auto^3", + "category_name^3", + "category_name.ngram^2", + "category_name.auto^2", ] functions = [ - # product-level boosts when searching for products { "filter": Q("term", **{"_index": "products"}), "field_value_factor": { @@ -67,7 +65,6 @@ functions = [ "missing": 0, }, }, - # category-level boost when searching for categories { "filter": Q("term", **{"_index": "categories"}), "field_value_factor": { @@ -77,7 +74,6 @@ functions = [ "missing": 0, }, }, - # brand-level boost when searching for brands { "filter": Q("term", **{"_index": "brands"}), "field_value_factor": { @@ -91,34 +87,35 @@ functions = [ def process_query(query: str = "", request: Request | None = None) -> dict[str, list[dict]] | None: - """ - Perform a lenient, typo‑tolerant, multi‑index search. - - * Full‑text with fuzziness for spelling mistakes - * `bool_prefix` for edge‑ngram autocomplete / “icontains” - """ if not query: raise ValueError(_("no search term provided.")) query = query.strip() try: + exact_shoulds = [ + Q("term", **{"name.raw": query}), + Q("term", **{"slug": slugify(query)}), + ] + + text_shoulds = [ + Q( + "multi_match", + query=query, + fields=SMART_FIELDS, + fuzziness="AUTO", + operator="and", + ), + Q( + "multi_match", + query=query, + fields=[f for f in SMART_FIELDS if f.endswith(".auto")], + type="bool_prefix", + ), + ] + query_base = Q( "bool", - should=[ - Q( - "multi_match", - query=query, - fields=SMART_FIELDS, - fuzziness="AUTO", - operator="and", - ), - Q( - "multi_match", - query=query, - fields=[f for f in SMART_FIELDS if f.endswith(".auto")], - type="bool_prefix", - ), - ], + should=exact_shoulds + text_shoulds, minimum_should_match=1, ) @@ -127,39 +124,61 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str, query=query_base, functions=functions, boost_mode="multiply", - score_mode="first", + score_mode="sum", ) search = Search(index=["products", "categories", "brands", "posts"]).query(function_score_query).extra(size=100) response = search.execute() + # Batch-load related image data to avoid N+1 queries results: dict = {"products": [], "categories": [], "brands": [], "posts": []} + uuids_by_index: dict[str, list] = {"products": [], "categories": [], "brands": []} + hit_cache: list = [] + for hit in response.hits: + hit_cache.append(hit) + if getattr(hit, "uuid", None): + uuids_by_index.setdefault(hit.meta.index, []).append(str(hit.uuid)) + + products_by_uuid = {} + brands_by_uuid = {} + cats_by_uuid = {} + + if request: + if uuids_by_index.get("products"): + products_by_uuid = { + str(p.uuid): p + for p in Product.objects.filter(uuid__in=uuids_by_index["products"]) + .select_related("brand", "category") + .prefetch_related("images") + } + if uuids_by_index.get("brands"): + brands_by_uuid = {str(b.uuid): b for b in Brand.objects.filter(uuid__in=uuids_by_index["brands"])} + if uuids_by_index.get("categories"): + cats_by_uuid = {str(c.uuid): c for c in Category.objects.filter(uuid__in=uuids_by_index["categories"])} + + for hit in hit_cache: obj_uuid = getattr(hit, "uuid", None) or hit.meta.id obj_name = getattr(hit, "name", None) or getattr(hit, "title", None) or "N/A" - obj_slug = "" - raw_slug = getattr(hit, "slug", None) - if raw_slug: - obj_slug = raw_slug - elif hit.meta.index == "brands": - obj_slug = slugify(obj_name) - elif hit.meta.index == "categories": - obj_slug = slugify(f"{obj_name}") + obj_slug = getattr(hit, "slug", "") or ( + slugify(obj_name) if hit.meta.index in {"brands", "categories"} else "" + ) image_url = None idx = hit.meta.index if idx == "products" and request: - prod = get_object_or_404(Product, uuid=obj_uuid) - first = prod.images.order_by("priority").first() - if first and first.image: - image_url = request.build_absolute_uri(first.image.url) + prod = products_by_uuid.get(str(obj_uuid)) + if prod: + first = prod.images.order_by("priority").first() + if first and first.image: + image_url = request.build_absolute_uri(first.image.url) elif idx == "brands" and request: - brand = get_object_or_404(Brand, uuid=obj_uuid) - if brand.small_logo: + brand = brands_by_uuid.get(str(obj_uuid)) + if brand and brand.small_logo: image_url = request.build_absolute_uri(brand.small_logo.url) elif idx == "categories" and request: - cat = get_object_or_404(Category, uuid=obj_uuid) - if cat.image: + cat = cats_by_uuid.get(str(obj_uuid)) + if cat and cat.image: image_url = request.build_absolute_uri(cat.image.url) hit_result = { @@ -175,12 +194,8 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str, hit_result["total_orders_debug"] = getattr(hit, "total_orders", 0) hit_result["brand_priority_debug"] = getattr(hit, "brand_priority", 0) hit_result["category_priority_debug"] = getattr(hit, "category_priority", 0) - if idx == "brands": + if idx in ("brands", "categories"): hit_result["priority_debug"] = getattr(hit, "priority", 0) - if idx == "categories": - hit_result["priority_debug"] = getattr(hit, "priority", 0) - if idx == "posts": - pass results[idx].append(hit_result) @@ -190,30 +205,30 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str, LANGUAGE_ANALYZER_MAP = { - "ar": "arabic", "cs": "czech", "da": "danish", "de": "german", "en": "english", "es": "spanish", "fr": "french", - "hi": "hindi", "it": "italian", - "ja": "standard", - "kk": "standard", "nl": "dutch", - "pl": "standard", "pt": "portuguese", "ro": "romanian", + + "ja": "cjk_search", + "zh": "cjk_search", + "ar": "arabic_search", + "hi": "indic_search", + "ru": "russian", - "zh": "standard", + "pl": "standard", + "kk": "standard", } - def _lang_analyzer(lang_code: str) -> str: - """Return the best‑guess ES analyzer for an ISO language code.""" base = lang_code.split("-")[0].lower() - return LANGUAGE_ANALYZER_MAP.get(base, "standard") + return LANGUAGE_ANALYZER_MAP.get(base, "icu_query") class ActiveOnlyMixin: @@ -227,33 +242,87 @@ class ActiveOnlyMixin: COMMON_ANALYSIS = { + "char_filter": { + # ICU normalizer tidies up Unicode (compatibility forms etc.) + "icu_nfkc_cf": {"type": "icu_normalizer", "name": "nfkc_cf"}, + }, "filter": { "edge_ngram_filter": {"type": "edge_ngram", "min_gram": 1, "max_gram": 20}, "ngram_filter": {"type": "ngram", "min_gram": 2, "max_gram": 20}, - "double_metaphone": { - "type": "phonetic", - "encoder": "double_metaphone", - "replace": False, - }, + + # CJK bigramming helps ja/zh when no language plugin is present + "cjk_bigram": {"type": "cjk_bigram"}, + + # ICU casefolding/diacritics for *all* scripts + "icu_folding": {"type": "icu_folding"}, + + # Your existing phonetic encoder (mainly helpful for Latin languages) + "double_metaphone": {"type": "phonetic", "encoder": "double_metaphone", "replace": False}, + + # Script-specific light normalizers + "arabic_norm": {"type": "arabic_normalization"}, + "indic_norm": {"type": "indic_normalization"}, }, "analyzer": { + # Generic query analyzer: ICU normalize+fold across scripts + "icu_query": { + "type": "custom", + "char_filter": ["icu_nfkc_cf"], + "tokenizer": "icu_tokenizer", + "filter": ["lowercase", "icu_folding"], + }, + + # Autocomplete (works well for all scripts thanks to icu_tokenizer) "autocomplete": { - "tokenizer": "standard", - "filter": ["lowercase", "asciifolding", "edge_ngram_filter"], + "type": "custom", + "char_filter": ["icu_nfkc_cf"], + "tokenizer": "icu_tokenizer", + "filter": ["lowercase", "icu_folding", "edge_ngram_filter"], }, "autocomplete_search": { - "tokenizer": "standard", - "filter": ["lowercase", "asciifolding"], + "type": "custom", + "char_filter": ["icu_nfkc_cf"], + "tokenizer": "icu_tokenizer", + "filter": ["lowercase", "icu_folding"], }, + + # Content ngram for recall (again ICU-aware) "name_ngram": { - "tokenizer": "standard", - "filter": ["lowercase", "asciifolding", "ngram_filter"], + "type": "custom", + "char_filter": ["icu_nfkc_cf"], + "tokenizer": "icu_tokenizer", + "filter": ["lowercase", "icu_folding", "ngram_filter"], }, + + # Phonetic for Latin fallback "name_phonetic": { - "tokenizer": "standard", - "filter": ["lowercase", "asciifolding", "double_metaphone"], + "type": "custom", + "char_filter": ["icu_nfkc_cf"], + "tokenizer": "icu_tokenizer", + "filter": ["lowercase", "icu_folding", "double_metaphone"], + }, + + # CJK search analyzer (no stemming; bigram + ICU) + "cjk_search": { + "type": "custom", + "char_filter": ["icu_nfkc_cf"], + "tokenizer": "icu_tokenizer", + "filter": ["lowercase", "icu_folding", "cjk_bigram"], + }, + + # Arabic & Indic light normalizations (no stemming; reliable & fast) + "arabic_search": { + "type": "custom", + "char_filter": ["icu_nfkc_cf"], + "tokenizer": "icu_tokenizer", + "filter": ["lowercase", "icu_folding", "arabic_norm"], + }, + "indic_search": { + "type": "custom", + "char_filter": ["icu_nfkc_cf"], + "tokenizer": "icu_tokenizer", + "filter": ["lowercase", "icu_folding", "indic_norm"], }, - "query_lc": {"tokenizer": "standard", "filter": ["lowercase", "asciifolding"]}, }, } @@ -275,7 +344,7 @@ def _add_multilang_fields(cls): copy_to="name", fields={ "raw": fields.KeywordField(ignore_above=256), - "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"), + "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"), "phonetic": fields.TextField(analyzer="name_phonetic"), }, ), @@ -298,7 +367,7 @@ def _add_multilang_fields(cls): copy_to="description", fields={ "raw": fields.KeywordField(ignore_above=256), - "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"), + "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"), "phonetic": fields.TextField(analyzer="name_phonetic"), }, ), diff --git a/core/elasticsearch/documents.py b/core/elasticsearch/documents.py index cb6af311..f24d7661 100644 --- a/core/elasticsearch/documents.py +++ b/core/elasticsearch/documents.py @@ -6,13 +6,13 @@ from core.elasticsearch import COMMON_ANALYSIS, ActiveOnlyMixin, _add_multilang_ from core.models import Brand, Category, Product -class _BaseDoc(ActiveOnlyMixin, Document): +class _BaseDoc(Document): name = fields.TextField( attr="name", analyzer="standard", fields={ "raw": fields.KeywordField(ignore_above=256), - "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"), + "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"), "phonetic": fields.TextField(analyzer="name_phonetic"), "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"), }, @@ -22,7 +22,7 @@ class _BaseDoc(ActiveOnlyMixin, Document): analyzer="standard", fields={ "raw": fields.KeywordField(ignore_above=256), - "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"), + "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"), "phonetic": fields.TextField(analyzer="name_phonetic"), "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"), }, @@ -44,9 +44,9 @@ class _BaseDoc(ActiveOnlyMixin, Document): return getattr(instance, "description", "") or "" -class ProductDocument(_BaseDoc): +class ProductDocument(ActiveOnlyMixin, _BaseDoc): rating = fields.FloatField(attr="rating") - total_order = fields.IntegerField(attr="total_orders") + total_orders = fields.IntegerField(attr="total_orders") brand_priority = fields.IntegerField( attr="brand.priority", index=True, @@ -58,6 +58,27 @@ class ProductDocument(_BaseDoc): fields={"raw": fields.KeywordField()}, ) + brand_name = fields.TextField( + attr="brand.name", + analyzer="standard", + fields={ + "raw": fields.KeywordField(ignore_above=256), + "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"), + "phonetic": fields.TextField(analyzer="name_phonetic"), + "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"), + }, + ) + category_name = fields.TextField( + attr="category.name", + analyzer="standard", + fields={ + "raw": fields.KeywordField(ignore_above=256), + "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"), + "phonetic": fields.TextField(analyzer="name_phonetic"), + "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"), + }, + ) + class Index(_BaseDoc.Index): name = "products" @@ -70,7 +91,7 @@ _add_multilang_fields(ProductDocument) registry.register_document(ProductDocument) -class CategoryDocument(_BaseDoc): +class CategoryDocument(ActiveOnlyMixin, _BaseDoc): priority = fields.IntegerField(attr="priority") class Index(_BaseDoc.Index): @@ -85,26 +106,18 @@ _add_multilang_fields(CategoryDocument) registry.register_document(CategoryDocument) -class BrandDocument(ActiveOnlyMixin, Document): +class BrandDocument(ActiveOnlyMixin, _BaseDoc): priority = fields.IntegerField(attr="priority") - class Index: + class Index(_BaseDoc.Index): name = "brands" - settings = { - "number_of_shards": 1, - "number_of_replicas": 0, - "analysis": COMMON_ANALYSIS, - "index": {"max_ngram_diff": 18}, - } class Django: model = Brand fields = ["uuid"] - def prepare_name(self, instance): - return getattr(instance, "name", "") or "" - +_add_multilang_fields(BrandDocument) registry.register_document(BrandDocument) @@ -114,9 +127,7 @@ class TestModelDocument(Document): class Django: model = TestModel - fields = [ - "title", - ] + fields = ["title"] ignore_signals = True related_models: list = [] auto_refresh = False diff --git a/scripts/Unix/reboot.sh b/scripts/Unix/reboot.sh index ea45fd1f..bb0ef21b 100755 --- a/scripts/Unix/reboot.sh +++ b/scripts/Unix/reboot.sh @@ -23,6 +23,10 @@ echo "Setting default caches..." docker compose exec app poetry run python manage.py set_default_caches echo "Default caches set successfully!" +echo "Building search Index..." +docker compose exec app poetry run python manage.py search_index --rebuild -f +echo "Search Index built successfully!" + echo "Cleaning up unused Docker data..." docker system prune -f echo "Unused Docker data cleaned successfully!" diff --git a/scripts/Windows/reboot.ps1 b/scripts/Windows/reboot.ps1 index ec0e2f2c..7cc6af1d 100644 --- a/scripts/Windows/reboot.ps1 +++ b/scripts/Windows/reboot.ps1 @@ -42,6 +42,13 @@ if ($LASTEXITCODE -ne 0) { } Write-Host "Default caches set successfully!" -ForegroundColor Green +Write-Host "Building search Index..." -ForegroundColor Magenta +docker compose exec app poetry run python manage.py search_index --rebuild -f +if ($LASTEXITCODE -ne 0) { + exit $LASTEXITCODE +} +Write-Host "Search Index built successfully!" -ForegroundColor Green + Write-Host "Cleaning up unused Docker data..." -ForegroundColor Magenta docker system prune -f if ($LASTEXITCODE -ne 0) {