Features: 1) Add sku and partnumber fields with search analyzers for Elasticsearch schema; 2) Implement code-like query boosting for sku and partnumber fields; 3) Separate search logic into modular build_search function;

Fixes: 1) Correct improper replacing logic for `name.ngram` and `title.ngram` in specific language cases; 2) Apply customization to handle `AUTO:5,8` fuzzy logic more cleanly;

Extra: Refactor search responses to handle brands, categories, and products separately with improved modularity.
This commit is contained in:
Egor Pavlovich Gorbunov 2025-09-05 19:42:51 +03:00
parent d811d1e5fe
commit 880f3f19b1
2 changed files with 84 additions and 34 deletions

View file

@ -1,3 +1,5 @@
import re
from django.conf import settings
from django.http import Http404
from django.utils.text import slugify
@ -26,6 +28,12 @@ SMART_FIELDS = [
"category_name^3",
"category_name.ngram^2",
"category_name.auto^2",
"sku^9",
"sku.ngram^6",
"sku.auto^8",
"partnumber^10",
"partnumber.ngram^7",
"partnumber.auto^9",
]
functions = [
@ -101,6 +109,8 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str,
exact_shoulds = [
Q("term", **{"name.raw": {"value": query, "boost": 3.0}}),
Q("term", **{"slug": {"value": slugify(query), "boost": 2.0}}),
Q("term", **{"sku.raw": {"value": query.lower(), "boost": 8.0}}),
Q("term", **{"partnumber.raw": {"value": query.lower(), "boost": 9.0}}),
]
lang = ""
@ -112,19 +122,15 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str,
is_rtl_or_indic = base in {"ar", "hi"}
fields_all = SMART_FIELDS[:]
if is_cjk or is_rtl_or_indic:
fields_all = [f for f in fields_all if ".phonetic" not in f]
if is_cjk or is_rtl_or_indic:
fields_all = [
f.replace("name.ngram^6", "name.ngram^8").replace("title.ngram^4", "title.ngram^6") for f in fields_all
f.replace("name.ngram^8", "name.ngram^10").replace("title.ngram^4", "title.ngram^6") for f in fields_all
]
if is_cjk or is_rtl_or_indic:
fuzzy = None
else:
fuzzy = "AUTO:5,8"
fuzzy = None if (is_cjk or is_rtl_or_indic) else "AUTO:5,8"
is_code_like = bool(re.search(r"[0-9]", query)) and " " not in query
text_shoulds = [
Q(
@ -142,44 +148,62 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str,
),
]
if is_code_like:
text_shoulds.extend(
[
Q("term", **{"sku.raw": {"value": query.lower(), "boost": 12.0}}),
Q("term", **{"partnumber.raw": {"value": query.lower(), "boost": 14.0}}),
Q("prefix", **{"sku.raw": {"value": query.lower(), "boost": 6.0}}),
Q("prefix", **{"partnumber.raw": {"value": query.lower(), "boost": 7.0}}),
]
)
query_base = Q(
"bool",
should=exact_shoulds + text_shoulds,
minimum_should_match=1,
)
search = (
Search(index=["products", "categories", "brands", "posts"])
.query(query_base)
.extra(
rescore={
"window_size": 200,
"query": {
"rescore_query": Q(
"function_score",
query=Q("match_all"),
functions=functions,
boost_mode="sum",
score_mode="sum",
max_boost=2.0,
).to_dict(),
"query_weight": 1.0,
"rescore_query_weight": 1.0,
},
}
def build_search(indexes, size):
return (
Search(index=indexes)
.query(query_base)
.extra(
rescore={
"window_size": 200,
"query": {
"rescore_query": Q(
"function_score",
query=Q("match_all"),
functions=functions,
boost_mode="sum",
score_mode="sum",
max_boost=2.0,
).to_dict(),
"query_weight": 1.0,
"rescore_query_weight": 1.0,
},
}
)
.extra(size=size, track_total_hits=True)
)
.extra(size=100)
)
response = search.execute()
search_cats = build_search(["categories"], size=22)
search_brands = build_search(["brands"], size=22)
search_products = build_search(["products"], size=44)
resp_cats = search_cats.execute()
resp_brands = search_brands.execute()
resp_products = search_products.execute()
results: dict = {"products": [], "categories": [], "brands": [], "posts": []}
uuids_by_index: dict[str, list] = {"products": [], "categories": [], "brands": []}
hit_cache: list = []
for hit in response.hits:
hit_cache.append(hit)
if getattr(hit, "uuid", None):
uuids_by_index.setdefault(hit.meta.index, []).append(str(hit.uuid))
for h in list(resp_cats.hits[:12]) + list(resp_brands.hits[:12]) + list(resp_products.hits[:26]):
hit_cache.append(h)
if getattr(h, "uuid", None):
uuids_by_index.setdefault(h.meta.index, []).append(str(h.uuid))
products_by_uuid = {}
brands_by_uuid = {}
@ -344,6 +368,12 @@ COMMON_ANALYSIS = {
"filter": ["lowercase", "icu_folding", "indic_norm"],
},
},
"normalizer": {
"lc_norm": {
"type": "custom",
"filter": ["lowercase", "icu_folding"],
}
},
}

View file

@ -79,6 +79,26 @@ class ProductDocument(ActiveOnlyMixin, BaseDocument):
},
)
sku = fields.KeywordField(
attr="sku",
normalizer="lc_norm",
fields={
"raw": fields.KeywordField(normalizer="lc_norm"),
"ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"),
"auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"),
},
)
partnumber = fields.KeywordField(
attr="partnumber",
normalizer="lc_norm",
fields={
"raw": fields.KeywordField(normalizer="lc_norm"),
"ngram": fields.TextField(analyzer="name_ngram", search_analyzer="icu_query"),
"auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"),
},
)
def get_queryset(self):
return (
super()