Features: 1) Add seen_keys mechanism to avoid duplicate hits in Elasticsearch query results; 2) Introduce _collect_hits helper function for processing and storing hits; 3) Add exact-match queries for categories, brands, and products to improve search accuracy.

Fixes: 1) Prevent duplicate entries in hit processing by checking `seen_keys`.

Extra: Refactor query-building logic for consistency and readability; minor performance optimizations in query execution.
This commit is contained in:
Egor Pavlovich Gorbunov 2025-10-16 15:19:13 +03:00
parent e894affad7
commit c263182414

View file

@ -228,12 +228,63 @@ def process_query(
uuids_by_index: dict[str, list[dict[str, Any]]] = {"products": [], "categories": [], "brands": []}
hit_cache: list[Any] = []
seen_keys: set[tuple[str, str]] = set()
def _hit_key(hittee: Any) -> tuple[str, str]:
return hittee.meta.index, str(getattr(hittee, "uuid", None) or hittee.meta.id)
def _collect_hits(hits: list[Any]) -> None:
for hh in hits:
key = _hit_key(hh)
if key in seen_keys:
continue
hit_cache.append(hh)
seen_keys.add(key)
if getattr(hh, "uuid", None):
uuids_by_index.setdefault(hh.meta.index, []).append({"uuid": str(hh.uuid)})
exact_queries_by_index: dict[str, list[Any]] = {
"categories": [
Q("term", **{"name.raw": {"value": query}}),
Q("term", **{"slug": {"value": slugify(query)}}),
],
"brands": [
Q("term", **{"name.raw": {"value": query}}),
Q("term", **{"slug": {"value": slugify(query)}}),
],
"products": [
Q("term", **{"name.raw": {"value": query}}),
Q("term", **{"slug": {"value": slugify(query)}}),
Q("term", **{"sku.raw": {"value": query.lower()}}),
Q("term", **{"partnumber.raw": {"value": query.lower()}}),
],
}
for idx_name in ("categories", "brands", "products"):
if idx_name in indexes:
shoulds = exact_queries_by_index[idx_name]
s_exact = (
Search(index=[idx_name])
.query(Q("bool", should=shoulds, minimum_should_match=1))
.extra(size=5, track_total_hits=False)
)
try:
resp_exact = s_exact.execute()
except NotFoundError:
resp_exact = None
if resp_exact is not None and getattr(resp_exact, "hits", None):
_collect_hits(list(resp_exact.hits))
for h in (
list(resp_cats.hits[:12] if resp_cats else [])
+ list(resp_brands.hits[:12] if resp_brands else [])
+ list(resp_products.hits[:26] if resp_products else [])
):
k = _hit_key(h)
if k in seen_keys:
continue
hit_cache.append(h)
seen_keys.add(k)
if getattr(h, "uuid", None):
uuids_by_index.setdefault(h.meta.index, []).append({"uuid": str(h.uuid)})