Refactor Elasticsearch documents for efficiency and clarity

Simplified field definitions and improved code consistency by consolidating and aligning styles. Added `prepare_*` methods to gracefully handle None values for fields. Enhanced multilingual support and streamlined query construction for better maintainability.
This commit is contained in:
Egor Pavlovich Gorbunov 2025-05-06 15:32:55 +03:00
parent 64a2fe7726
commit ef553a94a4
3 changed files with 75 additions and 101 deletions

View file

@ -11,25 +11,21 @@ class PostDocument(ActiveOnlyMixin, Document):
analyzer="standard", analyzer="standard",
fields={ fields={
"raw": fields.KeywordField(ignore_above=256), "raw": fields.KeywordField(ignore_above=256),
"ngram": fields.TextField( "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"),
analyzer="name_ngram", search_analyzer="query_lc"
),
"phonetic": fields.TextField(analyzer="name_phonetic"), "phonetic": fields.TextField(analyzer="name_phonetic"),
}, },
) )
class Index: class Index:
name = "posts" name = "posts"
settings = { settings = {"number_of_shards": 1, "number_of_replicas": 0,
"number_of_shards": 1, "analysis": COMMON_ANALYSIS, "index": {"max_ngram_diff": 18}}
"number_of_replicas": 0,
"analysis": COMMON_ANALYSIS,
"index": {"max_ngram_diff": 18},
}
class Django: class Django:
model = Post model = Post
fields = ["uuid"] fields = ["uuid"]
def prepare_title(self, instance):
return getattr(instance, "title", "") or ""
registry.register_document(PostDocument) registry.register_document(PostDocument)

View file

@ -40,6 +40,7 @@ def process_query(query: str = ""):
query = query.strip() query = query.strip()
try: try:
# Build the boolean query
q = Q( q = Q(
"bool", "bool",
should=[ should=[
@ -53,31 +54,32 @@ def process_query(query: str = ""):
Q( Q(
"multi_match", "multi_match",
query=query, query=query,
fields=[f.replace(".auto", ".auto") for f in SMART_FIELDS if ".auto" in f], fields=[f for f in SMART_FIELDS if f.endswith('.auto')],
type="bool_prefix", type="bool_prefix",
), ),
], ],
minimum_should_match=1, minimum_should_match=1,
) )
# Execute search across multiple indices
search = Search(index=["products", "categories", "brands", "posts"]).query(q).extra(size=100) search = Search(index=["products", "categories", "brands", "posts"]).query(q).extra(size=100)
response = search.execute() response = search.execute()
# Collect results, guard against None values
results = {"products": [], "categories": [], "brands": [], "posts": []} results = {"products": [], "categories": [], "brands": [], "posts": []}
for hit in response.hits: for hit in response.hits:
obj_uuid = getattr(hit, "uuid", hit.meta.id) obj_uuid = getattr(hit, "uuid", None) or hit.meta.id
obj_name = getattr(hit, "name", "N/A") obj_name = getattr(hit, "name", None) or "N/A"
obj_slug = getattr(hit, "slug", slugify(hit.name)) # Safely generate a slug
if hit.meta.index == "products": obj_slug = getattr(hit, "slug", None) or slugify(obj_name)
results["products"].append({"uuid": obj_uuid, "name": obj_name, "slug": obj_slug})
elif hit.meta.index == "categories":
results["categories"].append({"uuid": obj_uuid, "name": obj_name, "slug": obj_slug})
elif hit.meta.index == "brands":
results["brands"].append({"uuid": obj_uuid, "name": obj_name, "slug": obj_slug})
elif hit.meta.index == "posts":
results["posts"].append({"uuid": obj_uuid, "name": obj_name, "slug": obj_slug})
idx = hit.meta.index
if idx in results:
results[idx].append({
"uuid": str(obj_uuid),
"name": obj_name,
"slug": obj_slug,
})
return results return results
except NotFoundError: except NotFoundError:
raise Http404 raise Http404
@ -93,14 +95,14 @@ LANGUAGE_ANALYZER_MAP = {
"fr": "french", "fr": "french",
"hi": "hindi", "hi": "hindi",
"it": "italian", "it": "italian",
"ja": "standard", # Kuromoji plugin recommended for production "ja": "standard",
"kk": "standard", # No builtin Kazakh stemmer falls back to ICU/standard "kk": "standard",
"nl": "dutch", "nl": "dutch",
"pl": "standard", # No builtin Polish stemmer falls back to ICU/standard "pl": "standard",
"pt": "portuguese", "pt": "portuguese",
"ro": "romanian", "ro": "romanian",
"ru": "russian", "ru": "russian",
"zh": "standard", # smartcn / ICU plugin recommended for production "zh": "standard",
} }
@ -122,58 +124,34 @@ class ActiveOnlyMixin:
COMMON_ANALYSIS = { COMMON_ANALYSIS = {
"filter": { "filter": {
"edge_ngram_filter": { "edge_ngram_filter": {"type": "edge_ngram", "min_gram": 1, "max_gram": 20},
"type": "edge_ngram", "ngram_filter": {"type": "ngram", "min_gram": 2, "max_gram": 20},
"min_gram": 1, "double_metaphone": {"type": "phonetic", "encoder": "double_metaphone", "replace": False},
"max_gram": 20,
},
"ngram_filter": {
"type": "ngram",
"min_gram": 2,
"max_gram": 20,
},
"double_metaphone": {
"type": "phonetic",
"encoder": "double_metaphone",
"replace": False,
},
}, },
"analyzer": { "analyzer": {
"autocomplete": { "autocomplete": {"tokenizer": "standard", "filter": ["lowercase", "asciifolding", "edge_ngram_filter"]},
"tokenizer": "standard", "autocomplete_search": {"tokenizer": "standard", "filter": ["lowercase", "asciifolding"]},
"filter": ["lowercase", "asciifolding", "edge_ngram_filter"], "name_ngram": {"tokenizer": "standard", "filter": ["lowercase", "asciifolding", "ngram_filter"]},
}, "name_phonetic": {"tokenizer": "standard", "filter": ["lowercase", "asciifolding", "double_metaphone"]},
"autocomplete_search": { "query_lc": {"tokenizer": "standard", "filter": ["lowercase", "asciifolding"]},
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding"],
},
"name_ngram": {
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding", "ngram_filter"],
},
"name_phonetic": {
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding", "double_metaphone"],
},
"query_lc": {
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding"],
},
}, },
} }
def _add_multilang_fields(cls): def _add_multilang_fields(cls):
"""
Dynamically add multilingual name/description fields and prepare methods to guard against None.
"""
for code, _lang in settings.LANGUAGES: for code, _lang in settings.LANGUAGES:
lc = code.replace("-", "_").lower() lc = code.replace("-", "_").lower()
analyzer = _lang_analyzer(code) # name_{lc}
name_field = f"name_{lc}"
setattr( setattr(
cls, cls,
f"name_{lc}", name_field,
fields.TextField( fields.TextField(
attr=f"name_{lc}", attr=name_field,
analyzer=analyzer, analyzer=_lang_analyzer(code),
copy_to="name", copy_to="name",
fields={ fields={
"raw": fields.KeywordField(ignore_above=256), "raw": fields.KeywordField(ignore_above=256),
@ -182,12 +160,19 @@ def _add_multilang_fields(cls):
}, },
), ),
) )
# prepare_name_{lc} to ensure no None values
def make_prepare(attr):
return lambda self, instance: getattr(instance, attr, "") or ""
setattr(cls, f"prepare_{name_field}", make_prepare(name_field))
# description_{lc}
desc_field = f"description_{lc}"
setattr( setattr(
cls, cls,
f"description_{lc}", desc_field,
fields.TextField( fields.TextField(
attr=f"description_{lc}", attr=desc_field,
analyzer=analyzer, analyzer=_lang_analyzer(code),
copy_to="description", copy_to="description",
fields={ fields={
"raw": fields.KeywordField(ignore_above=256), "raw": fields.KeywordField(ignore_above=256),
@ -196,3 +181,4 @@ def _add_multilang_fields(cls):
}, },
), ),
) )
setattr(cls, f"prepare_{desc_field}", make_prepare(desc_field))

View file

@ -7,33 +7,24 @@ from core.models import Brand, Category, Product
class _BaseDoc(ActiveOnlyMixin, Document): class _BaseDoc(ActiveOnlyMixin, Document):
name = fields.TextField( name = fields.TextField(
attr="name",
analyzer="standard", analyzer="standard",
fields={ fields={
"raw": fields.KeywordField(ignore_above=256), "raw": fields.KeywordField(ignore_above=256),
"ngram": fields.TextField(analyzer="name_ngram", "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"),
search_analyzer="query_lc"),
"phonetic": fields.TextField(analyzer="name_phonetic"), "phonetic": fields.TextField(analyzer="name_phonetic"),
"auto": fields.TextField( "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"),
analyzer="autocomplete",
search_analyzer="autocomplete_search",
),
}, },
attr=None,
) )
description = fields.TextField( description = fields.TextField(
attr="description",
analyzer="standard", analyzer="standard",
fields={ fields={
"raw": fields.KeywordField(ignore_above=256), "raw": fields.KeywordField(ignore_above=256),
"ngram": fields.TextField(analyzer="name_ngram", "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"),
search_analyzer="query_lc"),
"phonetic": fields.TextField(analyzer="name_phonetic"), "phonetic": fields.TextField(analyzer="name_phonetic"),
"auto": fields.TextField( "auto": fields.TextField(analyzer="autocomplete", search_analyzer="autocomplete_search"),
analyzer="autocomplete",
search_analyzer="autocomplete_search",
),
}, },
attr=None,
) )
class Index: class Index:
@ -41,11 +32,15 @@ class _BaseDoc(ActiveOnlyMixin, Document):
"number_of_shards": 1, "number_of_shards": 1,
"number_of_replicas": 0, "number_of_replicas": 0,
"analysis": COMMON_ANALYSIS, "analysis": COMMON_ANALYSIS,
"index": { "index": {"max_ngram_diff": 20},
"max_ngram_diff": 20,
},
} }
def prepare_name(self, instance):
return getattr(instance, "name", "") or ""
def prepare_description(self, instance):
return getattr(instance, "description", "") or ""
class ProductDocument(_BaseDoc): class ProductDocument(_BaseDoc):
rating = fields.FloatField(attr="rating") rating = fields.FloatField(attr="rating")
@ -81,25 +76,22 @@ class BrandDocument(ActiveOnlyMixin, Document):
analyzer="standard", analyzer="standard",
fields={ fields={
"raw": fields.KeywordField(ignore_above=256), "raw": fields.KeywordField(ignore_above=256),
"ngram": fields.TextField( "ngram": fields.TextField(analyzer="name_ngram", search_analyzer="query_lc"),
analyzer="name_ngram", search_analyzer="query_lc"
),
"phonetic": fields.TextField(analyzer="name_phonetic"), "phonetic": fields.TextField(analyzer="name_phonetic"),
}, },
) )
class Index: class Index:
name = "brands" name = "brands"
settings = { settings = {"number_of_shards": 1, "number_of_replicas": 0,
"number_of_shards": 1, "analysis": COMMON_ANALYSIS, "index": {"max_ngram_diff": 18}}
"number_of_replicas": 0,
"analysis": COMMON_ANALYSIS,
"index": {"max_ngram_diff": 18},
}
class Django: class Django:
model = Brand model = Brand
fields = ["uuid"] fields = ["uuid"]
def prepare_name(self, instance):
return getattr(instance, "name", "") or ""
registry.register_document(BrandDocument) registry.register_document(BrandDocument)