Features: 1) Implement language-specific adjustments for CJK and RTL/Indic languages in Elasticsearch queries; 2) Enhance query fuzziness settings based on language; 3) Refine SMART_FIELDS logic for phonetic and ngram weighting;

Fixes: 1) Correct logic to handle empty or missing language codes in requests;

Extra: Clean up comments and redundant whitespace in analyzers and filters definitions;
This commit is contained in:
Egor Pavlovich Gorbunov 2025-08-20 06:09:16 +03:00
parent 0a375ad0d1
commit 167dafefb1

View file

@ -97,18 +97,41 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str,
Q("term", **{"slug": slugify(query)}), Q("term", **{"slug": slugify(query)}),
] ]
lang = ""
if request and hasattr(request, "LANGUAGE_CODE") and request.LANGUAGE_CODE:
lang = request.LANGUAGE_CODE.lower()
base = lang.split("-")[0] if lang else ""
is_cjk = base in {"ja", "zh"}
is_rtl_or_indic = base in {"ar", "hi"}
fields_all = SMART_FIELDS[:]
if is_cjk or is_rtl_or_indic:
fields_all = [f for f in fields_all if ".phonetic" not in f]
if is_cjk or is_rtl_or_indic:
fields_all = [
f.replace("name.ngram^6", "name.ngram^8").replace("title.ngram^4", "title.ngram^6") for f in fields_all
]
if is_cjk or is_rtl_or_indic:
fuzzy = None
else:
fuzzy = "AUTO:5,8"
text_shoulds = [ text_shoulds = [
Q( Q(
"multi_match", "multi_match",
query=query, query=query,
fields=SMART_FIELDS, fields=fields_all,
fuzziness="AUTO",
operator="and", operator="and",
**({"fuzziness": fuzzy} if fuzzy else {}),
), ),
Q( Q(
"multi_match", "multi_match",
query=query, query=query,
fields=[f for f in SMART_FIELDS if f.endswith(".auto")], fields=[f for f in fields_all if f.endswith(".auto")],
type="bool_prefix", type="bool_prefix",
), ),
] ]
@ -215,17 +238,16 @@ LANGUAGE_ANALYZER_MAP = {
"nl": "dutch", "nl": "dutch",
"pt": "portuguese", "pt": "portuguese",
"ro": "romanian", "ro": "romanian",
"ja": "cjk_search", "ja": "cjk_search",
"zh": "cjk_search", "zh": "cjk_search",
"ar": "arabic_search", "ar": "arabic_search",
"hi": "indic_search", "hi": "indic_search",
"ru": "russian", "ru": "russian",
"pl": "standard", "pl": "standard",
"kk": "standard", "kk": "standard",
} }
def _lang_analyzer(lang_code: str) -> str: def _lang_analyzer(lang_code: str) -> str:
base = lang_code.split("-")[0].lower() base = lang_code.split("-")[0].lower()
return LANGUAGE_ANALYZER_MAP.get(base, "icu_query") return LANGUAGE_ANALYZER_MAP.get(base, "icu_query")
@ -243,36 +265,24 @@ class ActiveOnlyMixin:
COMMON_ANALYSIS = { COMMON_ANALYSIS = {
"char_filter": { "char_filter": {
# ICU normalizer tidies up Unicode (compatibility forms etc.)
"icu_nfkc_cf": {"type": "icu_normalizer", "name": "nfkc_cf"}, "icu_nfkc_cf": {"type": "icu_normalizer", "name": "nfkc_cf"},
}, },
"filter": { "filter": {
"edge_ngram_filter": {"type": "edge_ngram", "min_gram": 1, "max_gram": 20}, "edge_ngram_filter": {"type": "edge_ngram", "min_gram": 1, "max_gram": 20},
"ngram_filter": {"type": "ngram", "min_gram": 2, "max_gram": 20}, "ngram_filter": {"type": "ngram", "min_gram": 2, "max_gram": 20},
# CJK bigramming helps ja/zh when no language plugin is present
"cjk_bigram": {"type": "cjk_bigram"}, "cjk_bigram": {"type": "cjk_bigram"},
# ICU casefolding/diacritics for *all* scripts
"icu_folding": {"type": "icu_folding"}, "icu_folding": {"type": "icu_folding"},
# Your existing phonetic encoder (mainly helpful for Latin languages)
"double_metaphone": {"type": "phonetic", "encoder": "double_metaphone", "replace": False}, "double_metaphone": {"type": "phonetic", "encoder": "double_metaphone", "replace": False},
# Script-specific light normalizers
"arabic_norm": {"type": "arabic_normalization"}, "arabic_norm": {"type": "arabic_normalization"},
"indic_norm": {"type": "indic_normalization"}, "indic_norm": {"type": "indic_normalization"},
}, },
"analyzer": { "analyzer": {
# Generic query analyzer: ICU normalize+fold across scripts
"icu_query": { "icu_query": {
"type": "custom", "type": "custom",
"char_filter": ["icu_nfkc_cf"], "char_filter": ["icu_nfkc_cf"],
"tokenizer": "icu_tokenizer", "tokenizer": "icu_tokenizer",
"filter": ["lowercase", "icu_folding"], "filter": ["lowercase", "icu_folding"],
}, },
# Autocomplete (works well for all scripts thanks to icu_tokenizer)
"autocomplete": { "autocomplete": {
"type": "custom", "type": "custom",
"char_filter": ["icu_nfkc_cf"], "char_filter": ["icu_nfkc_cf"],
@ -285,32 +295,24 @@ COMMON_ANALYSIS = {
"tokenizer": "icu_tokenizer", "tokenizer": "icu_tokenizer",
"filter": ["lowercase", "icu_folding"], "filter": ["lowercase", "icu_folding"],
}, },
# Content ngram for recall (again ICU-aware)
"name_ngram": { "name_ngram": {
"type": "custom", "type": "custom",
"char_filter": ["icu_nfkc_cf"], "char_filter": ["icu_nfkc_cf"],
"tokenizer": "icu_tokenizer", "tokenizer": "icu_tokenizer",
"filter": ["lowercase", "icu_folding", "ngram_filter"], "filter": ["lowercase", "icu_folding", "ngram_filter"],
}, },
# Phonetic for Latin fallback
"name_phonetic": { "name_phonetic": {
"type": "custom", "type": "custom",
"char_filter": ["icu_nfkc_cf"], "char_filter": ["icu_nfkc_cf"],
"tokenizer": "icu_tokenizer", "tokenizer": "icu_tokenizer",
"filter": ["lowercase", "icu_folding", "double_metaphone"], "filter": ["lowercase", "icu_folding", "double_metaphone"],
}, },
# CJK search analyzer (no stemming; bigram + ICU)
"cjk_search": { "cjk_search": {
"type": "custom", "type": "custom",
"char_filter": ["icu_nfkc_cf"], "char_filter": ["icu_nfkc_cf"],
"tokenizer": "icu_tokenizer", "tokenizer": "icu_tokenizer",
"filter": ["lowercase", "icu_folding", "cjk_bigram"], "filter": ["lowercase", "icu_folding", "cjk_bigram"],
}, },
# Arabic & Indic light normalizations (no stemming; reliable & fast)
"arabic_search": { "arabic_search": {
"type": "custom", "type": "custom",
"char_filter": ["icu_nfkc_cf"], "char_filter": ["icu_nfkc_cf"],