Features: 1) Implement language-specific adjustments for CJK and RTL/Indic languages in Elasticsearch queries; 2) Enhance query fuzziness settings based on language; 3) Refine SMART_FIELDS logic for phonetic and ngram weighting;
Fixes: 1) Correct logic to handle empty or missing language codes in requests; Extra: Clean up comments and redundant whitespace in analyzers and filters definitions;
This commit is contained in:
parent
0a375ad0d1
commit
167dafefb1
1 changed files with 27 additions and 25 deletions
|
|
@ -97,18 +97,41 @@ def process_query(query: str = "", request: Request | None = None) -> dict[str,
|
||||||
Q("term", **{"slug": slugify(query)}),
|
Q("term", **{"slug": slugify(query)}),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
lang = ""
|
||||||
|
if request and hasattr(request, "LANGUAGE_CODE") and request.LANGUAGE_CODE:
|
||||||
|
lang = request.LANGUAGE_CODE.lower()
|
||||||
|
base = lang.split("-")[0] if lang else ""
|
||||||
|
|
||||||
|
is_cjk = base in {"ja", "zh"}
|
||||||
|
is_rtl_or_indic = base in {"ar", "hi"}
|
||||||
|
|
||||||
|
fields_all = SMART_FIELDS[:]
|
||||||
|
|
||||||
|
if is_cjk or is_rtl_or_indic:
|
||||||
|
fields_all = [f for f in fields_all if ".phonetic" not in f]
|
||||||
|
|
||||||
|
if is_cjk or is_rtl_or_indic:
|
||||||
|
fields_all = [
|
||||||
|
f.replace("name.ngram^6", "name.ngram^8").replace("title.ngram^4", "title.ngram^6") for f in fields_all
|
||||||
|
]
|
||||||
|
|
||||||
|
if is_cjk or is_rtl_or_indic:
|
||||||
|
fuzzy = None
|
||||||
|
else:
|
||||||
|
fuzzy = "AUTO:5,8"
|
||||||
|
|
||||||
text_shoulds = [
|
text_shoulds = [
|
||||||
Q(
|
Q(
|
||||||
"multi_match",
|
"multi_match",
|
||||||
query=query,
|
query=query,
|
||||||
fields=SMART_FIELDS,
|
fields=fields_all,
|
||||||
fuzziness="AUTO",
|
|
||||||
operator="and",
|
operator="and",
|
||||||
|
**({"fuzziness": fuzzy} if fuzzy else {}),
|
||||||
),
|
),
|
||||||
Q(
|
Q(
|
||||||
"multi_match",
|
"multi_match",
|
||||||
query=query,
|
query=query,
|
||||||
fields=[f for f in SMART_FIELDS if f.endswith(".auto")],
|
fields=[f for f in fields_all if f.endswith(".auto")],
|
||||||
type="bool_prefix",
|
type="bool_prefix",
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|
@ -215,17 +238,16 @@ LANGUAGE_ANALYZER_MAP = {
|
||||||
"nl": "dutch",
|
"nl": "dutch",
|
||||||
"pt": "portuguese",
|
"pt": "portuguese",
|
||||||
"ro": "romanian",
|
"ro": "romanian",
|
||||||
|
|
||||||
"ja": "cjk_search",
|
"ja": "cjk_search",
|
||||||
"zh": "cjk_search",
|
"zh": "cjk_search",
|
||||||
"ar": "arabic_search",
|
"ar": "arabic_search",
|
||||||
"hi": "indic_search",
|
"hi": "indic_search",
|
||||||
|
|
||||||
"ru": "russian",
|
"ru": "russian",
|
||||||
"pl": "standard",
|
"pl": "standard",
|
||||||
"kk": "standard",
|
"kk": "standard",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def _lang_analyzer(lang_code: str) -> str:
|
def _lang_analyzer(lang_code: str) -> str:
|
||||||
base = lang_code.split("-")[0].lower()
|
base = lang_code.split("-")[0].lower()
|
||||||
return LANGUAGE_ANALYZER_MAP.get(base, "icu_query")
|
return LANGUAGE_ANALYZER_MAP.get(base, "icu_query")
|
||||||
|
|
@ -243,36 +265,24 @@ class ActiveOnlyMixin:
|
||||||
|
|
||||||
COMMON_ANALYSIS = {
|
COMMON_ANALYSIS = {
|
||||||
"char_filter": {
|
"char_filter": {
|
||||||
# ICU normalizer tidies up Unicode (compatibility forms etc.)
|
|
||||||
"icu_nfkc_cf": {"type": "icu_normalizer", "name": "nfkc_cf"},
|
"icu_nfkc_cf": {"type": "icu_normalizer", "name": "nfkc_cf"},
|
||||||
},
|
},
|
||||||
"filter": {
|
"filter": {
|
||||||
"edge_ngram_filter": {"type": "edge_ngram", "min_gram": 1, "max_gram": 20},
|
"edge_ngram_filter": {"type": "edge_ngram", "min_gram": 1, "max_gram": 20},
|
||||||
"ngram_filter": {"type": "ngram", "min_gram": 2, "max_gram": 20},
|
"ngram_filter": {"type": "ngram", "min_gram": 2, "max_gram": 20},
|
||||||
|
|
||||||
# CJK bigramming helps ja/zh when no language plugin is present
|
|
||||||
"cjk_bigram": {"type": "cjk_bigram"},
|
"cjk_bigram": {"type": "cjk_bigram"},
|
||||||
|
|
||||||
# ICU casefolding/diacritics for *all* scripts
|
|
||||||
"icu_folding": {"type": "icu_folding"},
|
"icu_folding": {"type": "icu_folding"},
|
||||||
|
|
||||||
# Your existing phonetic encoder (mainly helpful for Latin languages)
|
|
||||||
"double_metaphone": {"type": "phonetic", "encoder": "double_metaphone", "replace": False},
|
"double_metaphone": {"type": "phonetic", "encoder": "double_metaphone", "replace": False},
|
||||||
|
|
||||||
# Script-specific light normalizers
|
|
||||||
"arabic_norm": {"type": "arabic_normalization"},
|
"arabic_norm": {"type": "arabic_normalization"},
|
||||||
"indic_norm": {"type": "indic_normalization"},
|
"indic_norm": {"type": "indic_normalization"},
|
||||||
},
|
},
|
||||||
"analyzer": {
|
"analyzer": {
|
||||||
# Generic query analyzer: ICU normalize+fold across scripts
|
|
||||||
"icu_query": {
|
"icu_query": {
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"char_filter": ["icu_nfkc_cf"],
|
"char_filter": ["icu_nfkc_cf"],
|
||||||
"tokenizer": "icu_tokenizer",
|
"tokenizer": "icu_tokenizer",
|
||||||
"filter": ["lowercase", "icu_folding"],
|
"filter": ["lowercase", "icu_folding"],
|
||||||
},
|
},
|
||||||
|
|
||||||
# Autocomplete (works well for all scripts thanks to icu_tokenizer)
|
|
||||||
"autocomplete": {
|
"autocomplete": {
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"char_filter": ["icu_nfkc_cf"],
|
"char_filter": ["icu_nfkc_cf"],
|
||||||
|
|
@ -285,32 +295,24 @@ COMMON_ANALYSIS = {
|
||||||
"tokenizer": "icu_tokenizer",
|
"tokenizer": "icu_tokenizer",
|
||||||
"filter": ["lowercase", "icu_folding"],
|
"filter": ["lowercase", "icu_folding"],
|
||||||
},
|
},
|
||||||
|
|
||||||
# Content ngram for recall (again ICU-aware)
|
|
||||||
"name_ngram": {
|
"name_ngram": {
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"char_filter": ["icu_nfkc_cf"],
|
"char_filter": ["icu_nfkc_cf"],
|
||||||
"tokenizer": "icu_tokenizer",
|
"tokenizer": "icu_tokenizer",
|
||||||
"filter": ["lowercase", "icu_folding", "ngram_filter"],
|
"filter": ["lowercase", "icu_folding", "ngram_filter"],
|
||||||
},
|
},
|
||||||
|
|
||||||
# Phonetic for Latin fallback
|
|
||||||
"name_phonetic": {
|
"name_phonetic": {
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"char_filter": ["icu_nfkc_cf"],
|
"char_filter": ["icu_nfkc_cf"],
|
||||||
"tokenizer": "icu_tokenizer",
|
"tokenizer": "icu_tokenizer",
|
||||||
"filter": ["lowercase", "icu_folding", "double_metaphone"],
|
"filter": ["lowercase", "icu_folding", "double_metaphone"],
|
||||||
},
|
},
|
||||||
|
|
||||||
# CJK search analyzer (no stemming; bigram + ICU)
|
|
||||||
"cjk_search": {
|
"cjk_search": {
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"char_filter": ["icu_nfkc_cf"],
|
"char_filter": ["icu_nfkc_cf"],
|
||||||
"tokenizer": "icu_tokenizer",
|
"tokenizer": "icu_tokenizer",
|
||||||
"filter": ["lowercase", "icu_folding", "cjk_bigram"],
|
"filter": ["lowercase", "icu_folding", "cjk_bigram"],
|
||||||
},
|
},
|
||||||
|
|
||||||
# Arabic & Indic light normalizations (no stemming; reliable & fast)
|
|
||||||
"arabic_search": {
|
"arabic_search": {
|
||||||
"type": "custom",
|
"type": "custom",
|
||||||
"char_filter": ["icu_nfkc_cf"],
|
"char_filter": ["icu_nfkc_cf"],
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue