4888: bring back v1.10.0 into main r=Kerollmops a=ManyTheFish



Co-authored-by: Louis Dureuil <louis@meilisearch.com>
Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com>
Co-authored-by: Tamo <tamo@meilisearch.com>
Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot]
2024-08-27 14:02:08 +00:00
committed by GitHub
27 changed files with 2618 additions and 99 deletions

View File

@ -339,10 +339,18 @@ impl ValuesCollection {
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
let options = NormalizerOption { lossy: true, ..Default::default() };
let mut detection = StrDetection::new(facet_string, locales);
// Detect the language of the facet string only if several locales are explicitly provided.
let language = match locales {
Some(&[language]) => Some(language),
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
_ => None,
};
let token = Token {
lemma: std::borrow::Cow::Borrowed(facet_string),
script: detection.script(),
language: detection.language(),
language,
..Default::default()
};

View File

@ -360,6 +360,7 @@ mod test {
use super::*;
#[cfg(feature = "japanese")]
#[cfg(not(feature = "chinese-pinyin"))]
#[test]
fn test_kanji_language_detection() {
use crate::index::tests::TempIndex;

View File

@ -110,18 +110,18 @@ impl<'ctx> DatabaseCache<'ctx> {
.map_err(Into::into)
}
fn get_value_from_keys<'v, K1, KC, DC>(
fn get_value_from_keys<'v, K1, KC>(
txn: &'ctx RoTxn<'_>,
cache_key: K1,
db_keys: &'v [KC::EItem],
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
db: Database<KC, Bytes>,
universe: Option<&RoaringBitmap>,
merger: MergeFn,
) -> Result<Option<DC::DItem>>
) -> Result<Option<RoaringBitmap>>
where
K1: Copy + Eq + Hash,
KC: BytesEncode<'v>,
DC: BytesDecodeOwned,
KC::EItem: Sized,
{
if let Entry::Vacant(entry) = cache.entry(cache_key) {
@ -146,16 +146,22 @@ impl<'ctx> DatabaseCache<'ctx> {
entry.insert(bitmap_ptr);
}
match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes)
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
Some(Cow::Borrowed(bytes)) => bytes,
Some(Cow::Owned(bytes)) => bytes.as_slice(),
None => return Ok(None),
};
match (bitmap_bytes, universe) {
(bytes, Some(universe)) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
.map(Some)
.map_err(Into::into)
}
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
None => Ok(None),
}
}
}
@ -207,12 +213,13 @@ impl<'ctx> SearchContext<'ctx> {
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
DatabaseCache::get_value_from_keys::<_, _>(
self.txn,
word,
&keys[..],
&mut self.db_cache.word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
universe,
merge_cbo_roaring_bitmaps,
)
}
@ -238,12 +245,13 @@ impl<'ctx> SearchContext<'ctx> {
let keys: Vec<_> =
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
DatabaseCache::get_value_from_keys::<_, _>(
self.txn,
word,
&keys[..],
&mut self.db_cache.exact_word_docids,
self.index.word_fid_docids.remap_data_type::<Bytes>(),
universe,
merge_cbo_roaring_bitmaps,
)
}
@ -294,12 +302,13 @@ impl<'ctx> SearchContext<'ctx> {
let keys: Vec<_> =
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
DatabaseCache::get_value_from_keys::<_, _>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
universe,
merge_cbo_roaring_bitmaps,
)
}
@ -325,12 +334,13 @@ impl<'ctx> SearchContext<'ctx> {
let keys: Vec<_> =
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
DatabaseCache::get_value_from_keys::<_, _, CboRoaringBitmapCodec>(
DatabaseCache::get_value_from_keys::<_, _>(
self.txn,
prefix,
&keys[..],
&mut self.db_cache.exact_word_prefix_docids,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
universe,
merge_cbo_roaring_bitmaps,
)
}

View File

@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words;
use self::interner::Interned;
use self::vector_sort::VectorSort;
use crate::localized_attributes_rules::LocalizedFieldIds;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::apply_distinct_rule;
use crate::vector::Embedder;
@ -671,9 +672,44 @@ pub fn execute_search(
tokbuilder.words_dict(dictionary);
}
if let Some(locales) = locales {
tokbuilder.allow_list(locales);
}
let db_locales;
match locales {
Some(locales) => {
if !locales.is_empty() {
tokbuilder.allow_list(locales);
}
}
None => {
// If no locales are specified, we use the locales specified in the localized attributes rules
let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?;
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?;
let localized_fields = match &ctx.restricted_fids {
// if AttributeToSearchOn is set, use the restricted list of ids
Some(restricted_fids) => {
let iter = restricted_fids
.exact
.iter()
.chain(restricted_fids.tolerant.iter())
.map(|(fid, _)| *fid);
LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter)
}
// Otherwise use the full list of ids coming from the index searchable fields
None => LocalizedFieldIds::new(
&localized_attributes_rules,
&fields_ids_map,
searchable_fields.into_iter(),
),
};
db_locales = localized_fields.all_locales();
if !db_locales.is_empty() {
tokbuilder.allow_list(&db_locales);
}
}
};
let tokenizer = tokbuilder.build();
drop(entered);

View File

@ -6,6 +6,7 @@ pub mod exactness;
pub mod geo_sort;
pub mod integration;
#[cfg(feature = "all-tokenizations")]
#[cfg(not(feature = "chinese-pinyin"))]
pub mod language;
pub mod ngram_split_words;
pub mod proximity;