Compare commits

...

8 Commits

Author SHA1 Message Date
Clément Renault
d7bd3217d6 Only insert facets in bulks no more incrementally 2024-01-26 11:07:48 +01:00
Clément Renault
6210377b90 Add more logging to the received chunks 2024-01-25 20:16:59 +01:00
Clément Renault
9e3d1e1bbd Remove unused imports 2024-01-25 17:58:47 +01:00
Clément Renault
bceaf4f981 Add a log on the time taken by the incremental facet updating 2024-01-25 17:48:31 +01:00
Clément Renault
d29b301618 Disable the facet search 2024-01-25 17:47:33 +01:00
meili-bors[bot]
a6fa0b97ec Merge #4318
4318: Hide embedders r=ManyTheFish a=dureuill

Hides `embedders` when it is an empty dictionary.

Manual tests:

- getting settings with empty embedders: not displayed
- getting settings with non-empty embedders: displayed like before
- dump with empty embedders: can be imported
- dump with non-empty embedders: can be imported

Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2024-01-15 09:37:31 +00:00
Louis Dureuil
38abfec611 Fix tests 2024-01-11 21:35:30 +01:00
Louis Dureuil
84a5c304fc Don't display the embedders setting when it is an empty dict 2024-01-11 21:35:06 +01:00
6 changed files with 123 additions and 142 deletions

View File

@@ -600,11 +600,12 @@ pub fn settings(
),
};
let embedders = index
let embedders: BTreeMap<_, _> = index
.embedding_configs(rtxn)?
.into_iter()
.map(|(name, config)| (name, Setting::Set(config.into())))
.collect();
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
Ok(Settings {
displayed_attributes: match displayed_attributes {
@@ -631,7 +632,7 @@ pub fn settings(
typo_tolerance: Setting::Set(typo_tolerance),
faceting: Setting::Set(faceting),
pagination: Setting::Set(pagination),
embedders: Setting::Set(embedders),
embedders,
_kind: PhantomData,
})
}

View File

@@ -77,8 +77,7 @@ async fn import_dump_v1_movie_raw() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -239,8 +238,7 @@ async fn import_dump_v1_movie_with_settings() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -387,8 +385,7 @@ async fn import_dump_v1_rubygems_with_settings() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -521,8 +518,7 @@ async fn import_dump_v2_movie_raw() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -667,8 +663,7 @@ async fn import_dump_v2_movie_with_settings() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -812,8 +807,7 @@ async fn import_dump_v2_rubygems_with_settings() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -946,8 +940,7 @@ async fn import_dump_v3_movie_raw() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -1092,8 +1085,7 @@ async fn import_dump_v3_movie_with_settings() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -1237,8 +1229,7 @@ async fn import_dump_v3_rubygems_with_settings() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -1371,8 +1362,7 @@ async fn import_dump_v4_movie_raw() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -1517,8 +1507,7 @@ async fn import_dump_v4_movie_with_settings() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -1662,8 +1651,7 @@ async fn import_dump_v4_rubygems_with_settings() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###
);
@@ -1907,8 +1895,7 @@ async fn import_dump_v6_containing_experimental_features() {
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {}
}
}
"###);

View File

@@ -54,7 +54,7 @@ async fn get_settings() {
let (response, code) = index.settings().await;
assert_eq!(code, 200);
let settings = response.as_object().unwrap();
assert_eq!(settings.keys().len(), 16);
assert_eq!(settings.keys().len(), 15);
assert_eq!(settings["displayedAttributes"], json!(["*"]));
assert_eq!(settings["searchableAttributes"], json!(["*"]));
assert_eq!(settings["filterableAttributes"], json!([]));
@@ -83,7 +83,6 @@ async fn get_settings() {
"maxTotalHits": 1000,
})
);
assert_eq!(settings["embedders"], json!({}));
assert_eq!(settings["proximityPrecision"], json!("byWord"));
}

View File

@@ -61,6 +61,7 @@ impl FacetsUpdateIncremental {
}
}
#[logging_timer::time("FacetsUpdateIncremental::{}")]
pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
let mut cursor = self.delta_data.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {

View File

@@ -76,26 +76,18 @@ pub const FACET_MAX_GROUP_SIZE: u8 = 8;
pub const FACET_GROUP_SIZE: u8 = 4;
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::collections::BTreeSet;
use std::fs::File;
use std::io::BufReader;
use std::iter::FromIterator;
use charabia::normalizer::{Normalize, NormalizerOption};
use grenad::{CompressionType, SortAlgorithm};
use heed::types::{Bytes, DecodeIgnore, SerdeJson};
use heed::BytesEncode;
use log::debug;
use time::OffsetDateTime;
use self::incremental::FacetsUpdateIncremental;
use super::FacetsUpdateBulk;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::BytesRefCodec;
use crate::update::index_documents::create_sorter;
use crate::update::merge_btreeset_string;
use crate::{BEU16StrCodec, Index, Result, MAX_FACET_VALUE_LENGTH};
use crate::{Index, Result};
pub mod bulk;
pub mod incremental;
@@ -146,115 +138,114 @@ impl<'i> FacetsUpdate<'i> {
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
// See self::comparison_bench::benchmark_facet_indexing
if self.delta_data.len() >= (self.database.len(wtxn)? / 50) {
let field_ids =
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
let bulk_update = FacetsUpdateBulk::new(
self.index,
field_ids,
self.facet_type,
self.delta_data,
self.group_size,
self.min_level_size,
);
bulk_update.execute(wtxn)?;
} else {
let incremental_update = FacetsUpdateIncremental::new(
self.index,
self.facet_type,
self.delta_data,
self.group_size,
self.min_level_size,
self.max_group_size,
);
incremental_update.execute(wtxn)?;
}
// We clear the list of normalized-for-search facets
// and the previous FSTs to compute everything from scratch
self.index.facet_id_normalized_string_strings.clear(wtxn)?;
self.index.facet_id_string_fst.clear(wtxn)?;
// As we can't use the same write transaction to read and write in two different databases
// we must create a temporary sorter that we will write into LMDB afterward.
// As multiple unnormalized facet values can become the same normalized facet value
// we must merge them together.
let mut sorter = create_sorter(
SortAlgorithm::Unstable,
merge_btreeset_string,
CompressionType::None,
None,
None,
None,
// if self.delta_data.len() >= (self.database.len(wtxn)? / 50) {
let field_ids = self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
let bulk_update = FacetsUpdateBulk::new(
self.index,
field_ids,
self.facet_type,
self.delta_data,
self.group_size,
self.min_level_size,
);
bulk_update.execute(wtxn)?;
// } else {
// let incremental_update = FacetsUpdateIncremental::new(
// self.index,
// self.facet_type,
// self.delta_data,
// self.group_size,
// self.min_level_size,
// self.max_group_size,
// );
// incremental_update.execute(wtxn)?;
// }
// We iterate on the list of original, semi-normalized, facet values
// and normalize them for search, inserting them in LMDB in any given order.
let options = NormalizerOption { lossy: true, ..Default::default() };
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let (facet_group_key, ()) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
let mut normalized_facet = left_bound.normalize(&options);
let normalized_truncated_facet: String;
if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
normalized_truncated_facet = normalized_facet
.char_indices()
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
.map(|(_, c)| c)
.collect();
normalized_facet = normalized_truncated_facet.into();
}
let set = BTreeSet::from_iter(std::iter::once(left_bound));
let key = (field_id, normalized_facet.as_ref());
let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
sorter.insert(key, val)?;
}
}
// // We clear the list of normalized-for-search facets
// // and the previous FSTs to compute everything from scratch
// self.index.facet_id_normalized_string_strings.clear(wtxn)?;
// self.index.facet_id_string_fst.clear(wtxn)?;
// In this loop we don't need to take care of merging bitmaps
// as the grenad sorter already merged them for us.
let mut merger_iter = sorter.into_stream_merger_iter()?;
while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
self.index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>().put(
wtxn,
key_bytes,
btreeset_bytes,
)?;
}
// // As we can't use the same write transaction to read and write in two different databases
// // we must create a temporary sorter that we will write into LMDB afterward.
// // As multiple unnormalized facet values can become the same normalized facet value
// // we must merge them together.
// let mut sorter = create_sorter(
// SortAlgorithm::Unstable,
// merge_btreeset_string,
// CompressionType::None,
// None,
// None,
// None,
// );
// We compute one FST by string facet
let mut text_fsts = vec![];
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
let database =
self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
for result in database.iter(wtxn)? {
let ((field_id, normalized_facet), _) = result?;
current_fst = match current_fst.take() {
Some((fid, fst_builder)) if fid != field_id => {
let fst = fst_builder.into_set();
text_fsts.push((fid, fst));
Some((field_id, fst::SetBuilder::memory()))
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
// // We iterate on the list of original, semi-normalized, facet values
// // and normalize them for search, inserting them in LMDB in any given order.
// let options = NormalizerOption { lossy: true, ..Default::default() };
// let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
// for result in database.iter(wtxn)? {
// let (facet_group_key, ()) = result?;
// if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
// let mut normalized_facet = left_bound.normalize(&options);
// let normalized_truncated_facet: String;
// if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
// normalized_truncated_facet = normalized_facet
// .char_indices()
// .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
// .map(|(_, c)| c)
// .collect();
// normalized_facet = normalized_truncated_facet.into();
// }
// let set = BTreeSet::from_iter(std::iter::once(left_bound));
// let key = (field_id, normalized_facet.as_ref());
// let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
// let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
// sorter.insert(key, val)?;
// }
// }
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(normalized_facet)?;
}
}
// // In this loop we don't need to take care of merging bitmaps
// // as the grenad sorter already merged them for us.
// let mut merger_iter = sorter.into_stream_merger_iter()?;
// while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? {
// self.index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>().put(
// wtxn,
// key_bytes,
// btreeset_bytes,
// )?;
// }
if let Some((field_id, fst_builder)) = current_fst {
let fst = fst_builder.into_set();
text_fsts.push((field_id, fst));
}
// // We compute one FST by string facet
// let mut text_fsts = vec![];
// let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
// let database =
// self.index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
// for result in database.iter(wtxn)? {
// let ((field_id, normalized_facet), _) = result?;
// current_fst = match current_fst.take() {
// Some((fid, fst_builder)) if fid != field_id => {
// let fst = fst_builder.into_set();
// text_fsts.push((fid, fst));
// Some((field_id, fst::SetBuilder::memory()))
// }
// Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
// None => Some((field_id, fst::SetBuilder::memory())),
// };
// We write those FSTs in LMDB now
for (field_id, fst) in text_fsts {
self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
}
// if let Some((_, fst_builder)) = current_fst.as_mut() {
// fst_builder.insert(normalized_facet)?;
// }
// }
// if let Some((field_id, fst_builder)) = current_fst {
// let fst = fst_builder.into_set();
// text_fsts.push((field_id, fst));
// }
// // We write those FSTs in LMDB now
// for (field_id, fst) in text_fsts {
// self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
// }
Ok(())
}

View File

@@ -123,6 +123,8 @@ pub(crate) fn write_typed_chunk_into_index(
) -> Result<(RoaringBitmap, bool)> {
puffin::profile_function!(typed_chunk.to_debug_string());
log::debug!("Received a chunk to process: {}", typed_chunk.to_debug_string());
let mut is_merged_database = false;
match typed_chunk {
TypedChunk::Documents(obkv_documents_iter) => {