mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-02 11:50:03 +00:00
Try to make facet indexing incremental
This commit is contained in:
committed by
Loïc Lecrenier
parent
d30c89e345
commit
85824ee203
@ -32,6 +32,10 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
// document_id_bytes is a big-endian u32
|
||||
// merge_cbo_roaring_bitmap works with native endian u32s
|
||||
// that is a problem, I think
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
|
||||
|
@ -34,7 +34,6 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::UserError;
|
||||
use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice};
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
self, FacetsUpdateBulk, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
|
||||
@ -431,23 +430,6 @@ where
|
||||
// Merged databases are already been indexed, we start from this count;
|
||||
let mut databases_seen = MERGED_DATABASE_COUNT;
|
||||
|
||||
// Run the facets update operation.
|
||||
for facet_db in [
|
||||
(&self.index.facet_id_string_docids).remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
||||
(&self.index.facet_id_f64_docids).remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
||||
] {
|
||||
let mut builder = FacetsUpdateBulk::new(self.index, facet_db);
|
||||
builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
|
||||
builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
|
||||
if let Some(value) = self.config.facet_level_group_size {
|
||||
builder.level_group_size(value);
|
||||
}
|
||||
if let Some(value) = self.config.facet_min_level_size {
|
||||
builder.min_level_size(value);
|
||||
}
|
||||
builder.execute(self.wtxn)?;
|
||||
}
|
||||
|
||||
databases_seen += 1;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen,
|
||||
|
@ -13,7 +13,9 @@ use super::helpers::{
|
||||
valid_lmdb_key, CursorClonableMmap,
|
||||
};
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice};
|
||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||
use crate::update::FacetsUpdateIncremental;
|
||||
use crate::{
|
||||
lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index,
|
||||
Result,
|
||||
@ -146,6 +148,34 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
|
||||
// merge cbo roaring bitmaps is not the correct merger because the data in the DB
|
||||
// is FacetGroupValue and not RoaringBitmap
|
||||
// so I need to create my own merging function
|
||||
|
||||
// facet_id_string_docids is encoded as:
|
||||
// key: FacetKeyCodec<StrRefCodec>
|
||||
// value: CboRoaringBitmapCodec
|
||||
// basically
|
||||
|
||||
// TODO: a condition saying "if I have more than 1/50th of the DB to add,
|
||||
// then I do it in bulk, otherwise I do it incrementally". But instead of 1/50,
|
||||
// it is a ratio I determine empirically
|
||||
|
||||
// for now I only do it incrementally, to see if things work
|
||||
let builder = FacetsUpdateIncremental::new(
|
||||
index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
||||
);
|
||||
let mut cursor = facet_id_string_docids.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let key =
|
||||
FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?;
|
||||
let value =
|
||||
CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
|
||||
builder.insert(wtxn, key.field_id, key.left_bound, &value)?;
|
||||
}
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
|
||||
append_entries_into_database(
|
||||
facet_id_exists_docids,
|
||||
@ -188,17 +218,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
}
|
||||
}
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => {
|
||||
// facet_id_string_docids contains the thing that the extractor put into it,
|
||||
// so: (FacetKey { field id, level: 0, left_bound } , docids: RoaringBitmap )
|
||||
// now we need to either:
|
||||
// 1. incrementally add the keys/docids pairs into the DB
|
||||
// 2. add the keys/docids into level 0 and then call Facets::execute
|
||||
// the choice of solution should be determined by their performance
|
||||
// characteristics
|
||||
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::GeoPoints(geo_points) => {
|
||||
let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
|
||||
let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
|
||||
|
Reference in New Issue
Block a user