From 79671c9faa2bdc2e2dcd83191a31a00e7175d2e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Nov 2024 12:19:32 +0100 Subject: [PATCH 01/37] Implement a first version of the bbqueue channels --- Cargo.lock | 7 ++++ crates/milli/Cargo.toml | 2 ++ crates/milli/src/update/new/channel.rs | 46 ++++++++++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index e4789da4a..e2069db87 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -489,6 +489,11 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bbqueue" +version = "0.5.1" +source = "git+https://github.com/kerollmops/bbqueue#cbb87cc707b5af415ef203bdaf2443e06ba0d6d4" + [[package]] name = "benchmarks" version = "1.12.0" @@ -3611,6 +3616,7 @@ version = "1.12.0" dependencies = [ "allocator-api2", "arroy 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "bbqueue", "big_s", "bimap", "bincode", @@ -3623,6 +3629,7 @@ dependencies = [ "candle-transformers", "charabia", "concat-arrays", + "crossbeam", "crossbeam-channel", "csv", "deserr", diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index a0bd86a42..798a4ea19 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -98,6 +98,8 @@ allocator-api2 = "0.2.18" rustc-hash = "2.0.0" uell = "0.1.0" enum-iterator = "2.1.0" +bbqueue = { git = "https://github.com/kerollmops/bbqueue" } +crossbeam = "0.8.4" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 00b471b52..21cd6b87d 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,6 +1,7 @@ use std::marker::PhantomData; use std::sync::atomic::{AtomicUsize, Ordering}; +use crossbeam::sync::{Parker, Unparker}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use heed::types::Bytes; use heed::BytesDecode; @@ -8,6 +9,7 @@ use memmap2::Mmap; use roaring::RoaringBitmap; use super::extract::FacetKind; +use super::thread_local::{FullySend, ThreadLocal}; use super::StdResult; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; @@ -16,6 +18,50 @@ use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; use crate::{DocumentId, Index}; +/// Creates a tuple of producer/receivers to be used by +/// the extractors and the writer loop. +/// +/// # Safety +/// +/// Panics if the number of provided bbqueue is not exactly equal +/// to the number of available threads in the rayon threadpool. +pub fn extractor_writer_bbqueue( + bbqueue: &[bbqueue::BBBuffer], +) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { + assert_eq!( + bbqueue.len(), + rayon::current_num_threads(), + "You must provide as many BBBuffer as the available number of threads to extract" + ); + + let parker = Parker::new(); + let extractors = ThreadLocal::with_capacity(bbqueue.len()); + let producers = rayon::broadcast(|bi| { + let bbqueue = &bbqueue[bi.index()]; + let (producer, consumer) = bbqueue.try_split_framed().unwrap(); + extractors.get_or(|| FullySend(producer)); + consumer + }); + + ( + ExtractorBbqueueSender { inner: extractors, unparker: parker.unparker().clone() }, + WriterBbqueueReceiver { inner: producers, parker }, + ) +} + +pub struct ExtractorBbqueueSender<'a> { + inner: ThreadLocal>>, + /// Used to wake up the receiver thread, + /// Used everytime we write something in the producer. + unparker: Unparker, +} + +pub struct WriterBbqueueReceiver<'a> { + inner: Vec>, + /// Used to park when no more work is required + parker: Parker, +} + /// The capacity of the channel is currently in number of messages. pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) { let (sender, receiver) = crossbeam_channel::bounded(cap); From 8442db8101ccc7df7e5b5ab98f8be593d659700a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Nov 2024 18:30:44 +0100 Subject: [PATCH 02/37] Implement mostly all senders --- .../cbo_roaring_bitmap_codec.rs | 19 + crates/milli/src/update/new/channel.rs | 641 ++++++++++-------- .../milli/src/update/new/extract/documents.rs | 11 +- .../src/update/new/extract/vectors/mod.rs | 6 +- crates/milli/src/update/new/indexer/mod.rs | 8 +- crates/milli/src/update/new/merger.rs | 17 +- 6 files changed, 398 insertions(+), 304 deletions(-) diff --git a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 257d5bd0a..cae1874dd 100644 --- a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -41,6 +41,25 @@ impl CboRoaringBitmapCodec { } } + pub fn serialize_into_writer( + roaring: &RoaringBitmap, + mut writer: W, + ) -> io::Result<()> { + if roaring.len() <= THRESHOLD as u64 { + // If the number of items (u32s) to encode is less than or equal to the threshold + // it means that it would weigh the same or less than the RoaringBitmap + // header, so we directly encode them using ByteOrder instead. + for integer in roaring { + writer.write_u32::(integer)?; + } + } else { + // Otherwise, we use the classic RoaringBitmapCodec that writes a header. + roaring.serialize_into(writer)?; + } + + Ok(()) + } + pub fn deserialize_from(mut bytes: &[u8]) -> io::Result { if bytes.len() <= THRESHOLD * size_of::() { // If there is threshold or less than threshold integers that can fit into this array diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 21cd6b87d..cacc7b129 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,14 +1,19 @@ +use std::cell::RefCell; use std::marker::PhantomData; -use std::sync::atomic::{AtomicUsize, Ordering}; +use std::num::NonZeroU16; +use std::{mem, slice}; +use bbqueue::framed::{FrameGrantR, FrameProducer}; +use bytemuck::{NoUninit, CheckedBitPattern}; use crossbeam::sync::{Parker, Unparker}; -use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; +use crossbeam_channel::{IntoIter, Receiver, SendError}; use heed::types::Bytes; use heed::BytesDecode; use memmap2::Mmap; use roaring::RoaringBitmap; use super::extract::FacetKind; +use super::ref_cell_ext::RefCellExt; use super::thread_local::{FullySend, ThreadLocal}; use super::StdResult; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; @@ -16,7 +21,7 @@ use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; use crate::index::{db_name, IndexEmbeddingConfig}; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; -use crate::{DocumentId, Index}; +use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Creates a tuple of producer/receivers to be used by /// the extractors and the writer loop. @@ -26,125 +31,97 @@ use crate::{DocumentId, Index}; /// Panics if the number of provided bbqueue is not exactly equal /// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( - bbqueue: &[bbqueue::BBBuffer], + bbbuffers: &[bbqueue::BBBuffer], ) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { assert_eq!( - bbqueue.len(), + bbbuffers.len(), rayon::current_num_threads(), "You must provide as many BBBuffer as the available number of threads to extract" ); + let capacity = bbbuffers.first().unwrap().capacity(); let parker = Parker::new(); - let extractors = ThreadLocal::with_capacity(bbqueue.len()); + let extractors = ThreadLocal::with_capacity(bbbuffers.len()); let producers = rayon::broadcast(|bi| { - let bbqueue = &bbqueue[bi.index()]; + let bbqueue = &bbbuffers[bi.index()]; let (producer, consumer) = bbqueue.try_split_framed().unwrap(); - extractors.get_or(|| FullySend(producer)); + extractors.get_or(|| FullySend(RefCell::new(producer))); consumer }); ( - ExtractorBbqueueSender { inner: extractors, unparker: parker.unparker().clone() }, + ExtractorBbqueueSender { + inner: extractors, + capacity: capacity.checked_sub(9).unwrap(), + unparker: parker.unparker().clone(), + }, WriterBbqueueReceiver { inner: producers, parker }, ) } -pub struct ExtractorBbqueueSender<'a> { - inner: ThreadLocal>>, - /// Used to wake up the receiver thread, - /// Used everytime we write something in the producer. - unparker: Unparker, -} - pub struct WriterBbqueueReceiver<'a> { inner: Vec>, /// Used to park when no more work is required parker: Parker, } -/// The capacity of the channel is currently in number of messages. -pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) { - let (sender, receiver) = crossbeam_channel::bounded(cap); - ( - ExtractorSender { - sender, - send_count: Default::default(), - writer_contentious_count: Default::default(), - extractor_contentious_count: Default::default(), - }, - WriterReceiver(receiver), - ) -} - -pub enum KeyValueEntry { - Small { key_length: usize, data: Box<[u8]> }, - Large { key_entry: KeyEntry, data: Mmap }, -} - -impl KeyValueEntry { - pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self { - let mut data = Vec::with_capacity(key.len() + value.len()); - data.extend_from_slice(key); - data.extend_from_slice(value); - KeyValueEntry::Small { key_length: key.len(), data: data.into_boxed_slice() } - } - - fn from_large_key_value(key: &[u8], value: Mmap) -> Self { - KeyValueEntry::Large { key_entry: KeyEntry::from_key(key), data: value } - } - - pub fn key(&self) -> &[u8] { - match self { - KeyValueEntry::Small { key_length, data } => &data[..*key_length], - KeyValueEntry::Large { key_entry, data: _ } => key_entry.entry(), - } - } - - pub fn value(&self) -> &[u8] { - match self { - KeyValueEntry::Small { key_length, data } => &data[*key_length..], - KeyValueEntry::Large { key_entry: _, data } => &data[..], +impl<'a> WriterBbqueueReceiver<'a> { + pub fn read(&mut self) -> Option> { + loop { + for consumer in &mut self.inner { + // mark the frame as auto release + if let Some() = consumer.read() + } + break None; } } } -pub struct KeyEntry { - data: Box<[u8]>, +struct FrameWithHeader<'a> { + header: EntryHeader, + frame: FrameGrantR<'a>, } -impl KeyEntry { - pub fn from_key(key: &[u8]) -> Self { - KeyEntry { data: key.to_vec().into_boxed_slice() } +#[derive(Debug, Clone, Copy, CheckedBitPattern)] +#[repr(u8)] +enum EntryHeader { + /// Wether a put of the key/value pair or a delete of the given key. + DbOperation { + /// The database on which to perform the operation. + database: Database, + /// The key length in the buffer. + /// + /// If None it means that the buffer is dedicated + /// to the key and it is therefore a deletion operation. + key_length: Option, + }, + ArroyDeleteVector { + docid: DocumentId, + }, + /// The embedding is the remaining space and represents a non-aligned [f32]. + ArroySetVector { + docid: DocumentId, + embedder_id: u8, + }, +} + +impl EntryHeader { + fn delete_key_size(key_length: u16) -> usize { + mem::size_of::() + key_length as usize } - pub fn entry(&self) -> &[u8] { - self.data.as_ref() + fn put_key_value_size(key_length: u16, value_length: usize) -> usize { + mem::size_of::() + key_length as usize + value_length + } + + fn bytes_of(&self) -> &[u8] { + /// TODO do the variant matching ourselves + todo!() } } -pub enum EntryOperation { - Delete(KeyEntry), - Write(KeyValueEntry), -} - -pub enum WriterOperation { - DbOperation(DbOperation), - ArroyOperation(ArroyOperation), -} - -pub enum ArroyOperation { - DeleteVectors { docid: DocumentId }, - SetVectors { docid: DocumentId, embedder_id: u8, embeddings: Vec }, - SetVector { docid: DocumentId, embedder_id: u8, embedding: Embedding }, - Finish { configs: Vec }, -} - -pub struct DbOperation { - database: Database, - entry: EntryOperation, -} - -#[derive(Debug)] +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(u32)] pub enum Database { Main, Documents, @@ -220,82 +197,46 @@ impl From for Database { } } -impl DbOperation { - pub fn database(&self, index: &Index) -> heed::Database { - self.database.database(index) - } - - pub fn database_name(&self) -> &'static str { - self.database.database_name() - } - - pub fn entry(self) -> EntryOperation { - self.entry - } +pub struct ExtractorBbqueueSender<'a> { + inner: ThreadLocal>>>, + /// The capacity of this frame producer, will never be able to store more than that. + /// + /// Note that the FrameProducer requires up to 9 bytes to encode the length, + /// the capacity has been shrinked accordingly. + /// + /// + capacity: usize, + /// Used to wake up the receiver thread, + /// Used everytime we write something in the producer. + unparker: Unparker, } -pub struct WriterReceiver(Receiver); - -impl IntoIterator for WriterReceiver { - type Item = WriterOperation; - type IntoIter = IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } -} - -pub struct ExtractorSender { - sender: Sender, - /// The number of message we sent in total in the channel. - send_count: AtomicUsize, - /// The number of times we sent something in a channel that was full. - writer_contentious_count: AtomicUsize, - /// The number of times we sent something in a channel that was empty. - extractor_contentious_count: AtomicUsize, -} - -impl Drop for ExtractorSender { - fn drop(&mut self) { - let send_count = *self.send_count.get_mut(); - let writer_contentious_count = *self.writer_contentious_count.get_mut(); - let extractor_contentious_count = *self.extractor_contentious_count.get_mut(); - tracing::debug!( - "Extractor channel stats: {send_count} sends, \ - {writer_contentious_count} writer contentions ({}%), \ - {extractor_contentious_count} extractor contentions ({}%)", - (writer_contentious_count as f32 / send_count as f32) * 100.0, - (extractor_contentious_count as f32 / send_count as f32) * 100.0 - ) - } -} - -impl ExtractorSender { - pub fn docids(&self) -> WordDocidsSender<'_, D> { +impl<'b> ExtractorBbqueueSender<'b> { + pub fn docids<'a, D: DatabaseType>(&'a self) -> WordDocidsSender<'a, 'b, D> { WordDocidsSender { sender: self, _marker: PhantomData } } - pub fn facet_docids(&self) -> FacetDocidsSender<'_> { + pub fn facet_docids<'a>(&'a self) -> FacetDocidsSender<'a, 'b> { FacetDocidsSender { sender: self } } - pub fn field_id_docid_facet_sender(&self) -> FieldIdDocidFacetSender<'_> { - FieldIdDocidFacetSender(self) + pub fn field_id_docid_facet_sender<'a>(&'a self) -> FieldIdDocidFacetSender<'a, 'b> { + FieldIdDocidFacetSender(&self) } - pub fn documents(&self) -> DocumentsSender<'_> { - DocumentsSender(self) + pub fn documents<'a>(&'a self) -> DocumentsSender<'a, 'b> { + DocumentsSender(&self) } - pub fn embeddings(&self) -> EmbeddingSender<'_> { - EmbeddingSender(&self.sender) + pub fn embeddings<'a>(&'a self) -> EmbeddingSender<'a, 'b> { + EmbeddingSender(&self) } - pub fn geo(&self) -> GeoSender<'_> { - GeoSender(&self.sender) + pub fn geo<'a>(&'a self) -> GeoSender<'a, 'b> { + GeoSender(&self) } - fn send_delete_vector(&self, docid: DocumentId) -> StdResult<(), SendError<()>> { + fn send_delete_vector(&self, docid: DocumentId) -> crate::Result<()> { match self .sender .send(WriterOperation::ArroyOperation(ArroyOperation::DeleteVectors { docid })) @@ -305,18 +246,69 @@ impl ExtractorSender { } } - fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> { - if self.sender.is_full() { - self.writer_contentious_count.fetch_add(1, Ordering::SeqCst); - } - if self.sender.is_empty() { - self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst); + fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let key_length = key.len().try_into().unwrap(); + let value_length = value.len(); + let total_length = EntryHeader::put_key_value_size(key_length, value_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); } - self.send_count.fetch_add(1, Ordering::SeqCst); - match self.sender.send(WriterOperation::DbOperation(op)) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + let payload_header = + EntryHeader::DbOperation { database, key_length: NonZeroU16::new(key_length) }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + let (key_out, value_out) = remaining.split_at_mut(key.len()); + key_out.copy_from_slice(key); + value_out.copy_from_slice(value); + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); + } + } + + fn delete_entry(&self, database: Database, key: &[u8]) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let key_length = key.len().try_into().unwrap(); + let total_length = EntryHeader::delete_key_size(key_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); + } + + let payload_header = EntryHeader::DbOperation { database, key_length: None }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + remaining.copy_from_slice(key); + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); } } } @@ -356,159 +348,237 @@ impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; } -pub trait DocidsSender { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>; - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>; -} - -pub struct WordDocidsSender<'a, D> { - sender: &'a ExtractorSender, +pub struct WordDocidsSender<'a, 'b, D> { + sender: &'a ExtractorBbqueueSender<'b>, _marker: PhantomData, } -impl DocidsSender for WordDocidsSender<'_, D> { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), +impl WordDocidsSender<'_, '_, D> { + pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { + let capacity = self.sender.capacity; + let refcell = self.sender.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let key_length = key.len().try_into().unwrap(); + let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + + let total_length = EntryHeader::put_key_value_size(key_length, value_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); } - } - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } - } -} - -pub struct FacetDocidsSender<'a> { - sender: &'a ExtractorSender, -} - -impl DocidsSender for FacetDocidsSender<'_> { - fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { - let (facet_kind, key) = FacetKind::extract_from_key(key); - let database = Database::from(facet_kind); - let entry = match facet_kind { - // skip level group size - FacetKind::String | FacetKind::Number => { - // add facet group size - let value = [&[1], value].concat(); - EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &value)) - } - _ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)), + let payload_header = EntryHeader::DbOperation { + database: D::DATABASE, + key_length: NonZeroU16::new(key_length), }; - match self.sender.send_db_operation(DbOperation { database, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + let (key_out, value_out) = remaining.split_at_mut(key.len()); + key_out.copy_from_slice(key); + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); } } - fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete(&self, key: &[u8]) -> crate::Result<()> { + let capacity = self.sender.capacity; + let refcell = self.sender.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let key_length = key.len().try_into().unwrap(); + let total_length = EntryHeader::delete_key_size(key_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); + } + + let payload_header = EntryHeader::DbOperation { database: D::DATABASE, key_length: None }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + remaining.copy_from_slice(key); + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); + } + } +} + +pub struct FacetDocidsSender<'a, 'b> { + sender: &'a ExtractorBbqueueSender<'b>, +} + +impl FacetDocidsSender<'_, '_> { + pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { + let capacity = self.sender.capacity; + let refcell = self.sender.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + let (facet_kind, key) = FacetKind::extract_from_key(key); - let database = Database::from(facet_kind); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - match self.sender.send_db_operation(DbOperation { database, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + let key_length = key.len().try_into().unwrap(); + + let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + let value_length = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::Number | FacetKind::String => value_length + 1, + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, + }; + + let total_length = EntryHeader::put_key_value_size(key_length, value_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); + } + + let payload_header = EntryHeader::DbOperation { + database: Database::from(facet_kind), + key_length: NonZeroU16::new(key_length), + }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + let (key_out, value_out) = remaining.split_at_mut(key.len()); + key_out.copy_from_slice(key); + + let value_out = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::String | FacetKind::Number => { + let (first, remaining) = value_out.split_first_mut().unwrap(); + *first = 1; + remaining + } + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, + }; + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); + } + } + + pub fn delete(&self, key: &[u8]) -> crate::Result<()> { + let capacity = self.sender.capacity; + let refcell = self.sender.inner.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let (facet_kind, key) = FacetKind::extract_from_key(key); + let key_length = key.len().try_into().unwrap(); + + let total_length = EntryHeader::delete_key_size(key_length); + if total_length > capacity { + unreachable!("entry larger that the bbqueue capacity"); + } + + let payload_header = + EntryHeader::DbOperation { database: Database::from(facet_kind), key_length: None }; + + loop { + let mut grant = match producer.grant(total_length) { + Ok(grant) => grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + }; + + let (header, remaining) = grant.split_at_mut(mem::size_of::()); + header.copy_from_slice(payload_header.bytes_of()); + remaining.copy_from_slice(key); + + // We could commit only the used memory. + grant.commit(total_length); + + break Ok(()); } } } -pub struct FieldIdDocidFacetSender<'a>(&'a ExtractorSender); +pub struct FieldIdDocidFacetSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl FieldIdDocidFacetSender<'_> { - pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> { +impl FieldIdDocidFacetSender<'_, '_> { + pub fn write_facet_string(&self, key: &[u8], value: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)); - self.0 - .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + self.0.write_key_value(Database::FieldIdDocidFacetStrings, key, value) } - pub fn write_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn write_facet_f64(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &[])); - self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + self.0.write_key_value(Database::FieldIdDocidFacetF64s, key, &[]) } - pub fn delete_facet_string(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete_facet_string(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetStringCodec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - self.0 - .send_db_operation(DbOperation { database: Database::FieldIdDocidFacetStrings, entry }) + self.0.delete_entry(Database::FieldIdDocidFacetStrings, key) } - pub fn delete_facet_f64(&self, key: &[u8]) -> StdResult<(), SendError<()>> { + pub fn delete_facet_f64(&self, key: &[u8]) -> crate::Result<()> { debug_assert!(FieldDocIdFacetF64Codec::bytes_decode(key).is_ok()); - let entry = EntryOperation::Delete(KeyEntry::from_key(key)); - self.0.send_db_operation(DbOperation { database: Database::FieldIdDocidFacetF64s, entry }) + self.0.delete_entry(Database::FieldIdDocidFacetF64s, key) } } -pub struct DocumentsSender<'a>(&'a ExtractorSender); +pub struct DocumentsSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl DocumentsSender<'_> { +impl DocumentsSender<'_, '_> { /// TODO do that efficiently pub fn uncompressed( &self, docid: DocumentId, external_id: String, document: &KvReaderFieldId, - ) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( - &docid.to_be_bytes(), - document.as_bytes(), - )); - match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - }?; - - let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value( + ) -> crate::Result<()> { + self.0.write_key_value(Database::Documents, &docid.to_be_bytes(), document.as_bytes())?; + self.0.write_key_value( + Database::ExternalDocumentsIds, external_id.as_bytes(), &docid.to_be_bytes(), - )); - match self - .0 - .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + ) } - pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> { - let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes())); - match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - }?; - + pub fn delete(&self, docid: DocumentId, external_id: String) -> crate::Result<()> { + self.0.delete_entry(Database::Documents, &docid.to_be_bytes())?; self.0.send_delete_vector(docid)?; - - let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes())); - match self - .0 - .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry }) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), - } + self.0.delete_entry(Database::ExternalDocumentsIds, external_id.as_bytes()) } } -pub struct EmbeddingSender<'a>(&'a Sender); +pub struct EmbeddingSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl EmbeddingSender<'_> { +impl EmbeddingSender<'_, '_> { pub fn set_vectors( &self, docid: DocumentId, embedder_id: u8, embeddings: Vec, - ) -> StdResult<(), SendError<()>> { + ) -> crate::Result<()> { self.0 .send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors { docid, @@ -541,33 +611,36 @@ impl EmbeddingSender<'_> { } } -pub struct GeoSender<'a>(&'a Sender); +pub struct GeoSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); -impl GeoSender<'_> { +impl GeoSender<'_, '_> { pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::DbOperation(DbOperation { - database: Database::Main, - entry: EntryOperation::Write(KeyValueEntry::from_large_key_value( - GEO_RTREE_KEY.as_bytes(), - value, - )), - })) - .map_err(|_| SendError(())) + todo!("set rtree from file") + // self.0 + // .send(WriterOperation::DbOperation(DbOperation { + // database: Database::Main, + // entry: EntryOperation::Write(KeyValueEntry::from_large_key_value( + // GEO_RTREE_KEY.as_bytes(), + // value, + // )), + // })) + // .map_err(|_| SendError(())) } pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> StdResult<(), SendError<()>> { - let mut buffer = Vec::new(); - bitmap.serialize_into(&mut buffer).unwrap(); + todo!("serialize directly into bbqueue (as a real roaringbitmap not a cbo)") - self.0 - .send(WriterOperation::DbOperation(DbOperation { - database: Database::Main, - entry: EntryOperation::Write(KeyValueEntry::from_small_key_value( - GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(), - &buffer, - )), - })) - .map_err(|_| SendError(())) + // let mut buffer = Vec::new(); + // bitmap.serialize_into(&mut buffer).unwrap(); + + // self.0 + // .send(WriterOperation::DbOperation(DbOperation { + // database: Database::Main, + // entry: EntryOperation::Write(KeyValueEntry::from_small_key_value( + // GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(), + // &buffer, + // )), + // })) + // .map_err(|_| SendError(())) } } diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index aeb1d5694..13307025a 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -12,13 +12,14 @@ use crate::update::new::thread_local::FullySend; use crate::update::new::DocumentChange; use crate::vector::EmbeddingConfigs; use crate::Result; -pub struct DocumentsExtractor<'a> { - document_sender: &'a DocumentsSender<'a>, + +pub struct DocumentsExtractor<'a, 'b> { + document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs, } -impl<'a> DocumentsExtractor<'a> { - pub fn new(document_sender: &'a DocumentsSender<'a>, embedders: &'a EmbeddingConfigs) -> Self { +impl<'a, 'b> DocumentsExtractor<'a, 'b> { + pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self { Self { document_sender, embedders } } } @@ -29,7 +30,7 @@ pub struct DocumentExtractorData { pub field_distribution_delta: HashMap, } -impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { +impl<'a, 'b, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a, 'b> { type Data = FullySend>; fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result { diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 8ac73a8d7..52b13f37d 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -20,7 +20,7 @@ use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAb pub struct EmbeddingExtractor<'a> { embedders: &'a EmbeddingConfigs, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } @@ -28,7 +28,7 @@ pub struct EmbeddingExtractor<'a> { impl<'a> EmbeddingExtractor<'a> { pub fn new( embedders: &'a EmbeddingConfigs, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { @@ -368,7 +368,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { possible_embedding_mistakes: &PossibleEmbeddingMistakes, unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, - sender: &EmbeddingSender<'a>, + sender: EmbeddingSender<'a>, has_manual_generation: Option<&'a str>, ) -> Result<()> { if let Some(external_docid) = has_manual_generation { diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 35dea7a98..88a4c2f77 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -76,7 +76,11 @@ where MSP: Fn() -> bool + Sync, SP: Fn(Progress) + Sync, { - let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); + /// TODO restrict memory and remove this memory from the extractors bum allocators + let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) + .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread + .collect(); + let (extractor_sender, writer_receiver) = extractor_writer_bbqueue(&bbbuffers); let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; @@ -115,7 +119,7 @@ where // document but we need to create a function that collects and compresses documents. let document_sender = extractor_sender.documents(); - let document_extractor = DocumentsExtractor::new(&document_sender, embedders); + let document_extractor = DocumentsExtractor::new(document_sender, embedders); let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); { let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 039c56b9d..f2809b376 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -19,7 +19,7 @@ pub fn merge_and_send_rtree<'extractor, MSP>( datastore: impl IntoIterator>>, rtxn: &RoTxn, index: &Index, - geo_sender: GeoSender<'_>, + geo_sender: GeoSender<'_, '_>, must_stop_processing: &MSP, ) -> Result<()> where @@ -62,19 +62,19 @@ where } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -pub fn merge_and_send_docids<'extractor, MSP>( +pub fn merge_and_send_docids<'extractor, MSP, D>( mut caches: Vec>, database: Database, index: &Index, - docids_sender: impl DocidsSender + Sync, + docids_sender: WordDocidsSender, must_stop_processing: &MSP, ) -> Result<()> where MSP: Fn() -> bool + Sync, + D: DatabaseType + Sync, { transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| { let rtxn = index.read_txn()?; - let mut buffer = Vec::new(); if must_stop_processing() { return Err(InternalError::AbortedIndexation.into()); } @@ -82,8 +82,7 @@ where let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - docids_sender.write(key, value).unwrap(); + docids_sender.write(key, &bitmap).unwrap(); Ok(()) } Operation::Delete => { @@ -101,21 +100,19 @@ pub fn merge_and_send_facet_docids<'extractor>( mut caches: Vec>, database: FacetDatabases, index: &Index, - docids_sender: impl DocidsSender + Sync, + docids_sender: FacetDocidsSender, ) -> Result { transpose_and_freeze_caches(&mut caches)? .into_par_iter() .map(|frozen| { let mut facet_field_ids_delta = FacetFieldIdsDelta::default(); let rtxn = index.read_txn()?; - let mut buffer = Vec::new(); merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { facet_field_ids_delta.register_from_key(key); - let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); - docids_sender.write(key, value).unwrap(); + docids_sender.write(key, &bitmap).unwrap(); Ok(()) } Operation::Delete => { From 2094ce8a9a8febbab73efdeb0c477cda1c9c67c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 10:19:59 +0100 Subject: [PATCH 03/37] Move the arroy building after the writing loop --- crates/milli/src/update/new/indexer/mod.rs | 81 +++++++++++----------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 88a4c2f77..f82f4af37 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -76,7 +76,7 @@ where MSP: Fn() -> bool + Sync, SP: Fn(Progress) + Sync, { - /// TODO restrict memory and remove this memory from the extractors bum allocators + /// TODO restrict memory and remove this memory from the extractors bump allocators let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread .collect(); @@ -100,6 +100,7 @@ where send_progress, }; + let mut index_embeddings = index.embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; @@ -296,7 +297,6 @@ where 'vectors: { - let mut index_embeddings = index.embedding_configs(&rtxn)?; if index_embeddings.is_empty() { break 'vectors; } @@ -322,8 +322,6 @@ where } } } - - embedding_sender.finish(index_embeddings).unwrap(); } 'geo: { @@ -457,46 +455,47 @@ where embeddings.append(embedding).unwrap(); } - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_items(wtxn, docid, &embeddings)?; - } - ArroyOperation::SetVector { docid, embedder_id, embedding } => { - let (_, _, writer, dimensions) = arroy_writers - .get(&embedder_id) - .expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, &embedding)?; - } - ArroyOperation::Finish { configs } => { - let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); - let _entered = span.enter(); - - (indexing_context.send_progress)(Progress::from_step( - Step::WritingEmbeddingsToDatabase, - )); - - for ( - _embedder_index, - (_embedder_name, _embedder, writer, dimensions), - ) in &mut arroy_writers - { - let dimensions = *dimensions; - writer.build_and_quantize( - wtxn, - &mut rng, - dimensions, - false, - &indexing_context.must_stop_processing, - )?; - } - - index.put_embedding_configs(wtxn, configs)?; - } - }, - } + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_items(wtxn, docid, &embeddings)?; + } + ArroyOperation::SetVector { docid, embedder_id, embedding } => { + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, &embedding)?; + } + _otherwise => unreachable!(), + }, } } + 'vectors: { + let span = + tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); + let _entered = span.enter(); + + if index_embeddings.is_empty() { + break 'vectors; + } + + (indexing_context.send_progress)(Progress::from_step( + Step::WritingEmbeddingsToDatabase, + )); + + for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers { + let dimensions = *dimensions; + writer.build_and_quantize( + wtxn, + &mut rng, + dimensions, + false, + &indexing_context.must_stop_processing, + )?; + } + + index.put_embedding_configs(wtxn, index_embeddings)?; + } + (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); let facet_field_ids_delta = extractor_handle.join().unwrap()?; From e1e76f39d044d083bd7bf0552cc20b36d948af7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:03:39 +0100 Subject: [PATCH 04/37] Clean up dependencies --- Cargo.lock | 21 ++++----------------- Cargo.toml | 3 --- crates/benchmarks/Cargo.toml | 2 +- crates/dump/Cargo.toml | 2 +- crates/index-scheduler/Cargo.toml | 4 ++-- crates/index-scheduler/src/lib.rs | 10 +++++----- crates/meilisearch-auth/Cargo.toml | 2 +- crates/meilisearch-types/Cargo.toml | 2 +- crates/meilisearch/Cargo.toml | 2 +- crates/milli/Cargo.toml | 3 +-- 10 files changed, 17 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2069db87..8a0a6b3d0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1251,19 +1251,6 @@ dependencies = [ "itertools 0.10.5", ] -[[package]] -name = "crossbeam" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1137cd7e7fc0fb5d3c5a8678be38ec56e819125d8d7907411fe24ccb943faca8" -dependencies = [ - "crossbeam-channel", - "crossbeam-deque", - "crossbeam-epoch", - "crossbeam-queue", - "crossbeam-utils", -] - [[package]] name = "crossbeam-channel" version = "0.5.13" @@ -2621,7 +2608,7 @@ dependencies = [ "big_s", "bincode", "bumpalo", - "crossbeam", + "crossbeam-channel", "csv", "derive_builder 0.20.0", "dump", @@ -3629,7 +3616,6 @@ dependencies = [ "candle-transformers", "charabia", "concat-arrays", - "crossbeam", "crossbeam-channel", "csv", "deserr", @@ -4750,8 +4736,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.6" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#8ff028e484fb6192a0acf5a669eaf18c30cada6e" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f81dc953b2244ddd5e7860cb0bb2a790494b898ef321d4aff8e260efab60cc88" dependencies = [ "bytemuck", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index 5e53dbfa5..89a17d8fc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -43,6 +43,3 @@ opt-level = 3 opt-level = 3 [profile.dev.package.roaring] opt-level = 3 - -[patch.crates-io] -roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "clone-iter-slice" } diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index eec30ea3f..ccd256546 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -24,7 +24,7 @@ tempfile = "3.14.0" criterion = { version = "0.5.1", features = ["html_reports"] } rand = "0.8.5" rand_chacha = "0.3.1" -roaring = "0.10.6" +roaring = "0.10.7" [build-dependencies] anyhow = "1.0.86" diff --git a/crates/dump/Cargo.toml b/crates/dump/Cargo.toml index f9d2a9a0b..679a97b4e 100644 --- a/crates/dump/Cargo.toml +++ b/crates/dump/Cargo.toml @@ -17,7 +17,7 @@ http = "1.1.0" meilisearch-types = { path = "../meilisearch-types" } once_cell = "1.19.0" regex = "1.10.5" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } tar = "0.4.41" diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index 657dd6dfe..ad4c1b4b9 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -24,7 +24,7 @@ meilisearch-types = { path = "../meilisearch-types" } page_size = "0.6.0" raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } rayon = "1.10.0" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } synchronoise = "1.0.1" @@ -45,7 +45,7 @@ bumpalo = "3.16.0" [dev-dependencies] arroy = "0.5.0" big_s = "1.0.2" -crossbeam = "0.8.4" +crossbeam-channel = "0.5.13" insta = { version = "1.39.0", features = ["json", "redactions"] } maplit = "1.0.2" meili-snap = { path = "../meili-snap" } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index cef24c1ea..1a1c71bae 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -407,7 +407,7 @@ pub struct IndexScheduler { /// /// See [self.breakpoint()](`IndexScheduler::breakpoint`) for an explanation. #[cfg(test)] - test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>, + test_breakpoint_sdr: crossbeam_channel::Sender<(Breakpoint, bool)>, /// A list of planned failures within the [`tick`](IndexScheduler::tick) method of the index scheduler. /// @@ -476,7 +476,7 @@ impl IndexScheduler { /// Create an index scheduler and start its run loop. pub fn new( options: IndexSchedulerOptions, - #[cfg(test)] test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>, + #[cfg(test)] test_breakpoint_sdr: crossbeam_channel::Sender<(Breakpoint, bool)>, #[cfg(test)] planned_failures: Vec<(usize, tests::FailureLocation)>, ) -> Result { std::fs::create_dir_all(&options.tasks_path)?; @@ -2237,7 +2237,7 @@ mod tests { use std::time::Instant; use big_s::S; - use crossbeam::channel::RecvTimeoutError; + use crossbeam_channel::RecvTimeoutError; use file_store::File; use insta::assert_json_snapshot; use maplit::btreeset; @@ -2289,7 +2289,7 @@ mod tests { configuration: impl Fn(&mut IndexSchedulerOptions), ) -> (Self, IndexSchedulerHandle) { let tempdir = TempDir::new().unwrap(); - let (sender, receiver) = crossbeam::channel::bounded(0); + let (sender, receiver) = crossbeam_channel::bounded(0); let indexer_config = IndexerConfig { skip_index_budget: true, ..Default::default() }; @@ -2421,7 +2421,7 @@ mod tests { pub struct IndexSchedulerHandle { _tempdir: TempDir, index_scheduler: IndexScheduler, - test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, + test_breakpoint_rcv: crossbeam_channel::Receiver<(Breakpoint, bool)>, last_breakpoint: Breakpoint, } diff --git a/crates/meilisearch-auth/Cargo.toml b/crates/meilisearch-auth/Cargo.toml index ae0095ab4..591a40158 100644 --- a/crates/meilisearch-auth/Cargo.toml +++ b/crates/meilisearch-auth/Cargo.toml @@ -17,7 +17,7 @@ hmac = "0.12.1" maplit = "1.0.2" meilisearch-types = { path = "../meilisearch-types" } rand = "0.8.5" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order"] } sha2 = "0.10.8" diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index 349c06080..aca06a018 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -25,7 +25,7 @@ fst = "0.4.7" memmap2 = "0.9.4" milli = { path = "../milli" } raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde-cs = "0.2.4" serde_json = "1.0.120" diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 2884f0c9c..8e134ebd0 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -103,7 +103,7 @@ tracing-subscriber = { version = "0.3.18", features = ["json"] } tracing-trace = { version = "0.1.0", path = "../tracing-trace" } tracing-actix-web = "0.7.11" build-info = { version = "1.7.0", path = "../build-info" } -roaring = "0.10.2" +roaring = "0.10.7" mopa-maintained = "0.2.3" [dev-dependencies] diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 798a4ea19..b66dec9a4 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -42,7 +42,7 @@ obkv = "0.3.0" once_cell = "1.19.0" ordered-float = "4.2.1" rayon = "1.10.0" -roaring = { version = "0.10.6", features = ["serde"] } +roaring = { version = "0.10.7", features = ["serde"] } rstar = { version = "0.12.0", features = ["serde"] } serde = { version = "1.0.204", features = ["derive"] } serde_json = { version = "1.0.120", features = ["preserve_order", "raw_value"] } @@ -99,7 +99,6 @@ rustc-hash = "2.0.0" uell = "0.1.0" enum-iterator = "2.1.0" bbqueue = { git = "https://github.com/kerollmops/bbqueue" } -crossbeam = "0.8.4" [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } From 6ac5b3b136086b8b25b1eb8dc2d6678e39846262 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:36:30 +0100 Subject: [PATCH 05/37] Finish most of the channels types --- crates/milli/src/error.rs | 9 +- crates/milli/src/update/new/channel.rs | 662 +++++++++++------- .../src/update/new/extract/vectors/mod.rs | 2 +- crates/milli/src/update/new/indexer/mod.rs | 132 ++-- 4 files changed, 474 insertions(+), 331 deletions(-) diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 4da57a3e1..800dfa375 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -62,9 +62,14 @@ pub enum InternalError { #[error(transparent)] Store(#[from] MdbError), #[error("Cannot delete {key:?} from database {database_name}: {error}")] - StoreDeletion { database_name: &'static str, key: Vec, error: heed::Error }, + StoreDeletion { database_name: &'static str, key: Box<[u8]>, error: heed::Error }, #[error("Cannot insert {key:?} and value with length {value_length} into database {database_name}: {error}")] - StorePut { database_name: &'static str, key: Vec, value_length: usize, error: heed::Error }, + StorePut { + database_name: &'static str, + key: Box<[u8]>, + value_length: usize, + error: heed::Error, + }, #[error(transparent)] Utf8(#[from] str::Utf8Error), #[error("An indexation process was explicitly aborted")] diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index cacc7b129..d2681c915 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,12 +1,11 @@ use std::cell::RefCell; use std::marker::PhantomData; +use std::mem; use std::num::NonZeroU16; -use std::{mem, slice}; use bbqueue::framed::{FrameGrantR, FrameProducer}; -use bytemuck::{NoUninit, CheckedBitPattern}; -use crossbeam::sync::{Parker, Unparker}; -use crossbeam_channel::{IntoIter, Receiver, SendError}; +use bytemuck::{checked, CheckedBitPattern, NoUninit}; +use crossbeam_channel::SendError; use heed::types::Bytes; use heed::BytesDecode; use memmap2::Mmap; @@ -17,21 +16,32 @@ use super::ref_cell_ext::RefCellExt; use super::thread_local::{FullySend, ThreadLocal}; use super::StdResult; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::index::db_name; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; -use crate::index::{db_name, IndexEmbeddingConfig}; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; use crate::{CboRoaringBitmapCodec, DocumentId, Index}; -/// Creates a tuple of producer/receivers to be used by +/// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. /// +/// The `channel_capacity` parameter defines the number of +/// too-large-to-fit-in-BBQueue entries that can be sent through +/// a crossbeam channel. This parameter must stay low to make +/// sure we do not use too much memory. +/// +/// Note that the channel is also used to wake-up the receiver +/// wehn new stuff is available in any BBQueue buffer but we send +/// a message in this queue only if it is empty to avoid filling +/// the channel *and* the BBQueue. +/// /// # Safety /// -/// Panics if the number of provided bbqueue is not exactly equal +/// Panics if the number of provided BBQueues is not exactly equal /// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( bbbuffers: &[bbqueue::BBBuffer], + channel_capacity: usize, ) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { assert_eq!( bbbuffers.len(), @@ -40,88 +50,252 @@ pub fn extractor_writer_bbqueue( ); let capacity = bbbuffers.first().unwrap().capacity(); - let parker = Parker::new(); - let extractors = ThreadLocal::with_capacity(bbbuffers.len()); - let producers = rayon::broadcast(|bi| { + // Read the field description to understand this + let capacity = capacity.checked_sub(9).unwrap(); + + let producers = ThreadLocal::with_capacity(bbbuffers.len()); + let consumers = rayon::broadcast(|bi| { let bbqueue = &bbbuffers[bi.index()]; let (producer, consumer) = bbqueue.try_split_framed().unwrap(); - extractors.get_or(|| FullySend(RefCell::new(producer))); + producers.get_or(|| FullySend(RefCell::new(producer))); consumer }); - ( - ExtractorBbqueueSender { - inner: extractors, - capacity: capacity.checked_sub(9).unwrap(), - unparker: parker.unparker().clone(), - }, - WriterBbqueueReceiver { inner: producers, parker }, - ) + let (sender, receiver) = crossbeam_channel::bounded(channel_capacity); + let sender = ExtractorBbqueueSender { sender, producers, capacity }; + let receiver = WriterBbqueueReceiver { receiver, consumers }; + (sender, receiver) +} + +pub struct ExtractorBbqueueSender<'a> { + /// This channel is used to wake-up the receiver and + /// send large entries that cannot fit in the BBQueue. + sender: crossbeam_channel::Sender, + /// A memory buffer, one by thread, is used to serialize + /// the entries directly in this shared, lock-free space. + producers: ThreadLocal>>>, + /// The capacity of this frame producer, will never be able to store more than that. + /// + /// Note that the FrameProducer requires up to 9 bytes to encode the length, + /// the capacity has been shrinked accordingly. + /// + /// + capacity: usize, } pub struct WriterBbqueueReceiver<'a> { - inner: Vec>, - /// Used to park when no more work is required - parker: Parker, + /// Used to wake up when new entries are available either in + /// any BBQueue buffer or directly sent throught this channel + /// (still written to disk). + receiver: crossbeam_channel::Receiver, + /// The BBQueue frames to read when waking-up. + consumers: Vec>, +} + +/// The action to perform on the receiver/writer side. +pub enum ReceiverAction { + /// Wake up, you have frames to read for the BBQueue buffers. + WakeUp, + /// An entry that cannot fit in the BBQueue buffers has been + /// written to disk, memory-mapped and must be written in the + /// database. + LargeEntry { + /// The database where the entry must be written. + database: Database, + /// The key of the entry that must be written in the database. + key: Box<[u8]>, + /// The large value that must be written. + /// + /// Note: We can probably use a `File` here and + /// use `Database::put_reserved` instead of memory-mapping. + value: Mmap, + }, } impl<'a> WriterBbqueueReceiver<'a> { + pub fn recv(&mut self) -> Option { + self.receiver.recv().ok() + } + pub fn read(&mut self) -> Option> { - loop { - for consumer in &mut self.inner { - // mark the frame as auto release - if let Some() = consumer.read() + for consumer in &mut self.consumers { + if let Some(frame) = consumer.read() { + return Some(FrameWithHeader::from(frame)); } - break None; } + None } } -struct FrameWithHeader<'a> { +pub struct FrameWithHeader<'a> { header: EntryHeader, frame: FrameGrantR<'a>, } -#[derive(Debug, Clone, Copy, CheckedBitPattern)] -#[repr(u8)] -enum EntryHeader { - /// Wether a put of the key/value pair or a delete of the given key. - DbOperation { - /// The database on which to perform the operation. - database: Database, - /// The key length in the buffer. - /// - /// If None it means that the buffer is dedicated - /// to the key and it is therefore a deletion operation. - key_length: Option, - }, - ArroyDeleteVector { - docid: DocumentId, - }, - /// The embedding is the remaining space and represents a non-aligned [f32]. - ArroySetVector { - docid: DocumentId, - embedder_id: u8, - }, +impl FrameWithHeader<'_> { + pub fn header(&self) -> EntryHeader { + self.header + } + + pub fn frame(&self) -> &FrameGrantR<'_> { + &self.frame + } } -impl EntryHeader { - fn delete_key_size(key_length: u16) -> usize { - mem::size_of::() + key_length as usize - } - - fn put_key_value_size(key_length: u16, value_length: usize) -> usize { - mem::size_of::() + key_length as usize + value_length - } - - fn bytes_of(&self) -> &[u8] { - /// TODO do the variant matching ourselves - todo!() +impl<'a> From> for FrameWithHeader<'a> { + fn from(mut frame: FrameGrantR<'a>) -> Self { + frame.auto_release(true); + FrameWithHeader { header: EntryHeader::from_slice(&frame[..]), frame } } } #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(u32)] +#[repr(C)] +/// Wether a put of the key/value pair or a delete of the given key. +pub struct DbOperation { + /// The database on which to perform the operation. + pub database: Database, + /// The key length in the buffer. + /// + /// If None it means that the buffer is dedicated + /// to the key and it is therefore a deletion operation. + pub key_length: Option, +} + +impl DbOperation { + pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { + /// TODO replace the return type by an enum Write | Delete + let skip = EntryHeader::variant_size() + mem::size_of::(); + match self.key_length { + Some(key_length) => { + let (key, value) = frame[skip..].split_at(key_length.get() as usize); + (key, Some(value)) + } + None => (&frame[skip..], None), + } + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(transparent)] +pub struct ArroyDeleteVector { + pub docid: DocumentId, +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embedding is the remaining space and represents a non-aligned [f32]. +pub struct ArroySetVector { + pub docid: DocumentId, + pub embedder_id: u8, + _padding: [u8; 3], +} + +impl ArroySetVector { + pub fn read_embedding_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + vec.clear(); + let skip = EntryHeader::variant_size() + mem::size_of::(); + let bytes = &frame[skip..]; + bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { + let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); + vec.push(f); + }); + &vec[..] + } +} + +#[derive(Debug, Clone, Copy)] +#[repr(u8)] +pub enum EntryHeader { + DbOperation(DbOperation), + ArroyDeleteVector(ArroyDeleteVector), + ArroySetVector(ArroySetVector), +} + +impl EntryHeader { + const fn variant_size() -> usize { + mem::size_of::() + } + + const fn variant_id(&self) -> u8 { + match self { + EntryHeader::DbOperation(_) => 0, + EntryHeader::ArroyDeleteVector(_) => 1, + EntryHeader::ArroySetVector(_) => 2, + } + } + + const fn total_key_value_size(key_length: NonZeroU16, value_length: usize) -> usize { + Self::variant_size() + + mem::size_of::() + + key_length.get() as usize + + value_length + } + + const fn total_key_size(key_length: NonZeroU16) -> usize { + Self::total_key_value_size(key_length, 0) + } + + const fn total_delete_vector_size() -> usize { + Self::variant_size() + mem::size_of::() + } + + /// The `embedding_length` corresponds to the number of `f32` in the embedding. + fn total_set_vector_size(embedding_length: usize) -> usize { + Self::variant_size() + + mem::size_of::() + + embedding_length * mem::size_of::() + } + + fn header_size(&self) -> usize { + let payload_size = match self { + EntryHeader::DbOperation(op) => mem::size_of_val(op), + EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), + EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), + }; + Self::variant_size() + payload_size + } + + fn from_slice(slice: &[u8]) -> EntryHeader { + let (variant_id, remaining) = slice.split_first().unwrap(); + match variant_id { + 0 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::DbOperation(header) + } + 1 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroyDeleteVector(header) + } + 2 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroySetVector(header) + } + id => panic!("invalid variant id: {id}"), + } + } + + fn serialize_into(&self, header_bytes: &mut [u8]) { + let (first, remaining) = header_bytes.split_first_mut().unwrap(); + let payload_bytes = match self { + EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), + EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), + EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), + }; + *first = self.variant_id(); + remaining.copy_from_slice(payload_bytes); + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(u16)] pub enum Database { Main, Documents, @@ -197,20 +371,6 @@ impl From for Database { } } -pub struct ExtractorBbqueueSender<'a> { - inner: ThreadLocal>>>, - /// The capacity of this frame producer, will never be able to store more than that. - /// - /// Note that the FrameProducer requires up to 9 bytes to encode the length, - /// the capacity has been shrinked accordingly. - /// - /// - capacity: usize, - /// Used to wake up the receiver thread, - /// Used everytime we write something in the producer. - unparker: Unparker, -} - impl<'b> ExtractorBbqueueSender<'b> { pub fn docids<'a, D: DatabaseType>(&'a self) -> WordDocidsSender<'a, 'b, D> { WordDocidsSender { sender: self, _marker: PhantomData } @@ -236,80 +396,171 @@ impl<'b> ExtractorBbqueueSender<'b> { GeoSender(&self) } - fn send_delete_vector(&self, docid: DocumentId) -> crate::Result<()> { - match self - .sender - .send(WriterOperation::ArroyOperation(ArroyOperation::DeleteVectors { docid })) - { - Ok(()) => Ok(()), - Err(SendError(_)) => Err(SendError(())), + fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); + let total_length = EntryHeader::total_delete_vector_size(); + if total_length > capacity { + unreachable!("entry larger that the BBQueue capacity"); } + + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + }; + + payload_header.serialize_into(&mut grant); + + // We could commit only the used memory. + grant.commit(total_length); + + Ok(()) + } + + fn set_vector( + &self, + docid: DocumentId, + embedder_id: u8, + embedding: &[f32], + ) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let payload_header = + EntryHeader::ArroySetVector(ArroySetVector { docid, embedder_id, _padding: [0; 3] }); + let total_length = EntryHeader::total_set_vector_size(embedding.len()); + if total_length > capacity { + unreachable!("entry larger that the BBQueue capacity"); + } + + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + }; + + // payload_header.serialize_into(&mut grant); + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + remaining.copy_from_slice(bytemuck::cast_slice(embedding)); + + // We could commit only the used memory. + grant.commit(total_length); + + Ok(()) } fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + self.write_key_value_with(database, key_length, value.len(), |buffer| { + let (key_buffer, value_buffer) = buffer.split_at_mut(key.len()); + key_buffer.copy_from_slice(key); + value_buffer.copy_from_slice(value); + Ok(()) + }) + } + + fn write_key_value_with( + &self, + database: Database, + key_length: NonZeroU16, + value_length: usize, + key_value_writer: F, + ) -> crate::Result<()> + where + F: FnOnce(&mut [u8]) -> crate::Result<()>, + { let capacity = self.capacity; - let refcell = self.inner.get().unwrap(); + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); - let key_length = key.len().try_into().unwrap(); - let value_length = value.len(); - let total_length = EntryHeader::put_key_value_size(key_length, value_length); + let operation = DbOperation { database, key_length: Some(key_length) }; + let payload_header = EntryHeader::DbOperation(operation); + let total_length = EntryHeader::total_key_value_size(key_length, value_length); if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); + unreachable!("entry larger that the BBQueue capacity"); } - let payload_header = - EntryHeader::DbOperation { database, key_length: NonZeroU16::new(key_length) }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, Err(bbqueue::Error::InsufficientSize) => continue, Err(e) => unreachable!("{e:?}"), - }; + } + }; - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - let (key_out, value_out) = remaining.split_at_mut(key.len()); - key_out.copy_from_slice(key); - value_out.copy_from_slice(value); + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + key_value_writer(remaining)?; - // We could commit only the used memory. - grant.commit(total_length); + // We could commit only the used memory. + grant.commit(total_length); - break Ok(()); - } + Ok(()) } fn delete_entry(&self, database: Database, key: &[u8]) -> crate::Result<()> { + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + self.delete_entry_with(database, key_length, |buffer| { + buffer.copy_from_slice(key); + Ok(()) + }) + } + + fn delete_entry_with( + &self, + database: Database, + key_length: NonZeroU16, + key_writer: F, + ) -> crate::Result<()> + where + F: FnOnce(&mut [u8]) -> crate::Result<()>, + { let capacity = self.capacity; - let refcell = self.inner.get().unwrap(); + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); - let key_length = key.len().try_into().unwrap(); - let total_length = EntryHeader::delete_key_size(key_length); + // For deletion we do not specify the key length, + // it's in the remaining bytes. + let operation = DbOperation { database, key_length: None }; + let payload_header = EntryHeader::DbOperation(operation); + let total_length = EntryHeader::total_key_size(key_length); if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); + unreachable!("entry larger that the BBQueue capacity"); } - let payload_header = EntryHeader::DbOperation { database, key_length: None }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, Err(bbqueue::Error::InsufficientSize) => continue, Err(e) => unreachable!("{e:?}"), - }; + } + }; - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - remaining.copy_from_slice(key); + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + key_writer(remaining)?; - // We could commit only the used memory. - grant.commit(total_length); + // We could commit only the used memory. + grant.commit(total_length); - break Ok(()); - } + Ok(()) } } @@ -355,72 +606,18 @@ pub struct WordDocidsSender<'a, 'b, D> { impl WordDocidsSender<'_, '_, D> { pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { - let capacity = self.sender.capacity; - let refcell = self.sender.inner.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - - let key_length = key.len().try_into().unwrap(); + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); - - let total_length = EntryHeader::put_key_value_size(key_length, value_length); - if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); - } - - let payload_header = EntryHeader::DbOperation { - database: D::DATABASE, - key_length: NonZeroU16::new(key_length), - }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - }; - - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - let (key_out, value_out) = remaining.split_at_mut(key.len()); - key_out.copy_from_slice(key); - CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; - - // We could commit only the used memory. - grant.commit(total_length); - - break Ok(()); - } + self.sender.write_key_value_with(D::DATABASE, key_length, value_length, |buffer| { + let (key_buffer, value_buffer) = buffer.split_at_mut(key.len()); + key_buffer.copy_from_slice(key); + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?; + Ok(()) + }) } pub fn delete(&self, key: &[u8]) -> crate::Result<()> { - let capacity = self.sender.capacity; - let refcell = self.sender.inner.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - - let key_length = key.len().try_into().unwrap(); - let total_length = EntryHeader::delete_key_size(key_length); - if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); - } - - let payload_header = EntryHeader::DbOperation { database: D::DATABASE, key_length: None }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - }; - - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - remaining.copy_from_slice(key); - - // We could commit only the used memory. - grant.commit(total_length); - - break Ok(()); - } + self.sender.delete_entry(D::DATABASE, key) } } @@ -430,13 +627,10 @@ pub struct FacetDocidsSender<'a, 'b> { impl FacetDocidsSender<'_, '_> { pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { - let capacity = self.sender.capacity; - let refcell = self.sender.inner.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - let (facet_kind, key) = FacetKind::extract_from_key(key); - let key_length = key.len().try_into().unwrap(); + let database = Database::from(facet_kind); + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); let value_length = match facet_kind { // We must take the facet group size into account @@ -445,26 +639,8 @@ impl FacetDocidsSender<'_, '_> { FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, }; - let total_length = EntryHeader::put_key_value_size(key_length, value_length); - if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); - } - - let payload_header = EntryHeader::DbOperation { - database: Database::from(facet_kind), - key_length: NonZeroU16::new(key_length), - }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - }; - - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - let (key_out, value_out) = remaining.split_at_mut(key.len()); + self.sender.write_key_value_with(database, key_length, value_length, |buffer| { + let (key_out, value_out) = buffer.split_at_mut(key.len()); key_out.copy_from_slice(key); let value_out = match facet_kind { @@ -477,47 +653,17 @@ impl FacetDocidsSender<'_, '_> { } FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, }; + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; - // We could commit only the used memory. - grant.commit(total_length); - - break Ok(()); - } + Ok(()) + }) } pub fn delete(&self, key: &[u8]) -> crate::Result<()> { - let capacity = self.sender.capacity; - let refcell = self.sender.inner.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - let (facet_kind, key) = FacetKind::extract_from_key(key); - let key_length = key.len().try_into().unwrap(); - - let total_length = EntryHeader::delete_key_size(key_length); - if total_length > capacity { - unreachable!("entry larger that the bbqueue capacity"); - } - - let payload_header = - EntryHeader::DbOperation { database: Database::from(facet_kind), key_length: None }; - - loop { - let mut grant = match producer.grant(total_length) { - Ok(grant) => grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - }; - - let (header, remaining) = grant.split_at_mut(mem::size_of::()); - header.copy_from_slice(payload_header.bytes_of()); - remaining.copy_from_slice(key); - - // We could commit only the used memory. - grant.commit(total_length); - - break Ok(()); - } + let database = Database::from(facet_kind); + self.sender.delete_entry(database, key) } } @@ -565,7 +711,7 @@ impl DocumentsSender<'_, '_> { pub fn delete(&self, docid: DocumentId, external_id: String) -> crate::Result<()> { self.0.delete_entry(Database::Documents, &docid.to_be_bytes())?; - self.0.send_delete_vector(docid)?; + self.0.delete_vector(docid)?; self.0.delete_entry(Database::ExternalDocumentsIds, external_id.as_bytes()) } } @@ -579,13 +725,10 @@ impl EmbeddingSender<'_, '_> { embedder_id: u8, embeddings: Vec, ) -> crate::Result<()> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors { - docid, - embedder_id, - embeddings, - })) - .map_err(|_| SendError(())) + for embedding in embeddings { + self.set_vector(docid, embedder_id, embedding)?; + } + Ok(()) } pub fn set_vector( @@ -593,21 +736,8 @@ impl EmbeddingSender<'_, '_> { docid: DocumentId, embedder_id: u8, embedding: Embedding, - ) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::SetVector { - docid, - embedder_id, - embedding, - })) - .map_err(|_| SendError(())) - } - - /// Marks all embedders as "to be built" - pub fn finish(self, configs: Vec) -> StdResult<(), SendError<()>> { - self.0 - .send(WriterOperation::ArroyOperation(ArroyOperation::Finish { configs })) - .map_err(|_| SendError(())) + ) -> crate::Result<()> { + self.0.set_vector(docid, embedder_id, &embedding[..]) } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 52b13f37d..42278d443 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -76,7 +76,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { context.data, &self.possible_embedding_mistakes, self.threads, - self.sender, + &self.sender, &context.doc_alloc, )) } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index f82f4af37..1fd60b610 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -40,7 +40,7 @@ use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings}; +use crate::vector::{ArroyWrapper, EmbeddingConfigs}; use crate::{ Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder, UserError, @@ -80,7 +80,7 @@ where let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread .collect(); - let (extractor_sender, writer_receiver) = extractor_writer_bbqueue(&bbbuffers); + let (extractor_sender, writer_receiver) = extractor_writer_bbqueue(&bbbuffers, 1000); let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; @@ -386,7 +386,11 @@ where }) .collect(); + // Used by by the ArroySetVector to copy the embedding into an + // aligned memory area, required by arroy to accept a new vector. + let mut aligned_embedding = Vec::new(); let mut arroy_writers = arroy_writers?; + { let span = tracing::trace_span!(target: "indexing::write_db", "all"); let _entered = span.enter(); @@ -394,81 +398,85 @@ where let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); let mut _entered_post_merge = None; - for operation in writer_receiver { + while let Some(action) = writer_receiver.recv() { if _entered_post_merge.is_none() && finished_extraction.load(std::sync::atomic::Ordering::Relaxed) { _entered_post_merge = Some(span.enter()); } - match operation { - WriterOperation::DbOperation(db_operation) => { - let database = db_operation.database(index); - let database_name = db_operation.database_name(); - match db_operation.entry() { - EntryOperation::Delete(e) => match database.delete(wtxn, e.entry()) { - Ok(false) => unreachable!("We tried to delete an unknown key"), - Ok(_) => (), - Err(error) => { - return Err(Error::InternalError( - InternalError::StoreDeletion { - database_name, - key: e.entry().to_owned(), - error, - }, - )); - } - }, - EntryOperation::Write(e) => { - if let Err(error) = database.put(wtxn, e.key(), e.value()) { - return Err(Error::InternalError(InternalError::StorePut { - database_name, - key: e.key().to_owned(), - value_length: e.value().len(), - error, - })); - } - } + + match action { + ReceiverAction::WakeUp => (), + ReceiverAction::LargeEntry { database, key, value } => { + let database_name = database.database_name(); + let database = database.database(index); + if let Err(error) = database.put(wtxn, &key, &value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key, + value_length: value.len(), + error, + })); } } - WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { - ArroyOperation::DeleteVectors { docid } => { - for ( - _embedder_index, - (_embedder_name, _embedder, writer, dimensions), - ) in &mut arroy_writers - { + } + + while let Some(frame_with_header) = writer_receiver.read() { + match frame_with_header.header() { + EntryHeader::DbOperation(operation) => { + let database_name = operation.database.database_name(); + let database = operation.database.database(index); + let frame = frame_with_header.frame(); + match operation.key_value(frame) { + (key, Some(value)) => { + if let Err(error) = database.put(wtxn, key, value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: key.into(), + value_length: value.len(), + error, + })); + } + } + (key, None) => match database.delete(wtxn, key) { + Ok(false) => { + unreachable!("We tried to delete an unknown key: {key:?}") + } + Ok(_) => (), + Err(error) => { + return Err(Error::InternalError( + InternalError::StoreDeletion { + database_name, + key: key.into(), + error, + }, + )); + } + }, + } + } + EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { + for (_index, (_name, _embedder, writer, dimensions)) in &mut arroy_writers { let dimensions = *dimensions; writer.del_items(wtxn, dimensions, docid)?; } } - ArroyOperation::SetVectors { - docid, - embedder_id, - embeddings: raw_embeddings, - } => { - let (_, _, writer, dimensions) = arroy_writers - .get(&embedder_id) - .expect("requested a missing embedder"); - - let mut embeddings = Embeddings::new(*dimensions); - for embedding in raw_embeddings { - embeddings.append(embedding).unwrap(); - } - - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_items(wtxn, docid, &embeddings)?; + EntryHeader::ArroySetVector(asv) => { + let ArroySetVector { docid, embedder_id, .. } = asv; + let frame = frame_with_header.frame(); + let embedding = asv.read_embedding_into_vec(frame, &mut aligned_embedding); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, embedding)?; + } } - ArroyOperation::SetVector { docid, embedder_id, embedding } => { - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, &embedding)?; - } - _otherwise => unreachable!(), - }, + } } } + todo!("read the BBQueue once the channel is closed"); + 'vectors: { let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); From 70802eb7c72473fb5cb8a1b0258a9a6ab88b81f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:45:47 +0100 Subject: [PATCH 06/37] Fix most issues with the lifetimes --- crates/milli/src/update/new/channel.rs | 7 ++++++ .../new/extract/faceted/extract_facets.rs | 6 ++--- .../src/update/new/extract/vectors/mod.rs | 22 +++++++++---------- crates/milli/src/update/new/indexer/mod.rs | 6 ++--- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index d2681c915..d1d64814e 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -93,6 +93,7 @@ pub struct WriterBbqueueReceiver<'a> { } /// The action to perform on the receiver/writer side. +#[derive(Debug)] pub enum ReceiverAction { /// Wake up, you have frames to read for the BBQueue buffers. WakeUp, @@ -599,6 +600,7 @@ impl DatabaseType for WordPositionDocids { const DATABASE: Database = Database::WordPositionDocids; } +#[derive(Clone, Copy)] pub struct WordDocidsSender<'a, 'b, D> { sender: &'a ExtractorBbqueueSender<'b>, _marker: PhantomData, @@ -621,6 +623,7 @@ impl WordDocidsSender<'_, '_, D> { } } +#[derive(Clone, Copy)] pub struct FacetDocidsSender<'a, 'b> { sender: &'a ExtractorBbqueueSender<'b>, } @@ -667,6 +670,7 @@ impl FacetDocidsSender<'_, '_> { } } +#[derive(Clone, Copy)] pub struct FieldIdDocidFacetSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl FieldIdDocidFacetSender<'_, '_> { @@ -691,6 +695,7 @@ impl FieldIdDocidFacetSender<'_, '_> { } } +#[derive(Clone, Copy)] pub struct DocumentsSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl DocumentsSender<'_, '_> { @@ -716,6 +721,7 @@ impl DocumentsSender<'_, '_> { } } +#[derive(Clone, Copy)] pub struct EmbeddingSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl EmbeddingSender<'_, '_> { @@ -741,6 +747,7 @@ impl EmbeddingSender<'_, '_> { } } +#[derive(Clone, Copy)] pub struct GeoSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl GeoSender<'_, '_> { diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 9ad37d52c..490dada65 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -25,14 +25,14 @@ use crate::update::new::DocumentChange; use crate::update::GrenadParameters; use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; -pub struct FacetedExtractorData<'a> { +pub struct FacetedExtractorData<'a, 'b> { attributes_to_extract: &'a [&'a str], - sender: &'a FieldIdDocidFacetSender<'a>, + sender: &'a FieldIdDocidFacetSender<'a, 'b>, grenad_parameters: GrenadParameters, buckets: usize, } -impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { +impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> { type Data = RefCell>; fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result { diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 42278d443..1110432fa 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -18,17 +18,17 @@ use crate::vector::error::{ use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; -pub struct EmbeddingExtractor<'a> { +pub struct EmbeddingExtractor<'a, 'b> { embedders: &'a EmbeddingConfigs, - sender: EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } -impl<'a> EmbeddingExtractor<'a> { +impl<'a, 'b> EmbeddingExtractor<'a, 'b> { pub fn new( embedders: &'a EmbeddingConfigs, - sender: EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { @@ -43,7 +43,7 @@ pub struct EmbeddingExtractorData<'extractor>( unsafe impl MostlySend for EmbeddingExtractorData<'_> {} -impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { +impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { type Data = RefCell>; fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result { @@ -76,7 +76,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { context.data, &self.possible_embedding_mistakes, self.threads, - &self.sender, + self.sender, &context.doc_alloc, )) } @@ -259,7 +259,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { // Currently this is the case as: // 1. BVec are inside of the bumaplo // 2. All other fields are either trivial (u8) or references. -struct Chunks<'a, 'extractor> { +struct Chunks<'a, 'b, 'extractor> { texts: BVec<'a, &'a str>, ids: BVec<'a, DocumentId>, @@ -270,11 +270,11 @@ struct Chunks<'a, 'extractor> { possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, has_manual_generation: Option<&'a str>, } -impl<'a, 'extractor> Chunks<'a, 'extractor> { +impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { #[allow(clippy::too_many_arguments)] pub fn new( embedder: &'a Embedder, @@ -284,7 +284,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { user_provided: &'a RefCell>, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, - sender: &'a EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, doc_alloc: &'a Bump, ) -> Self { let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); @@ -368,7 +368,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { possible_embedding_mistakes: &PossibleEmbeddingMistakes, unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, - sender: EmbeddingSender<'a>, + sender: EmbeddingSender<'a, 'b>, has_manual_generation: Option<&'a str>, ) -> Result<()> { if let Some(external_docid) = has_manual_generation { diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 1fd60b610..982868d93 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -80,7 +80,7 @@ where let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread .collect(); - let (extractor_sender, writer_receiver) = extractor_writer_bbqueue(&bbbuffers, 1000); + let (extractor_sender, mut writer_receiver) = extractor_writer_bbqueue(&bbbuffers, 1000); let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; @@ -302,7 +302,7 @@ where } let embedding_sender = extractor_sender.embeddings(); - let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); + let extractor = EmbeddingExtractor::new(embedders, embedding_sender, field_distribution, request_threads()); let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); { let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); @@ -363,7 +363,6 @@ where let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); let vector_arroy = index.vector_arroy; - let mut rng = rand::rngs::StdRng::seed_from_u64(42); let indexer_span = tracing::Span::current(); let arroy_writers: Result> = embedders .inner_as_ref() @@ -490,6 +489,7 @@ where Step::WritingEmbeddingsToDatabase, )); + let mut rng = rand::rngs::StdRng::seed_from_u64(42); for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers { let dimensions = *dimensions; writer.build_and_quantize( From 08d641336588a51bf7ada203828ba2b9c19123bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:46:41 +0100 Subject: [PATCH 07/37] Fix result types --- crates/milli/src/update/new/extract/faceted/extract_facets.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 490dada65..f2132ce38 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -318,7 +318,7 @@ impl<'doc> DelAddFacetValue<'doc> { docid: DocumentId, sender: &FieldIdDocidFacetSender, doc_alloc: &Bump, - ) -> std::result::Result<(), crossbeam_channel::SendError<()>> { + ) -> crate::Result<()> { let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); for ((fid, value), deladd) in self.strings { if let Ok(s) = std::str::from_utf8(&value) { From acec45ad7c3414db493132fd37fdd951b61529b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 13:59:29 +0100 Subject: [PATCH 08/37] Send a WakeUp when writing data in the BBQueue buffers --- crates/milli/src/update/new/channel.rs | 24 ++++ crates/milli/src/update/new/indexer/mod.rs | 136 +++++++++++++-------- 2 files changed, 107 insertions(+), 53 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index d1d64814e..0a6d37943 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -422,6 +422,12 @@ impl<'b> ExtractorBbqueueSender<'b> { // We could commit only the used memory. grant.commit(total_length); + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + Ok(()) } @@ -460,6 +466,12 @@ impl<'b> ExtractorBbqueueSender<'b> { // We could commit only the used memory. grant.commit(total_length); + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + Ok(()) } @@ -511,6 +523,12 @@ impl<'b> ExtractorBbqueueSender<'b> { // We could commit only the used memory. grant.commit(total_length); + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + Ok(()) } @@ -561,6 +579,12 @@ impl<'b> ExtractorBbqueueSender<'b> { // We could commit only the used memory. grant.commit(total_length); + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + Ok(()) } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 982868d93..835ee240b 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -420,61 +420,27 @@ where } } - while let Some(frame_with_header) = writer_receiver.read() { - match frame_with_header.header() { - EntryHeader::DbOperation(operation) => { - let database_name = operation.database.database_name(); - let database = operation.database.database(index); - let frame = frame_with_header.frame(); - match operation.key_value(frame) { - (key, Some(value)) => { - if let Err(error) = database.put(wtxn, key, value) { - return Err(Error::InternalError(InternalError::StorePut { - database_name, - key: key.into(), - value_length: value.len(), - error, - })); - } - } - (key, None) => match database.delete(wtxn, key) { - Ok(false) => { - unreachable!("We tried to delete an unknown key: {key:?}") - } - Ok(_) => (), - Err(error) => { - return Err(Error::InternalError( - InternalError::StoreDeletion { - database_name, - key: key.into(), - error, - }, - )); - } - }, - } - } - EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { - for (_index, (_name, _embedder, writer, dimensions)) in &mut arroy_writers { - let dimensions = *dimensions; - writer.del_items(wtxn, dimensions, docid)?; - } - } - EntryHeader::ArroySetVector(asv) => { - let ArroySetVector { docid, embedder_id, .. } = asv; - let frame = frame_with_header.frame(); - let embedding = asv.read_embedding_into_vec(frame, &mut aligned_embedding); - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, embedding)?; - } - } - } + // Every time the is a message in the channel we search + // for new entries in the BBQueue buffers. + write_from_bbqueue( + &mut writer_receiver, + index, + wtxn, + &arroy_writers, + &mut aligned_embedding, + )?; } - } - todo!("read the BBQueue once the channel is closed"); + // Once the extractor/writer channel is closed + // we must process the remaining BBQueue messages. + write_from_bbqueue( + &mut writer_receiver, + index, + wtxn, + &arroy_writers, + &mut aligned_embedding, + )?; + } 'vectors: { let span = @@ -548,6 +514,70 @@ where Ok(()) } +/// A function dedicated to manage all the available BBQueue frames. +/// +/// It reads all the available frames, do the corresponding database operations +/// and stops when no frame are available. +fn write_from_bbqueue( + writer_receiver: &mut WriterBbqueueReceiver<'_>, + index: &Index, + wtxn: &mut RwTxn<'_>, + arroy_writers: &HashMap, + aligned_embedding: &mut Vec, +) -> crate::Result<()> { + while let Some(frame_with_header) = writer_receiver.read() { + match frame_with_header.header() { + EntryHeader::DbOperation(operation) => { + let database_name = operation.database.database_name(); + let database = operation.database.database(index); + let frame = frame_with_header.frame(); + match operation.key_value(frame) { + (key, Some(value)) => { + if let Err(error) = database.put(wtxn, key, value) { + return Err(Error::InternalError(InternalError::StorePut { + database_name, + key: key.into(), + value_length: value.len(), + error, + })); + } + } + (key, None) => match database.delete(wtxn, key) { + Ok(false) => { + unreachable!("We tried to delete an unknown key: {key:?}") + } + Ok(_) => (), + Err(error) => { + return Err(Error::InternalError(InternalError::StoreDeletion { + database_name, + key: key.into(), + error, + })); + } + }, + } + } + EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { + for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers { + let dimensions = *dimensions; + writer.del_items(wtxn, dimensions, docid)?; + } + } + EntryHeader::ArroySetVector(asv) => { + let ArroySetVector { docid, embedder_id, .. } = asv; + let frame = frame_with_header.frame(); + let embedding = asv.read_embedding_into_vec(frame, aligned_embedding); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, embedding)?; + } + } + } + + Ok(()) +} + #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] fn compute_prefix_database( index: &Index, From cc63802115d864ce169a3f86cf669ed356f8167d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 14:58:03 +0100 Subject: [PATCH 09/37] Modify and return the IndexEmbeddings to write them later --- crates/milli/src/update/new/indexer/mod.rs | 25 +++++++++++----------- crates/milli/src/update/new/steps.rs | 4 ++-- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 835ee240b..89c1b850d 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -117,7 +117,6 @@ where let rtxn = index.read_txn()?; - // document but we need to create a function that collects and compresses documents. let document_sender = extractor_sender.documents(); let document_extractor = DocumentsExtractor::new(document_sender, embedders); @@ -180,10 +179,6 @@ where } { - - - - let WordDocidsCaches { word_docids, word_fid_docids, @@ -296,7 +291,6 @@ where } 'vectors: { - if index_embeddings.is_empty() { break 'vectors; } @@ -308,7 +302,14 @@ where let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); let _entered = span.enter(); - extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?; + extract( + document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::ExtractingEmbeddings, + )?; } { let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); @@ -357,7 +358,7 @@ where finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); - Result::Ok(facet_field_ids_delta) + Result::Ok((facet_field_ids_delta, index_embeddings)) })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); @@ -442,6 +443,10 @@ where )?; } + (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); + + let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?; + 'vectors: { let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); @@ -470,10 +475,6 @@ where index.put_embedding_configs(wtxn, index_embeddings)?; } - (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); - - let facet_field_ids_delta = extractor_handle.join().unwrap()?; - (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets)); if index.facet_search(wtxn)? { diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index 7c2441933..bee1be260 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -11,8 +11,8 @@ pub enum Step { ExtractingEmbeddings, WritingGeoPoints, WritingToDatabase, - WritingEmbeddingsToDatabase, WaitingForExtractors, + WritingEmbeddingsToDatabase, PostProcessingFacets, PostProcessingWords, Finalizing, @@ -29,8 +29,8 @@ impl Step { Step::ExtractingEmbeddings => "extracting embeddings", Step::WritingGeoPoints => "writing geo points", Step::WritingToDatabase => "writing to database", - Step::WritingEmbeddingsToDatabase => "writing embeddings to database", Step::WaitingForExtractors => "waiting for extractors", + Step::WritingEmbeddingsToDatabase => "writing embeddings to database", Step::PostProcessingFacets => "post-processing facets", Step::PostProcessingWords => "post-processing words", Step::Finalizing => "finalizing", From a514ce472acfb6bbe329f01ad3be27f0c487bb20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 14:59:04 +0100 Subject: [PATCH 10/37] Make clippy happy --- crates/milli/src/update/new/channel.rs | 8 ++++---- crates/milli/src/update/new/merger.rs | 7 ------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 0a6d37943..fc05baa89 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -382,19 +382,19 @@ impl<'b> ExtractorBbqueueSender<'b> { } pub fn field_id_docid_facet_sender<'a>(&'a self) -> FieldIdDocidFacetSender<'a, 'b> { - FieldIdDocidFacetSender(&self) + FieldIdDocidFacetSender(self) } pub fn documents<'a>(&'a self) -> DocumentsSender<'a, 'b> { - DocumentsSender(&self) + DocumentsSender(self) } pub fn embeddings<'a>(&'a self) -> EmbeddingSender<'a, 'b> { - EmbeddingSender(&self) + EmbeddingSender(self) } pub fn geo<'a>(&'a self) -> GeoSender<'a, 'b> { - GeoSender(&self) + GeoSender(self) } fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index f2809b376..f8af84177 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -249,10 +249,3 @@ fn merge_cbo_bitmaps( } } } - -/// TODO Return the slice directly from the serialize_into method -fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec) -> &'b [u8] { - buffer.clear(); - CboRoaringBitmapCodec::serialize_into(bitmap, buffer); - buffer.as_slice() -} From 98d4a2909e85c8ec5ba1a6dd4b2a6b2d63cf42c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 16:05:44 +0100 Subject: [PATCH 11/37] Fix the way we spawn the rayon threadpool --- crates/index-scheduler/src/batch.rs | 110 +++--- crates/milli/src/update/new/channel.rs | 36 +- crates/milli/src/update/new/indexer/mod.rs | 440 +++++++++++---------- 3 files changed, 313 insertions(+), 273 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 04cdb912f..bec1fedf5 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -1351,7 +1351,10 @@ impl IndexScheduler { let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); &local_pool } }; @@ -1399,21 +1402,19 @@ impl IndexScheduler { } if tasks.iter().any(|res| res.error.is_none()) { - pool.install(|| { - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - primary_key, - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - ) - }) - .unwrap()?; + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + embedders, + &|| must_stop_processing.get(), + &send_progress, + )?; tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } @@ -1489,34 +1490,34 @@ impl IndexScheduler { let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); &local_pool } }; - pool.install(|| { - let indexer = - UpdateByFunction::new(candidates, context.clone(), code.clone()); - let document_changes = indexer.into_changes(&primary_key)?; - let embedders = index.embedding_configs(index_wtxn)?; - let embedders = self.embedders(embedders)?; + let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone()); + let document_changes = + pool.install(|| indexer.into_changes(&primary_key)).unwrap()?; - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // cannot change primary key in DocumentEdition - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - )?; + let embedders = index.embedding_configs(index_wtxn)?; + let embedders = self.embedders(embedders)?; - Result::Ok(()) - }) - .unwrap()?; + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // cannot change primary key in DocumentEdition + &document_changes, + embedders, + &|| must_stop_processing.get(), + &send_progress, + )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } @@ -1641,7 +1642,10 @@ impl IndexScheduler { let pool = match &indexer_config.thread_pool { Some(pool) => pool, None => { - local_pool = ThreadPoolNoAbortBuilder::new().build().unwrap(); + local_pool = ThreadPoolNoAbortBuilder::new() + .thread_name(|i| format!("indexing-thread-{i}")) + .build() + .unwrap(); &local_pool } }; @@ -1652,21 +1656,19 @@ impl IndexScheduler { let embedders = index.embedding_configs(index_wtxn)?; let embedders = self.embedders(embedders)?; - pool.install(|| { - indexer::index( - index_wtxn, - index, - indexer_config.grenad_parameters(), - &db_fields_ids_map, - new_fields_ids_map, - None, // document deletion never changes primary key - &document_changes, - embedders, - &|| must_stop_processing.get(), - &send_progress, - ) - }) - .unwrap()?; + indexer::index( + index_wtxn, + index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + None, // document deletion never changes primary key + &document_changes, + embedders, + &|| must_stop_processing.get(), + &send_progress, + )?; // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done"); } diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index fc05baa89..beba80ac8 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -55,6 +55,12 @@ pub fn extractor_writer_bbqueue( let producers = ThreadLocal::with_capacity(bbbuffers.len()); let consumers = rayon::broadcast(|bi| { + eprintln!( + "hello thread #{:?} (#{:?}, #{:?})", + bi.index(), + std::thread::current().name(), + std::thread::current().id(), + ); let bbqueue = &bbbuffers[bi.index()]; let (producer, consumer) = bbqueue.try_split_framed().unwrap(); producers.get_or(|| FullySend(RefCell::new(producer))); @@ -399,7 +405,15 @@ impl<'b> ExtractorBbqueueSender<'b> { fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { let capacity = self.capacity; - let refcell = self.producers.get().unwrap(); + let refcell = match self.producers.get() { + Some(refcell) => refcell, + None => panic!( + "hello thread #{:?} (#{:?}, #{:?})", + rayon::current_thread_index(), + std::thread::current().name(), + std::thread::current().id() + ), + }; let mut producer = refcell.0.borrow_mut_or_yield(); let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); @@ -438,7 +452,15 @@ impl<'b> ExtractorBbqueueSender<'b> { embedding: &[f32], ) -> crate::Result<()> { let capacity = self.capacity; - let refcell = self.producers.get().unwrap(); + let refcell = match self.producers.get() { + Some(refcell) => refcell, + None => panic!( + "hello thread #{:?} (#{:?}, #{:?})", + rayon::current_thread_index(), + std::thread::current().name(), + std::thread::current().id() + ), + }; let mut producer = refcell.0.borrow_mut_or_yield(); let payload_header = @@ -496,7 +518,15 @@ impl<'b> ExtractorBbqueueSender<'b> { F: FnOnce(&mut [u8]) -> crate::Result<()>, { let capacity = self.capacity; - let refcell = self.producers.get().unwrap(); + let refcell = match self.producers.get() { + Some(refcell) => refcell, + None => panic!( + "hello thread #{:?} (#{:?}, #{:?})", + rayon::current_thread_index(), + std::thread::current().name(), + std::thread::current().id() + ), + }; let mut producer = refcell.0.borrow_mut_or_yield(); let operation = DbOperation { database, key_length: Some(key_length) }; diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 89c1b850d..b7d5431b4 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -62,6 +62,7 @@ mod update_by_function; pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( wtxn: &mut RwTxn, index: &'index Index, + pool: &ThreadPoolNoAbort, grenad_parameters: GrenadParameters, db_fields_ids_map: &'indexer FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, @@ -77,10 +78,15 @@ where SP: Fn(Progress) + Sync, { /// TODO restrict memory and remove this memory from the extractors bump allocators - let bbbuffers: Vec<_> = (0..rayon::current_num_threads()) - .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread - .collect(); - let (extractor_sender, mut writer_receiver) = extractor_writer_bbqueue(&bbbuffers, 1000); + let bbbuffers: Vec<_> = pool + .install(|| { + (0..rayon::current_num_threads()) + .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread + .collect() + }) + .unwrap(); + let (extractor_sender, mut writer_receiver) = + pool.install(|| extractor_writer_bbqueue(&bbbuffers, 1000)).unwrap(); let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; @@ -112,253 +118,255 @@ where let field_distribution = &mut field_distribution; let document_ids = &mut document_ids; let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { - let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); - let _entered = span.enter(); - - let rtxn = index.read_txn()?; - - // document but we need to create a function that collects and compresses documents. - let document_sender = extractor_sender.documents(); - let document_extractor = DocumentsExtractor::new(document_sender, embedders); - let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - { - let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); + pool.install(move || { + let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); let _entered = span.enter(); - extract(document_changes, - &document_extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::ExtractingDocuments, - )?; - } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents"); - let _entered = span.enter(); - for document_extractor_data in datastore { - let document_extractor_data = document_extractor_data.0.into_inner(); - for (field, delta) in document_extractor_data.field_distribution_delta { - let current = field_distribution.entry(field).or_default(); - // adding the delta should never cause a negative result, as we are removing fields that previously existed. - *current = current.saturating_add_signed(delta); + + let rtxn = index.read_txn()?; + + // document but we need to create a function that collects and compresses documents. + let document_sender = extractor_sender.documents(); + let document_extractor = DocumentsExtractor::new(document_sender, embedders); + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); + let _entered = span.enter(); + extract(document_changes, + &document_extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::ExtractingDocuments, + )?; + } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents"); + let _entered = span.enter(); + for document_extractor_data in datastore { + let document_extractor_data = document_extractor_data.0.into_inner(); + for (field, delta) in document_extractor_data.field_distribution_delta { + let current = field_distribution.entry(field).or_default(); + // adding the delta should never cause a negative result, as we are removing fields that previously existed. + *current = current.saturating_add_signed(delta); + } + document_extractor_data.docids_delta.apply_to(document_ids); } - document_extractor_data.docids_delta.apply_to(document_ids); + + field_distribution.retain(|_, v| *v != 0); } - field_distribution.retain(|_, v| *v != 0); - } + let facet_field_ids_delta; - let facet_field_ids_delta; + { + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted"); + let _entered = span.enter(); - { - let caches = { - let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted"); - let _entered = span.enter(); + FacetedDocidsExtractor::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + &extractor_sender.field_id_docid_facet_sender(), + Step::ExtractingFacets + )? + }; - FacetedDocidsExtractor::run_extraction( + { + let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted"); + let _entered = span.enter(); + + facet_field_ids_delta = merge_and_send_facet_docids( + caches, + FacetDatabases::new(index), + index, + extractor_sender.facet_docids(), + )?; + } + } + + { + let WordDocidsCaches { + word_docids, + word_fid_docids, + exact_word_docids, + word_position_docids, + fid_word_count_docids, + } = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); + let _entered = span.enter(); + + WordDocidsExtractors::run_extraction( grenad_parameters, document_changes, indexing_context, &mut extractor_allocs, - &extractor_sender.field_id_docid_facet_sender(), - Step::ExtractingFacets + Step::ExtractingWords )? - }; + }; - { - let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted"); - let _entered = span.enter(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_docids, + index.word_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - facet_field_ids_delta = merge_and_send_facet_docids( - caches, - FacetDatabases::new(index), - index, - extractor_sender.facet_docids(), - )?; - } - } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_fid_docids, + index.word_fid_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - { - let WordDocidsCaches { - word_docids, - word_fid_docids, - exact_word_docids, - word_position_docids, - fid_word_count_docids, - } = { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); - let _entered = span.enter(); + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); + let _entered = span.enter(); + merge_and_send_docids( + exact_word_docids, + index.exact_word_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - WordDocidsExtractors::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWords - )? - }; + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); + let _entered = span.enter(); + merge_and_send_docids( + word_position_docids, + index.word_position_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_docids, - index.word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); + let _entered = span.enter(); + merge_and_send_docids( + fid_word_count_docids, + index.field_id_word_count_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_fid_docids, - index.word_fid_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; + // run the proximity extraction only if the precision is by word + // this works only if the settings didn't change during this transaction. + let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); + if proximity_precision == ProximityPrecision::ByWord { + let caches = { + let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); + let _entered = span.enter(); + + ::run_extraction( + grenad_parameters, + document_changes, + indexing_context, + &mut extractor_allocs, + Step::ExtractingWordProximity, + )? + }; + + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); + let _entered = span.enter(); + + merge_and_send_docids( + caches, + index.word_pair_proximity_docids.remap_types(), + index, + extractor_sender.docids::(), + &indexing_context.must_stop_processing, + )?; + } } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); - let _entered = span.enter(); - merge_and_send_docids( - exact_word_docids, - index.exact_word_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } + 'vectors: { + if index_embeddings.is_empty() { + break 'vectors; + } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); - let _entered = span.enter(); - merge_and_send_docids( - word_position_docids, - index.word_position_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } + let embedding_sender = extractor_sender.embeddings(); + let extractor = EmbeddingExtractor::new(embedders, embedding_sender, field_distribution, request_threads()); + let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); + let _entered = span.enter(); - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); - let _entered = span.enter(); - merge_and_send_docids( - fid_word_count_docids, - index.field_id_word_count_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - } + extract( + document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::ExtractingEmbeddings, + )?; + } + { + let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); + let _entered = span.enter(); - // run the proximity extraction only if the precision is by word - // this works only if the settings didn't change during this transaction. - let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); - if proximity_precision == ProximityPrecision::ByWord { - let caches = { - let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); - let _entered = span.enter(); - - ::run_extraction( - grenad_parameters, - document_changes, - indexing_context, - &mut extractor_allocs, - Step::ExtractingWordProximity, - )? - }; - - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); - let _entered = span.enter(); - - merge_and_send_docids( - caches, - index.word_pair_proximity_docids.remap_types(), - index, - extractor_sender.docids::(), - &indexing_context.must_stop_processing, - )?; - } - } - - 'vectors: { - if index_embeddings.is_empty() { - break 'vectors; - } - - let embedding_sender = extractor_sender.embeddings(); - let extractor = EmbeddingExtractor::new(embedders, embedding_sender, field_distribution, request_threads()); - let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); - let _entered = span.enter(); - - extract( - document_changes, - &extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::ExtractingEmbeddings, - )?; - } - { - let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); - let _entered = span.enter(); - - for config in &mut index_embeddings { - 'data: for data in datastore.iter_mut() { - let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided); + for config in &mut index_embeddings { + 'data: for data in datastore.iter_mut() { + let data = &mut data.get_mut().0; + let Some(deladd) = data.remove(&config.name) else { continue 'data; }; + deladd.apply_to(&mut config.user_provided); + } } } } - } - 'geo: { - let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { - break 'geo; - }; - let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); + 'geo: { + let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { + break 'geo; + }; + let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); - { - let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); - let _entered = span.enter(); + { + let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); + let _entered = span.enter(); - extract( - document_changes, - &extractor, - indexing_context, - &mut extractor_allocs, - &datastore, - Step::WritingGeoPoints + extract( + document_changes, + &extractor, + indexing_context, + &mut extractor_allocs, + &datastore, + Step::WritingGeoPoints + )?; + } + + merge_and_send_rtree( + datastore, + &rtxn, + index, + extractor_sender.geo(), + &indexing_context.must_stop_processing, )?; } - merge_and_send_rtree( - datastore, - &rtxn, - index, - extractor_sender.geo(), - &indexing_context.must_stop_processing, - )?; - } + (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); - (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); + finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); - finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); - - Result::Ok((facet_field_ids_delta, index_embeddings)) + Result::Ok((facet_field_ids_delta, index_embeddings)) + }).unwrap() })?; let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); From e83534a4305963c857423cf03c3612e4e31a2b07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 16:27:43 +0100 Subject: [PATCH 12/37] Fix the indexer::index to correctly use the rayon::ThreadPool --- crates/milli/src/update/new/channel.rs | 49 +++++----------------- crates/milli/src/update/new/indexer/mod.rs | 17 ++++---- 2 files changed, 19 insertions(+), 47 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index beba80ac8..70c4a6042 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -4,6 +4,7 @@ use std::mem; use std::num::NonZeroU16; use bbqueue::framed::{FrameGrantR, FrameProducer}; +use bbqueue::BBBuffer; use bytemuck::{checked, CheckedBitPattern, NoUninit}; use crossbeam_channel::SendError; use heed::types::Bytes; @@ -25,6 +26,9 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. /// +/// The `bbqueue_capacity` represent the number of bytes allocated +/// to each BBQueue buffer and is not the sum of all of them. +/// /// The `channel_capacity` parameter defines the number of /// too-large-to-fit-in-BBQueue entries that can be sent through /// a crossbeam channel. This parameter must stay low to make @@ -40,14 +44,11 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Panics if the number of provided BBQueues is not exactly equal /// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( - bbbuffers: &[bbqueue::BBBuffer], + bbbuffers: &mut Vec, + bbbuffer_capacity: usize, channel_capacity: usize, ) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { - assert_eq!( - bbbuffers.len(), - rayon::current_num_threads(), - "You must provide as many BBBuffer as the available number of threads to extract" - ); + bbbuffers.resize_with(rayon::current_num_threads(), || BBBuffer::new(bbbuffer_capacity)); let capacity = bbbuffers.first().unwrap().capacity(); // Read the field description to understand this @@ -55,12 +56,6 @@ pub fn extractor_writer_bbqueue( let producers = ThreadLocal::with_capacity(bbbuffers.len()); let consumers = rayon::broadcast(|bi| { - eprintln!( - "hello thread #{:?} (#{:?}, #{:?})", - bi.index(), - std::thread::current().name(), - std::thread::current().id(), - ); let bbqueue = &bbbuffers[bi.index()]; let (producer, consumer) = bbqueue.try_split_framed().unwrap(); producers.get_or(|| FullySend(RefCell::new(producer))); @@ -405,15 +400,7 @@ impl<'b> ExtractorBbqueueSender<'b> { fn delete_vector(&self, docid: DocumentId) -> crate::Result<()> { let capacity = self.capacity; - let refcell = match self.producers.get() { - Some(refcell) => refcell, - None => panic!( - "hello thread #{:?} (#{:?}, #{:?})", - rayon::current_thread_index(), - std::thread::current().name(), - std::thread::current().id() - ), - }; + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); @@ -452,15 +439,7 @@ impl<'b> ExtractorBbqueueSender<'b> { embedding: &[f32], ) -> crate::Result<()> { let capacity = self.capacity; - let refcell = match self.producers.get() { - Some(refcell) => refcell, - None => panic!( - "hello thread #{:?} (#{:?}, #{:?})", - rayon::current_thread_index(), - std::thread::current().name(), - std::thread::current().id() - ), - }; + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); let payload_header = @@ -518,15 +497,7 @@ impl<'b> ExtractorBbqueueSender<'b> { F: FnOnce(&mut [u8]) -> crate::Result<()>, { let capacity = self.capacity; - let refcell = match self.producers.get() { - Some(refcell) => refcell, - None => panic!( - "hello thread #{:?} (#{:?}, #{:?})", - rayon::current_thread_index(), - std::thread::current().name(), - std::thread::current().id() - ), - }; + let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); let operation = DbOperation { database, key_length: Some(key_length) }; diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index b7d5431b4..3a4406aef 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -77,17 +77,18 @@ where MSP: Fn() -> bool + Sync, SP: Fn(Progress) + Sync, { - /// TODO restrict memory and remove this memory from the extractors bump allocators - let bbbuffers: Vec<_> = pool + let mut bbbuffers = Vec::new(); + let finished_extraction = AtomicBool::new(false); + let (extractor_sender, mut writer_receiver) = pool .install(|| { - (0..rayon::current_num_threads()) - .map(|_| bbqueue::BBBuffer::new(100 * 1024 * 1024)) // 100 MiB by thread - .collect() + /// TODO restrict memory and remove this memory from the extractors bump allocators + extractor_writer_bbqueue( + &mut bbbuffers, + 100 * 1024 * 1024, // 100 MiB + 1000, + ) }) .unwrap(); - let (extractor_sender, mut writer_receiver) = - pool.install(|| extractor_writer_bbqueue(&bbbuffers, 1000)).unwrap(); - let finished_extraction = AtomicBool::new(false); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); From da650f834ee4fcb12d4a38a0e545f548bb06660f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 17:04:49 +0100 Subject: [PATCH 13/37] Plug the NoPanicThreadPool in the tests and benchmarks --- crates/benchmarks/benches/indexing.rs | 31 +++++++++++++++++++ crates/benchmarks/benches/utils.rs | 1 + crates/fuzzers/src/bin/fuzz-indexing.rs | 1 + crates/milli/src/index.rs | 3 ++ .../milli/src/search/new/tests/integration.rs | 1 + .../milli/src/update/index_documents/mod.rs | 11 +++++++ .../milli/tests/search/facet_distribution.rs | 1 + crates/milli/tests/search/mod.rs | 1 + crates/milli/tests/search/query_criteria.rs | 1 + crates/milli/tests/search/typo_tolerance.rs | 1 + 10 files changed, 52 insertions(+) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 2f33c3454..d3f307be3 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -157,6 +157,7 @@ fn indexing_songs_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -223,6 +224,7 @@ fn reindexing_songs_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -267,6 +269,7 @@ fn reindexing_songs_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -335,6 +338,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -411,6 +415,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -455,6 +460,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -495,6 +501,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -562,6 +569,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -628,6 +636,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -694,6 +703,7 @@ fn indexing_wiki(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -759,6 +769,7 @@ fn reindexing_wiki(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -803,6 +814,7 @@ fn reindexing_wiki(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -870,6 +882,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -946,6 +959,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -991,6 +1005,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1032,6 +1047,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1098,6 +1114,7 @@ fn indexing_movies_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1163,6 +1180,7 @@ fn reindexing_movies_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1207,6 +1225,7 @@ fn reindexing_movies_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1274,6 +1293,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1321,6 +1341,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index f335938b9..ee927940f 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -135,6 +135,7 @@ fn main() { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index fe83877a7..268d33cd9 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1821,6 +1821,7 @@ pub(crate) mod tests { indexer::index( wtxn, &self.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1911,6 +1912,7 @@ pub(crate) mod tests { indexer::index( wtxn, &self.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -1991,6 +1993,7 @@ pub(crate) mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 79668b34b..5db5b400b 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -83,6 +83,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { indexer::index( &mut wtxn, &index, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 186cc501d..3988b311c 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -2155,6 +2155,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2216,6 +2217,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2268,6 +2270,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2319,6 +2322,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2372,6 +2376,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2430,6 +2435,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2481,6 +2487,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2532,6 +2539,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2725,6 +2733,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2783,6 +2792,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, @@ -2838,6 +2848,7 @@ mod tests { indexer::index( &mut wtxn, &index.inner, + &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), indexer_config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index 61d0697ff..418cdc356 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -64,6 +64,7 @@ fn test_facet_distribution_with_no_facet_values() { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 1287b59d5..08b22d7b6 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -101,6 +101,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index 3e56eeff0..8401f0444 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -333,6 +333,7 @@ fn criteria_ascdesc() { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index 7ac9a1e4b..dbee296ee 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -142,6 +142,7 @@ fn test_typo_disabled_on_word() { indexer::index( &mut wtxn, &index, + &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), config.grenad_parameters(), &db_fields_ids_map, new_fields_ids_map, From 5c488e20cc07a66aff3794fd94c3c84d47170b31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 18:03:45 +0100 Subject: [PATCH 14/37] Send the geo rtree through crossbeam channel --- crates/milli/src/update/new/channel.rs | 107 +++++++++++++------------ 1 file changed, 56 insertions(+), 51 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 70c4a6042..26e375a5a 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -166,7 +166,6 @@ pub struct DbOperation { impl DbOperation { pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { - /// TODO replace the return type by an enum Write | Delete let skip = EntryHeader::variant_size() + mem::size_of::(); match self.key_length { Some(key_length) => { @@ -478,8 +477,7 @@ impl<'b> ExtractorBbqueueSender<'b> { fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); - self.write_key_value_with(database, key_length, value.len(), |buffer| { - let (key_buffer, value_buffer) = buffer.split_at_mut(key.len()); + self.write_key_value_with(database, key_length, value.len(), |key_buffer, value_buffer| { key_buffer.copy_from_slice(key); value_buffer.copy_from_slice(value); Ok(()) @@ -494,7 +492,7 @@ impl<'b> ExtractorBbqueueSender<'b> { key_value_writer: F, ) -> crate::Result<()> where - F: FnOnce(&mut [u8]) -> crate::Result<()>, + F: FnOnce(&mut [u8], &mut [u8]) -> crate::Result<()>, { let capacity = self.capacity; let refcell = self.producers.get().unwrap(); @@ -519,7 +517,8 @@ impl<'b> ExtractorBbqueueSender<'b> { let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); payload_header.serialize_into(header_bytes); - key_value_writer(remaining)?; + let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); + key_value_writer(key_buffer, value_buffer)?; // We could commit only the used memory. grant.commit(total_length); @@ -635,12 +634,16 @@ impl WordDocidsSender<'_, '_, D> { pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); - self.sender.write_key_value_with(D::DATABASE, key_length, value_length, |buffer| { - let (key_buffer, value_buffer) = buffer.split_at_mut(key.len()); - key_buffer.copy_from_slice(key); - CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?; - Ok(()) - }) + self.sender.write_key_value_with( + D::DATABASE, + key_length, + value_length, + |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?; + Ok(()) + }, + ) } pub fn delete(&self, key: &[u8]) -> crate::Result<()> { @@ -667,25 +670,29 @@ impl FacetDocidsSender<'_, '_> { FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, }; - self.sender.write_key_value_with(database, key_length, value_length, |buffer| { - let (key_out, value_out) = buffer.split_at_mut(key.len()); - key_out.copy_from_slice(key); + self.sender.write_key_value_with( + database, + key_length, + value_length, + |key_out, value_out| { + key_out.copy_from_slice(key); - let value_out = match facet_kind { - // We must take the facet group size into account - // when we serialize strings and numbers. - FacetKind::String | FacetKind::Number => { - let (first, remaining) = value_out.split_first_mut().unwrap(); - *first = 1; - remaining - } - FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, - }; + let value_out = match facet_kind { + // We must take the facet group size into account + // when we serialize strings and numbers. + FacetKind::String | FacetKind::Number => { + let (first, remaining) = value_out.split_first_mut().unwrap(); + *first = 1; + remaining + } + FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out, + }; - CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; + CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?; - Ok(()) - }) + Ok(()) + }, + ) } pub fn delete(&self, key: &[u8]) -> crate::Result<()> { @@ -777,32 +784,30 @@ pub struct GeoSender<'a, 'b>(&'a ExtractorBbqueueSender<'b>); impl GeoSender<'_, '_> { pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> { - todo!("set rtree from file") - // self.0 - // .send(WriterOperation::DbOperation(DbOperation { - // database: Database::Main, - // entry: EntryOperation::Write(KeyValueEntry::from_large_key_value( - // GEO_RTREE_KEY.as_bytes(), - // value, - // )), - // })) - // .map_err(|_| SendError(())) + self.0 + .sender + .send(ReceiverAction::LargeEntry { + database: Database::Main, + key: GEO_RTREE_KEY.to_string().into_bytes().into_boxed_slice(), + value, + }) + .map_err(|_| SendError(())) } - pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> StdResult<(), SendError<()>> { - todo!("serialize directly into bbqueue (as a real roaringbitmap not a cbo)") + pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> crate::Result<()> { + let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(); + let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + let value_length = bitmap.serialized_size(); - // let mut buffer = Vec::new(); - // bitmap.serialize_into(&mut buffer).unwrap(); - - // self.0 - // .send(WriterOperation::DbOperation(DbOperation { - // database: Database::Main, - // entry: EntryOperation::Write(KeyValueEntry::from_small_key_value( - // GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(), - // &buffer, - // )), - // })) - // .map_err(|_| SendError(())) + self.0.write_key_value_with( + Database::Main, + key_length, + value_length, + |key_buffer, value_buffer| { + key_buffer.copy_from_slice(key); + bitmap.serialize_into(value_buffer)?; + Ok(()) + }, + ) } } From 58eab9a0182323ba4ce458d026726e7253a51917 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 27 Nov 2024 18:06:43 +0100 Subject: [PATCH 15/37] Send large payload through crossbeam --- crates/milli/src/update/new/channel.rs | 263 ++++++++++++++++++--- crates/milli/src/update/new/indexer/mod.rs | 39 ++- 2 files changed, 266 insertions(+), 36 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 26e375a5a..7eaa50df1 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,4 +1,5 @@ use std::cell::RefCell; +use std::io::{self, BufWriter}; use std::marker::PhantomData; use std::mem; use std::num::NonZeroU16; @@ -9,7 +10,7 @@ use bytemuck::{checked, CheckedBitPattern, NoUninit}; use crossbeam_channel::SendError; use heed::types::Bytes; use heed::BytesDecode; -use memmap2::Mmap; +use memmap2::{Mmap, MmapMut}; use roaring::RoaringBitmap; use super::extract::FacetKind; @@ -98,20 +99,63 @@ pub struct WriterBbqueueReceiver<'a> { pub enum ReceiverAction { /// Wake up, you have frames to read for the BBQueue buffers. WakeUp, - /// An entry that cannot fit in the BBQueue buffers has been - /// written to disk, memory-mapped and must be written in the - /// database. - LargeEntry { - /// The database where the entry must be written. - database: Database, - /// The key of the entry that must be written in the database. - key: Box<[u8]>, - /// The large value that must be written. - /// - /// Note: We can probably use a `File` here and - /// use `Database::put_reserved` instead of memory-mapping. - value: Mmap, - }, + LargeEntry(LargeEntry), + LargeVector(LargeVector), + LargeVectors(LargeVectors), +} + +/// An entry that cannot fit in the BBQueue buffers has been +/// written to disk, memory-mapped and must be written in the +/// database. +#[derive(Debug)] +pub struct LargeEntry { + /// The database where the entry must be written. + pub database: Database, + /// The key of the entry that must be written in the database. + pub key: Box<[u8]>, + /// The large value that must be written. + /// + /// Note: We can probably use a `File` here and + /// use `Database::put_reserved` instead of memory-mapping. + pub value: Mmap, +} + +/// When an embedding is larger than the available +/// BBQueue space it arrives here. +#[derive(Debug)] +pub struct LargeVector { + /// The document id associated to the large embedding. + pub docid: DocumentId, + /// The embedder id in which to insert the large embedding. + pub embedder_id: u8, + /// The large embedding that must be written. + pub embedding: Mmap, +} + +impl LargeVector { + pub fn read_embedding(&self) -> &[f32] { + bytemuck::cast_slice(&self.embedding) + } +} + +/// When embeddings are larger than the available +/// BBQueue space it arrives here. +#[derive(Debug)] +pub struct LargeVectors { + /// The document id associated to the large embedding. + pub docid: DocumentId, + /// The embedder id in which to insert the large embedding. + pub embedder_id: u8, + /// The dimensions of the embeddings in this payload. + pub dimensions: u16, + /// The large embedding that must be written. + pub embeddings: Mmap, +} + +impl LargeVectors { + pub fn read_embeddings(&self) -> impl Iterator { + self.embeddings.chunks_exact(self.dimensions as usize).map(bytemuck::cast_slice) + } } impl<'a> WriterBbqueueReceiver<'a> { @@ -209,12 +253,55 @@ impl ArroySetVector { } } +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embeddings are in the remaining space and represents +/// non-aligned [f32] each with dimensions f32s. +pub struct ArroySetVectors { + pub docid: DocumentId, + pub dimensions: u16, + pub embedder_id: u8, + _padding: u8, +} + +impl ArroySetVectors { + fn remaining_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + let skip = EntryHeader::variant_size() + mem::size_of::(); + &frame[skip..] + } + + // /// The number of embeddings in this payload. + // pub fn embedding_count(&self, frame: &FrameGrantR<'_>) -> usize { + // let bytes = Self::remaining_bytes(frame); + // bytes.len().checked_div(self.dimensions as usize).unwrap() + // } + + /// Read the embedding at `index` or `None` if out of bounds. + pub fn read_embedding_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + index: usize, + vec: &'v mut Vec, + ) -> Option<&'v [f32]> { + vec.clear(); + let bytes = Self::remaining_bytes(frame); + let embedding_size = self.dimensions as usize * mem::size_of::(); + let embedding_bytes = bytes.chunks_exact(embedding_size).nth(index)?; + embedding_bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { + let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); + vec.push(f); + }); + Some(&vec[..]) + } +} + #[derive(Debug, Clone, Copy)] #[repr(u8)] pub enum EntryHeader { DbOperation(DbOperation), ArroyDeleteVector(ArroyDeleteVector), ArroySetVector(ArroySetVector), + ArroySetVectors(ArroySetVectors), } impl EntryHeader { @@ -227,6 +314,7 @@ impl EntryHeader { EntryHeader::DbOperation(_) => 0, EntryHeader::ArroyDeleteVector(_) => 1, EntryHeader::ArroySetVector(_) => 2, + EntryHeader::ArroySetVectors(_) => 3, } } @@ -245,11 +333,15 @@ impl EntryHeader { Self::variant_size() + mem::size_of::() } - /// The `embedding_length` corresponds to the number of `f32` in the embedding. - fn total_set_vector_size(embedding_length: usize) -> usize { - Self::variant_size() - + mem::size_of::() - + embedding_length * mem::size_of::() + /// The `dimensions` corresponds to the number of `f32` in the embedding. + fn total_set_vector_size(dimensions: usize) -> usize { + Self::variant_size() + mem::size_of::() + dimensions * mem::size_of::() + } + + /// The `dimensions` corresponds to the number of `f32` in the embedding. + fn total_set_vectors_size(count: usize, dimensions: usize) -> usize { + let embedding_size = dimensions * mem::size_of::(); + Self::variant_size() + mem::size_of::() + embedding_size * count } fn header_size(&self) -> usize { @@ -257,6 +349,7 @@ impl EntryHeader { EntryHeader::DbOperation(op) => mem::size_of_val(op), EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), + EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), }; Self::variant_size() + payload_size } @@ -279,6 +372,11 @@ impl EntryHeader { let header = checked::pod_read_unaligned(header_bytes); EntryHeader::ArroySetVector(header) } + 3 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroySetVectors(header) + } id => panic!("invalid variant id: {id}"), } } @@ -289,6 +387,7 @@ impl EntryHeader { EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), + EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), }; *first = self.variant_id(); remaining.copy_from_slice(payload_bytes); @@ -405,7 +504,7 @@ impl<'b> ExtractorBbqueueSender<'b> { let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); let total_length = EntryHeader::total_delete_vector_size(); if total_length > capacity { - unreachable!("entry larger that the BBQueue capacity"); + panic!("The entry is larger ({total_length} bytes) than the BBQueue capacity ({capacity} bytes)"); } // Spin loop to have a frame the size we requested. @@ -441,11 +540,21 @@ impl<'b> ExtractorBbqueueSender<'b> { let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); - let payload_header = - EntryHeader::ArroySetVector(ArroySetVector { docid, embedder_id, _padding: [0; 3] }); + let arroy_set_vector = ArroySetVector { docid, embedder_id, _padding: [0; 3] }; + let payload_header = EntryHeader::ArroySetVector(arroy_set_vector); let total_length = EntryHeader::total_set_vector_size(embedding.len()); if total_length > capacity { - unreachable!("entry larger that the BBQueue capacity"); + let mut embedding_bytes = bytemuck::cast_slice(embedding); + let mut value_file = tempfile::tempfile().map(BufWriter::new)?; + io::copy(&mut embedding_bytes, &mut value_file)?; + let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; + value_file.sync_all()?; + let embedding = unsafe { Mmap::map(&value_file)? }; + + let large_vector = LargeVector { docid, embedder_id, embedding }; + self.sender.send(ReceiverAction::LargeVector(large_vector)).unwrap(); + + return Ok(()); } // Spin loop to have a frame the size we requested. @@ -457,7 +566,6 @@ impl<'b> ExtractorBbqueueSender<'b> { } }; - // payload_header.serialize_into(&mut grant); let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); payload_header.serialize_into(header_bytes); @@ -475,6 +583,83 @@ impl<'b> ExtractorBbqueueSender<'b> { Ok(()) } + fn set_vectors( + &self, + docid: u32, + embedder_id: u8, + embeddings: &[Vec], + ) -> crate::Result<()> { + let capacity = self.capacity; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + let dimensions = match embeddings.first() { + Some(embedding) => embedding.len(), + None => return Ok(()), + }; + + let arroy_set_vector = ArroySetVectors { + docid, + dimensions: dimensions.try_into().unwrap(), + embedder_id, + _padding: 0, + }; + + let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); + let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions); + if total_length > capacity { + let mut value_file = tempfile::tempfile().map(BufWriter::new)?; + for embedding in embeddings { + let mut embedding_bytes = bytemuck::cast_slice(embedding); + io::copy(&mut embedding_bytes, &mut value_file)?; + } + + let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; + value_file.sync_all()?; + let embeddings = unsafe { Mmap::map(&value_file)? }; + + let large_vectors = LargeVectors { + docid, + embedder_id, + dimensions: dimensions.try_into().unwrap(), + embeddings, + }; + + self.sender.send(ReceiverAction::LargeVectors(large_vectors)).unwrap(); + + return Ok(()); + } + + // Spin loop to have a frame the size we requested. + let mut grant = loop { + match producer.grant(total_length) { + Ok(grant) => break grant, + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + }; + + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + + let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); + for (embedding, output) in embeddings.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } + + // We could commit only the used memory. + grant.commit(total_length); + + // We only send a wake up message when the channel is empty + // so that we don't fill the channel with too many WakeUps. + if self.sender.is_empty() { + self.sender.send(ReceiverAction::WakeUp).unwrap(); + } + + Ok(()) + } + fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); self.write_key_value_with(database, key_length, value.len(), |key_buffer, value_buffer| { @@ -502,7 +687,22 @@ impl<'b> ExtractorBbqueueSender<'b> { let payload_header = EntryHeader::DbOperation(operation); let total_length = EntryHeader::total_key_value_size(key_length, value_length); if total_length > capacity { - unreachable!("entry larger that the BBQueue capacity"); + let mut key_buffer = vec![0; key_length.get() as usize].into_boxed_slice(); + let value_file = tempfile::tempfile()?; + value_file.set_len(value_length.try_into().unwrap())?; + let mut mmap_mut = unsafe { MmapMut::map_mut(&value_file)? }; + + key_value_writer(&mut key_buffer, &mut mmap_mut)?; + + self.sender + .send(ReceiverAction::LargeEntry(LargeEntry { + database, + key: key_buffer, + value: mmap_mut.make_read_only()?, + })) + .unwrap(); + + return Ok(()); } // Spin loop to have a frame the size we requested. @@ -559,7 +759,7 @@ impl<'b> ExtractorBbqueueSender<'b> { let payload_header = EntryHeader::DbOperation(operation); let total_length = EntryHeader::total_key_size(key_length); if total_length > capacity { - unreachable!("entry larger that the BBQueue capacity"); + panic!("The entry is larger ({total_length} bytes) than the BBQueue capacity ({capacity} bytes)"); } // Spin loop to have a frame the size we requested. @@ -763,10 +963,7 @@ impl EmbeddingSender<'_, '_> { embedder_id: u8, embeddings: Vec, ) -> crate::Result<()> { - for embedding in embeddings { - self.set_vector(docid, embedder_id, embedding)?; - } - Ok(()) + self.0.set_vectors(docid, embedder_id, &embeddings[..]) } pub fn set_vector( @@ -786,11 +983,11 @@ impl GeoSender<'_, '_> { pub fn set_rtree(&self, value: Mmap) -> StdResult<(), SendError<()>> { self.0 .sender - .send(ReceiverAction::LargeEntry { + .send(ReceiverAction::LargeEntry(LargeEntry { database: Database::Main, key: GEO_RTREE_KEY.to_string().into_bytes().into_boxed_slice(), value, - }) + })) .map_err(|_| SendError(())) } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 3a4406aef..9ad7a8f0b 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -16,6 +16,7 @@ use rand::SeedableRng as _; use raw_collections::RawMap; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; +use {LargeEntry, LargeVector}; use super::channel::*; use super::extract::*; @@ -40,7 +41,7 @@ use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::InnerIndexSettings; use crate::update::{FacetsUpdateBulk, GrenadParameters}; -use crate::vector::{ArroyWrapper, EmbeddingConfigs}; +use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings}; use crate::{ Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder, UserError, @@ -132,7 +133,8 @@ where { let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); let _entered = span.enter(); - extract(document_changes, + extract( + document_changes, &document_extractor, indexing_context, &mut extractor_allocs, @@ -416,7 +418,7 @@ where match action { ReceiverAction::WakeUp => (), - ReceiverAction::LargeEntry { database, key, value } => { + ReceiverAction::LargeEntry(LargeEntry { database, key, value }) => { let database_name = database.database_name(); let database = database.database(index); if let Err(error) = database.put(wtxn, &key, &value) { @@ -428,6 +430,24 @@ where })); } } + ReceiverAction::LargeVector(large_vector) => { + let embedding = large_vector.read_embedding(); + let LargeVector { docid, embedder_id, .. } = large_vector; + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_item(wtxn, docid, embedding)?; + } + ReceiverAction::LargeVectors(large_vectors) => { + let LargeVectors { docid, embedder_id, .. } = large_vectors; + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + let mut embeddings = Embeddings::new(*dimensions); + for embedding in large_vectors.read_embeddings() { + embeddings.push(embedding.to_vec()).unwrap(); + } + } } // Every time the is a message in the channel we search @@ -582,6 +602,19 @@ fn write_from_bbqueue( writer.del_items(wtxn, *dimensions, docid)?; writer.add_item(wtxn, docid, embedding)?; } + EntryHeader::ArroySetVectors(asvs) => { + let ArroySetVectors { docid, embedder_id, .. } = asvs; + let frame = frame_with_header.frame(); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + writer.del_items(wtxn, *dimensions, docid)?; + for index in 0.. { + match asvs.read_embedding_into_vec(frame, index, aligned_embedding) { + Some(embedding) => writer.add_item(wtxn, docid, embedding)?, + None => break, + } + } + } } } From cc4bd54669b64b6fa195616fb18ca7da38c299a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 13:53:25 +0100 Subject: [PATCH 16/37] Correctly construct the Embeddings struct --- crates/milli/src/update/new/channel.rs | 14 ++++++++++++++ crates/milli/src/update/new/indexer/mod.rs | 13 ++++++------- crates/milli/src/vector/mod.rs | 2 +- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 7eaa50df1..237c19a5c 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -293,6 +293,20 @@ impl ArroySetVectors { }); Some(&vec[..]) } + + /// Read all the embeddings and write them into an aligned `f32` Vec. + pub fn read_all_embeddings_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + vec.clear(); + Self::remaining_bytes(frame).chunks_exact(mem::size_of::()).for_each(|bytes| { + let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); + vec.push(f); + }); + &vec[..] + } } #[derive(Debug, Clone, Copy)] diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 9ad7a8f0b..a8a94cb7c 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -442,11 +442,12 @@ where let LargeVectors { docid, embedder_id, .. } = large_vectors; let (_, _, writer, dimensions) = arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; let mut embeddings = Embeddings::new(*dimensions); for embedding in large_vectors.read_embeddings() { embeddings.push(embedding.to_vec()).unwrap(); } + writer.del_items(wtxn, *dimensions, docid)?; + writer.add_items(wtxn, docid, &embeddings)?; } } @@ -607,13 +608,11 @@ fn write_from_bbqueue( let frame = frame_with_header.frame(); let (_, _, writer, dimensions) = arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let mut embeddings = Embeddings::new(*dimensions); + let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); + embeddings.append(all_embeddings.to_vec()).unwrap(); writer.del_items(wtxn, *dimensions, docid)?; - for index in 0.. { - match asvs.read_embedding_into_vec(frame, index, aligned_embedding) { - Some(embedding) => writer.add_item(wtxn, docid, embedding)?, - None => break, - } - } + writer.add_items(wtxn, docid, &embeddings)?; } } } diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 3047e6dfc..a1d71ef93 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -475,7 +475,7 @@ impl Embeddings { Ok(()) } - /// Append a flat vector of embeddings a the end of the embeddings. + /// Append a flat vector of embeddings at the end of the embeddings. /// /// If `embeddings.len() % self.dimension != 0`, then the append operation fails. pub fn append(&mut self, mut embeddings: Vec) -> Result<(), Vec> { From 096a28656ee3c1bba1900f2335e33a8a88677070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 15:15:06 +0100 Subject: [PATCH 17/37] Fix a bug around deleting all the vectors of a doc --- crates/milli/src/update/new/channel.rs | 68 ++++++--------------- crates/milli/src/update/new/indexer/mod.rs | 7 ++- crates/milli/src/update/new/ref_cell_ext.rs | 1 + 3 files changed, 23 insertions(+), 53 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 237c19a5c..38f436837 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -146,15 +146,13 @@ pub struct LargeVectors { pub docid: DocumentId, /// The embedder id in which to insert the large embedding. pub embedder_id: u8, - /// The dimensions of the embeddings in this payload. - pub dimensions: u16, /// The large embedding that must be written. pub embeddings: Mmap, } impl LargeVectors { - pub fn read_embeddings(&self) -> impl Iterator { - self.embeddings.chunks_exact(self.dimensions as usize).map(bytemuck::cast_slice) + pub fn read_embeddings(&self, dimensions: usize) -> impl Iterator { + self.embeddings.chunks_exact(dimensions).map(bytemuck::cast_slice) } } @@ -241,15 +239,18 @@ impl ArroySetVector { &self, frame: &FrameGrantR<'_>, vec: &'v mut Vec, - ) -> &'v [f32] { + ) -> Option<&'v [f32]> { vec.clear(); let skip = EntryHeader::variant_size() + mem::size_of::(); let bytes = &frame[skip..]; + if bytes.is_empty() { + return None; + } bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); vec.push(f); }); - &vec[..] + Some(&vec[..]) } } @@ -259,9 +260,8 @@ impl ArroySetVector { /// non-aligned [f32] each with dimensions f32s. pub struct ArroySetVectors { pub docid: DocumentId, - pub dimensions: u16, pub embedder_id: u8, - _padding: u8, + _padding: [u8; 3], } impl ArroySetVectors { @@ -270,30 +270,6 @@ impl ArroySetVectors { &frame[skip..] } - // /// The number of embeddings in this payload. - // pub fn embedding_count(&self, frame: &FrameGrantR<'_>) -> usize { - // let bytes = Self::remaining_bytes(frame); - // bytes.len().checked_div(self.dimensions as usize).unwrap() - // } - - /// Read the embedding at `index` or `None` if out of bounds. - pub fn read_embedding_into_vec<'v>( - &self, - frame: &FrameGrantR<'_>, - index: usize, - vec: &'v mut Vec, - ) -> Option<&'v [f32]> { - vec.clear(); - let bytes = Self::remaining_bytes(frame); - let embedding_size = self.dimensions as usize * mem::size_of::(); - let embedding_bytes = bytes.chunks_exact(embedding_size).nth(index)?; - embedding_bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { - let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); - vec.push(f); - }); - Some(&vec[..]) - } - /// Read all the embeddings and write them into an aligned `f32` Vec. pub fn read_all_embeddings_into_vec<'v>( &self, @@ -607,18 +583,14 @@ impl<'b> ExtractorBbqueueSender<'b> { let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); + // If there are no vector we specify the dimensions + // to zero to allocate no extra space at all let dimensions = match embeddings.first() { Some(embedding) => embedding.len(), - None => return Ok(()), - }; - - let arroy_set_vector = ArroySetVectors { - docid, - dimensions: dimensions.try_into().unwrap(), - embedder_id, - _padding: 0, + None => 0, }; + let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] }; let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions); if total_length > capacity { @@ -632,13 +604,7 @@ impl<'b> ExtractorBbqueueSender<'b> { value_file.sync_all()?; let embeddings = unsafe { Mmap::map(&value_file)? }; - let large_vectors = LargeVectors { - docid, - embedder_id, - dimensions: dimensions.try_into().unwrap(), - embeddings, - }; - + let large_vectors = LargeVectors { docid, embedder_id, embeddings }; self.sender.send(ReceiverAction::LargeVectors(large_vectors)).unwrap(); return Ok(()); @@ -657,9 +623,11 @@ impl<'b> ExtractorBbqueueSender<'b> { let (header_bytes, remaining) = grant.split_at_mut(header_size); payload_header.serialize_into(header_bytes); - let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); - for (embedding, output) in embeddings.iter().zip(output_iter) { - output.copy_from_slice(bytemuck::cast_slice(embedding)); + if dimensions != 0 { + let output_iter = remaining.chunks_exact_mut(dimensions * mem::size_of::()); + for (embedding, output) in embeddings.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } } // We could commit only the used memory. diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index a8a94cb7c..07cb9d69e 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -443,7 +443,7 @@ where let (_, _, writer, dimensions) = arroy_writers.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); - for embedding in large_vectors.read_embeddings() { + for embedding in large_vectors.read_embeddings(*dimensions) { embeddings.push(embedding.to_vec()).unwrap(); } writer.del_items(wtxn, *dimensions, docid)?; @@ -597,11 +597,12 @@ fn write_from_bbqueue( EntryHeader::ArroySetVector(asv) => { let ArroySetVector { docid, embedder_id, .. } = asv; let frame = frame_with_header.frame(); - let embedding = asv.read_embedding_into_vec(frame, aligned_embedding); let (_, _, writer, dimensions) = arroy_writers.get(&embedder_id).expect("requested a missing embedder"); writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, embedding)?; + if let Some(embedding) = asv.read_embedding_into_vec(frame, aligned_embedding) { + writer.add_item(wtxn, docid, embedding)?; + } } EntryHeader::ArroySetVectors(asvs) => { let ArroySetVectors { docid, embedder_id, .. } = asvs; diff --git a/crates/milli/src/update/new/ref_cell_ext.rs b/crates/milli/src/update/new/ref_cell_ext.rs index c66f4af0a..77f5fa800 100644 --- a/crates/milli/src/update/new/ref_cell_ext.rs +++ b/crates/milli/src/update/new/ref_cell_ext.rs @@ -5,6 +5,7 @@ pub trait RefCellExt { &self, ) -> std::result::Result, std::cell::BorrowMutError>; + #[track_caller] fn borrow_mut_or_yield(&self) -> RefMut<'_, T> { self.try_borrow_mut_or_yield().unwrap() } From b57dd5c58e2944bb607681a4adfcf0b05dd25b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 15:19:57 +0100 Subject: [PATCH 18/37] Remove the Vector variant and use the Vectors --- crates/milli/src/update/new/channel.rs | 126 +-------------------- crates/milli/src/update/new/indexer/mod.rs | 19 ---- 2 files changed, 4 insertions(+), 141 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 38f436837..102a27336 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -100,7 +100,6 @@ pub enum ReceiverAction { /// Wake up, you have frames to read for the BBQueue buffers. WakeUp, LargeEntry(LargeEntry), - LargeVector(LargeVector), LargeVectors(LargeVectors), } @@ -120,24 +119,6 @@ pub struct LargeEntry { pub value: Mmap, } -/// When an embedding is larger than the available -/// BBQueue space it arrives here. -#[derive(Debug)] -pub struct LargeVector { - /// The document id associated to the large embedding. - pub docid: DocumentId, - /// The embedder id in which to insert the large embedding. - pub embedder_id: u8, - /// The large embedding that must be written. - pub embedding: Mmap, -} - -impl LargeVector { - pub fn read_embedding(&self) -> &[f32] { - bytemuck::cast_slice(&self.embedding) - } -} - /// When embeddings are larger than the available /// BBQueue space it arrives here. #[derive(Debug)] @@ -225,35 +206,6 @@ pub struct ArroyDeleteVector { pub docid: DocumentId, } -#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(C)] -/// The embedding is the remaining space and represents a non-aligned [f32]. -pub struct ArroySetVector { - pub docid: DocumentId, - pub embedder_id: u8, - _padding: [u8; 3], -} - -impl ArroySetVector { - pub fn read_embedding_into_vec<'v>( - &self, - frame: &FrameGrantR<'_>, - vec: &'v mut Vec, - ) -> Option<&'v [f32]> { - vec.clear(); - let skip = EntryHeader::variant_size() + mem::size_of::(); - let bytes = &frame[skip..]; - if bytes.is_empty() { - return None; - } - bytes.chunks_exact(mem::size_of::()).for_each(|bytes| { - let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); - vec.push(f); - }); - Some(&vec[..]) - } -} - #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[repr(C)] /// The embeddings are in the remaining space and represents @@ -290,7 +242,6 @@ impl ArroySetVectors { pub enum EntryHeader { DbOperation(DbOperation), ArroyDeleteVector(ArroyDeleteVector), - ArroySetVector(ArroySetVector), ArroySetVectors(ArroySetVectors), } @@ -303,8 +254,7 @@ impl EntryHeader { match self { EntryHeader::DbOperation(_) => 0, EntryHeader::ArroyDeleteVector(_) => 1, - EntryHeader::ArroySetVector(_) => 2, - EntryHeader::ArroySetVectors(_) => 3, + EntryHeader::ArroySetVectors(_) => 2, } } @@ -323,11 +273,6 @@ impl EntryHeader { Self::variant_size() + mem::size_of::() } - /// The `dimensions` corresponds to the number of `f32` in the embedding. - fn total_set_vector_size(dimensions: usize) -> usize { - Self::variant_size() + mem::size_of::() + dimensions * mem::size_of::() - } - /// The `dimensions` corresponds to the number of `f32` in the embedding. fn total_set_vectors_size(count: usize, dimensions: usize) -> usize { let embedding_size = dimensions * mem::size_of::(); @@ -338,7 +283,6 @@ impl EntryHeader { let payload_size = match self { EntryHeader::DbOperation(op) => mem::size_of_val(op), EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), - EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), }; Self::variant_size() + payload_size @@ -358,11 +302,6 @@ impl EntryHeader { EntryHeader::ArroyDeleteVector(header) } 2 => { - let header_bytes = &remaining[..mem::size_of::()]; - let header = checked::pod_read_unaligned(header_bytes); - EntryHeader::ArroySetVector(header) - } - 3 => { let header_bytes = &remaining[..mem::size_of::()]; let header = checked::pod_read_unaligned(header_bytes); EntryHeader::ArroySetVectors(header) @@ -376,7 +315,6 @@ impl EntryHeader { let payload_bytes = match self { EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), - EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), }; *first = self.variant_id(); @@ -520,59 +458,6 @@ impl<'b> ExtractorBbqueueSender<'b> { Ok(()) } - fn set_vector( - &self, - docid: DocumentId, - embedder_id: u8, - embedding: &[f32], - ) -> crate::Result<()> { - let capacity = self.capacity; - let refcell = self.producers.get().unwrap(); - let mut producer = refcell.0.borrow_mut_or_yield(); - - let arroy_set_vector = ArroySetVector { docid, embedder_id, _padding: [0; 3] }; - let payload_header = EntryHeader::ArroySetVector(arroy_set_vector); - let total_length = EntryHeader::total_set_vector_size(embedding.len()); - if total_length > capacity { - let mut embedding_bytes = bytemuck::cast_slice(embedding); - let mut value_file = tempfile::tempfile().map(BufWriter::new)?; - io::copy(&mut embedding_bytes, &mut value_file)?; - let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; - value_file.sync_all()?; - let embedding = unsafe { Mmap::map(&value_file)? }; - - let large_vector = LargeVector { docid, embedder_id, embedding }; - self.sender.send(ReceiverAction::LargeVector(large_vector)).unwrap(); - - return Ok(()); - } - - // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; - - let header_size = payload_header.header_size(); - let (header_bytes, remaining) = grant.split_at_mut(header_size); - payload_header.serialize_into(header_bytes); - remaining.copy_from_slice(bytemuck::cast_slice(embedding)); - - // We could commit only the used memory. - grant.commit(total_length); - - // We only send a wake up message when the channel is empty - // so that we don't fill the channel with too many WakeUps. - if self.sender.is_empty() { - self.sender.send(ReceiverAction::WakeUp).unwrap(); - } - - Ok(()) - } - fn set_vectors( &self, docid: u32, @@ -583,12 +468,9 @@ impl<'b> ExtractorBbqueueSender<'b> { let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); - // If there are no vector we specify the dimensions + // If there are no vectors we specify the dimensions // to zero to allocate no extra space at all - let dimensions = match embeddings.first() { - Some(embedding) => embedding.len(), - None => 0, - }; + let dimensions = embeddings.first().map_or(0, |emb| emb.len()); let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] }; let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); @@ -954,7 +836,7 @@ impl EmbeddingSender<'_, '_> { embedder_id: u8, embedding: Embedding, ) -> crate::Result<()> { - self.0.set_vector(docid, embedder_id, &embedding[..]) + self.0.set_vectors(docid, embedder_id, &[embedding]) } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 07cb9d69e..9a6b40efb 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -16,7 +16,6 @@ use rand::SeedableRng as _; use raw_collections::RawMap; use time::OffsetDateTime; pub use update_by_function::UpdateByFunction; -use {LargeEntry, LargeVector}; use super::channel::*; use super::extract::*; @@ -430,14 +429,6 @@ where })); } } - ReceiverAction::LargeVector(large_vector) => { - let embedding = large_vector.read_embedding(); - let LargeVector { docid, embedder_id, .. } = large_vector; - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - writer.add_item(wtxn, docid, embedding)?; - } ReceiverAction::LargeVectors(large_vectors) => { let LargeVectors { docid, embedder_id, .. } = large_vectors; let (_, _, writer, dimensions) = @@ -594,16 +585,6 @@ fn write_from_bbqueue( writer.del_items(wtxn, dimensions, docid)?; } } - EntryHeader::ArroySetVector(asv) => { - let ArroySetVector { docid, embedder_id, .. } = asv; - let frame = frame_with_header.frame(); - let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); - writer.del_items(wtxn, *dimensions, docid)?; - if let Some(embedding) = asv.read_embedding_into_vec(frame, aligned_embedding) { - writer.add_item(wtxn, docid, embedding)?; - } - } EntryHeader::ArroySetVectors(asvs) => { let ArroySetVectors { docid, embedder_id, .. } = asvs; let frame = frame_with_header.frame(); From 3c7ac093d39a6fa08eaf5e34814ba967037e80ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 15:43:14 +0100 Subject: [PATCH 19/37] Take the BBQueue capacity into account in the max memory --- crates/milli/src/update/new/channel.rs | 11 +++++++---- crates/milli/src/update/new/indexer/mod.rs | 23 ++++++++++++++-------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 102a27336..1a463be1e 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -27,8 +27,9 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. /// -/// The `bbqueue_capacity` represent the number of bytes allocated -/// to each BBQueue buffer and is not the sum of all of them. +/// The `total_bbbuffer_capacity` represent the number of bytes +/// allocated to all BBQueue buffer. It will be split by the +/// number of thread. /// /// The `channel_capacity` parameter defines the number of /// too-large-to-fit-in-BBQueue entries that can be sent through @@ -46,10 +47,12 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( bbbuffers: &mut Vec, - bbbuffer_capacity: usize, + total_bbbuffer_capacity: usize, channel_capacity: usize, ) -> (ExtractorBbqueueSender, WriterBbqueueReceiver) { - bbbuffers.resize_with(rayon::current_num_threads(), || BBBuffer::new(bbbuffer_capacity)); + let current_num_threads = rayon::current_num_threads(); + let bbbuffer_capacity = total_bbbuffer_capacity.checked_div(current_num_threads).unwrap(); + bbbuffers.resize_with(current_num_threads, || BBBuffer::new(bbbuffer_capacity)); let capacity = bbbuffers.first().unwrap().capacity(); // Read the field description to understand this diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 9a6b40efb..99ee89701 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -79,15 +79,22 @@ where { let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); + + // We compute and remove the allocated BBQueues buffers capacity from the indexing memory. + let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( + (grenad_parameters, 100 * 1024 * 1024 * pool.current_num_threads()), // 100 MiB by thread by default + |max_memory| { + let total_bbbuffer_capacity = max_memory / 10; // 10% of the indexing memory + let new_grenad_parameters = GrenadParameters { + max_memory: Some(max_memory - total_bbbuffer_capacity), + ..grenad_parameters + }; + (new_grenad_parameters, total_bbbuffer_capacity) + }, + ); + let (extractor_sender, mut writer_receiver) = pool - .install(|| { - /// TODO restrict memory and remove this memory from the extractors bump allocators - extractor_writer_bbqueue( - &mut bbbuffers, - 100 * 1024 * 1024, // 100 MiB - 1000, - ) - }) + .install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000)) .unwrap(); let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; From 8a35cd1743ec4ce9e8b872bbd9bb0ede4aaad35d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 16:00:15 +0100 Subject: [PATCH 20/37] Adjust the BBQueue buffers to use 2% instead of 10% --- crates/milli/src/update/new/indexer/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 99ee89701..19f1bca3e 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -84,7 +84,7 @@ where let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( (grenad_parameters, 100 * 1024 * 1024 * pool.current_num_threads()), // 100 MiB by thread by default |max_memory| { - let total_bbbuffer_capacity = max_memory / 10; // 10% of the indexing memory + let total_bbbuffer_capacity = max_memory / (100 / 2); // 2% of the indexing memory let new_grenad_parameters = GrenadParameters { max_memory: Some(max_memory - total_bbbuffer_capacity), ..grenad_parameters From 14ee7aa84c7fc82e6475f551b1fc9d2b4f8aaff2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Nov 2024 18:02:48 +0100 Subject: [PATCH 21/37] Make sure the BBQueue is at least 50 MiB --- crates/milli/src/update/new/indexer/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 19f1bca3e..e0450ff7d 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -81,10 +81,12 @@ where let finished_extraction = AtomicBool::new(false); // We compute and remove the allocated BBQueues buffers capacity from the indexing memory. + let minimum_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( - (grenad_parameters, 100 * 1024 * 1024 * pool.current_num_threads()), // 100 MiB by thread by default + (grenad_parameters, 2 * minimum_capacity), // 100 MiB by thread by default |max_memory| { - let total_bbbuffer_capacity = max_memory / (100 / 2); // 2% of the indexing memory + // 2% of the indexing memory + let total_bbbuffer_capacity = (max_memory / 100 / 2).min(minimum_capacity); let new_grenad_parameters = GrenadParameters { max_memory: Some(max_memory - total_bbbuffer_capacity), ..grenad_parameters From 13f21206a64de13202cec3c2841a8c3654b6899a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:03:01 +0100 Subject: [PATCH 22/37] Call the serialize_into_writer method from the serialize_into one --- .../roaring_bitmap/cbo_roaring_bitmap_codec.rs | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index cae1874dd..20a246dcd 100644 --- a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -27,18 +27,8 @@ impl CboRoaringBitmapCodec { } } - pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec) { - if roaring.len() <= THRESHOLD as u64 { - // If the number of items (u32s) to encode is less than or equal to the threshold - // it means that it would weigh the same or less than the RoaringBitmap - // header, so we directly encode them using ByteOrder instead. - for integer in roaring { - vec.write_u32::(integer).unwrap(); - } - } else { - // Otherwise, we use the classic RoaringBitmapCodec that writes a header. - roaring.serialize_into(vec).unwrap(); - } + pub fn serialize_into_vec(roaring: &RoaringBitmap, vec: &mut Vec) { + Self::serialize_into_writer(roaring, vec).unwrap() } pub fn serialize_into_writer( From db4eaf4d2de4140fde57ddfd71af80f8a4ed4826 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:03:27 +0100 Subject: [PATCH 23/37] Rename serialize_into into serialize_into_writer --- crates/milli/src/heed_codec/facet/mod.rs | 2 +- .../heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs | 4 ++-- crates/milli/src/update/new/extract/cache.rs | 8 ++++---- crates/milli/src/update/new/words_prefix_docids.rs | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/milli/src/heed_codec/facet/mod.rs b/crates/milli/src/heed_codec/facet/mod.rs index a8bb5055e..c0870c9fd 100644 --- a/crates/milli/src/heed_codec/facet/mod.rs +++ b/crates/milli/src/heed_codec/facet/mod.rs @@ -97,7 +97,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { fn bytes_encode(value: &'a Self::EItem) -> Result, BoxedError> { let mut v = vec![value.size]; - CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); + CboRoaringBitmapCodec::serialize_into_vec(&value.bitmap, &mut v); Ok(Cow::Owned(v)) } } diff --git a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 20a246dcd..0ab162880 100644 --- a/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/crates/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -152,7 +152,7 @@ impl CboRoaringBitmapCodec { return Ok(None); } - Self::serialize_into(&previous, buffer); + Self::serialize_into_vec(&previous, buffer); Ok(Some(&buffer[..])) } } @@ -178,7 +178,7 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { let mut vec = Vec::with_capacity(Self::serialized_size(item)); - Self::serialize_into(item, &mut vec); + Self::serialize_into_vec(item, &mut vec); Ok(Cow::Owned(vec)) } } diff --git a/crates/milli/src/update/new/extract/cache.rs b/crates/milli/src/update/new/extract/cache.rs index 26ed0eb44..be077d142 100644 --- a/crates/milli/src/update/new/extract/cache.rs +++ b/crates/milli/src/update/new/extract/cache.rs @@ -415,21 +415,21 @@ fn spill_entry_to_sorter( match deladd { DelAddRoaringBitmap { del: Some(del), add: None } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: Some(add) } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); value_writer.insert(DelAdd::Addition, &cbo_buffer)?; } DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; cbo_buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); + CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); value_writer.insert(DelAdd::Addition, &cbo_buffer)?; } DelAddRoaringBitmap { del: None, add: None } => return Ok(()), diff --git a/crates/milli/src/update/new/words_prefix_docids.rs b/crates/milli/src/update/new/words_prefix_docids.rs index 338d22505..7e56beeae 100644 --- a/crates/milli/src/update/new/words_prefix_docids.rs +++ b/crates/milli/src/update/new/words_prefix_docids.rs @@ -76,7 +76,7 @@ impl WordPrefixDocids { .union()?; buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&output, buffer); + CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); index.push(PrefixEntry { prefix, serialized_length: buffer.len() }); file.write_all(buffer) })?; @@ -211,7 +211,7 @@ impl WordPrefixIntegerDocids { .union()?; buffer.clear(); - CboRoaringBitmapCodec::serialize_into(&output, buffer); + CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); index.push(PrefixIntegerEntry { prefix, pos, serialized_length: buffer.len() }); file.write_all(buffer)?; } From 76d0623b11b88c169843bbc61c1b8bff132e9d4f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:05:06 +0100 Subject: [PATCH 24/37] Reduce the number of unwraps --- crates/milli/src/update/new/merger.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index f8af84177..b650b6b53 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -56,7 +56,7 @@ where let rtree_mmap = unsafe { Mmap::map(&file)? }; geo_sender.set_rtree(rtree_mmap).unwrap(); - geo_sender.set_geo_faceted(&faceted).unwrap(); + geo_sender.set_geo_faceted(&faceted)?; Ok(()) } @@ -82,11 +82,11 @@ where let current = database.get(&rtxn, key)?; match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { - docids_sender.write(key, &bitmap).unwrap(); + docids_sender.write(key, &bitmap)?; Ok(()) } Operation::Delete => { - docids_sender.delete(key).unwrap(); + docids_sender.delete(key)?; Ok(()) } Operation::Ignore => Ok(()), @@ -112,12 +112,12 @@ pub fn merge_and_send_facet_docids<'extractor>( match merge_cbo_bitmaps(current, del, add)? { Operation::Write(bitmap) => { facet_field_ids_delta.register_from_key(key); - docids_sender.write(key, &bitmap).unwrap(); + docids_sender.write(key, &bitmap)?; Ok(()) } Operation::Delete => { facet_field_ids_delta.register_from_key(key); - docids_sender.delete(key).unwrap(); + docids_sender.delete(key)?; Ok(()) } Operation::Ignore => Ok(()), From 5b860cb9893ded811150f9ae0332dc89f166ea6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:06:35 +0100 Subject: [PATCH 25/37] Fix english in the doc --- crates/milli/src/update/new/channel.rs | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 1a463be1e..7375354aa 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -27,9 +27,9 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. /// -/// The `total_bbbuffer_capacity` represent the number of bytes -/// allocated to all BBQueue buffer. It will be split by the -/// number of thread. +/// The `total_bbbuffer_capacity` represents the number of bytes +/// allocated to all BBQueue buffers. It will be split by the +/// number of threads. /// /// The `channel_capacity` parameter defines the number of /// too-large-to-fit-in-BBQueue entries that can be sent through @@ -37,14 +37,9 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// sure we do not use too much memory. /// /// Note that the channel is also used to wake-up the receiver -/// wehn new stuff is available in any BBQueue buffer but we send +/// when new stuff is available in any BBQueue buffer but we send /// a message in this queue only if it is empty to avoid filling /// the channel *and* the BBQueue. -/// -/// # Safety -/// -/// Panics if the number of provided BBQueues is not exactly equal -/// to the number of available threads in the rayon threadpool. pub fn extractor_writer_bbqueue( bbbuffers: &mut Vec, total_bbbuffer_capacity: usize, @@ -82,7 +77,7 @@ pub struct ExtractorBbqueueSender<'a> { /// The capacity of this frame producer, will never be able to store more than that. /// /// Note that the FrameProducer requires up to 9 bytes to encode the length, - /// the capacity has been shrinked accordingly. + /// the capacity has been shrunk accordingly. /// /// capacity: usize, From 30eb0e5b5baad02475a73c5ae16f3a1713bd21a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:08:01 +0100 Subject: [PATCH 26/37] Rename recv and read methods to recv_action and recv_frame --- crates/milli/src/update/new/channel.rs | 4 ++-- crates/milli/src/update/new/indexer/mod.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 7375354aa..82e483d18 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -136,11 +136,11 @@ impl LargeVectors { } impl<'a> WriterBbqueueReceiver<'a> { - pub fn recv(&mut self) -> Option { + pub fn recv_action(&mut self) -> Option { self.receiver.recv().ok() } - pub fn read(&mut self) -> Option> { + pub fn recv_frame(&mut self) -> Option> { for consumer in &mut self.consumers { if let Some(frame) = consumer.read() { return Some(FrameWithHeader::from(frame)); diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index e0450ff7d..bd3fedae2 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -417,7 +417,7 @@ where let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); let mut _entered_post_merge = None; - while let Some(action) = writer_receiver.recv() { + while let Some(action) = writer_receiver.recv_action() { if _entered_post_merge.is_none() && finished_extraction.load(std::sync::atomic::Ordering::Relaxed) { @@ -556,7 +556,7 @@ fn write_from_bbqueue( arroy_writers: &HashMap, aligned_embedding: &mut Vec, ) -> crate::Result<()> { - while let Some(frame_with_header) = writer_receiver.read() { + while let Some(frame_with_header) = writer_receiver.recv_frame() { match frame_with_header.header() { EntryHeader::DbOperation(operation) => { let database_name = operation.database.database_name(); From 5df5eb2db26159f79a0cedaea575bb9a79e098c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:10:48 +0100 Subject: [PATCH 27/37] Clarify a method name --- crates/milli/src/update/new/channel.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 82e483d18..7b083341b 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -215,7 +215,7 @@ pub struct ArroySetVectors { } impl ArroySetVectors { - fn remaining_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { let skip = EntryHeader::variant_size() + mem::size_of::(); &frame[skip..] } @@ -227,7 +227,7 @@ impl ArroySetVectors { vec: &'v mut Vec, ) -> &'v [f32] { vec.clear(); - Self::remaining_bytes(frame).chunks_exact(mem::size_of::()).for_each(|bytes| { + Self::embeddings_bytes(frame).chunks_exact(mem::size_of::()).for_each(|bytes| { let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); vec.push(f); }); From f7f9a131e400bc995d7ef152e559b1e70ecd85e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:15:58 +0100 Subject: [PATCH 28/37] Improve copying bytes into aligned memory area --- crates/milli/src/update/new/channel.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 7b083341b..7a997c3af 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -226,11 +226,10 @@ impl ArroySetVectors { frame: &FrameGrantR<'_>, vec: &'v mut Vec, ) -> &'v [f32] { - vec.clear(); - Self::embeddings_bytes(frame).chunks_exact(mem::size_of::()).for_each(|bytes| { - let f = bytes.try_into().map(f32::from_ne_bytes).unwrap(); - vec.push(f); - }); + let embeddings_bytes = Self::embeddings_bytes(frame); + let embeddings_count = embeddings_bytes.len() / mem::size_of::(); + vec.resize(embeddings_count, 0.0); + bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); &vec[..] } } From be7d2fbe63070066538a6450d5e46803990169b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:19:11 +0100 Subject: [PATCH 29/37] Move the EntryHeader up in the file and document the safety related to the size --- crates/milli/src/update/new/channel.rs | 128 +++++++++++++------------ 1 file changed, 66 insertions(+), 62 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 7a997c3af..bebaad686 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -172,68 +172,10 @@ impl<'a> From> for FrameWithHeader<'a> { } } -#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(C)] -/// Wether a put of the key/value pair or a delete of the given key. -pub struct DbOperation { - /// The database on which to perform the operation. - pub database: Database, - /// The key length in the buffer. - /// - /// If None it means that the buffer is dedicated - /// to the key and it is therefore a deletion operation. - pub key_length: Option, -} - -impl DbOperation { - pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { - let skip = EntryHeader::variant_size() + mem::size_of::(); - match self.key_length { - Some(key_length) => { - let (key, value) = frame[skip..].split_at(key_length.get() as usize); - (key, Some(value)) - } - None => (&frame[skip..], None), - } - } -} - -#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(transparent)] -pub struct ArroyDeleteVector { - pub docid: DocumentId, -} - -#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] -#[repr(C)] -/// The embeddings are in the remaining space and represents -/// non-aligned [f32] each with dimensions f32s. -pub struct ArroySetVectors { - pub docid: DocumentId, - pub embedder_id: u8, - _padding: [u8; 3], -} - -impl ArroySetVectors { - fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { - let skip = EntryHeader::variant_size() + mem::size_of::(); - &frame[skip..] - } - - /// Read all the embeddings and write them into an aligned `f32` Vec. - pub fn read_all_embeddings_into_vec<'v>( - &self, - frame: &FrameGrantR<'_>, - vec: &'v mut Vec, - ) -> &'v [f32] { - let embeddings_bytes = Self::embeddings_bytes(frame); - let embeddings_count = embeddings_bytes.len() / mem::size_of::(); - vec.resize(embeddings_count, 0.0); - bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); - &vec[..] - } -} - +/// A header that is written at the beginning of a bbqueue frame. +/// +/// Note that the different variants cannot be changed without taking +/// care of their size in the implementation, like, everywhere. #[derive(Debug, Clone, Copy)] #[repr(u8)] pub enum EntryHeader { @@ -319,6 +261,68 @@ impl EntryHeader { } } +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// Wether a put of the key/value pair or a delete of the given key. +pub struct DbOperation { + /// The database on which to perform the operation. + pub database: Database, + /// The key length in the buffer. + /// + /// If None it means that the buffer is dedicated + /// to the key and it is therefore a deletion operation. + pub key_length: Option, +} + +impl DbOperation { + pub fn key_value<'a>(&self, frame: &'a FrameGrantR<'_>) -> (&'a [u8], Option<&'a [u8]>) { + let skip = EntryHeader::variant_size() + mem::size_of::(); + match self.key_length { + Some(key_length) => { + let (key, value) = frame[skip..].split_at(key_length.get() as usize); + (key, Some(value)) + } + None => (&frame[skip..], None), + } + } +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(transparent)] +pub struct ArroyDeleteVector { + pub docid: DocumentId, +} + +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embeddings are in the remaining space and represents +/// non-aligned [f32] each with dimensions f32s. +pub struct ArroySetVectors { + pub docid: DocumentId, + pub embedder_id: u8, + _padding: [u8; 3], +} + +impl ArroySetVectors { + fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + let skip = EntryHeader::variant_size() + mem::size_of::(); + &frame[skip..] + } + + /// Read all the embeddings and write them into an aligned `f32` Vec. + pub fn read_all_embeddings_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + let embeddings_bytes = Self::embeddings_bytes(frame); + let embeddings_count = embeddings_bytes.len() / mem::size_of::(); + vec.resize(embeddings_count, 0.0); + bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); + &vec[..] + } +} + #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[repr(u16)] pub enum Database { From 263c5a348ee321559b8b98789d70be9950d6ec83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:33:49 +0100 Subject: [PATCH 30/37] Move the spin looping for BBQueue frames into a dedicated function --- Cargo.lock | 13 +++++ crates/milli/Cargo.toml | 1 + crates/milli/src/update/new/channel.rs | 79 ++++++++++++-------------- 3 files changed, 49 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8a0a6b3d0..038b269ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1910,6 +1910,15 @@ dependencies = [ "serde_json", ] +[[package]] +name = "flume" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0e4dd2a88388a1f4ccc7c9ce104604dab68d9f408dc34cd45823d5a9069095" +dependencies = [ + "spin", +] + [[package]] name = "fnv" version = "1.0.7" @@ -3623,6 +3632,7 @@ dependencies = [ "enum-iterator", "filter-parser", "flatten-serde-json", + "flume", "fst", "fxhash", "geoutils", @@ -5180,6 +5190,9 @@ name = "spin" version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] [[package]] name = "spm_precompiled" diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index b66dec9a4..a88401470 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -99,6 +99,7 @@ rustc-hash = "2.0.0" uell = "0.1.0" enum-iterator = "2.1.0" bbqueue = { git = "https://github.com/kerollmops/bbqueue" } +flume = { version = "0.11.1", default-features = false } [dev-dependencies] mimalloc = { version = "0.1.43", default-features = false } diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index bebaad686..e8bb6930c 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -4,10 +4,10 @@ use std::marker::PhantomData; use std::mem; use std::num::NonZeroU16; -use bbqueue::framed::{FrameGrantR, FrameProducer}; +use bbqueue::framed::{FrameGrantR, FrameGrantW, FrameProducer}; use bbqueue::BBBuffer; use bytemuck::{checked, CheckedBitPattern, NoUninit}; -use crossbeam_channel::SendError; +use flume::SendError; use heed::types::Bytes; use heed::BytesDecode; use memmap2::{Mmap, MmapMut}; @@ -33,7 +33,7 @@ use crate::{CboRoaringBitmapCodec, DocumentId, Index}; /// /// The `channel_capacity` parameter defines the number of /// too-large-to-fit-in-BBQueue entries that can be sent through -/// a crossbeam channel. This parameter must stay low to make +/// a flume channel. This parameter must stay low to make /// sure we do not use too much memory. /// /// Note that the channel is also used to wake-up the receiver @@ -61,7 +61,7 @@ pub fn extractor_writer_bbqueue( consumer }); - let (sender, receiver) = crossbeam_channel::bounded(channel_capacity); + let (sender, receiver) = flume::bounded(channel_capacity); let sender = ExtractorBbqueueSender { sender, producers, capacity }; let receiver = WriterBbqueueReceiver { receiver, consumers }; (sender, receiver) @@ -70,7 +70,7 @@ pub fn extractor_writer_bbqueue( pub struct ExtractorBbqueueSender<'a> { /// This channel is used to wake-up the receiver and /// send large entries that cannot fit in the BBQueue. - sender: crossbeam_channel::Sender, + sender: flume::Sender, /// A memory buffer, one by thread, is used to serialize /// the entries directly in this shared, lock-free space. producers: ThreadLocal>>>, @@ -87,7 +87,7 @@ pub struct WriterBbqueueReceiver<'a> { /// Used to wake up when new entries are available either in /// any BBQueue buffer or directly sent throught this channel /// (still written to disk). - receiver: crossbeam_channel::Receiver, + receiver: flume::Receiver, /// The BBQueue frames to read when waking-up. consumers: Vec>, } @@ -437,19 +437,9 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; - + let mut grant = reserve_grant(&mut producer, total_length, &self.sender); payload_header.serialize_into(&mut grant); - // We could commit only the used memory. - grant.commit(total_length); - // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. if self.sender.is_empty() { @@ -494,13 +484,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; + let mut grant = reserve_grant(&mut producer, total_length, &self.sender); let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -571,13 +555,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; + let mut grant = reserve_grant(&mut producer, total_length, &self.sender); let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -585,9 +563,6 @@ impl<'b> ExtractorBbqueueSender<'b> { let (key_buffer, value_buffer) = remaining.split_at_mut(key_length.get() as usize); key_value_writer(key_buffer, value_buffer)?; - // We could commit only the used memory. - grant.commit(total_length); - // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. if self.sender.is_empty() { @@ -628,22 +603,13 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = loop { - match producer.grant(total_length) { - Ok(grant) => break grant, - Err(bbqueue::Error::InsufficientSize) => continue, - Err(e) => unreachable!("{e:?}"), - } - }; + let mut grant = reserve_grant(&mut producer, total_length, &self.sender); let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); payload_header.serialize_into(header_bytes); key_writer(remaining)?; - // We could commit only the used memory. - grant.commit(total_length); - // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. if self.sender.is_empty() { @@ -654,6 +620,31 @@ impl<'b> ExtractorBbqueueSender<'b> { } } +/// Try to reserve a frame grant of `total_length` by spin looping +/// on the BBQueue buffer and panics if the receiver has been disconnected. +fn reserve_grant<'b>( + producer: &mut FrameProducer<'b>, + total_length: usize, + sender: &flume::Sender, +) -> FrameGrantW<'b> { + loop { + for _ in 0..10_000 { + match producer.grant(total_length) { + Ok(mut grant) => { + // We could commit only the used memory. + grant.to_commit(total_length); + return grant; + } + Err(bbqueue::Error::InsufficientSize) => continue, + Err(e) => unreachable!("{e:?}"), + } + } + if sender.is_disconnected() { + panic!("channel is disconnected"); + } + } +} + pub enum ExactWordDocids {} pub enum FidWordCountDocids {} pub enum WordDocids {} From bcab61ab1d83738710a69766bf4c3723b1596906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:42:47 +0100 Subject: [PATCH 31/37] Do spurious wake ups on the receiver side --- crates/milli/src/update/new/channel.rs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index e8bb6930c..631fcf74e 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -3,11 +3,12 @@ use std::io::{self, BufWriter}; use std::marker::PhantomData; use std::mem; use std::num::NonZeroU16; +use std::time::Duration; use bbqueue::framed::{FrameGrantR, FrameGrantW, FrameProducer}; use bbqueue::BBBuffer; use bytemuck::{checked, CheckedBitPattern, NoUninit}; -use flume::SendError; +use flume::{RecvTimeoutError, SendError}; use heed::types::Bytes; use heed::BytesDecode; use memmap2::{Mmap, MmapMut}; @@ -136,10 +137,24 @@ impl LargeVectors { } impl<'a> WriterBbqueueReceiver<'a> { + /// Tries to receive an action to do until the timeout occurs + /// and if it does, consider it as a spurious wake up. pub fn recv_action(&mut self) -> Option { - self.receiver.recv().ok() + match self.receiver.recv_timeout(Duration::from_millis(100)) { + Ok(action) => Some(action), + Err(RecvTimeoutError::Timeout) => Some(ReceiverAction::WakeUp), + Err(RecvTimeoutError::Disconnected) => None, + } } + /// Reads all the BBQueue buffers and selects the first available frame. + /// + /// Note: Selecting the first available frame gives preference to + /// frames that will be cleaned up first. It may result in the + /// last frames being more likely to fill up. One potential optimization + /// could involve keeping track of the last processed BBQueue index + /// to cycle through the frames instead of always starting from the + /// beginning. pub fn recv_frame(&mut self) -> Option> { for consumer in &mut self.consumers { if let Some(frame) = consumer.read() { From 5e218f3f4daf1594580eb377183770fd4a206a5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 10:44:42 +0100 Subject: [PATCH 32/37] Remove a sync_all (mark my words) --- crates/milli/src/update/new/channel.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 631fcf74e..219f20854 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -489,7 +489,6 @@ impl<'b> ExtractorBbqueueSender<'b> { } let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; - value_file.sync_all()?; let embeddings = unsafe { Mmap::map(&value_file)? }; let large_vectors = LargeVectors { docid, embedder_id, embeddings }; From d5c07ef7b310f8af30a6d5ac0ea2b0da93241709 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 11:02:49 +0100 Subject: [PATCH 33/37] Manage key length conversion error correctly --- crates/milli/src/error.rs | 10 ++-- crates/milli/src/update/new/channel.rs | 53 ++++++++++++++++++---- crates/milli/src/update/new/indexer/mod.rs | 2 +- 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 800dfa375..a6774a7bd 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -3,6 +3,7 @@ use std::convert::Infallible; use std::fmt::Write; use std::{io, str}; +use bstr::BString; use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use rhai::EvalAltResult; @@ -62,14 +63,9 @@ pub enum InternalError { #[error(transparent)] Store(#[from] MdbError), #[error("Cannot delete {key:?} from database {database_name}: {error}")] - StoreDeletion { database_name: &'static str, key: Box<[u8]>, error: heed::Error }, + StoreDeletion { database_name: &'static str, key: BString, error: heed::Error }, #[error("Cannot insert {key:?} and value with length {value_length} into database {database_name}: {error}")] - StorePut { - database_name: &'static str, - key: Box<[u8]>, - value_length: usize, - error: heed::Error, - }, + StorePut { database_name: &'static str, key: BString, value_length: usize, error: heed::Error }, #[error(transparent)] Utf8(#[from] str::Utf8Error), #[error("An indexation process was explicitly aborted")] diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 219f20854..b0a61bd7f 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -10,7 +10,7 @@ use bbqueue::BBBuffer; use bytemuck::{checked, CheckedBitPattern, NoUninit}; use flume::{RecvTimeoutError, SendError}; use heed::types::Bytes; -use heed::BytesDecode; +use heed::{BytesDecode, MdbError}; use memmap2::{Mmap, MmapMut}; use roaring::RoaringBitmap; @@ -23,7 +23,7 @@ use crate::index::db_name; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; -use crate::{CboRoaringBitmapCodec, DocumentId, Index}; +use crate::{CboRoaringBitmapCodec, DocumentId, Index, InternalError}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. @@ -524,7 +524,14 @@ impl<'b> ExtractorBbqueueSender<'b> { } fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length: value.len(), + error: MdbError::BadValSize.into(), + } + })?; self.write_key_value_with(database, key_length, value.len(), |key_buffer, value_buffer| { key_buffer.copy_from_slice(key); value_buffer.copy_from_slice(value); @@ -587,7 +594,13 @@ impl<'b> ExtractorBbqueueSender<'b> { } fn delete_entry(&self, database: Database, key: &[u8]) -> crate::Result<()> { - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StoreDeletion { + database_name: database.database_name(), + key: key.into(), + error: MdbError::BadValSize.into(), + } + })?; self.delete_entry_with(database, key_length, |buffer| { buffer.copy_from_slice(key); Ok(()) @@ -702,8 +715,15 @@ pub struct WordDocidsSender<'a, 'b, D> { impl WordDocidsSender<'_, '_, D> { pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> { - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: D::DATABASE.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; self.sender.write_key_value_with( D::DATABASE, key_length, @@ -731,7 +751,6 @@ impl FacetDocidsSender<'_, '_> { let (facet_kind, key) = FacetKind::extract_from_key(key); let database = Database::from(facet_kind); - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); let value_length = CboRoaringBitmapCodec::serialized_size(bitmap); let value_length = match facet_kind { // We must take the facet group size into account @@ -739,6 +758,14 @@ impl FacetDocidsSender<'_, '_> { FacetKind::Number | FacetKind::String => value_length + 1, FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_length, }; + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; self.sender.write_key_value_with( database, @@ -862,12 +889,20 @@ impl GeoSender<'_, '_> { } pub fn set_geo_faceted(&self, bitmap: &RoaringBitmap) -> crate::Result<()> { - let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(); - let key_length = NonZeroU16::new(key.len().try_into().unwrap()).unwrap(); + let database = Database::Main; let value_length = bitmap.serialized_size(); + let key = GEO_FACETED_DOCUMENTS_IDS_KEY.as_bytes(); + let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { + InternalError::StorePut { + database_name: database.database_name(), + key: key.into(), + value_length, + error: MdbError::BadValSize.into(), + } + })?; self.0.write_key_value_with( - Database::Main, + database, key_length, value_length, |key_buffer, value_buffer| { diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index bd3fedae2..7262c65cb 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -432,7 +432,7 @@ where if let Err(error) = database.put(wtxn, &key, &value) { return Err(Error::InternalError(InternalError::StorePut { database_name, - key, + key: bstr::BString::from(&key[..]), value_length: value.len(), error, })); From e9f34fb4b1d8ec674818218009055f21cb2e68e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 11:49:01 +0100 Subject: [PATCH 34/37] Make the frame consumer pulling fair --- crates/milli/src/update/new/channel.rs | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index b0a61bd7f..a2f16983e 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -1,8 +1,10 @@ use std::cell::RefCell; use std::io::{self, BufWriter}; +use std::iter::Cycle; use std::marker::PhantomData; use std::mem; use std::num::NonZeroU16; +use std::ops::Range; use std::time::Duration; use bbqueue::framed::{FrameGrantR, FrameGrantW, FrameProducer}; @@ -64,7 +66,11 @@ pub fn extractor_writer_bbqueue( let (sender, receiver) = flume::bounded(channel_capacity); let sender = ExtractorBbqueueSender { sender, producers, capacity }; - let receiver = WriterBbqueueReceiver { receiver, consumers }; + let receiver = WriterBbqueueReceiver { + receiver, + look_at_consumer: (0..consumers.len()).cycle(), + consumers, + }; (sender, receiver) } @@ -89,6 +95,9 @@ pub struct WriterBbqueueReceiver<'a> { /// any BBQueue buffer or directly sent throught this channel /// (still written to disk). receiver: flume::Receiver, + /// Indicates the consumer to observe. This cycling range + /// ensures fair distribution of work among consumers. + look_at_consumer: Cycle>, /// The BBQueue frames to read when waking-up. consumers: Vec>, } @@ -148,16 +157,9 @@ impl<'a> WriterBbqueueReceiver<'a> { } /// Reads all the BBQueue buffers and selects the first available frame. - /// - /// Note: Selecting the first available frame gives preference to - /// frames that will be cleaned up first. It may result in the - /// last frames being more likely to fill up. One potential optimization - /// could involve keeping track of the last processed BBQueue index - /// to cycle through the frames instead of always starting from the - /// beginning. pub fn recv_frame(&mut self) -> Option> { - for consumer in &mut self.consumers { - if let Some(frame) = consumer.read() { + for index in self.look_at_consumer.by_ref().take(self.consumers.len()) { + if let Some(frame) = self.consumers[index].read() { return Some(FrameWithHeader::from(frame)); } } @@ -511,9 +513,6 @@ impl<'b> ExtractorBbqueueSender<'b> { } } - // We could commit only the used memory. - grant.commit(total_length); - // We only send a wake up message when the channel is empty // so that we don't fill the channel with too many WakeUps. if self.sender.is_empty() { From 767259be7e5e7c8a69a802ddae9a434e349849e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 2 Dec 2024 11:53:42 +0100 Subject: [PATCH 35/37] Prefer returning a abort indexation rather than throwing a panic --- crates/milli/src/update/new/channel.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index a2f16983e..b749eb7fe 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -25,7 +25,7 @@ use crate::index::db_name; use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY}; use crate::update::new::KvReaderFieldId; use crate::vector::Embedding; -use crate::{CboRoaringBitmapCodec, DocumentId, Index, InternalError}; +use crate::{CboRoaringBitmapCodec, DocumentId, Error, Index, InternalError}; /// Creates a tuple of senders/receiver to be used by /// the extractors and the writer loop. @@ -454,7 +454,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender); + let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; payload_header.serialize_into(&mut grant); // We only send a wake up message when the channel is empty @@ -500,7 +500,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender); + let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -575,7 +575,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender); + let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -629,7 +629,7 @@ impl<'b> ExtractorBbqueueSender<'b> { } // Spin loop to have a frame the size we requested. - let mut grant = reserve_grant(&mut producer, total_length, &self.sender); + let mut grant = reserve_grant(&mut producer, total_length, &self.sender)?; let header_size = payload_header.header_size(); let (header_bytes, remaining) = grant.split_at_mut(header_size); @@ -652,21 +652,21 @@ fn reserve_grant<'b>( producer: &mut FrameProducer<'b>, total_length: usize, sender: &flume::Sender, -) -> FrameGrantW<'b> { +) -> crate::Result> { loop { for _ in 0..10_000 { match producer.grant(total_length) { Ok(mut grant) => { // We could commit only the used memory. grant.to_commit(total_length); - return grant; + return Ok(grant); } Err(bbqueue::Error::InsufficientSize) => continue, Err(e) => unreachable!("{e:?}"), } } if sender.is_disconnected() { - panic!("channel is disconnected"); + return Err(Error::InternalError(InternalError::AbortedIndexation)); } } } From d040aff10124c72b4785f295163189943704fb4d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Dec 2024 16:30:14 +0100 Subject: [PATCH 36/37] Stop allocating 1GiB for documents --- crates/meilisearch-types/src/document_formats.rs | 2 +- crates/milli/src/update/new/indexer/document_changes.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch-types/src/document_formats.rs b/crates/meilisearch-types/src/document_formats.rs index 096349448..008be4022 100644 --- a/crates/meilisearch-types/src/document_formats.rs +++ b/crates/meilisearch-types/src/document_formats.rs @@ -214,7 +214,7 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result { // We memory map to be able to deserialize into a RawMap that // does not allocate when possible and only materialize the first/top level. let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? }; - let mut doc_alloc = Bump::with_capacity(1024 * 1024 * 1024); // 1MiB + let mut doc_alloc = Bump::with_capacity(1024 * 1024); // 1MiB let mut out = BufWriter::new(output); let mut deserializer = serde_json::Deserializer::from_slice(&input); diff --git a/crates/milli/src/update/new/indexer/document_changes.rs b/crates/milli/src/update/new/indexer/document_changes.rs index bfb369680..2a5c25525 100644 --- a/crates/milli/src/update/new/indexer/document_changes.rs +++ b/crates/milli/src/update/new/indexer/document_changes.rs @@ -70,7 +70,7 @@ impl< F: FnOnce(&'extractor Bump) -> Result, { let doc_alloc = - doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024)))); + doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024)))); let doc_alloc = doc_alloc.0.take(); let fields_ids_map = fields_ids_map_store .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into()); From e905a72d731f0e6dc581d9b7ad02b94e594aa94e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Dec 2024 18:13:56 +0100 Subject: [PATCH 37/37] remove mimalloc on Windows --- crates/benchmarks/benches/indexing.rs | 1 + crates/benchmarks/benches/search_geo.rs | 1 + crates/benchmarks/benches/search_songs.rs | 1 + crates/benchmarks/benches/search_wiki.rs | 1 + crates/meilisearch/src/main.rs | 4 ++-- crates/milli/src/lib.rs | 1 + 6 files changed, 7 insertions(+), 2 deletions(-) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index d3f307be3..870e56686 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -16,6 +16,7 @@ use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; use roaring::RoaringBitmap; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/benchmarks/benches/search_geo.rs b/crates/benchmarks/benches/search_geo.rs index faea4e3e0..72503ce57 100644 --- a/crates/benchmarks/benches/search_geo.rs +++ b/crates/benchmarks/benches/search_geo.rs @@ -5,6 +5,7 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/benchmarks/benches/search_songs.rs b/crates/benchmarks/benches/search_songs.rs index a1245528f..bef014a0e 100644 --- a/crates/benchmarks/benches/search_songs.rs +++ b/crates/benchmarks/benches/search_songs.rs @@ -5,6 +5,7 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/benchmarks/benches/search_wiki.rs b/crates/benchmarks/benches/search_wiki.rs index b792c2645..24eb5c8d1 100644 --- a/crates/benchmarks/benches/search_wiki.rs +++ b/crates/benchmarks/benches/search_wiki.rs @@ -5,6 +5,7 @@ use criterion::{criterion_group, criterion_main}; use milli::update::Settings; use utils::Conf; +#[cfg(not(windows))] #[global_allocator] static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; diff --git a/crates/meilisearch/src/main.rs b/crates/meilisearch/src/main.rs index c0652bf1e..b4b46bec4 100644 --- a/crates/meilisearch/src/main.rs +++ b/crates/meilisearch/src/main.rs @@ -20,14 +20,14 @@ use meilisearch::{ LogStderrType, Opt, SubscriberForSecondLayer, }; use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE}; -use mimalloc::MiMalloc; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use tracing::level_filters::LevelFilter; use tracing_subscriber::layer::SubscriberExt as _; use tracing_subscriber::Layer; +#[cfg(not(windows))] #[global_allocator] -static ALLOC: MiMalloc = MiMalloc; +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; fn default_log_route_layer() -> LogRouteType { None.with_filter(tracing_subscriber::filter::Targets::new().with_target("", LevelFilter::OFF)) diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 48b03b6cc..1fc876f79 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -1,6 +1,7 @@ #![cfg_attr(all(test, fuzzing), feature(no_coverage))] #![allow(clippy::type_complexity)] +#[cfg(not(windows))] #[cfg(test)] #[global_allocator] pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;