mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-03 17:36:29 +00:00 
			
		
		
		
	Only run word pair proximity docids extraction if proximity_precision enables it
This commit is contained in:
		
				
					committed by
					
						
						Clément Renault
					
				
			
			
				
	
			
			
			
						parent
						
							3d29226a7f
						
					
				
				
					commit
					ad52c950ba
				
			
							
								
								
									
										1470
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1470
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@@ -33,13 +33,11 @@ use meilisearch_types::milli::heed::CompactionOption;
 | 
				
			|||||||
use meilisearch_types::milli::update::new::indexer::{
 | 
					use meilisearch_types::milli::update::new::indexer::{
 | 
				
			||||||
    self, retrieve_or_guess_primary_key, UpdateByFunction,
 | 
					    self, retrieve_or_guess_primary_key, UpdateByFunction,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
use meilisearch_types::milli::update::{
 | 
					use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSettings};
 | 
				
			||||||
    IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
use meilisearch_types::milli::vector::parsed_vectors::{
 | 
					use meilisearch_types::milli::vector::parsed_vectors::{
 | 
				
			||||||
    ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME,
 | 
					    ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
use meilisearch_types::milli::{self, Filter, Object};
 | 
					use meilisearch_types::milli::{self, Filter};
 | 
				
			||||||
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
 | 
					use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
 | 
				
			||||||
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task};
 | 
					use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task};
 | 
				
			||||||
use meilisearch_types::{compression, Index, VERSION_FILE_NAME};
 | 
					use meilisearch_types::{compression, Index, VERSION_FILE_NAME};
 | 
				
			||||||
@@ -50,7 +48,7 @@ use uuid::Uuid;
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
use crate::autobatcher::{self, BatchKind};
 | 
					use crate::autobatcher::{self, BatchKind};
 | 
				
			||||||
use crate::utils::{self, swap_index_uid_in_task};
 | 
					use crate::utils::{self, swap_index_uid_in_task};
 | 
				
			||||||
use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId};
 | 
					use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/// Represents a combination of tasks that can all be processed at the same time.
 | 
					/// Represents a combination of tasks that can all be processed at the same time.
 | 
				
			||||||
///
 | 
					///
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -11,6 +11,6 @@ edition.workspace = true
 | 
				
			|||||||
license.workspace = true
 | 
					license.workspace = true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[dependencies]
 | 
					[dependencies]
 | 
				
			||||||
insta = { version = "^1.39.0", features = ["json", "redactions"] }
 | 
					insta = { version = "=1.39.0", features = ["json", "redactions"] }
 | 
				
			||||||
md5 = "0.7.0"
 | 
					md5 = "0.7.0"
 | 
				
			||||||
once_cell = "1.19"
 | 
					once_cell = "1.19"
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -94,10 +94,12 @@ tracing = "0.1.40"
 | 
				
			|||||||
ureq = { version = "2.10.0", features = ["json"] }
 | 
					ureq = { version = "2.10.0", features = ["json"] }
 | 
				
			||||||
url = "2.5.2"
 | 
					url = "2.5.2"
 | 
				
			||||||
rayon-par-bridge = "0.1.0"
 | 
					rayon-par-bridge = "0.1.0"
 | 
				
			||||||
hashbrown = "0.14.5"
 | 
					hashbrown = "0.15.0"
 | 
				
			||||||
raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
 | 
					raw-collections = { git = "https://github.com/dureuill/raw-collections.git", version = "0.1.0" }
 | 
				
			||||||
bumpalo = "3.16.0"
 | 
					bumpalo = "3.16.0"
 | 
				
			||||||
thread_local = "1.1.8"
 | 
					thread_local = "1.1.8"
 | 
				
			||||||
 | 
					allocator-api2 = "0.2.18"
 | 
				
			||||||
 | 
					rustc-hash = "2.0.0"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[dev-dependencies]
 | 
					[dev-dependencies]
 | 
				
			||||||
mimalloc = { version = "0.1.43", default-features = false }
 | 
					mimalloc = { version = "0.1.43", default-features = false }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -83,6 +83,8 @@ pub fn writer_into_reader(
 | 
				
			|||||||
    grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
 | 
					    grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/// # Safety
 | 
				
			||||||
 | 
					/// We use memory mapping inside. So, according to the Rust community, it's unsafe.
 | 
				
			||||||
pub unsafe fn as_cloneable_grenad(
 | 
					pub unsafe fn as_cloneable_grenad(
 | 
				
			||||||
    reader: &grenad::Reader<BufReader<File>>,
 | 
					    reader: &grenad::Reader<BufReader<File>>,
 | 
				
			||||||
) -> Result<grenad::Reader<CursorClonableMmap>> {
 | 
					) -> Result<grenad::Reader<CursorClonableMmap>> {
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,42 +1,33 @@
 | 
				
			|||||||
use std::fs::File;
 | 
					 | 
				
			||||||
use std::marker::PhantomData;
 | 
					use std::marker::PhantomData;
 | 
				
			||||||
 | 
					use std::sync::atomic::{AtomicUsize, Ordering};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
 | 
					use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
 | 
				
			||||||
use grenad::Merger;
 | 
					 | 
				
			||||||
use heed::types::Bytes;
 | 
					use heed::types::Bytes;
 | 
				
			||||||
use memmap2::Mmap;
 | 
					 | 
				
			||||||
use roaring::RoaringBitmap;
 | 
					use roaring::RoaringBitmap;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::extract::FacetKind;
 | 
					use super::extract::FacetKind;
 | 
				
			||||||
use super::StdResult;
 | 
					use super::StdResult;
 | 
				
			||||||
use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
 | 
					use crate::index::main_key::DOCUMENTS_IDS_KEY;
 | 
				
			||||||
use crate::update::new::KvReaderFieldId;
 | 
					use crate::update::new::KvReaderFieldId;
 | 
				
			||||||
use crate::update::MergeDeladdCboRoaringBitmaps;
 | 
					 | 
				
			||||||
use crate::{DocumentId, Index};
 | 
					use crate::{DocumentId, Index};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/// The capacity of the channel is currently in number of messages.
 | 
					/// The capacity of the channel is currently in number of messages.
 | 
				
			||||||
pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) {
 | 
					pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) {
 | 
				
			||||||
    let (sender, receiver) = crossbeam_channel::bounded(cap);
 | 
					    let (sender, receiver) = crossbeam_channel::bounded(cap);
 | 
				
			||||||
    (
 | 
					    (
 | 
				
			||||||
        MergerSender {
 | 
					        ExtractorSender {
 | 
				
			||||||
            sender,
 | 
					            sender,
 | 
				
			||||||
            send_count: Default::default(),
 | 
					            send_count: Default::default(),
 | 
				
			||||||
            writer_contentious_count: Default::default(),
 | 
					            writer_contentious_count: Default::default(),
 | 
				
			||||||
            merger_contentious_count: Default::default(),
 | 
					            extractor_contentious_count: Default::default(),
 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        WriterReceiver(receiver),
 | 
					        WriterReceiver(receiver),
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/// The capacity of the channel is currently in number of messages.
 | 
					pub struct KeyValueEntry {
 | 
				
			||||||
pub fn extractors_merger_channels(cap: usize) -> (ExtractorSender, MergerReceiver) {
 | 
					    pub key_length: usize,
 | 
				
			||||||
    let (sender, receiver) = crossbeam_channel::bounded(cap);
 | 
					    pub data: Box<[u8]>,
 | 
				
			||||||
    (ExtractorSender(sender), MergerReceiver(receiver))
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub enum KeyValueEntry {
 | 
					 | 
				
			||||||
    SmallInMemory { key_length: usize, data: Box<[u8]> },
 | 
					 | 
				
			||||||
    LargeOnDisk { key: Box<[u8]>, value: Mmap },
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl KeyValueEntry {
 | 
					impl KeyValueEntry {
 | 
				
			||||||
@@ -44,32 +35,22 @@ impl KeyValueEntry {
 | 
				
			|||||||
        let mut data = Vec::with_capacity(key.len() + value.len());
 | 
					        let mut data = Vec::with_capacity(key.len() + value.len());
 | 
				
			||||||
        data.extend_from_slice(key);
 | 
					        data.extend_from_slice(key);
 | 
				
			||||||
        data.extend_from_slice(value);
 | 
					        data.extend_from_slice(value);
 | 
				
			||||||
        KeyValueEntry::SmallInMemory { key_length: key.len(), data: data.into_boxed_slice() }
 | 
					        KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pub fn from_small_key_bitmap(key: &[u8], bitmap: RoaringBitmap) -> Self {
 | 
					    pub fn from_small_key_bitmap(key: &[u8], bitmap: RoaringBitmap) -> Self {
 | 
				
			||||||
        let mut data = Vec::with_capacity(key.len() + bitmap.serialized_size());
 | 
					        let mut data = Vec::with_capacity(key.len() + bitmap.serialized_size());
 | 
				
			||||||
        data.extend_from_slice(key);
 | 
					        data.extend_from_slice(key);
 | 
				
			||||||
        bitmap.serialize_into(&mut data).unwrap();
 | 
					        bitmap.serialize_into(&mut data).unwrap();
 | 
				
			||||||
        KeyValueEntry::SmallInMemory { key_length: key.len(), data: data.into_boxed_slice() }
 | 
					        KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() }
 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn from_large_key_value(key: &[u8], value: Mmap) -> Self {
 | 
					 | 
				
			||||||
        KeyValueEntry::LargeOnDisk { key: key.to_vec().into_boxed_slice(), value }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pub fn key(&self) -> &[u8] {
 | 
					    pub fn key(&self) -> &[u8] {
 | 
				
			||||||
        match self {
 | 
					        &self.data[..self.key_length]
 | 
				
			||||||
            KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[..*key_length],
 | 
					 | 
				
			||||||
            KeyValueEntry::LargeOnDisk { key, value: _ } => key.as_ref(),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pub fn value(&self) -> &[u8] {
 | 
					    pub fn value(&self) -> &[u8] {
 | 
				
			||||||
        match self {
 | 
					        &self.data[self.key_length..]
 | 
				
			||||||
            KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[*key_length..],
 | 
					 | 
				
			||||||
            KeyValueEntry::LargeOnDisk { key: _, value } => value.as_ref(),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -92,37 +73,6 @@ pub enum EntryOperation {
 | 
				
			|||||||
    Write(KeyValueEntry),
 | 
					    Write(KeyValueEntry),
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct DocumentEntry {
 | 
					 | 
				
			||||||
    docid: DocumentId,
 | 
					 | 
				
			||||||
    content: Box<[u8]>,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl DocumentEntry {
 | 
					 | 
				
			||||||
    pub fn new_uncompressed(docid: DocumentId, content: Box<KvReaderFieldId>) -> Self {
 | 
					 | 
				
			||||||
        DocumentEntry { docid, content: content.into() }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn new_compressed(docid: DocumentId, content: Box<[u8]>) -> Self {
 | 
					 | 
				
			||||||
        DocumentEntry { docid, content }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn key(&self) -> [u8; 4] {
 | 
					 | 
				
			||||||
        self.docid.to_be_bytes()
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn content(&self) -> &[u8] {
 | 
					 | 
				
			||||||
        &self.content
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub struct DocumentDeletionEntry(DocumentId);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl DocumentDeletionEntry {
 | 
					 | 
				
			||||||
    pub fn key(&self) -> [u8; 4] {
 | 
					 | 
				
			||||||
        self.0.to_be_bytes()
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub struct WriterOperation {
 | 
					pub struct WriterOperation {
 | 
				
			||||||
    database: Database,
 | 
					    database: Database,
 | 
				
			||||||
    entry: EntryOperation,
 | 
					    entry: EntryOperation,
 | 
				
			||||||
@@ -206,34 +156,32 @@ impl IntoIterator for WriterReceiver {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct MergerSender {
 | 
					pub struct ExtractorSender {
 | 
				
			||||||
    sender: Sender<WriterOperation>,
 | 
					    sender: Sender<WriterOperation>,
 | 
				
			||||||
    /// The number of message we send in total in the channel.
 | 
					    /// The number of message we sent in total in the channel.
 | 
				
			||||||
    send_count: std::cell::Cell<usize>,
 | 
					    send_count: AtomicUsize,
 | 
				
			||||||
    /// The number of times we sent something in a channel that was full.
 | 
					    /// The number of times we sent something in a channel that was full.
 | 
				
			||||||
    writer_contentious_count: std::cell::Cell<usize>,
 | 
					    writer_contentious_count: AtomicUsize,
 | 
				
			||||||
    /// The number of times we sent something in a channel that was empty.
 | 
					    /// The number of times we sent something in a channel that was empty.
 | 
				
			||||||
    merger_contentious_count: std::cell::Cell<usize>,
 | 
					    extractor_contentious_count: AtomicUsize,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl Drop for MergerSender {
 | 
					impl Drop for ExtractorSender {
 | 
				
			||||||
    fn drop(&mut self) {
 | 
					    fn drop(&mut self) {
 | 
				
			||||||
 | 
					        let send_count = *self.send_count.get_mut();
 | 
				
			||||||
 | 
					        let writer_contentious_count = *self.writer_contentious_count.get_mut();
 | 
				
			||||||
 | 
					        let extractor_contentious_count = *self.extractor_contentious_count.get_mut();
 | 
				
			||||||
        eprintln!(
 | 
					        eprintln!(
 | 
				
			||||||
            "Merger channel stats: {} sends, {} writer contentions ({}%), {} merger contentions ({}%)",
 | 
					            "Extractor channel stats: {send_count} sends, \
 | 
				
			||||||
            self.send_count.get(),
 | 
					            {writer_contentious_count} writer contentions ({}%), \
 | 
				
			||||||
            self.writer_contentious_count.get(),
 | 
					            {extractor_contentious_count} extractor contentions ({}%)",
 | 
				
			||||||
            (self.writer_contentious_count.get() as f32 / self.send_count.get() as f32) * 100.0,
 | 
					            (writer_contentious_count as f32 / send_count as f32) * 100.0,
 | 
				
			||||||
            self.merger_contentious_count.get(),
 | 
					            (extractor_contentious_count as f32 / send_count as f32) * 100.0
 | 
				
			||||||
            (self.merger_contentious_count.get() as f32 / self.send_count.get() as f32) * 100.0
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl MergerSender {
 | 
					impl ExtractorSender {
 | 
				
			||||||
    pub fn main(&self) -> MainSender<'_> {
 | 
					 | 
				
			||||||
        MainSender(self)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn docids<D: DatabaseType>(&self) -> WordDocidsSender<'_, D> {
 | 
					    pub fn docids<D: DatabaseType>(&self) -> WordDocidsSender<'_, D> {
 | 
				
			||||||
        WordDocidsSender { sender: self, _marker: PhantomData }
 | 
					        WordDocidsSender { sender: self, _marker: PhantomData }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -263,12 +211,12 @@ impl MergerSender {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    fn send(&self, op: WriterOperation) -> StdResult<(), SendError<()>> {
 | 
					    fn send(&self, op: WriterOperation) -> StdResult<(), SendError<()>> {
 | 
				
			||||||
        if self.sender.is_full() {
 | 
					        if self.sender.is_full() {
 | 
				
			||||||
            self.writer_contentious_count.set(self.writer_contentious_count.get() + 1);
 | 
					            self.writer_contentious_count.fetch_add(1, Ordering::SeqCst);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        if self.sender.is_empty() {
 | 
					        if self.sender.is_empty() {
 | 
				
			||||||
            self.merger_contentious_count.set(self.merger_contentious_count.get() + 1);
 | 
					            self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        self.send_count.set(self.send_count.get() + 1);
 | 
					        self.send_count.fetch_add(1, Ordering::SeqCst);
 | 
				
			||||||
        match self.sender.send(op) {
 | 
					        match self.sender.send(op) {
 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					            Ok(()) => Ok(()),
 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					            Err(SendError(_)) => Err(SendError(())),
 | 
				
			||||||
@@ -276,129 +224,48 @@ impl MergerSender {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct MainSender<'a>(&'a MergerSender);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl MainSender<'_> {
 | 
					 | 
				
			||||||
    pub fn write_words_fst(&self, value: Mmap) -> StdResult<(), SendError<()>> {
 | 
					 | 
				
			||||||
        let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value(
 | 
					 | 
				
			||||||
            WORDS_FST_KEY.as_bytes(),
 | 
					 | 
				
			||||||
            value,
 | 
					 | 
				
			||||||
        ));
 | 
					 | 
				
			||||||
        match self.0.send(WriterOperation { database: Database::Main, entry }) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn write_words_prefixes_fst(&self, value: Mmap) -> StdResult<(), SendError<()>> {
 | 
					 | 
				
			||||||
        let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value(
 | 
					 | 
				
			||||||
            WORDS_PREFIXES_FST_KEY.as_bytes(),
 | 
					 | 
				
			||||||
            value,
 | 
					 | 
				
			||||||
        ));
 | 
					 | 
				
			||||||
        match self.0.send(WriterOperation { database: Database::Main, entry }) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
 | 
					 | 
				
			||||||
        let entry = EntryOperation::Delete(KeyEntry::from_key(key));
 | 
					 | 
				
			||||||
        match self.0.send(WriterOperation { database: Database::Main, entry }) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub enum ExactWordDocids {}
 | 
					pub enum ExactWordDocids {}
 | 
				
			||||||
pub enum FidWordCountDocids {}
 | 
					pub enum FidWordCountDocids {}
 | 
				
			||||||
pub enum WordDocids {}
 | 
					pub enum WordDocids {}
 | 
				
			||||||
pub enum WordFidDocids {}
 | 
					pub enum WordFidDocids {}
 | 
				
			||||||
pub enum WordPairProximityDocids {}
 | 
					pub enum WordPairProximityDocids {}
 | 
				
			||||||
pub enum WordPositionDocids {}
 | 
					pub enum WordPositionDocids {}
 | 
				
			||||||
pub enum FacetDocids {}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub trait DatabaseType {
 | 
					pub trait DatabaseType {
 | 
				
			||||||
    const DATABASE: Database;
 | 
					    const DATABASE: Database;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub trait MergerOperationType {
 | 
					 | 
				
			||||||
    fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation;
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl DatabaseType for ExactWordDocids {
 | 
					impl DatabaseType for ExactWordDocids {
 | 
				
			||||||
    const DATABASE: Database = Database::ExactWordDocids;
 | 
					    const DATABASE: Database = Database::ExactWordDocids;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl MergerOperationType for ExactWordDocids {
 | 
					 | 
				
			||||||
    fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
 | 
					 | 
				
			||||||
        MergerOperation::ExactWordDocidsMerger(merger)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl DatabaseType for FidWordCountDocids {
 | 
					impl DatabaseType for FidWordCountDocids {
 | 
				
			||||||
    const DATABASE: Database = Database::FidWordCountDocids;
 | 
					    const DATABASE: Database = Database::FidWordCountDocids;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl MergerOperationType for FidWordCountDocids {
 | 
					 | 
				
			||||||
    fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
 | 
					 | 
				
			||||||
        MergerOperation::FidWordCountDocidsMerger(merger)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl DatabaseType for WordDocids {
 | 
					impl DatabaseType for WordDocids {
 | 
				
			||||||
    const DATABASE: Database = Database::WordDocids;
 | 
					    const DATABASE: Database = Database::WordDocids;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl MergerOperationType for WordDocids {
 | 
					 | 
				
			||||||
    fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
 | 
					 | 
				
			||||||
        MergerOperation::WordDocidsMerger(merger)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl DatabaseType for WordFidDocids {
 | 
					impl DatabaseType for WordFidDocids {
 | 
				
			||||||
    const DATABASE: Database = Database::WordFidDocids;
 | 
					    const DATABASE: Database = Database::WordFidDocids;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl MergerOperationType for WordFidDocids {
 | 
					 | 
				
			||||||
    fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
 | 
					 | 
				
			||||||
        MergerOperation::WordFidDocidsMerger(merger)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl DatabaseType for WordPairProximityDocids {
 | 
					impl DatabaseType for WordPairProximityDocids {
 | 
				
			||||||
    const DATABASE: Database = Database::WordPairProximityDocids;
 | 
					    const DATABASE: Database = Database::WordPairProximityDocids;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl MergerOperationType for WordPairProximityDocids {
 | 
					 | 
				
			||||||
    fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
 | 
					 | 
				
			||||||
        MergerOperation::WordPairProximityDocidsMerger(merger)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl DatabaseType for WordPositionDocids {
 | 
					impl DatabaseType for WordPositionDocids {
 | 
				
			||||||
    const DATABASE: Database = Database::WordPositionDocids;
 | 
					    const DATABASE: Database = Database::WordPositionDocids;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl MergerOperationType for WordPositionDocids {
 | 
					 | 
				
			||||||
    fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
 | 
					 | 
				
			||||||
        MergerOperation::WordPositionDocidsMerger(merger)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl MergerOperationType for FacetDocids {
 | 
					 | 
				
			||||||
    fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
 | 
					 | 
				
			||||||
        MergerOperation::FacetDocidsMerger(merger)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub trait DocidsSender {
 | 
					pub trait DocidsSender {
 | 
				
			||||||
    fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>;
 | 
					    fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>;
 | 
				
			||||||
    fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>;
 | 
					    fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct WordDocidsSender<'a, D> {
 | 
					pub struct WordDocidsSender<'a, D> {
 | 
				
			||||||
    sender: &'a MergerSender,
 | 
					    sender: &'a ExtractorSender,
 | 
				
			||||||
    _marker: PhantomData<D>,
 | 
					    _marker: PhantomData<D>,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -421,7 +288,7 @@ impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct FacetDocidsSender<'a> {
 | 
					pub struct FacetDocidsSender<'a> {
 | 
				
			||||||
    sender: &'a MergerSender,
 | 
					    sender: &'a ExtractorSender,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl DocidsSender for FacetDocidsSender<'_> {
 | 
					impl DocidsSender for FacetDocidsSender<'_> {
 | 
				
			||||||
@@ -456,7 +323,7 @@ impl DocidsSender for FacetDocidsSender<'_> {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct FacetSearchableSender<'a> {
 | 
					pub struct FacetSearchableSender<'a> {
 | 
				
			||||||
    sender: &'a MergerSender,
 | 
					    sender: &'a ExtractorSender,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl FacetSearchableSender<'_> {
 | 
					impl FacetSearchableSender<'_> {
 | 
				
			||||||
@@ -481,25 +348,9 @@ impl FacetSearchableSender<'_> {
 | 
				
			|||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					            Err(SendError(_)) => Err(SendError(())),
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn write_fst(&self, key: &[u8], value: Mmap) -> StdResult<(), SendError<()>> {
 | 
					 | 
				
			||||||
        let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value(key, value));
 | 
					 | 
				
			||||||
        match self.sender.send(WriterOperation { database: Database::FacetIdStringFst, entry }) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pub fn delete_fst(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
 | 
					pub struct DocumentsSender<'a>(&'a ExtractorSender);
 | 
				
			||||||
        let entry = EntryOperation::Delete(KeyEntry::from_key(key));
 | 
					 | 
				
			||||||
        match self.sender.send(WriterOperation { database: Database::FacetIdStringFst, entry }) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub struct DocumentsSender<'a>(&'a MergerSender);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl DocumentsSender<'_> {
 | 
					impl DocumentsSender<'_> {
 | 
				
			||||||
    /// TODO do that efficiently
 | 
					    /// TODO do that efficiently
 | 
				
			||||||
@@ -542,86 +393,3 @@ impl DocumentsSender<'_> {
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					 | 
				
			||||||
pub enum MergerOperation {
 | 
					 | 
				
			||||||
    ExactWordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
 | 
					 | 
				
			||||||
    FidWordCountDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
 | 
					 | 
				
			||||||
    WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
 | 
					 | 
				
			||||||
    WordFidDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
 | 
					 | 
				
			||||||
    WordPairProximityDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
 | 
					 | 
				
			||||||
    WordPositionDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
 | 
					 | 
				
			||||||
    FacetDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
 | 
					 | 
				
			||||||
    DeleteDocument { docid: DocumentId, external_id: String },
 | 
					 | 
				
			||||||
    InsertDocument { docid: DocumentId, external_id: String, document: Box<KvReaderFieldId> },
 | 
					 | 
				
			||||||
    FinishedDocument,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub struct MergerReceiver(Receiver<MergerOperation>);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl IntoIterator for MergerReceiver {
 | 
					 | 
				
			||||||
    type Item = MergerOperation;
 | 
					 | 
				
			||||||
    type IntoIter = IntoIter<Self::Item>;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fn into_iter(self) -> Self::IntoIter {
 | 
					 | 
				
			||||||
        self.0.into_iter()
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub struct ExtractorSender(Sender<MergerOperation>);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl ExtractorSender {
 | 
					 | 
				
			||||||
    pub fn document_sender(&self) -> DocumentSender<'_> {
 | 
					 | 
				
			||||||
        DocumentSender(Some(&self.0))
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn send_searchable<D: MergerOperationType>(
 | 
					 | 
				
			||||||
        &self,
 | 
					 | 
				
			||||||
        merger: Merger<File, MergeDeladdCboRoaringBitmaps>,
 | 
					 | 
				
			||||||
    ) -> StdResult<(), SendError<()>> {
 | 
					 | 
				
			||||||
        match self.0.send(D::new_merger_operation(merger)) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub struct DocumentSender<'a>(Option<&'a Sender<MergerOperation>>);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl DocumentSender<'_> {
 | 
					 | 
				
			||||||
    pub fn insert(
 | 
					 | 
				
			||||||
        &self,
 | 
					 | 
				
			||||||
        docid: DocumentId,
 | 
					 | 
				
			||||||
        external_id: String,
 | 
					 | 
				
			||||||
        document: Box<KvReaderFieldId>,
 | 
					 | 
				
			||||||
    ) -> StdResult<(), SendError<()>> {
 | 
					 | 
				
			||||||
        let sender = self.0.unwrap();
 | 
					 | 
				
			||||||
        match sender.send(MergerOperation::InsertDocument { docid, external_id, document }) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> {
 | 
					 | 
				
			||||||
        let sender = self.0.unwrap();
 | 
					 | 
				
			||||||
        match sender.send(MergerOperation::DeleteDocument { docid, external_id }) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn finish(mut self) -> StdResult<(), SendError<()>> {
 | 
					 | 
				
			||||||
        let sender = self.0.take().unwrap();
 | 
					 | 
				
			||||||
        match sender.send(MergerOperation::FinishedDocument) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(SendError(_)) => Err(SendError(())),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl Drop for DocumentSender<'_> {
 | 
					 | 
				
			||||||
    fn drop(&mut self) {
 | 
					 | 
				
			||||||
        if let Some(sender) = self.0.take() {
 | 
					 | 
				
			||||||
            let _ = sender.send(MergerOperation::FinishedDocument);
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,299 +1,611 @@
 | 
				
			|||||||
use std::fmt::Write as _;
 | 
					//! # How the Merge Algorithm works
 | 
				
			||||||
use std::mem;
 | 
					//!
 | 
				
			||||||
use std::num::NonZeroUsize;
 | 
					//! Each extractor create #Threads caches and balances the entries
 | 
				
			||||||
 | 
					//! based on the hash of the keys. To do that we can use the
 | 
				
			||||||
 | 
					//! hashbrown::hash_map::RawEntryBuilderMut::from_key_hashed_nocheck.
 | 
				
			||||||
 | 
					//! This way we can compute the hash on our own, decide on the cache to
 | 
				
			||||||
 | 
					//! target, and insert it into the right HashMap.
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! #Thread  -> caches
 | 
				
			||||||
 | 
					//! t1       -> [t1c1, t1c2, t1c3]
 | 
				
			||||||
 | 
					//! t2       -> [t2c1, t2c2, t2c3]
 | 
				
			||||||
 | 
					//! t3       -> [t3c1, t3c2, t3c3]
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! When the extractors are done filling the caches, we want to merge
 | 
				
			||||||
 | 
					//! the content of all the caches. We do a transpose and each thread is
 | 
				
			||||||
 | 
					//! assigned the associated cache. By doing that we know that every key
 | 
				
			||||||
 | 
					//! is put in a known cache and will collide with keys in the other
 | 
				
			||||||
 | 
					//! caches of the other threads.
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! #Thread  -> caches
 | 
				
			||||||
 | 
					//! t1       -> [t1c1, t2c1, t3c1]
 | 
				
			||||||
 | 
					//! t2       -> [t1c2, t2c2, t3c2]
 | 
				
			||||||
 | 
					//! t3       -> [t1c3, t2c3, t3c3]
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! When we encountered a miss in the other caches we must still try
 | 
				
			||||||
 | 
					//! to find it in the spilled entries. This is the reason why we use
 | 
				
			||||||
 | 
					//! a grenad sorter/reader so that we can seek "efficiently" for a key.
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! ## More Detailled Algorithm
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! Each sub-cache has an in-memory HashMap and some spilled
 | 
				
			||||||
 | 
					//! lexicographically ordered entries on disk (grenad). We first iterate
 | 
				
			||||||
 | 
					//! over the spilled entries of all the caches at once by using a merge
 | 
				
			||||||
 | 
					//! join algorithm. This algorithm will merge the entries by using its
 | 
				
			||||||
 | 
					//! merge function.
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! Everytime a merged entry is emited by the merge join algorithm we also
 | 
				
			||||||
 | 
					//! fetch the value from the other in-memory caches (HashMaps) to finish
 | 
				
			||||||
 | 
					//! the merge. Everytime we retrieve an entry from the in-memory caches
 | 
				
			||||||
 | 
					//! we mark them with a tombstone for later.
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! Once we are done with the spilled entries we iterate over the in-memory
 | 
				
			||||||
 | 
					//! HashMaps. We iterate over the first one, retrieve the content from the
 | 
				
			||||||
 | 
					//! other onces and mark them with a tombstone again. We also make sure
 | 
				
			||||||
 | 
					//! to ignore the dead (tombstoned) ones.
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! ## Memory Control
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! We can detect that there are no more memory available when the
 | 
				
			||||||
 | 
					//! bump allocator reaches a threshold. When this is the case we
 | 
				
			||||||
 | 
					//! freeze the cache. There is one bump allocator by thread and the
 | 
				
			||||||
 | 
					//! memory must be well balanced as we manage one type of extraction
 | 
				
			||||||
 | 
					//! at a time with well-balanced documents.
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! It means that the unknown new keys added to the
 | 
				
			||||||
 | 
					//! cache are directly spilled to disk: basically a key followed by a
 | 
				
			||||||
 | 
					//! del/add bitmap. For the known keys we can keep modifying them in
 | 
				
			||||||
 | 
					//! the materialized version in the cache: update the del/add bitmaps.
 | 
				
			||||||
 | 
					//!
 | 
				
			||||||
 | 
					//! For now we can use a grenad sorter for spilling even thought I think
 | 
				
			||||||
 | 
					//! it's not the most efficient way (too many files open, sorting entries).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use grenad::{MergeFunction, Sorter};
 | 
					use std::cmp::Ordering;
 | 
				
			||||||
use roaring::bitmap::Statistics;
 | 
					use std::collections::binary_heap::PeekMut;
 | 
				
			||||||
 | 
					use std::collections::BinaryHeap;
 | 
				
			||||||
 | 
					use std::fs::File;
 | 
				
			||||||
 | 
					use std::hash::BuildHasher;
 | 
				
			||||||
 | 
					use std::io::BufReader;
 | 
				
			||||||
 | 
					use std::{io, iter, mem};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use bumpalo::Bump;
 | 
				
			||||||
 | 
					use grenad::ReaderCursor;
 | 
				
			||||||
 | 
					use hashbrown::hash_map::RawEntryMut;
 | 
				
			||||||
 | 
					use hashbrown::HashMap;
 | 
				
			||||||
 | 
					use raw_collections::map::FrozenMap;
 | 
				
			||||||
use roaring::RoaringBitmap;
 | 
					use roaring::RoaringBitmap;
 | 
				
			||||||
use smallvec::SmallVec;
 | 
					use rustc_hash::FxBuildHasher;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::lru::Lru;
 | 
					 | 
				
			||||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
 | 
					use crate::update::del_add::{DelAdd, KvWriterDelAdd};
 | 
				
			||||||
use crate::CboRoaringBitmapCodec;
 | 
					use crate::update::new::indexer::document_changes::MostlySend;
 | 
				
			||||||
 | 
					use crate::update::new::KvReaderDelAdd;
 | 
				
			||||||
 | 
					use crate::update::MergeDeladdCboRoaringBitmaps;
 | 
				
			||||||
 | 
					use crate::{CboRoaringBitmapCodec, Result};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
const KEY_SIZE: usize = 12;
 | 
					/// A cache that stores bytes keys associated to CboDelAddRoaringBitmaps.
 | 
				
			||||||
 | 
					///
 | 
				
			||||||
 | 
					/// Internally balances the content over `N` buckets for future merging.
 | 
				
			||||||
 | 
					pub struct BalancedCaches<'extractor> {
 | 
				
			||||||
 | 
					    hasher: FxBuildHasher,
 | 
				
			||||||
 | 
					    alloc: &'extractor Bump,
 | 
				
			||||||
 | 
					    max_memory: Option<usize>,
 | 
				
			||||||
 | 
					    caches: InnerCaches<'extractor>,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[derive(Debug)]
 | 
					enum InnerCaches<'extractor> {
 | 
				
			||||||
pub struct CboCachedSorter<MF> {
 | 
					    Normal(NormalCaches<'extractor>),
 | 
				
			||||||
    cache: Lru<SmallVec<[u8; KEY_SIZE]>, DelAddRoaringBitmap>,
 | 
					    Spilling(SpillingCaches<'extractor>),
 | 
				
			||||||
    sorter: Sorter<MF>,
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<'extractor> BalancedCaches<'extractor> {
 | 
				
			||||||
 | 
					    pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
 | 
				
			||||||
 | 
					        Self {
 | 
				
			||||||
 | 
					            hasher: FxBuildHasher,
 | 
				
			||||||
 | 
					            max_memory,
 | 
				
			||||||
 | 
					            caches: InnerCaches::Normal(NormalCaches {
 | 
				
			||||||
 | 
					                caches: iter::repeat_with(|| HashMap::with_hasher_in(FxBuildHasher, alloc))
 | 
				
			||||||
 | 
					                    .take(buckets)
 | 
				
			||||||
 | 
					                    .collect(),
 | 
				
			||||||
 | 
					            }),
 | 
				
			||||||
 | 
					            alloc,
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    fn buckets(&self) -> usize {
 | 
				
			||||||
 | 
					        match &self.caches {
 | 
				
			||||||
 | 
					            InnerCaches::Normal(caches) => caches.caches.len(),
 | 
				
			||||||
 | 
					            InnerCaches::Spilling(caches) => caches.caches.len(),
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> Result<()> {
 | 
				
			||||||
 | 
					        if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) {
 | 
				
			||||||
 | 
					            self.start_spilling()?;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let buckets = self.buckets();
 | 
				
			||||||
 | 
					        match &mut self.caches {
 | 
				
			||||||
 | 
					            InnerCaches::Normal(normal) => {
 | 
				
			||||||
 | 
					                normal.insert_del_u32(&self.hasher, self.alloc, buckets, key, n);
 | 
				
			||||||
 | 
					                Ok(())
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            InnerCaches::Spilling(spilling) => {
 | 
				
			||||||
 | 
					                spilling.insert_del_u32(&self.hasher, buckets, key, n)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> Result<()> {
 | 
				
			||||||
 | 
					        if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) {
 | 
				
			||||||
 | 
					            self.start_spilling()?;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let buckets = self.buckets();
 | 
				
			||||||
 | 
					        match &mut self.caches {
 | 
				
			||||||
 | 
					            InnerCaches::Normal(normal) => {
 | 
				
			||||||
 | 
					                normal.insert_add_u32(&self.hasher, self.alloc, buckets, key, n);
 | 
				
			||||||
 | 
					                Ok(())
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            InnerCaches::Spilling(spilling) => {
 | 
				
			||||||
 | 
					                spilling.insert_add_u32(&self.hasher, buckets, key, n)
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    /// Make sure the cache is no longer allocating data
 | 
				
			||||||
 | 
					    /// and writes every new and unknow entry to disk.
 | 
				
			||||||
 | 
					    fn start_spilling(&mut self) -> Result<()> {
 | 
				
			||||||
 | 
					        let BalancedCaches { hasher: _, alloc, max_memory: _, caches } = self;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if let InnerCaches::Normal(normal_caches) = caches {
 | 
				
			||||||
 | 
					            eprintln!(
 | 
				
			||||||
 | 
					                "We are spilling after we allocated {} bytes on thread #{}",
 | 
				
			||||||
 | 
					                alloc.allocated_bytes(),
 | 
				
			||||||
 | 
					                rayon::current_thread_index().unwrap_or(0)
 | 
				
			||||||
 | 
					            );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            let allocated: usize = normal_caches.caches.iter().map(|m| m.allocation_size()).sum();
 | 
				
			||||||
 | 
					            eprintln!("The last allocated HasMap took {allocated} bytes");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            let dummy = NormalCaches { caches: Vec::new() };
 | 
				
			||||||
 | 
					            let NormalCaches { caches: cache_maps } = mem::replace(normal_caches, dummy);
 | 
				
			||||||
 | 
					            *caches = InnerCaches::Spilling(SpillingCaches::from_cache_maps(cache_maps));
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Ok(())
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn freeze(&mut self) -> Result<Vec<FrozenCache<'_, 'extractor>>> {
 | 
				
			||||||
 | 
					        match &mut self.caches {
 | 
				
			||||||
 | 
					            InnerCaches::Normal(NormalCaches { caches }) => caches
 | 
				
			||||||
 | 
					                .iter_mut()
 | 
				
			||||||
 | 
					                .enumerate()
 | 
				
			||||||
 | 
					                .map(|(bucket, map)| {
 | 
				
			||||||
 | 
					                    Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() })
 | 
				
			||||||
 | 
					                })
 | 
				
			||||||
 | 
					                .collect(),
 | 
				
			||||||
 | 
					            InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches
 | 
				
			||||||
 | 
					                .iter_mut()
 | 
				
			||||||
 | 
					                .zip(mem::take(spilled_entries))
 | 
				
			||||||
 | 
					                .enumerate()
 | 
				
			||||||
 | 
					                .map(|(bucket, (map, sorter))| {
 | 
				
			||||||
 | 
					                    let spilled = sorter
 | 
				
			||||||
 | 
					                        .into_reader_cursors()?
 | 
				
			||||||
 | 
					                        .into_iter()
 | 
				
			||||||
 | 
					                        .map(ReaderCursor::into_inner)
 | 
				
			||||||
 | 
					                        .map(BufReader::new)
 | 
				
			||||||
 | 
					                        .map(|bufreader| grenad::Reader::new(bufreader).map_err(Into::into))
 | 
				
			||||||
 | 
					                        .collect::<Result<_>>()?;
 | 
				
			||||||
 | 
					                    Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled })
 | 
				
			||||||
 | 
					                })
 | 
				
			||||||
 | 
					                .collect(),
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					unsafe impl MostlySend for BalancedCaches<'_> {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct NormalCaches<'extractor> {
 | 
				
			||||||
 | 
					    caches: Vec<HashMap<&'extractor [u8], DelAddRoaringBitmap, FxBuildHasher, &'extractor Bump>>,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<'extractor> NormalCaches<'extractor> {
 | 
				
			||||||
 | 
					    pub fn insert_del_u32(
 | 
				
			||||||
 | 
					        &mut self,
 | 
				
			||||||
 | 
					        hasher: &FxBuildHasher,
 | 
				
			||||||
 | 
					        alloc: &'extractor Bump,
 | 
				
			||||||
 | 
					        buckets: usize,
 | 
				
			||||||
 | 
					        key: &[u8],
 | 
				
			||||||
 | 
					        n: u32,
 | 
				
			||||||
 | 
					    ) {
 | 
				
			||||||
 | 
					        let hash = hasher.hash_one(key);
 | 
				
			||||||
 | 
					        let bucket = compute_bucket_from_hash(buckets, hash);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
 | 
				
			||||||
 | 
					            RawEntryMut::Occupied(mut entry) => {
 | 
				
			||||||
 | 
					                entry.get_mut().del.get_or_insert_with(RoaringBitmap::default).insert(n);
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            RawEntryMut::Vacant(entry) => {
 | 
				
			||||||
 | 
					                entry.insert_hashed_nocheck(
 | 
				
			||||||
 | 
					                    hash,
 | 
				
			||||||
 | 
					                    alloc.alloc_slice_copy(key),
 | 
				
			||||||
 | 
					                    DelAddRoaringBitmap::new_del_u32(n),
 | 
				
			||||||
 | 
					                );
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn insert_add_u32(
 | 
				
			||||||
 | 
					        &mut self,
 | 
				
			||||||
 | 
					        hasher: &FxBuildHasher,
 | 
				
			||||||
 | 
					        alloc: &'extractor Bump,
 | 
				
			||||||
 | 
					        buckets: usize,
 | 
				
			||||||
 | 
					        key: &[u8],
 | 
				
			||||||
 | 
					        n: u32,
 | 
				
			||||||
 | 
					    ) {
 | 
				
			||||||
 | 
					        let hash = hasher.hash_one(key);
 | 
				
			||||||
 | 
					        let bucket = compute_bucket_from_hash(buckets, hash);
 | 
				
			||||||
 | 
					        match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
 | 
				
			||||||
 | 
					            RawEntryMut::Occupied(mut entry) => {
 | 
				
			||||||
 | 
					                entry.get_mut().add.get_or_insert_with(RoaringBitmap::default).insert(n);
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            RawEntryMut::Vacant(entry) => {
 | 
				
			||||||
 | 
					                entry.insert_hashed_nocheck(
 | 
				
			||||||
 | 
					                    hash,
 | 
				
			||||||
 | 
					                    alloc.alloc_slice_copy(key),
 | 
				
			||||||
 | 
					                    DelAddRoaringBitmap::new_add_u32(n),
 | 
				
			||||||
 | 
					                );
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct SpillingCaches<'extractor> {
 | 
				
			||||||
 | 
					    caches: Vec<HashMap<&'extractor [u8], DelAddRoaringBitmap, FxBuildHasher, &'extractor Bump>>,
 | 
				
			||||||
 | 
					    spilled_entries: Vec<grenad::Sorter<MergeDeladdCboRoaringBitmaps>>,
 | 
				
			||||||
    deladd_buffer: Vec<u8>,
 | 
					    deladd_buffer: Vec<u8>,
 | 
				
			||||||
    cbo_buffer: Vec<u8>,
 | 
					    cbo_buffer: Vec<u8>,
 | 
				
			||||||
    total_insertions: usize,
 | 
					 | 
				
			||||||
    fitted_in_key: usize,
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl<MF> CboCachedSorter<MF> {
 | 
					impl<'extractor> SpillingCaches<'extractor> {
 | 
				
			||||||
    pub fn new(cap: NonZeroUsize, sorter: Sorter<MF>) -> Self {
 | 
					    fn from_cache_maps(
 | 
				
			||||||
        CboCachedSorter {
 | 
					        caches: Vec<
 | 
				
			||||||
            cache: Lru::new(cap),
 | 
					            HashMap<&'extractor [u8], DelAddRoaringBitmap, FxBuildHasher, &'extractor Bump>,
 | 
				
			||||||
            sorter,
 | 
					        >,
 | 
				
			||||||
 | 
					    ) -> SpillingCaches<'extractor> {
 | 
				
			||||||
 | 
					        SpillingCaches {
 | 
				
			||||||
 | 
					            spilled_entries: iter::repeat_with(|| {
 | 
				
			||||||
 | 
					                let mut builder = grenad::SorterBuilder::new(MergeDeladdCboRoaringBitmaps);
 | 
				
			||||||
 | 
					                builder.dump_threshold(0);
 | 
				
			||||||
 | 
					                builder.allow_realloc(false);
 | 
				
			||||||
 | 
					                builder.build()
 | 
				
			||||||
 | 
					            })
 | 
				
			||||||
 | 
					            .take(caches.len())
 | 
				
			||||||
 | 
					            .collect(),
 | 
				
			||||||
 | 
					            caches,
 | 
				
			||||||
            deladd_buffer: Vec::new(),
 | 
					            deladd_buffer: Vec::new(),
 | 
				
			||||||
            cbo_buffer: Vec::new(),
 | 
					            cbo_buffer: Vec::new(),
 | 
				
			||||||
            total_insertions: 0,
 | 
					 | 
				
			||||||
            fitted_in_key: 0,
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl<MF: MergeFunction> CboCachedSorter<MF> {
 | 
					    pub fn insert_del_u32(
 | 
				
			||||||
    pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
 | 
					 | 
				
			||||||
        match self.cache.get_mut(key) {
 | 
					 | 
				
			||||||
            Some(DelAddRoaringBitmap { del, add: _ }) => {
 | 
					 | 
				
			||||||
                del.get_or_insert_with(RoaringBitmap::default).insert(n);
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            None => {
 | 
					 | 
				
			||||||
                self.total_insertions += 1;
 | 
					 | 
				
			||||||
                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
 | 
					 | 
				
			||||||
                let value = DelAddRoaringBitmap::new_del_u32(n);
 | 
					 | 
				
			||||||
                if let Some((key, deladd)) = self.cache.push(key.into(), value) {
 | 
					 | 
				
			||||||
                    self.write_entry(key, deladd)?;
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Ok(())
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn insert_del(
 | 
					 | 
				
			||||||
        &mut self,
 | 
					        &mut self,
 | 
				
			||||||
 | 
					        hasher: &FxBuildHasher,
 | 
				
			||||||
 | 
					        buckets: usize,
 | 
				
			||||||
        key: &[u8],
 | 
					        key: &[u8],
 | 
				
			||||||
        bitmap: RoaringBitmap,
 | 
					        n: u32,
 | 
				
			||||||
    ) -> grenad::Result<(), MF::Error> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
        match self.cache.get_mut(key) {
 | 
					        let hash = hasher.hash_one(key);
 | 
				
			||||||
            Some(DelAddRoaringBitmap { del, add: _ }) => {
 | 
					        let bucket = compute_bucket_from_hash(buckets, hash);
 | 
				
			||||||
                *del.get_or_insert_with(RoaringBitmap::default) |= bitmap;
 | 
					        match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
 | 
				
			||||||
            }
 | 
					            RawEntryMut::Occupied(mut entry) => {
 | 
				
			||||||
            None => {
 | 
					                entry.get_mut().del.get_or_insert_with(RoaringBitmap::default).insert(n);
 | 
				
			||||||
                self.total_insertions += 1;
 | 
					 | 
				
			||||||
                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
 | 
					 | 
				
			||||||
                let value = DelAddRoaringBitmap::new_del(bitmap);
 | 
					 | 
				
			||||||
                if let Some((key, deladd)) = self.cache.push(key.into(), value) {
 | 
					 | 
				
			||||||
                    self.write_entry(key, deladd)?;
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                Ok(())
 | 
					                Ok(())
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
 | 
					            RawEntryMut::Vacant(_entry) => {
 | 
				
			||||||
    pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
 | 
					                let deladd = DelAddRoaringBitmap::new_del_u32(n);
 | 
				
			||||||
        match self.cache.get_mut(key) {
 | 
					                spill_entry_to_sorter(
 | 
				
			||||||
            Some(DelAddRoaringBitmap { del: _, add }) => {
 | 
					                    &mut self.spilled_entries[bucket],
 | 
				
			||||||
                add.get_or_insert_with(RoaringBitmap::default).insert(n);
 | 
					                    &mut self.deladd_buffer,
 | 
				
			||||||
            }
 | 
					                    &mut self.cbo_buffer,
 | 
				
			||||||
            None => {
 | 
					                    key,
 | 
				
			||||||
                self.total_insertions += 1;
 | 
					                    deladd,
 | 
				
			||||||
                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
 | 
					                )
 | 
				
			||||||
                let value = DelAddRoaringBitmap::new_add_u32(n);
 | 
					 | 
				
			||||||
                if let Some((key, deladd)) = self.cache.push(key.into(), value) {
 | 
					 | 
				
			||||||
                    self.write_entry(key, deladd)?;
 | 
					 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Ok(())
 | 
					    pub fn insert_add_u32(
 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    pub fn insert_add(
 | 
					 | 
				
			||||||
        &mut self,
 | 
					        &mut self,
 | 
				
			||||||
 | 
					        hasher: &FxBuildHasher,
 | 
				
			||||||
 | 
					        buckets: usize,
 | 
				
			||||||
        key: &[u8],
 | 
					        key: &[u8],
 | 
				
			||||||
        bitmap: RoaringBitmap,
 | 
					        n: u32,
 | 
				
			||||||
    ) -> grenad::Result<(), MF::Error> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
        match self.cache.get_mut(key) {
 | 
					        let hash = hasher.hash_one(key);
 | 
				
			||||||
            Some(DelAddRoaringBitmap { del: _, add }) => {
 | 
					        let bucket = compute_bucket_from_hash(buckets, hash);
 | 
				
			||||||
                *add.get_or_insert_with(RoaringBitmap::default) |= bitmap;
 | 
					        match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
 | 
				
			||||||
            }
 | 
					            RawEntryMut::Occupied(mut entry) => {
 | 
				
			||||||
            None => {
 | 
					                entry.get_mut().add.get_or_insert_with(RoaringBitmap::default).insert(n);
 | 
				
			||||||
                self.total_insertions += 1;
 | 
					 | 
				
			||||||
                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
 | 
					 | 
				
			||||||
                let value = DelAddRoaringBitmap::new_add(bitmap);
 | 
					 | 
				
			||||||
                if let Some((key, deladd)) = self.cache.push(key.into(), value) {
 | 
					 | 
				
			||||||
                    self.write_entry(key, deladd)?;
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                Ok(())
 | 
					                Ok(())
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
 | 
					            RawEntryMut::Vacant(_entry) => {
 | 
				
			||||||
    pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) -> grenad::Result<(), MF::Error> {
 | 
					                let deladd = DelAddRoaringBitmap::new_add_u32(n);
 | 
				
			||||||
        match self.cache.get_mut(key) {
 | 
					                spill_entry_to_sorter(
 | 
				
			||||||
            Some(DelAddRoaringBitmap { del, add }) => {
 | 
					                    &mut self.spilled_entries[bucket],
 | 
				
			||||||
                del.get_or_insert_with(RoaringBitmap::default).insert(n);
 | 
					                    &mut self.deladd_buffer,
 | 
				
			||||||
                add.get_or_insert_with(RoaringBitmap::default).insert(n);
 | 
					                    &mut self.cbo_buffer,
 | 
				
			||||||
 | 
					                    key,
 | 
				
			||||||
 | 
					                    deladd,
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            None => {
 | 
					 | 
				
			||||||
                self.total_insertions += 1;
 | 
					 | 
				
			||||||
                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
 | 
					 | 
				
			||||||
                let value = DelAddRoaringBitmap::new_del_add_u32(n);
 | 
					 | 
				
			||||||
                if let Some((key, deladd)) = self.cache.push(key.into(), value) {
 | 
					 | 
				
			||||||
                    self.write_entry(key, deladd)?;
 | 
					 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Ok(())
 | 
					#[inline]
 | 
				
			||||||
 | 
					fn compute_bucket_from_hash(buckets: usize, hash: u64) -> usize {
 | 
				
			||||||
 | 
					    hash as usize % buckets
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn write_entry<A: AsRef<[u8]>>(
 | 
					fn spill_entry_to_sorter(
 | 
				
			||||||
        &mut self,
 | 
					    spilled_entries: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
 | 
				
			||||||
        key: A,
 | 
					    deladd_buffer: &mut Vec<u8>,
 | 
				
			||||||
 | 
					    cbo_buffer: &mut Vec<u8>,
 | 
				
			||||||
 | 
					    key: &[u8],
 | 
				
			||||||
    deladd: DelAddRoaringBitmap,
 | 
					    deladd: DelAddRoaringBitmap,
 | 
				
			||||||
    ) -> grenad::Result<(), MF::Error> {
 | 
					) -> Result<()> {
 | 
				
			||||||
        /// TODO we must create a serialization trait to correctly serialize bitmaps
 | 
					    deladd_buffer.clear();
 | 
				
			||||||
        self.deladd_buffer.clear();
 | 
					    let mut value_writer = KvWriterDelAdd::new(deladd_buffer);
 | 
				
			||||||
        let mut value_writer = KvWriterDelAdd::new(&mut self.deladd_buffer);
 | 
					
 | 
				
			||||||
    match deladd {
 | 
					    match deladd {
 | 
				
			||||||
        DelAddRoaringBitmap { del: Some(del), add: None } => {
 | 
					        DelAddRoaringBitmap { del: Some(del), add: None } => {
 | 
				
			||||||
                self.cbo_buffer.clear();
 | 
					            cbo_buffer.clear();
 | 
				
			||||||
                CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer);
 | 
					            CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
 | 
				
			||||||
                value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?;
 | 
					            value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        DelAddRoaringBitmap { del: None, add: Some(add) } => {
 | 
					        DelAddRoaringBitmap { del: None, add: Some(add) } => {
 | 
				
			||||||
                self.cbo_buffer.clear();
 | 
					            cbo_buffer.clear();
 | 
				
			||||||
                CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer);
 | 
					            CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
 | 
				
			||||||
                value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?;
 | 
					            value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
 | 
					        DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
 | 
				
			||||||
                self.cbo_buffer.clear();
 | 
					            cbo_buffer.clear();
 | 
				
			||||||
                CboRoaringBitmapCodec::serialize_into(&del, &mut self.cbo_buffer);
 | 
					            CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
 | 
				
			||||||
                value_writer.insert(DelAdd::Deletion, &self.cbo_buffer)?;
 | 
					            value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                self.cbo_buffer.clear();
 | 
					            cbo_buffer.clear();
 | 
				
			||||||
                CboRoaringBitmapCodec::serialize_into(&add, &mut self.cbo_buffer);
 | 
					            CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
 | 
				
			||||||
                value_writer.insert(DelAdd::Addition, &self.cbo_buffer)?;
 | 
					            value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
 | 
					        DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    let bytes = value_writer.into_inner().unwrap();
 | 
					    let bytes = value_writer.into_inner().unwrap();
 | 
				
			||||||
        self.sorter.insert(key, bytes)
 | 
					    spilled_entries.insert(key, bytes).map_err(Into::into)
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pub fn direct_insert(&mut self, key: &[u8], val: &[u8]) -> grenad::Result<(), MF::Error> {
 | 
					pub struct FrozenCache<'a, 'extractor> {
 | 
				
			||||||
        self.sorter.insert(key, val)
 | 
					    bucket: usize,
 | 
				
			||||||
 | 
					    cache: FrozenMap<'a, 'extractor, &'extractor [u8], DelAddRoaringBitmap, FxBuildHasher>,
 | 
				
			||||||
 | 
					    spilled: Vec<grenad::Reader<BufReader<File>>>,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pub fn into_sorter(mut self) -> grenad::Result<Sorter<MF>, MF::Error> {
 | 
					pub fn transpose_and_freeze_caches<'a, 'extractor>(
 | 
				
			||||||
        let mut all_n_containers = Vec::new();
 | 
					    caches: &'a mut [BalancedCaches<'extractor>],
 | 
				
			||||||
        let mut all_n_array_containers = Vec::new();
 | 
					) -> Result<Vec<Vec<FrozenCache<'a, 'extractor>>>> {
 | 
				
			||||||
        let mut all_n_bitset_containers = Vec::new();
 | 
					    let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0);
 | 
				
			||||||
        let mut all_n_values_array_containers = Vec::new();
 | 
					    let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect();
 | 
				
			||||||
        let mut all_n_values_bitset_containers = Vec::new();
 | 
					 | 
				
			||||||
        let mut all_cardinality = Vec::new();
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let default_arc = Lru::new(NonZeroUsize::MIN);
 | 
					    for thread_cache in caches {
 | 
				
			||||||
        for (key, deladd) in mem::replace(&mut self.cache, default_arc) {
 | 
					        for frozen in thread_cache.freeze()? {
 | 
				
			||||||
            for bitmap in [&deladd.del, &deladd.add].into_iter().flatten() {
 | 
					            bucket_caches[frozen.bucket].push(frozen);
 | 
				
			||||||
                let Statistics {
 | 
					 | 
				
			||||||
                    n_containers,
 | 
					 | 
				
			||||||
                    n_array_containers,
 | 
					 | 
				
			||||||
                    n_bitset_containers,
 | 
					 | 
				
			||||||
                    n_values_array_containers,
 | 
					 | 
				
			||||||
                    n_values_bitset_containers,
 | 
					 | 
				
			||||||
                    cardinality,
 | 
					 | 
				
			||||||
                    ..
 | 
					 | 
				
			||||||
                } = bitmap.statistics();
 | 
					 | 
				
			||||||
                all_n_containers.push(n_containers);
 | 
					 | 
				
			||||||
                all_n_array_containers.push(n_array_containers);
 | 
					 | 
				
			||||||
                all_n_bitset_containers.push(n_bitset_containers);
 | 
					 | 
				
			||||||
                all_n_values_array_containers.push(n_values_array_containers);
 | 
					 | 
				
			||||||
                all_n_values_bitset_containers.push(n_values_bitset_containers);
 | 
					 | 
				
			||||||
                all_cardinality.push(cardinality as u32);
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            self.write_entry(key, deladd)?;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        let mut output = String::new();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        for (name, mut slice) in [
 | 
					 | 
				
			||||||
            ("n_containers", all_n_containers),
 | 
					 | 
				
			||||||
            ("n_array_containers", all_n_array_containers),
 | 
					 | 
				
			||||||
            ("n_bitset_containers", all_n_bitset_containers),
 | 
					 | 
				
			||||||
            ("n_values_array_containers", all_n_values_array_containers),
 | 
					 | 
				
			||||||
            ("n_values_bitset_containers", all_n_values_bitset_containers),
 | 
					 | 
				
			||||||
            ("cardinality", all_cardinality),
 | 
					 | 
				
			||||||
        ] {
 | 
					 | 
				
			||||||
            let _ = writeln!(&mut output, "{name} (p100) {:?}", Stats::from_slice(&mut slice));
 | 
					 | 
				
			||||||
            // let _ = writeln!(&mut output, "{name} (p99)  {:?}", Stats::from_slice_p99(&mut slice));
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        let _ = writeln!(
 | 
					 | 
				
			||||||
            &mut output,
 | 
					 | 
				
			||||||
            "LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions",
 | 
					 | 
				
			||||||
            self.fitted_in_key,
 | 
					 | 
				
			||||||
            (self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0,
 | 
					 | 
				
			||||||
            self.total_insertions,
 | 
					 | 
				
			||||||
        );
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        eprintln!("{output}");
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        Ok(self.sorter)
 | 
					 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[derive(Default, Debug)]
 | 
					    Ok(bucket_caches)
 | 
				
			||||||
struct Stats {
 | 
					 | 
				
			||||||
    pub len: usize,
 | 
					 | 
				
			||||||
    pub average: f32,
 | 
					 | 
				
			||||||
    pub mean: u32,
 | 
					 | 
				
			||||||
    pub min: u32,
 | 
					 | 
				
			||||||
    pub max: u32,
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl Stats {
 | 
					/// Merges the caches that must be all associated to the same bucket.
 | 
				
			||||||
    fn from_slice(slice: &mut [u32]) -> Stats {
 | 
					///
 | 
				
			||||||
        slice.sort_unstable();
 | 
					/// # Panics
 | 
				
			||||||
        Self::from_sorted_slice(slice)
 | 
					///
 | 
				
			||||||
 | 
					/// - If the bucket IDs in these frozen caches are not exactly the same.
 | 
				
			||||||
 | 
					pub fn merge_caches<F>(frozen: Vec<FrozenCache>, mut f: F) -> Result<()>
 | 
				
			||||||
 | 
					where
 | 
				
			||||||
 | 
					    F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>,
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    let mut maps = Vec::new();
 | 
				
			||||||
 | 
					    let mut readers = Vec::new();
 | 
				
			||||||
 | 
					    let mut current_bucket = None;
 | 
				
			||||||
 | 
					    for FrozenCache { bucket, cache, ref mut spilled } in frozen {
 | 
				
			||||||
 | 
					        assert_eq!(*current_bucket.get_or_insert(bucket), bucket);
 | 
				
			||||||
 | 
					        maps.push(cache);
 | 
				
			||||||
 | 
					        readers.append(spilled);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn from_slice_p99(slice: &mut [u32]) -> Stats {
 | 
					    // First manage the spilled entries by looking into the HashMaps,
 | 
				
			||||||
        slice.sort_unstable();
 | 
					    // merge them and mark them as dummy.
 | 
				
			||||||
        let new_len = slice.len() - (slice.len() as f32 / 100.0) as usize;
 | 
					    let mut heap = BinaryHeap::new();
 | 
				
			||||||
        match slice.get(..new_len) {
 | 
					    for (source_index, source) in readers.into_iter().enumerate() {
 | 
				
			||||||
            Some(slice) => Self::from_sorted_slice(slice),
 | 
					        let mut cursor = source.into_cursor()?;
 | 
				
			||||||
            None => Stats::default(),
 | 
					        if cursor.move_on_next()?.is_some() {
 | 
				
			||||||
 | 
					            heap.push(Entry { cursor, source_index });
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn from_sorted_slice(slice: &[u32]) -> Stats {
 | 
					    loop {
 | 
				
			||||||
        let sum: f64 = slice.iter().map(|i| *i as f64).sum();
 | 
					        let mut first_entry = match heap.pop() {
 | 
				
			||||||
        let average = (sum / slice.len() as f64) as f32;
 | 
					            Some(entry) => entry,
 | 
				
			||||||
        let mean = *slice.len().checked_div(2).and_then(|middle| slice.get(middle)).unwrap_or(&0);
 | 
					            None => break,
 | 
				
			||||||
        let min = *slice.first().unwrap_or(&0);
 | 
					        };
 | 
				
			||||||
        let max = *slice.last().unwrap_or(&0);
 | 
					
 | 
				
			||||||
        Stats { len: slice.len(), average, mean, min, max }
 | 
					        let (first_key, first_value) = match first_entry.cursor.current() {
 | 
				
			||||||
 | 
					            Some((key, value)) => (key, value),
 | 
				
			||||||
 | 
					            None => break,
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let mut output = DelAddRoaringBitmap::from_bytes(first_value)?;
 | 
				
			||||||
 | 
					        while let Some(mut entry) = heap.peek_mut() {
 | 
				
			||||||
 | 
					            if let Some((key, _value)) = entry.cursor.current() {
 | 
				
			||||||
 | 
					                if first_key == key {
 | 
				
			||||||
 | 
					                    let new = DelAddRoaringBitmap::from_bytes(first_value)?;
 | 
				
			||||||
 | 
					                    output = output.merge(new);
 | 
				
			||||||
 | 
					                    // When we are done we the current value of this entry move make
 | 
				
			||||||
 | 
					                    // it move forward and let the heap reorganize itself (on drop)
 | 
				
			||||||
 | 
					                    if entry.cursor.move_on_next()?.is_none() {
 | 
				
			||||||
 | 
					                        PeekMut::pop(entry);
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                } else {
 | 
				
			||||||
 | 
					                    break;
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[derive(Debug, Clone)]
 | 
					        // Once we merged all of the spilled bitmaps we must also
 | 
				
			||||||
 | 
					        // fetch the entries from the non-spilled entries (the HashMaps).
 | 
				
			||||||
 | 
					        for (map_index, map) in maps.iter_mut().enumerate() {
 | 
				
			||||||
 | 
					            if first_entry.source_index != map_index {
 | 
				
			||||||
 | 
					                if let Some(new) = map.get_mut(first_key) {
 | 
				
			||||||
 | 
					                    output = output.merge(mem::take(new));
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // We send the merged entry outside.
 | 
				
			||||||
 | 
					        (f)(first_key, output)?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // Don't forget to put the first entry back into the heap.
 | 
				
			||||||
 | 
					        if first_entry.cursor.move_on_next()?.is_some() {
 | 
				
			||||||
 | 
					            heap.push(first_entry)
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    // Then manage the content on the HashMap entries that weren't taken (mem::take).
 | 
				
			||||||
 | 
					    while let Some(mut map) = maps.pop() {
 | 
				
			||||||
 | 
					        for (key, output) in map.iter_mut() {
 | 
				
			||||||
 | 
					            let mut output = mem::take(output);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            // Make sure we don't try to work with entries already managed by the spilled
 | 
				
			||||||
 | 
					            if !output.is_empty() {
 | 
				
			||||||
 | 
					                for rhs in maps.iter_mut() {
 | 
				
			||||||
 | 
					                    if let Some(new) = rhs.get_mut(key) {
 | 
				
			||||||
 | 
					                        output = output.merge(mem::take(new));
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                // We send the merged entry outside.
 | 
				
			||||||
 | 
					                (f)(key, output)?;
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Ok(())
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					struct Entry<R> {
 | 
				
			||||||
 | 
					    cursor: ReaderCursor<R>,
 | 
				
			||||||
 | 
					    source_index: usize,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<R> Ord for Entry<R> {
 | 
				
			||||||
 | 
					    fn cmp(&self, other: &Entry<R>) -> Ordering {
 | 
				
			||||||
 | 
					        let skey = self.cursor.current().map(|(k, _)| k);
 | 
				
			||||||
 | 
					        let okey = other.cursor.current().map(|(k, _)| k);
 | 
				
			||||||
 | 
					        skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse()
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<R> Eq for Entry<R> {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<R> PartialEq for Entry<R> {
 | 
				
			||||||
 | 
					    fn eq(&self, other: &Entry<R>) -> bool {
 | 
				
			||||||
 | 
					        self.cmp(other) == Ordering::Equal
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<R> PartialOrd for Entry<R> {
 | 
				
			||||||
 | 
					    fn partial_cmp(&self, other: &Entry<R>) -> Option<Ordering> {
 | 
				
			||||||
 | 
					        Some(self.cmp(other))
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#[derive(Debug, Default, Clone)]
 | 
				
			||||||
pub struct DelAddRoaringBitmap {
 | 
					pub struct DelAddRoaringBitmap {
 | 
				
			||||||
    pub(crate) del: Option<RoaringBitmap>,
 | 
					    pub del: Option<RoaringBitmap>,
 | 
				
			||||||
    pub(crate) add: Option<RoaringBitmap>,
 | 
					    pub add: Option<RoaringBitmap>,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl DelAddRoaringBitmap {
 | 
					impl DelAddRoaringBitmap {
 | 
				
			||||||
    fn new_del_add_u32(n: u32) -> Self {
 | 
					    fn from_bytes(bytes: &[u8]) -> io::Result<DelAddRoaringBitmap> {
 | 
				
			||||||
        DelAddRoaringBitmap {
 | 
					        let reader = KvReaderDelAdd::from_slice(bytes);
 | 
				
			||||||
            del: Some(RoaringBitmap::from([n])),
 | 
					
 | 
				
			||||||
            add: Some(RoaringBitmap::from([n])),
 | 
					        let del = match reader.get(DelAdd::Deletion) {
 | 
				
			||||||
        }
 | 
					            Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
 | 
				
			||||||
 | 
					            None => None,
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let add = match reader.get(DelAdd::Addition) {
 | 
				
			||||||
 | 
					            Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
 | 
				
			||||||
 | 
					            None => None,
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Ok(DelAddRoaringBitmap { del, add })
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn new_del(bitmap: RoaringBitmap) -> Self {
 | 
					    pub fn empty() -> DelAddRoaringBitmap {
 | 
				
			||||||
        DelAddRoaringBitmap { del: Some(bitmap), add: None }
 | 
					        DelAddRoaringBitmap { del: None, add: None }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn new_del_u32(n: u32) -> Self {
 | 
					    pub fn is_empty(&self) -> bool {
 | 
				
			||||||
 | 
					        let DelAddRoaringBitmap { del, add } = self;
 | 
				
			||||||
 | 
					        del.is_none() && add.is_none()
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn insert_del_u32(&mut self, n: u32) {
 | 
				
			||||||
 | 
					        self.del.get_or_insert_with(RoaringBitmap::new).insert(n);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn insert_add_u32(&mut self, n: u32) {
 | 
				
			||||||
 | 
					        self.add.get_or_insert_with(RoaringBitmap::new).insert(n);
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn new_del_u32(n: u32) -> Self {
 | 
				
			||||||
        DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None }
 | 
					        DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn new_add(bitmap: RoaringBitmap) -> Self {
 | 
					    pub fn new_add_u32(n: u32) -> Self {
 | 
				
			||||||
        DelAddRoaringBitmap { del: None, add: Some(bitmap) }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fn new_add_u32(n: u32) -> Self {
 | 
					 | 
				
			||||||
        DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) }
 | 
					        DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn merge(self, rhs: DelAddRoaringBitmap) -> DelAddRoaringBitmap {
 | 
				
			||||||
 | 
					        let DelAddRoaringBitmap { del, add } = self;
 | 
				
			||||||
 | 
					        let DelAddRoaringBitmap { del: ndel, add: nadd } = rhs;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let del = match (del, ndel) {
 | 
				
			||||||
 | 
					            (None, None) => None,
 | 
				
			||||||
 | 
					            (None, Some(del)) | (Some(del), None) => Some(del),
 | 
				
			||||||
 | 
					            (Some(del), Some(ndel)) => Some(del | ndel),
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let add = match (add, nadd) {
 | 
				
			||||||
 | 
					            (None, None) => None,
 | 
				
			||||||
 | 
					            (None, Some(add)) | (Some(add), None) => Some(add),
 | 
				
			||||||
 | 
					            (Some(add), Some(nadd)) => Some(add | nadd),
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        DelAddRoaringBitmap { del, add }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) {
 | 
				
			||||||
 | 
					        let DelAddRoaringBitmap { del, add } = self;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if let Some(del) = del {
 | 
				
			||||||
 | 
					            *documents_ids -= del;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if let Some(add) = add {
 | 
				
			||||||
 | 
					            *documents_ids |= add;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										73
									
								
								milli/src/update/new/extract/documents.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								milli/src/update/new/extract/documents.rs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,73 @@
 | 
				
			|||||||
 | 
					use std::cell::RefCell;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use bumpalo::Bump;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use super::DelAddRoaringBitmap;
 | 
				
			||||||
 | 
					use crate::update::new::channel::DocumentsSender;
 | 
				
			||||||
 | 
					use crate::update::new::document::write_to_obkv;
 | 
				
			||||||
 | 
					use crate::update::new::indexer::document_changes::{
 | 
				
			||||||
 | 
					    DocumentChangeContext, Extractor, FullySend, RefCellExt as _,
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					use crate::update::new::DocumentChange;
 | 
				
			||||||
 | 
					use crate::Result;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					pub struct DocumentsExtractor<'a> {
 | 
				
			||||||
 | 
					    documents_sender: &'a DocumentsSender<'a>,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<'a> DocumentsExtractor<'a> {
 | 
				
			||||||
 | 
					    pub fn new(documents_sender: &'a DocumentsSender<'a>) -> Self {
 | 
				
			||||||
 | 
					        Self { documents_sender }
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
 | 
				
			||||||
 | 
					    type Data = FullySend<RefCell<DelAddRoaringBitmap>>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
 | 
				
			||||||
 | 
					        Ok(FullySend(RefCell::new(DelAddRoaringBitmap::empty())))
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    fn process(
 | 
				
			||||||
 | 
					        &self,
 | 
				
			||||||
 | 
					        change: DocumentChange,
 | 
				
			||||||
 | 
					        context: &DocumentChangeContext<Self::Data>,
 | 
				
			||||||
 | 
					    ) -> Result<()> {
 | 
				
			||||||
 | 
					        let mut document_buffer = Vec::new();
 | 
				
			||||||
 | 
					        let mut delta_documents_ids = context.data.0.borrow_mut_or_yield();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield();
 | 
				
			||||||
 | 
					        let new_fields_ids_map = &*new_fields_ids_map;
 | 
				
			||||||
 | 
					        let new_fields_ids_map = new_fields_ids_map.local_map();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let external_docid = change.external_docid().to_owned();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // document but we need to create a function that collects and compresses documents.
 | 
				
			||||||
 | 
					        match change {
 | 
				
			||||||
 | 
					            DocumentChange::Deletion(deletion) => {
 | 
				
			||||||
 | 
					                let docid = deletion.docid();
 | 
				
			||||||
 | 
					                self.documents_sender.delete(docid, external_docid).unwrap();
 | 
				
			||||||
 | 
					                delta_documents_ids.insert_del_u32(docid);
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            /// TODO: change NONE by SOME(vector) when implemented
 | 
				
			||||||
 | 
					            DocumentChange::Update(update) => {
 | 
				
			||||||
 | 
					                let docid = update.docid();
 | 
				
			||||||
 | 
					                let content =
 | 
				
			||||||
 | 
					                    update.new(&context.txn, context.index, &context.db_fields_ids_map)?;
 | 
				
			||||||
 | 
					                let content =
 | 
				
			||||||
 | 
					                    write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
 | 
				
			||||||
 | 
					                self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            DocumentChange::Insertion(insertion) => {
 | 
				
			||||||
 | 
					                let docid = insertion.docid();
 | 
				
			||||||
 | 
					                let content = insertion.new();
 | 
				
			||||||
 | 
					                let content =
 | 
				
			||||||
 | 
					                    write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
 | 
				
			||||||
 | 
					                self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
 | 
				
			||||||
 | 
					                delta_documents_ids.insert_add_u32(docid);
 | 
				
			||||||
 | 
					                // extracted_dictionary_sender.send(self, dictionary: &[u8]);
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        Ok(())
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
@@ -1,16 +1,12 @@
 | 
				
			|||||||
use std::cell::RefCell;
 | 
					use std::cell::RefCell;
 | 
				
			||||||
use std::collections::HashSet;
 | 
					use std::collections::HashSet;
 | 
				
			||||||
use std::fmt::Debug;
 | 
					 | 
				
			||||||
use std::fs::File;
 | 
					 | 
				
			||||||
use std::ops::DerefMut as _;
 | 
					use std::ops::DerefMut as _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use bumpalo::Bump;
 | 
					use bumpalo::Bump;
 | 
				
			||||||
use grenad::{MergeFunction, Merger};
 | 
					 | 
				
			||||||
use heed::RoTxn;
 | 
					use heed::RoTxn;
 | 
				
			||||||
use rayon::iter::{ParallelBridge as _, ParallelIterator as _};
 | 
					 | 
				
			||||||
use serde_json::Value;
 | 
					use serde_json::Value;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::super::cache::CboCachedSorter;
 | 
					use super::super::cache::BalancedCaches;
 | 
				
			||||||
use super::facet_document::extract_document_facets;
 | 
					use super::facet_document::extract_document_facets;
 | 
				
			||||||
use super::FacetKind;
 | 
					use super::FacetKind;
 | 
				
			||||||
use crate::facet::value_encoding::f64_into_bytes;
 | 
					use crate::facet::value_encoding::f64_into_bytes;
 | 
				
			||||||
@@ -20,44 +16,30 @@ use crate::update::new::indexer::document_changes::{
 | 
				
			|||||||
    IndexingContext, RefCellExt, ThreadLocal,
 | 
					    IndexingContext, RefCellExt, ThreadLocal,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
use crate::update::new::DocumentChange;
 | 
					use crate::update::new::DocumentChange;
 | 
				
			||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
 | 
					use crate::update::GrenadParameters;
 | 
				
			||||||
use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
 | 
					use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct FacetedExtractorData<'extractor> {
 | 
					pub struct FacetedExtractorData<'a> {
 | 
				
			||||||
    attributes_to_extract: &'extractor [&'extractor str],
 | 
					    attributes_to_extract: &'a [&'a str],
 | 
				
			||||||
    grenad_parameters: GrenadParameters,
 | 
					    grenad_parameters: GrenadParameters,
 | 
				
			||||||
    max_memory: Option<usize>,
 | 
					    buckets: usize,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> {
 | 
					impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> {
 | 
				
			||||||
    type Data = FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>;
 | 
					    type Data = RefCell<BalancedCaches<'extractor>>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn init_data(
 | 
					    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
 | 
				
			||||||
        &self,
 | 
					        Ok(RefCell::new(BalancedCaches::new_in(
 | 
				
			||||||
        _extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
 | 
					            self.buckets,
 | 
				
			||||||
    ) -> Result<Self::Data> {
 | 
					            self.grenad_parameters.max_memory,
 | 
				
			||||||
        Ok(FullySend(RefCell::new(CboCachedSorter::new(
 | 
					            extractor_alloc,
 | 
				
			||||||
            // TODO use a better value
 | 
					        )))
 | 
				
			||||||
            1_000_000.try_into().unwrap(),
 | 
					 | 
				
			||||||
            create_sorter(
 | 
					 | 
				
			||||||
                grenad::SortAlgorithm::Stable,
 | 
					 | 
				
			||||||
                MergeDeladdCboRoaringBitmaps,
 | 
					 | 
				
			||||||
                self.grenad_parameters.chunk_compression_type,
 | 
					 | 
				
			||||||
                self.grenad_parameters.chunk_compression_level,
 | 
					 | 
				
			||||||
                self.grenad_parameters.max_nb_chunks,
 | 
					 | 
				
			||||||
                self.max_memory,
 | 
					 | 
				
			||||||
                // *NOTE*: this must not be set to true:
 | 
					 | 
				
			||||||
                // 1. we're already using max parallelism in the pool, so it wouldn't help
 | 
					 | 
				
			||||||
                // 2. it creates correctness issues if it causes to yield a borrow-mut wielding task
 | 
					 | 
				
			||||||
                false,
 | 
					 | 
				
			||||||
            ),
 | 
					 | 
				
			||||||
        ))))
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn process(
 | 
					    fn process(
 | 
				
			||||||
        &self,
 | 
					        &self,
 | 
				
			||||||
        change: DocumentChange,
 | 
					        change: DocumentChange,
 | 
				
			||||||
        context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
 | 
					        context: &DocumentChangeContext<Self::Data>,
 | 
				
			||||||
    ) -> Result<()> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
        FacetedDocidsExtractor::extract_document_change(context, self.attributes_to_extract, change)
 | 
					        FacetedDocidsExtractor::extract_document_change(context, self.attributes_to_extract, change)
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -67,16 +49,14 @@ pub struct FacetedDocidsExtractor;
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
impl FacetedDocidsExtractor {
 | 
					impl FacetedDocidsExtractor {
 | 
				
			||||||
    fn extract_document_change(
 | 
					    fn extract_document_change(
 | 
				
			||||||
        context: &DocumentChangeContext<
 | 
					        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
 | 
				
			||||||
            FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
 | 
					 | 
				
			||||||
        >,
 | 
					 | 
				
			||||||
        attributes_to_extract: &[&str],
 | 
					        attributes_to_extract: &[&str],
 | 
				
			||||||
        document_change: DocumentChange,
 | 
					        document_change: DocumentChange,
 | 
				
			||||||
    ) -> Result<()> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
        let index = &context.index;
 | 
					        let index = &context.index;
 | 
				
			||||||
        let rtxn = &context.txn;
 | 
					        let rtxn = &context.txn;
 | 
				
			||||||
        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
 | 
					        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
 | 
				
			||||||
        let mut cached_sorter = context.data.0.borrow_mut_or_yield();
 | 
					        let mut cached_sorter = context.data.borrow_mut_or_yield();
 | 
				
			||||||
        match document_change {
 | 
					        match document_change {
 | 
				
			||||||
            DocumentChange::Deletion(inner) => extract_document_facets(
 | 
					            DocumentChange::Deletion(inner) => extract_document_facets(
 | 
				
			||||||
                attributes_to_extract,
 | 
					                attributes_to_extract,
 | 
				
			||||||
@@ -86,7 +66,7 @@ impl FacetedDocidsExtractor {
 | 
				
			|||||||
                    Self::facet_fn_with_options(
 | 
					                    Self::facet_fn_with_options(
 | 
				
			||||||
                        &context.doc_alloc,
 | 
					                        &context.doc_alloc,
 | 
				
			||||||
                        cached_sorter.deref_mut(),
 | 
					                        cached_sorter.deref_mut(),
 | 
				
			||||||
                        CboCachedSorter::insert_del_u32,
 | 
					                        BalancedCaches::insert_del_u32,
 | 
				
			||||||
                        inner.docid(),
 | 
					                        inner.docid(),
 | 
				
			||||||
                        fid,
 | 
					                        fid,
 | 
				
			||||||
                        value,
 | 
					                        value,
 | 
				
			||||||
@@ -102,7 +82,7 @@ impl FacetedDocidsExtractor {
 | 
				
			|||||||
                        Self::facet_fn_with_options(
 | 
					                        Self::facet_fn_with_options(
 | 
				
			||||||
                            &context.doc_alloc,
 | 
					                            &context.doc_alloc,
 | 
				
			||||||
                            cached_sorter.deref_mut(),
 | 
					                            cached_sorter.deref_mut(),
 | 
				
			||||||
                            CboCachedSorter::insert_del_u32,
 | 
					                            BalancedCaches::insert_del_u32,
 | 
				
			||||||
                            inner.docid(),
 | 
					                            inner.docid(),
 | 
				
			||||||
                            fid,
 | 
					                            fid,
 | 
				
			||||||
                            value,
 | 
					                            value,
 | 
				
			||||||
@@ -118,7 +98,7 @@ impl FacetedDocidsExtractor {
 | 
				
			|||||||
                        Self::facet_fn_with_options(
 | 
					                        Self::facet_fn_with_options(
 | 
				
			||||||
                            &context.doc_alloc,
 | 
					                            &context.doc_alloc,
 | 
				
			||||||
                            cached_sorter.deref_mut(),
 | 
					                            cached_sorter.deref_mut(),
 | 
				
			||||||
                            CboCachedSorter::insert_add_u32,
 | 
					                            BalancedCaches::insert_add_u32,
 | 
				
			||||||
                            inner.docid(),
 | 
					                            inner.docid(),
 | 
				
			||||||
                            fid,
 | 
					                            fid,
 | 
				
			||||||
                            value,
 | 
					                            value,
 | 
				
			||||||
@@ -134,7 +114,7 @@ impl FacetedDocidsExtractor {
 | 
				
			|||||||
                    Self::facet_fn_with_options(
 | 
					                    Self::facet_fn_with_options(
 | 
				
			||||||
                        &context.doc_alloc,
 | 
					                        &context.doc_alloc,
 | 
				
			||||||
                        cached_sorter.deref_mut(),
 | 
					                        cached_sorter.deref_mut(),
 | 
				
			||||||
                        CboCachedSorter::insert_add_u32,
 | 
					                        BalancedCaches::insert_add_u32,
 | 
				
			||||||
                        inner.docid(),
 | 
					                        inner.docid(),
 | 
				
			||||||
                        fid,
 | 
					                        fid,
 | 
				
			||||||
                        value,
 | 
					                        value,
 | 
				
			||||||
@@ -144,25 +124,20 @@ impl FacetedDocidsExtractor {
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn facet_fn_with_options<MF>(
 | 
					    fn facet_fn_with_options<'extractor>(
 | 
				
			||||||
        doc_alloc: &Bump,
 | 
					        doc_alloc: &Bump,
 | 
				
			||||||
        cached_sorter: &mut CboCachedSorter<MF>,
 | 
					        cached_sorter: &mut BalancedCaches<'extractor>,
 | 
				
			||||||
        cache_fn: impl Fn(&mut CboCachedSorter<MF>, &[u8], u32) -> grenad::Result<(), MF::Error>,
 | 
					        cache_fn: impl Fn(&mut BalancedCaches<'extractor>, &[u8], u32) -> Result<()>,
 | 
				
			||||||
        docid: DocumentId,
 | 
					        docid: DocumentId,
 | 
				
			||||||
        fid: FieldId,
 | 
					        fid: FieldId,
 | 
				
			||||||
        value: &Value,
 | 
					        value: &Value,
 | 
				
			||||||
    ) -> Result<()>
 | 
					    ) -> Result<()> {
 | 
				
			||||||
    where
 | 
					 | 
				
			||||||
        MF: MergeFunction,
 | 
					 | 
				
			||||||
        MF::Error: Debug,
 | 
					 | 
				
			||||||
        grenad::Error<MF::Error>: Into<crate::Error>,
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
        let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
 | 
					        let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
 | 
				
			||||||
        // Exists
 | 
					        // Exists
 | 
				
			||||||
        // key: fid
 | 
					        // key: fid
 | 
				
			||||||
        buffer.push(FacetKind::Exists as u8);
 | 
					        buffer.push(FacetKind::Exists as u8);
 | 
				
			||||||
        buffer.extend_from_slice(&fid.to_be_bytes());
 | 
					        buffer.extend_from_slice(&fid.to_be_bytes());
 | 
				
			||||||
        cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)?;
 | 
					        cache_fn(cached_sorter, &buffer, docid)?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        match value {
 | 
					        match value {
 | 
				
			||||||
            // Number
 | 
					            // Number
 | 
				
			||||||
@@ -177,8 +152,7 @@ impl FacetedDocidsExtractor {
 | 
				
			|||||||
                    buffer.push(0); // level 0
 | 
					                    buffer.push(0); // level 0
 | 
				
			||||||
                    buffer.extend_from_slice(&ordered);
 | 
					                    buffer.extend_from_slice(&ordered);
 | 
				
			||||||
                    buffer.extend_from_slice(&n.to_be_bytes());
 | 
					                    buffer.extend_from_slice(&n.to_be_bytes());
 | 
				
			||||||
 | 
					                    cache_fn(cached_sorter, &buffer, docid)
 | 
				
			||||||
                    cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
 | 
					 | 
				
			||||||
                } else {
 | 
					                } else {
 | 
				
			||||||
                    Ok(())
 | 
					                    Ok(())
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
@@ -193,7 +167,7 @@ impl FacetedDocidsExtractor {
 | 
				
			|||||||
                buffer.extend_from_slice(&fid.to_be_bytes());
 | 
					                buffer.extend_from_slice(&fid.to_be_bytes());
 | 
				
			||||||
                buffer.push(0); // level 0
 | 
					                buffer.push(0); // level 0
 | 
				
			||||||
                buffer.extend_from_slice(truncated.as_bytes());
 | 
					                buffer.extend_from_slice(truncated.as_bytes());
 | 
				
			||||||
                cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
 | 
					                cache_fn(cached_sorter, &buffer, docid)
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            // Null
 | 
					            // Null
 | 
				
			||||||
            // key: fid
 | 
					            // key: fid
 | 
				
			||||||
@@ -201,7 +175,7 @@ impl FacetedDocidsExtractor {
 | 
				
			|||||||
                buffer.clear();
 | 
					                buffer.clear();
 | 
				
			||||||
                buffer.push(FacetKind::Null as u8);
 | 
					                buffer.push(FacetKind::Null as u8);
 | 
				
			||||||
                buffer.extend_from_slice(&fid.to_be_bytes());
 | 
					                buffer.extend_from_slice(&fid.to_be_bytes());
 | 
				
			||||||
                cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
 | 
					                cache_fn(cached_sorter, &buffer, docid)
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            // Empty
 | 
					            // Empty
 | 
				
			||||||
            // key: fid
 | 
					            // key: fid
 | 
				
			||||||
@@ -209,13 +183,13 @@ impl FacetedDocidsExtractor {
 | 
				
			|||||||
                buffer.clear();
 | 
					                buffer.clear();
 | 
				
			||||||
                buffer.push(FacetKind::Empty as u8);
 | 
					                buffer.push(FacetKind::Empty as u8);
 | 
				
			||||||
                buffer.extend_from_slice(&fid.to_be_bytes());
 | 
					                buffer.extend_from_slice(&fid.to_be_bytes());
 | 
				
			||||||
                cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
 | 
					                cache_fn(cached_sorter, &buffer, docid)
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            Value::Object(o) if o.is_empty() => {
 | 
					            Value::Object(o) if o.is_empty() => {
 | 
				
			||||||
                buffer.clear();
 | 
					                buffer.clear();
 | 
				
			||||||
                buffer.push(FacetKind::Empty as u8);
 | 
					                buffer.push(FacetKind::Empty as u8);
 | 
				
			||||||
                buffer.extend_from_slice(&fid.to_be_bytes());
 | 
					                buffer.extend_from_slice(&fid.to_be_bytes());
 | 
				
			||||||
                cache_fn(cached_sorter, &buffer, docid).map_err(Into::into)
 | 
					                cache_fn(cached_sorter, &buffer, docid)
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            // Otherwise, do nothing
 | 
					            // Otherwise, do nothing
 | 
				
			||||||
            /// TODO: What about Value::Bool?
 | 
					            /// TODO: What about Value::Bool?
 | 
				
			||||||
@@ -242,16 +216,13 @@ fn truncate_str(s: &str) -> &str {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
impl DocidsExtractor for FacetedDocidsExtractor {
 | 
					impl DocidsExtractor for FacetedDocidsExtractor {
 | 
				
			||||||
    #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
 | 
					    #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
 | 
				
			||||||
    fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
 | 
					    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
 | 
				
			||||||
        grenad_parameters: GrenadParameters,
 | 
					        grenad_parameters: GrenadParameters,
 | 
				
			||||||
        document_changes: &DC,
 | 
					        document_changes: &DC,
 | 
				
			||||||
        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
					        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
				
			||||||
        extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
 | 
					        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
 | 
				
			||||||
    ) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
 | 
					    ) -> Result<Vec<BalancedCaches<'extractor>>> {
 | 
				
			||||||
        let max_memory = grenad_parameters.max_memory_by_thread();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        let index = indexing_context.index;
 | 
					        let index = indexing_context.index;
 | 
				
			||||||
 | 
					 | 
				
			||||||
        let rtxn = index.read_txn()?;
 | 
					        let rtxn = index.read_txn()?;
 | 
				
			||||||
        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
 | 
					        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
 | 
				
			||||||
        let attributes_to_extract: Vec<_> =
 | 
					        let attributes_to_extract: Vec<_> =
 | 
				
			||||||
@@ -266,7 +237,7 @@ impl DocidsExtractor for FacetedDocidsExtractor {
 | 
				
			|||||||
            let extractor = FacetedExtractorData {
 | 
					            let extractor = FacetedExtractorData {
 | 
				
			||||||
                attributes_to_extract: &attributes_to_extract,
 | 
					                attributes_to_extract: &attributes_to_extract,
 | 
				
			||||||
                grenad_parameters,
 | 
					                grenad_parameters,
 | 
				
			||||||
                max_memory,
 | 
					                buckets: rayon::current_num_threads(),
 | 
				
			||||||
            };
 | 
					            };
 | 
				
			||||||
            for_each_document_change(
 | 
					            for_each_document_change(
 | 
				
			||||||
                document_changes,
 | 
					                document_changes,
 | 
				
			||||||
@@ -276,26 +247,7 @@ impl DocidsExtractor for FacetedDocidsExtractor {
 | 
				
			|||||||
                &datastore,
 | 
					                &datastore,
 | 
				
			||||||
            )?;
 | 
					            )?;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        {
 | 
					 | 
				
			||||||
            let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
 | 
					 | 
				
			||||||
            let span =
 | 
					 | 
				
			||||||
                tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
 | 
					 | 
				
			||||||
            let _entered = span.enter();
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            let readers: Vec<_> = datastore
 | 
					        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
 | 
				
			||||||
                .into_iter()
 | 
					 | 
				
			||||||
                .par_bridge()
 | 
					 | 
				
			||||||
                .map(|cached_sorter| {
 | 
					 | 
				
			||||||
                    let cached_sorter = cached_sorter.0.into_inner();
 | 
					 | 
				
			||||||
                    let sorter = cached_sorter.into_sorter()?;
 | 
					 | 
				
			||||||
                    sorter.into_reader_cursors()
 | 
					 | 
				
			||||||
                })
 | 
					 | 
				
			||||||
                .collect();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            for reader in readers {
 | 
					 | 
				
			||||||
                builder.extend(reader?);
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            Ok(builder.build())
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,234 +0,0 @@
 | 
				
			|||||||
use std::borrow::Borrow;
 | 
					 | 
				
			||||||
use std::hash::{BuildHasher, Hash};
 | 
					 | 
				
			||||||
use std::iter::repeat_with;
 | 
					 | 
				
			||||||
use std::mem;
 | 
					 | 
				
			||||||
use std::num::NonZeroUsize;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
use hashbrown::hash_map::{DefaultHashBuilder, Entry};
 | 
					 | 
				
			||||||
use hashbrown::HashMap;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#[derive(Debug)]
 | 
					 | 
				
			||||||
pub struct Lru<K, V, S = DefaultHashBuilder> {
 | 
					 | 
				
			||||||
    lookup: HashMap<K, usize, S>,
 | 
					 | 
				
			||||||
    storage: FixedSizeList<LruNode<K, V>>,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl<K: Eq + Hash, V> Lru<K, V> {
 | 
					 | 
				
			||||||
    /// Creates a new LRU cache that holds at most `capacity` elements.
 | 
					 | 
				
			||||||
    pub fn new(capacity: NonZeroUsize) -> Self {
 | 
					 | 
				
			||||||
        Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
 | 
					 | 
				
			||||||
    /// Creates a new LRU cache that holds at most `capacity` elements
 | 
					 | 
				
			||||||
    /// and uses the provided hash builder to hash keys.
 | 
					 | 
				
			||||||
    pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru<K, V, S> {
 | 
					 | 
				
			||||||
        Self {
 | 
					 | 
				
			||||||
            lookup: HashMap::with_hasher(hash_builder),
 | 
					 | 
				
			||||||
            storage: FixedSizeList::new(capacity.get()),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
 | 
					 | 
				
			||||||
    /// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache.
 | 
					 | 
				
			||||||
    ///
 | 
					 | 
				
			||||||
    /// Moves the key to the head of the LRU list if it exists.
 | 
					 | 
				
			||||||
    pub fn get_mut<Q>(&mut self, key: &Q) -> Option<&mut V>
 | 
					 | 
				
			||||||
    where
 | 
					 | 
				
			||||||
        K: Borrow<Q>,
 | 
					 | 
				
			||||||
        Q: Hash + Eq + ?Sized,
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
        let idx = *self.lookup.get(key)?;
 | 
					 | 
				
			||||||
        self.storage.move_front(idx).map(|node| &mut node.value)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl<K: Clone + Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
 | 
					 | 
				
			||||||
    pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> {
 | 
					 | 
				
			||||||
        match self.lookup.entry(key) {
 | 
					 | 
				
			||||||
            Entry::Occupied(occ) => {
 | 
					 | 
				
			||||||
                // It's fine to unwrap here because:
 | 
					 | 
				
			||||||
                // * the entry already exists
 | 
					 | 
				
			||||||
                let node = self.storage.move_front(*occ.get()).unwrap();
 | 
					 | 
				
			||||||
                let old_value = mem::replace(&mut node.value, value);
 | 
					 | 
				
			||||||
                let old_key = occ.replace_key();
 | 
					 | 
				
			||||||
                Some((old_key, old_value))
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            Entry::Vacant(vac) => {
 | 
					 | 
				
			||||||
                let key = vac.key().clone();
 | 
					 | 
				
			||||||
                if self.storage.is_full() {
 | 
					 | 
				
			||||||
                    // It's fine to unwrap here because:
 | 
					 | 
				
			||||||
                    // * the cache capacity is non zero
 | 
					 | 
				
			||||||
                    // * the cache is full
 | 
					 | 
				
			||||||
                    let idx = self.storage.back_idx();
 | 
					 | 
				
			||||||
                    let node = self.storage.move_front(idx).unwrap();
 | 
					 | 
				
			||||||
                    let LruNode { key, value } = mem::replace(node, LruNode { key, value });
 | 
					 | 
				
			||||||
                    vac.insert(idx);
 | 
					 | 
				
			||||||
                    self.lookup.remove(&key);
 | 
					 | 
				
			||||||
                    Some((key, value))
 | 
					 | 
				
			||||||
                } else {
 | 
					 | 
				
			||||||
                    // It's fine to unwrap here because:
 | 
					 | 
				
			||||||
                    // * the cache capacity is non zero
 | 
					 | 
				
			||||||
                    // * the cache is not full
 | 
					 | 
				
			||||||
                    let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap();
 | 
					 | 
				
			||||||
                    vac.insert(idx);
 | 
					 | 
				
			||||||
                    None
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl<K, V, S> IntoIterator for Lru<K, V, S> {
 | 
					 | 
				
			||||||
    type Item = (K, V);
 | 
					 | 
				
			||||||
    type IntoIter = IntoIter<K, V>;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fn into_iter(self) -> Self::IntoIter {
 | 
					 | 
				
			||||||
        IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub struct IntoIter<K, V> {
 | 
					 | 
				
			||||||
    lookup_iter: hashbrown::hash_map::IntoIter<K, usize>,
 | 
					 | 
				
			||||||
    nodes: Box<[Option<FixedSizeListNode<LruNode<K, V>>>]>,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl<K, V> Iterator for IntoIter<K, V> {
 | 
					 | 
				
			||||||
    type Item = (K, V);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fn next(&mut self) -> Option<Self::Item> {
 | 
					 | 
				
			||||||
        let (_key, idx) = self.lookup_iter.next()?;
 | 
					 | 
				
			||||||
        let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data;
 | 
					 | 
				
			||||||
        Some((key, value))
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#[derive(Debug)]
 | 
					 | 
				
			||||||
struct LruNode<K, V> {
 | 
					 | 
				
			||||||
    key: K,
 | 
					 | 
				
			||||||
    value: V,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#[derive(Debug)]
 | 
					 | 
				
			||||||
struct FixedSizeListNode<T> {
 | 
					 | 
				
			||||||
    prev: usize,
 | 
					 | 
				
			||||||
    next: usize,
 | 
					 | 
				
			||||||
    data: T,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#[derive(Debug)]
 | 
					 | 
				
			||||||
struct FixedSizeList<T> {
 | 
					 | 
				
			||||||
    nodes: Box<[Option<FixedSizeListNode<T>>]>,
 | 
					 | 
				
			||||||
    /// Also corresponds to the first `None` in the nodes.
 | 
					 | 
				
			||||||
    length: usize,
 | 
					 | 
				
			||||||
    // TODO Also, we probably do not need one of the front and back cursors.
 | 
					 | 
				
			||||||
    front: usize,
 | 
					 | 
				
			||||||
    back: usize,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl<T> FixedSizeList<T> {
 | 
					 | 
				
			||||||
    fn new(capacity: usize) -> Self {
 | 
					 | 
				
			||||||
        Self {
 | 
					 | 
				
			||||||
            nodes: repeat_with(|| None).take(capacity).collect::<Vec<_>>().into_boxed_slice(),
 | 
					 | 
				
			||||||
            length: 0,
 | 
					 | 
				
			||||||
            front: usize::MAX,
 | 
					 | 
				
			||||||
            back: usize::MAX,
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn capacity(&self) -> usize {
 | 
					 | 
				
			||||||
        self.nodes.len()
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn len(&self) -> usize {
 | 
					 | 
				
			||||||
        self.length
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn is_empty(&self) -> bool {
 | 
					 | 
				
			||||||
        self.len() == 0
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn is_full(&self) -> bool {
 | 
					 | 
				
			||||||
        self.len() == self.capacity()
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn back_idx(&self) -> usize {
 | 
					 | 
				
			||||||
        self.back
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn next(&mut self) -> Option<usize> {
 | 
					 | 
				
			||||||
        if self.is_full() {
 | 
					 | 
				
			||||||
            None
 | 
					 | 
				
			||||||
        } else {
 | 
					 | 
				
			||||||
            let current_free = self.length;
 | 
					 | 
				
			||||||
            self.length += 1;
 | 
					 | 
				
			||||||
            Some(current_free)
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode<T>> {
 | 
					 | 
				
			||||||
        self.nodes.get_mut(idx).and_then(|node| node.as_mut())
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode<T>> {
 | 
					 | 
				
			||||||
        self.nodes.get(idx).and_then(|node| node.as_ref())
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn move_front(&mut self, idx: usize) -> Option<&mut T> {
 | 
					 | 
				
			||||||
        let node = self.nodes.get_mut(idx)?.take()?;
 | 
					 | 
				
			||||||
        if let Some(prev) = self.node_mut(node.prev) {
 | 
					 | 
				
			||||||
            prev.next = node.next;
 | 
					 | 
				
			||||||
        } else {
 | 
					 | 
				
			||||||
            self.front = node.next;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        if let Some(next) = self.node_mut(node.next) {
 | 
					 | 
				
			||||||
            next.prev = node.prev;
 | 
					 | 
				
			||||||
        } else {
 | 
					 | 
				
			||||||
            self.back = node.prev;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if let Some(front) = self.node_mut(self.front) {
 | 
					 | 
				
			||||||
            front.prev = idx;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        if self.node_ref(self.back).is_none() {
 | 
					 | 
				
			||||||
            self.back = idx;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
 | 
					 | 
				
			||||||
            prev: usize::MAX,
 | 
					 | 
				
			||||||
            next: self.front,
 | 
					 | 
				
			||||||
            data: node.data,
 | 
					 | 
				
			||||||
        });
 | 
					 | 
				
			||||||
        self.front = idx;
 | 
					 | 
				
			||||||
        Some(&mut node.data)
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    #[inline]
 | 
					 | 
				
			||||||
    fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> {
 | 
					 | 
				
			||||||
        let idx = self.next()?;
 | 
					 | 
				
			||||||
        if let Some(front) = self.node_mut(self.front) {
 | 
					 | 
				
			||||||
            front.prev = idx;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        if self.node_ref(self.back).is_none() {
 | 
					 | 
				
			||||||
            self.back = idx;
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
 | 
					 | 
				
			||||||
            prev: usize::MAX,
 | 
					 | 
				
			||||||
            next: self.front,
 | 
					 | 
				
			||||||
            data,
 | 
					 | 
				
			||||||
        });
 | 
					 | 
				
			||||||
        self.front = idx;
 | 
					 | 
				
			||||||
        Some((idx, &mut node.data))
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
@@ -1,27 +1,25 @@
 | 
				
			|||||||
mod cache;
 | 
					mod cache;
 | 
				
			||||||
 | 
					mod documents;
 | 
				
			||||||
mod faceted;
 | 
					mod faceted;
 | 
				
			||||||
mod lru;
 | 
					 | 
				
			||||||
mod searchable;
 | 
					mod searchable;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use std::cell::RefCell;
 | 
					 | 
				
			||||||
use std::fs::File;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
use bumpalo::Bump;
 | 
					use bumpalo::Bump;
 | 
				
			||||||
 | 
					pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap};
 | 
				
			||||||
 | 
					pub use documents::*;
 | 
				
			||||||
pub use faceted::*;
 | 
					pub use faceted::*;
 | 
				
			||||||
use grenad::Merger;
 | 
					 | 
				
			||||||
pub use searchable::*;
 | 
					pub use searchable::*;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal};
 | 
					use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal};
 | 
				
			||||||
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
 | 
					use crate::update::GrenadParameters;
 | 
				
			||||||
use crate::Result;
 | 
					use crate::Result;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub trait DocidsExtractor {
 | 
					pub trait DocidsExtractor {
 | 
				
			||||||
    fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
 | 
					    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
 | 
				
			||||||
        grenad_parameters: GrenadParameters,
 | 
					        grenad_parameters: GrenadParameters,
 | 
				
			||||||
        document_changes: &DC,
 | 
					        document_changes: &DC,
 | 
				
			||||||
        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
					        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
				
			||||||
        extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
 | 
					        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
 | 
				
			||||||
    ) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>;
 | 
					    ) -> Result<Vec<BalancedCaches<'extractor>>>;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/// TODO move in permissive json pointer
 | 
					/// TODO move in permissive json pointer
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,113 +1,46 @@
 | 
				
			|||||||
use std::cell::RefCell;
 | 
					use std::cell::RefCell;
 | 
				
			||||||
use std::collections::HashMap;
 | 
					use std::collections::HashMap;
 | 
				
			||||||
use std::fs::File;
 | 
					 | 
				
			||||||
use std::mem::size_of;
 | 
					use std::mem::size_of;
 | 
				
			||||||
use std::num::NonZero;
 | 
					 | 
				
			||||||
use std::ops::DerefMut as _;
 | 
					use std::ops::DerefMut as _;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use bumpalo::collections::vec::Vec as BumpVec;
 | 
					use bumpalo::collections::vec::Vec as BumpVec;
 | 
				
			||||||
use bumpalo::Bump;
 | 
					use bumpalo::Bump;
 | 
				
			||||||
use grenad::{Merger, MergerBuilder};
 | 
					 | 
				
			||||||
use heed::RoTxn;
 | 
					use heed::RoTxn;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
 | 
					use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
 | 
				
			||||||
use crate::update::new::extract::cache::CboCachedSorter;
 | 
					use crate::update::new::extract::cache::BalancedCaches;
 | 
				
			||||||
use crate::update::new::extract::perm_json_p::contained_in;
 | 
					use crate::update::new::extract::perm_json_p::contained_in;
 | 
				
			||||||
use crate::update::new::indexer::document_changes::{
 | 
					use crate::update::new::indexer::document_changes::{
 | 
				
			||||||
    for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
 | 
					    for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
 | 
				
			||||||
    IndexingContext, RefCellExt, ThreadLocal,
 | 
					    IndexingContext, MostlySend, RefCellExt, ThreadLocal,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
use crate::update::new::DocumentChange;
 | 
					use crate::update::new::DocumentChange;
 | 
				
			||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
 | 
					use crate::update::GrenadParameters;
 | 
				
			||||||
use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
 | 
					use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
const MAX_COUNTED_WORDS: usize = 30;
 | 
					const MAX_COUNTED_WORDS: usize = 30;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct WordDocidsCachedSorters {
 | 
					pub struct WordDocidsBalancedCaches<'extractor> {
 | 
				
			||||||
    word_fid_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
 | 
					    word_fid_docids: BalancedCaches<'extractor>,
 | 
				
			||||||
    word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
 | 
					    word_docids: BalancedCaches<'extractor>,
 | 
				
			||||||
    exact_word_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
 | 
					    exact_word_docids: BalancedCaches<'extractor>,
 | 
				
			||||||
    word_position_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
 | 
					    word_position_docids: BalancedCaches<'extractor>,
 | 
				
			||||||
    fid_word_count_docids: CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
 | 
					    fid_word_count_docids: BalancedCaches<'extractor>,
 | 
				
			||||||
    fid_word_count: HashMap<FieldId, (usize, usize)>,
 | 
					    fid_word_count: HashMap<FieldId, (usize, usize)>,
 | 
				
			||||||
    current_docid: Option<DocumentId>,
 | 
					    current_docid: Option<DocumentId>,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl WordDocidsCachedSorters {
 | 
					unsafe impl<'extractor> MostlySend for WordDocidsBalancedCaches<'extractor> {}
 | 
				
			||||||
    pub fn new(
 | 
					 | 
				
			||||||
        indexer: GrenadParameters,
 | 
					 | 
				
			||||||
        max_memory: Option<usize>,
 | 
					 | 
				
			||||||
        capacity: NonZero<usize>,
 | 
					 | 
				
			||||||
    ) -> Self {
 | 
					 | 
				
			||||||
        let max_memory = max_memory.map(|max_memory| max_memory / 4);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        let word_fid_docids = CboCachedSorter::new(
 | 
					 | 
				
			||||||
            capacity,
 | 
					 | 
				
			||||||
            create_sorter(
 | 
					 | 
				
			||||||
                grenad::SortAlgorithm::Stable,
 | 
					 | 
				
			||||||
                MergeDeladdCboRoaringBitmaps,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_type,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_level,
 | 
					 | 
				
			||||||
                indexer.max_nb_chunks,
 | 
					 | 
				
			||||||
                max_memory,
 | 
					 | 
				
			||||||
                false,
 | 
					 | 
				
			||||||
            ),
 | 
					 | 
				
			||||||
        );
 | 
					 | 
				
			||||||
        let word_docids = CboCachedSorter::new(
 | 
					 | 
				
			||||||
            capacity,
 | 
					 | 
				
			||||||
            create_sorter(
 | 
					 | 
				
			||||||
                grenad::SortAlgorithm::Stable,
 | 
					 | 
				
			||||||
                MergeDeladdCboRoaringBitmaps,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_type,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_level,
 | 
					 | 
				
			||||||
                indexer.max_nb_chunks,
 | 
					 | 
				
			||||||
                max_memory,
 | 
					 | 
				
			||||||
                false,
 | 
					 | 
				
			||||||
            ),
 | 
					 | 
				
			||||||
        );
 | 
					 | 
				
			||||||
        let exact_word_docids = CboCachedSorter::new(
 | 
					 | 
				
			||||||
            capacity,
 | 
					 | 
				
			||||||
            create_sorter(
 | 
					 | 
				
			||||||
                grenad::SortAlgorithm::Stable,
 | 
					 | 
				
			||||||
                MergeDeladdCboRoaringBitmaps,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_type,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_level,
 | 
					 | 
				
			||||||
                indexer.max_nb_chunks,
 | 
					 | 
				
			||||||
                max_memory,
 | 
					 | 
				
			||||||
                false,
 | 
					 | 
				
			||||||
            ),
 | 
					 | 
				
			||||||
        );
 | 
					 | 
				
			||||||
        let word_position_docids = CboCachedSorter::new(
 | 
					 | 
				
			||||||
            capacity,
 | 
					 | 
				
			||||||
            create_sorter(
 | 
					 | 
				
			||||||
                grenad::SortAlgorithm::Stable,
 | 
					 | 
				
			||||||
                MergeDeladdCboRoaringBitmaps,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_type,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_level,
 | 
					 | 
				
			||||||
                indexer.max_nb_chunks,
 | 
					 | 
				
			||||||
                max_memory,
 | 
					 | 
				
			||||||
                false,
 | 
					 | 
				
			||||||
            ),
 | 
					 | 
				
			||||||
        );
 | 
					 | 
				
			||||||
        let fid_word_count_docids = CboCachedSorter::new(
 | 
					 | 
				
			||||||
            capacity,
 | 
					 | 
				
			||||||
            create_sorter(
 | 
					 | 
				
			||||||
                grenad::SortAlgorithm::Stable,
 | 
					 | 
				
			||||||
                MergeDeladdCboRoaringBitmaps,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_type,
 | 
					 | 
				
			||||||
                indexer.chunk_compression_level,
 | 
					 | 
				
			||||||
                indexer.max_nb_chunks,
 | 
					 | 
				
			||||||
                max_memory,
 | 
					 | 
				
			||||||
                false,
 | 
					 | 
				
			||||||
            ),
 | 
					 | 
				
			||||||
        );
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<'extractor> WordDocidsBalancedCaches<'extractor> {
 | 
				
			||||||
 | 
					    /// TODO Make sure to give the same max_memory to all of them, without splitting it
 | 
				
			||||||
 | 
					    pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
 | 
				
			||||||
        Self {
 | 
					        Self {
 | 
				
			||||||
            word_fid_docids,
 | 
					            word_fid_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
 | 
				
			||||||
            word_docids,
 | 
					            word_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
 | 
				
			||||||
            exact_word_docids,
 | 
					            exact_word_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
 | 
				
			||||||
            word_position_docids,
 | 
					            word_position_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
 | 
				
			||||||
            fid_word_count_docids,
 | 
					            fid_word_count_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
 | 
				
			||||||
            fid_word_count: HashMap::new(),
 | 
					            fid_word_count: HashMap::new(),
 | 
				
			||||||
            current_docid: None,
 | 
					            current_docid: None,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
@@ -198,6 +131,7 @@ impl WordDocidsCachedSorters {
 | 
				
			|||||||
            .entry(field_id)
 | 
					            .entry(field_id)
 | 
				
			||||||
            .and_modify(|(current_count, _new_count)| *current_count += 1)
 | 
					            .and_modify(|(current_count, _new_count)| *current_count += 1)
 | 
				
			||||||
            .or_insert((1, 0));
 | 
					            .or_insert((1, 0));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.current_docid = Some(docid);
 | 
					        self.current_docid = Some(docid);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Ok(())
 | 
					        Ok(())
 | 
				
			||||||
@@ -227,37 +161,29 @@ impl WordDocidsCachedSorters {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct WordDocidsMergerBuilders {
 | 
					pub struct WordDocidsCaches<'extractor> {
 | 
				
			||||||
    word_fid_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
 | 
					    pub word_docids: Vec<BalancedCaches<'extractor>>,
 | 
				
			||||||
    word_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
 | 
					    pub word_fid_docids: Vec<BalancedCaches<'extractor>>,
 | 
				
			||||||
    exact_word_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
 | 
					    pub exact_word_docids: Vec<BalancedCaches<'extractor>>,
 | 
				
			||||||
    word_position_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
 | 
					    pub word_position_docids: Vec<BalancedCaches<'extractor>>,
 | 
				
			||||||
    fid_word_count_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
 | 
					    pub fid_word_count_docids: Vec<BalancedCaches<'extractor>>,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct WordDocidsMergers {
 | 
					impl<'extractor> WordDocidsCaches<'extractor> {
 | 
				
			||||||
    pub word_fid_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
 | 
					 | 
				
			||||||
    pub word_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
 | 
					 | 
				
			||||||
    pub exact_word_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
 | 
					 | 
				
			||||||
    pub word_position_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
 | 
					 | 
				
			||||||
    pub fid_word_count_docids: Merger<File, MergeDeladdCboRoaringBitmaps>,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl WordDocidsMergerBuilders {
 | 
					 | 
				
			||||||
    fn new() -> Self {
 | 
					    fn new() -> Self {
 | 
				
			||||||
        Self {
 | 
					        Self {
 | 
				
			||||||
            word_fid_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
 | 
					            word_docids: Vec::new(),
 | 
				
			||||||
            word_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
 | 
					            word_fid_docids: Vec::new(),
 | 
				
			||||||
            exact_word_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
 | 
					            exact_word_docids: Vec::new(),
 | 
				
			||||||
            word_position_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
 | 
					            word_position_docids: Vec::new(),
 | 
				
			||||||
            fid_word_count_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
 | 
					            fid_word_count_docids: Vec::new(),
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn add_sorters(&mut self, other: WordDocidsCachedSorters) -> Result<()> {
 | 
					    fn push(&mut self, other: WordDocidsBalancedCaches<'extractor>) -> Result<()> {
 | 
				
			||||||
        let WordDocidsCachedSorters {
 | 
					        let WordDocidsBalancedCaches {
 | 
				
			||||||
            word_fid_docids,
 | 
					 | 
				
			||||||
            word_docids,
 | 
					            word_docids,
 | 
				
			||||||
 | 
					            word_fid_docids,
 | 
				
			||||||
            exact_word_docids,
 | 
					            exact_word_docids,
 | 
				
			||||||
            word_position_docids,
 | 
					            word_position_docids,
 | 
				
			||||||
            fid_word_count_docids,
 | 
					            fid_word_count_docids,
 | 
				
			||||||
@@ -265,78 +191,37 @@ impl WordDocidsMergerBuilders {
 | 
				
			|||||||
            current_docid: _,
 | 
					            current_docid: _,
 | 
				
			||||||
        } = other;
 | 
					        } = other;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let mut word_fid_docids_readers = Ok(vec![]);
 | 
					        self.word_docids.push(word_docids);
 | 
				
			||||||
        let mut word_docids_readers = Ok(vec![]);
 | 
					        self.word_fid_docids.push(word_fid_docids);
 | 
				
			||||||
        let mut exact_word_docids_readers = Ok(vec![]);
 | 
					        self.exact_word_docids.push(exact_word_docids);
 | 
				
			||||||
        let mut word_position_docids_readers = Ok(vec![]);
 | 
					        self.word_position_docids.push(word_position_docids);
 | 
				
			||||||
        let mut fid_word_count_docids_readers = Ok(vec![]);
 | 
					        self.fid_word_count_docids.push(fid_word_count_docids);
 | 
				
			||||||
        rayon::scope(|s| {
 | 
					 | 
				
			||||||
            s.spawn(|_| {
 | 
					 | 
				
			||||||
                word_fid_docids_readers =
 | 
					 | 
				
			||||||
                    word_fid_docids.into_sorter().and_then(|s| s.into_reader_cursors());
 | 
					 | 
				
			||||||
            });
 | 
					 | 
				
			||||||
            s.spawn(|_| {
 | 
					 | 
				
			||||||
                word_docids_readers =
 | 
					 | 
				
			||||||
                    word_docids.into_sorter().and_then(|s| s.into_reader_cursors());
 | 
					 | 
				
			||||||
            });
 | 
					 | 
				
			||||||
            s.spawn(|_| {
 | 
					 | 
				
			||||||
                exact_word_docids_readers =
 | 
					 | 
				
			||||||
                    exact_word_docids.into_sorter().and_then(|s| s.into_reader_cursors());
 | 
					 | 
				
			||||||
            });
 | 
					 | 
				
			||||||
            s.spawn(|_| {
 | 
					 | 
				
			||||||
                word_position_docids_readers =
 | 
					 | 
				
			||||||
                    word_position_docids.into_sorter().and_then(|s| s.into_reader_cursors());
 | 
					 | 
				
			||||||
            });
 | 
					 | 
				
			||||||
            s.spawn(|_| {
 | 
					 | 
				
			||||||
                fid_word_count_docids_readers =
 | 
					 | 
				
			||||||
                    fid_word_count_docids.into_sorter().and_then(|s| s.into_reader_cursors());
 | 
					 | 
				
			||||||
            });
 | 
					 | 
				
			||||||
        });
 | 
					 | 
				
			||||||
        self.word_fid_docids.extend(word_fid_docids_readers?);
 | 
					 | 
				
			||||||
        self.word_docids.extend(word_docids_readers?);
 | 
					 | 
				
			||||||
        self.exact_word_docids.extend(exact_word_docids_readers?);
 | 
					 | 
				
			||||||
        self.word_position_docids.extend(word_position_docids_readers?);
 | 
					 | 
				
			||||||
        self.fid_word_count_docids.extend(fid_word_count_docids_readers?);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Ok(())
 | 
					        Ok(())
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					 | 
				
			||||||
    fn build(self) -> WordDocidsMergers {
 | 
					 | 
				
			||||||
        WordDocidsMergers {
 | 
					 | 
				
			||||||
            word_fid_docids: self.word_fid_docids.build(),
 | 
					 | 
				
			||||||
            word_docids: self.word_docids.build(),
 | 
					 | 
				
			||||||
            exact_word_docids: self.exact_word_docids.build(),
 | 
					 | 
				
			||||||
            word_position_docids: self.word_position_docids.build(),
 | 
					 | 
				
			||||||
            fid_word_count_docids: self.fid_word_count_docids.build(),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct WordDocidsExtractorData<'extractor> {
 | 
					pub struct WordDocidsExtractorData<'a> {
 | 
				
			||||||
    tokenizer: &'extractor DocumentTokenizer<'extractor>,
 | 
					    tokenizer: &'a DocumentTokenizer<'a>,
 | 
				
			||||||
    grenad_parameters: GrenadParameters,
 | 
					    grenad_parameters: GrenadParameters,
 | 
				
			||||||
    max_memory: Option<usize>,
 | 
					    buckets: usize,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
 | 
					impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
 | 
				
			||||||
    type Data = FullySend<RefCell<WordDocidsCachedSorters>>;
 | 
					    type Data = RefCell<Option<WordDocidsBalancedCaches<'extractor>>>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn init_data(
 | 
					    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
 | 
				
			||||||
        &self,
 | 
					        Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
 | 
				
			||||||
        _extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
 | 
					            self.buckets,
 | 
				
			||||||
    ) -> Result<Self::Data> {
 | 
					            self.grenad_parameters.max_memory,
 | 
				
			||||||
        Ok(FullySend(RefCell::new(WordDocidsCachedSorters::new(
 | 
					            extractor_alloc,
 | 
				
			||||||
            self.grenad_parameters,
 | 
					 | 
				
			||||||
            self.max_memory,
 | 
					 | 
				
			||||||
            // TODO use a better value
 | 
					 | 
				
			||||||
            200_000.try_into().unwrap(),
 | 
					 | 
				
			||||||
        ))))
 | 
					        ))))
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn process(
 | 
					    fn process(
 | 
				
			||||||
        &self,
 | 
					        &self,
 | 
				
			||||||
        change: DocumentChange,
 | 
					        change: DocumentChange,
 | 
				
			||||||
        context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
 | 
					        context: &DocumentChangeContext<Self::Data>,
 | 
				
			||||||
    ) -> Result<()> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
        WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)
 | 
					        WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -345,16 +230,15 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'extractor> {
 | 
				
			|||||||
pub struct WordDocidsExtractors;
 | 
					pub struct WordDocidsExtractors;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl WordDocidsExtractors {
 | 
					impl WordDocidsExtractors {
 | 
				
			||||||
    pub fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
 | 
					    pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
 | 
				
			||||||
        grenad_parameters: GrenadParameters,
 | 
					        grenad_parameters: GrenadParameters,
 | 
				
			||||||
        document_changes: &DC,
 | 
					        document_changes: &DC,
 | 
				
			||||||
        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
					        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
				
			||||||
        extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
 | 
					        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
 | 
				
			||||||
    ) -> Result<WordDocidsMergers> {
 | 
					    ) -> Result<WordDocidsCaches<'extractor>> {
 | 
				
			||||||
        let max_memory = grenad_parameters.max_memory_by_thread();
 | 
					 | 
				
			||||||
        let index = indexing_context.index;
 | 
					        let index = indexing_context.index;
 | 
				
			||||||
 | 
					 | 
				
			||||||
        let rtxn = index.read_txn()?;
 | 
					        let rtxn = index.read_txn()?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let stop_words = index.stop_words(&rtxn)?;
 | 
					        let stop_words = index.stop_words(&rtxn)?;
 | 
				
			||||||
        let allowed_separators = index.allowed_separators(&rtxn)?;
 | 
					        let allowed_separators = index.allowed_separators(&rtxn)?;
 | 
				
			||||||
        let allowed_separators: Option<Vec<_>> =
 | 
					        let allowed_separators: Option<Vec<_>> =
 | 
				
			||||||
@@ -392,7 +276,7 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
            let extractor = WordDocidsExtractorData {
 | 
					            let extractor = WordDocidsExtractorData {
 | 
				
			||||||
                tokenizer: &document_tokenizer,
 | 
					                tokenizer: &document_tokenizer,
 | 
				
			||||||
                grenad_parameters,
 | 
					                grenad_parameters,
 | 
				
			||||||
                max_memory,
 | 
					                buckets: rayon::current_num_threads(),
 | 
				
			||||||
            };
 | 
					            };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            for_each_document_change(
 | 
					            for_each_document_change(
 | 
				
			||||||
@@ -404,28 +288,23 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
            )?;
 | 
					            )?;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        {
 | 
					        let mut merger = WordDocidsCaches::new();
 | 
				
			||||||
            let span =
 | 
					        for cache in datastore.into_iter().flat_map(RefCell::into_inner) {
 | 
				
			||||||
                tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
 | 
					            merger.push(cache)?;
 | 
				
			||||||
            let _entered = span.enter();
 | 
					 | 
				
			||||||
            let mut builder = WordDocidsMergerBuilders::new();
 | 
					 | 
				
			||||||
            for cache in datastore.into_iter().map(|cache| cache.0.into_inner()) {
 | 
					 | 
				
			||||||
                builder.add_sorters(cache)?;
 | 
					 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            Ok(builder.build())
 | 
					        Ok(merger)
 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn extract_document_change(
 | 
					    fn extract_document_change(
 | 
				
			||||||
        context: &DocumentChangeContext<FullySend<RefCell<WordDocidsCachedSorters>>>,
 | 
					        context: &DocumentChangeContext<RefCell<Option<WordDocidsBalancedCaches>>>,
 | 
				
			||||||
        document_tokenizer: &DocumentTokenizer,
 | 
					        document_tokenizer: &DocumentTokenizer,
 | 
				
			||||||
        document_change: DocumentChange,
 | 
					        document_change: DocumentChange,
 | 
				
			||||||
    ) -> Result<()> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
        let index = &context.index;
 | 
					        let index = &context.index;
 | 
				
			||||||
        let rtxn = &context.txn;
 | 
					        let rtxn = &context.txn;
 | 
				
			||||||
        let mut cached_sorter = context.data.0.borrow_mut_or_yield();
 | 
					        let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
 | 
				
			||||||
        let cached_sorter = cached_sorter.deref_mut();
 | 
					        let cached_sorter = cached_sorter_ref.as_mut().unwrap();
 | 
				
			||||||
        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
 | 
					        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
 | 
				
			||||||
        let new_fields_ids_map = new_fields_ids_map.deref_mut();
 | 
					        let new_fields_ids_map = new_fields_ids_map.deref_mut();
 | 
				
			||||||
        let doc_alloc = &context.doc_alloc;
 | 
					        let doc_alloc = &context.doc_alloc;
 | 
				
			||||||
@@ -436,8 +315,7 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
        match document_change {
 | 
					        match document_change {
 | 
				
			||||||
            DocumentChange::Deletion(inner) => {
 | 
					            DocumentChange::Deletion(inner) => {
 | 
				
			||||||
                let mut token_fn = |fname: &str, fid, pos, word: &str| {
 | 
					                let mut token_fn = |fname: &str, fid, pos, word: &str| {
 | 
				
			||||||
                    cached_sorter
 | 
					                    cached_sorter.insert_del_u32(
 | 
				
			||||||
                        .insert_del_u32(
 | 
					 | 
				
			||||||
                        fid,
 | 
					                        fid,
 | 
				
			||||||
                        pos,
 | 
					                        pos,
 | 
				
			||||||
                        word,
 | 
					                        word,
 | 
				
			||||||
@@ -445,7 +323,6 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
                        inner.docid(),
 | 
					                        inner.docid(),
 | 
				
			||||||
                        doc_alloc,
 | 
					                        doc_alloc,
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                        .map_err(crate::Error::from)
 | 
					 | 
				
			||||||
                };
 | 
					                };
 | 
				
			||||||
                document_tokenizer.tokenize_document(
 | 
					                document_tokenizer.tokenize_document(
 | 
				
			||||||
                    inner.current(rtxn, index, context.db_fields_ids_map)?,
 | 
					                    inner.current(rtxn, index, context.db_fields_ids_map)?,
 | 
				
			||||||
@@ -455,8 +332,7 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
            }
 | 
					            }
 | 
				
			||||||
            DocumentChange::Update(inner) => {
 | 
					            DocumentChange::Update(inner) => {
 | 
				
			||||||
                let mut token_fn = |fname: &str, fid, pos, word: &str| {
 | 
					                let mut token_fn = |fname: &str, fid, pos, word: &str| {
 | 
				
			||||||
                    cached_sorter
 | 
					                    cached_sorter.insert_del_u32(
 | 
				
			||||||
                        .insert_del_u32(
 | 
					 | 
				
			||||||
                        fid,
 | 
					                        fid,
 | 
				
			||||||
                        pos,
 | 
					                        pos,
 | 
				
			||||||
                        word,
 | 
					                        word,
 | 
				
			||||||
@@ -464,7 +340,6 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
                        inner.docid(),
 | 
					                        inner.docid(),
 | 
				
			||||||
                        doc_alloc,
 | 
					                        doc_alloc,
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                        .map_err(crate::Error::from)
 | 
					 | 
				
			||||||
                };
 | 
					                };
 | 
				
			||||||
                document_tokenizer.tokenize_document(
 | 
					                document_tokenizer.tokenize_document(
 | 
				
			||||||
                    inner.current(rtxn, index, context.db_fields_ids_map)?,
 | 
					                    inner.current(rtxn, index, context.db_fields_ids_map)?,
 | 
				
			||||||
@@ -473,8 +348,7 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
                )?;
 | 
					                )?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                let mut token_fn = |fname: &str, fid, pos, word: &str| {
 | 
					                let mut token_fn = |fname: &str, fid, pos, word: &str| {
 | 
				
			||||||
                    cached_sorter
 | 
					                    cached_sorter.insert_add_u32(
 | 
				
			||||||
                        .insert_add_u32(
 | 
					 | 
				
			||||||
                        fid,
 | 
					                        fid,
 | 
				
			||||||
                        pos,
 | 
					                        pos,
 | 
				
			||||||
                        word,
 | 
					                        word,
 | 
				
			||||||
@@ -482,7 +356,6 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
                        inner.docid(),
 | 
					                        inner.docid(),
 | 
				
			||||||
                        doc_alloc,
 | 
					                        doc_alloc,
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                        .map_err(crate::Error::from)
 | 
					 | 
				
			||||||
                };
 | 
					                };
 | 
				
			||||||
                document_tokenizer.tokenize_document(
 | 
					                document_tokenizer.tokenize_document(
 | 
				
			||||||
                    inner.new(rtxn, index, context.db_fields_ids_map)?,
 | 
					                    inner.new(rtxn, index, context.db_fields_ids_map)?,
 | 
				
			||||||
@@ -492,8 +365,7 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
            }
 | 
					            }
 | 
				
			||||||
            DocumentChange::Insertion(inner) => {
 | 
					            DocumentChange::Insertion(inner) => {
 | 
				
			||||||
                let mut token_fn = |fname: &str, fid, pos, word: &str| {
 | 
					                let mut token_fn = |fname: &str, fid, pos, word: &str| {
 | 
				
			||||||
                    cached_sorter
 | 
					                    cached_sorter.insert_add_u32(
 | 
				
			||||||
                        .insert_add_u32(
 | 
					 | 
				
			||||||
                        fid,
 | 
					                        fid,
 | 
				
			||||||
                        pos,
 | 
					                        pos,
 | 
				
			||||||
                        word,
 | 
					                        word,
 | 
				
			||||||
@@ -501,7 +373,6 @@ impl WordDocidsExtractors {
 | 
				
			|||||||
                        inner.docid(),
 | 
					                        inner.docid(),
 | 
				
			||||||
                        doc_alloc,
 | 
					                        doc_alloc,
 | 
				
			||||||
                    )
 | 
					                    )
 | 
				
			||||||
                        .map_err(crate::Error::from)
 | 
					 | 
				
			||||||
                };
 | 
					                };
 | 
				
			||||||
                document_tokenizer.tokenize_document(
 | 
					                document_tokenizer.tokenize_document(
 | 
				
			||||||
                    inner.new(),
 | 
					                    inner.new(),
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -8,13 +8,13 @@ use super::tokenize_document::DocumentTokenizer;
 | 
				
			|||||||
use super::SearchableExtractor;
 | 
					use super::SearchableExtractor;
 | 
				
			||||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
 | 
					use crate::proximity::{index_proximity, MAX_DISTANCE};
 | 
				
			||||||
use crate::update::new::document::Document;
 | 
					use crate::update::new::document::Document;
 | 
				
			||||||
use crate::update::new::extract::cache::CboCachedSorter;
 | 
					use crate::update::new::extract::cache::BalancedCaches;
 | 
				
			||||||
use crate::update::new::indexer::document_changes::{DocumentChangeContext, FullySend, RefCellExt};
 | 
					use crate::update::new::indexer::document_changes::{DocumentChangeContext, RefCellExt};
 | 
				
			||||||
use crate::update::new::DocumentChange;
 | 
					use crate::update::new::DocumentChange;
 | 
				
			||||||
use crate::update::MergeDeladdCboRoaringBitmaps;
 | 
					 | 
				
			||||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
 | 
					use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct WordPairProximityDocidsExtractor;
 | 
					pub struct WordPairProximityDocidsExtractor;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl SearchableExtractor for WordPairProximityDocidsExtractor {
 | 
					impl SearchableExtractor for WordPairProximityDocidsExtractor {
 | 
				
			||||||
    fn attributes_to_extract<'a>(
 | 
					    fn attributes_to_extract<'a>(
 | 
				
			||||||
        rtxn: &'a RoTxn,
 | 
					        rtxn: &'a RoTxn,
 | 
				
			||||||
@@ -28,11 +28,10 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // This method is reimplemented to count the number of words in the document in each field
 | 
					    // This method is reimplemented to count the number of words in the document in each field
 | 
				
			||||||
    // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
 | 
					    // and to store the docids of the documents that have a number of words in a given field
 | 
				
			||||||
 | 
					    // equal to or under than MAX_COUNTED_WORDS.
 | 
				
			||||||
    fn extract_document_change(
 | 
					    fn extract_document_change(
 | 
				
			||||||
        context: &DocumentChangeContext<
 | 
					        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
 | 
				
			||||||
            FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
 | 
					 | 
				
			||||||
        >,
 | 
					 | 
				
			||||||
        document_tokenizer: &DocumentTokenizer,
 | 
					        document_tokenizer: &DocumentTokenizer,
 | 
				
			||||||
        document_change: DocumentChange,
 | 
					        document_change: DocumentChange,
 | 
				
			||||||
    ) -> Result<()> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
@@ -48,7 +47,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
 | 
				
			|||||||
        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
 | 
					        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
 | 
				
			||||||
        let new_fields_ids_map = &mut *new_fields_ids_map;
 | 
					        let new_fields_ids_map = &mut *new_fields_ids_map;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let mut cached_sorter = context.data.0.borrow_mut_or_yield();
 | 
					        let mut cached_sorter = context.data.borrow_mut_or_yield();
 | 
				
			||||||
        let cached_sorter = &mut *cached_sorter;
 | 
					        let cached_sorter = &mut *cached_sorter;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // is a vecdequeue, and will be smol, so can stay on the heap for now
 | 
					        // is a vecdequeue, and will be smol, so can stay on the heap for now
 | 
				
			||||||
@@ -139,7 +138,7 @@ fn build_key<'a>(
 | 
				
			|||||||
fn word_positions_into_word_pair_proximity(
 | 
					fn word_positions_into_word_pair_proximity(
 | 
				
			||||||
    word_positions: &mut VecDeque<(Rc<str>, u16)>,
 | 
					    word_positions: &mut VecDeque<(Rc<str>, u16)>,
 | 
				
			||||||
    word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
 | 
					    word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
 | 
				
			||||||
) -> Result<()> {
 | 
					) {
 | 
				
			||||||
    let (head_word, head_position) = word_positions.pop_front().unwrap();
 | 
					    let (head_word, head_position) = word_positions.pop_front().unwrap();
 | 
				
			||||||
    for (word, position) in word_positions.iter() {
 | 
					    for (word, position) in word_positions.iter() {
 | 
				
			||||||
        let prox = index_proximity(head_position as u32, *position as u32) as u8;
 | 
					        let prox = index_proximity(head_position as u32, *position as u32) as u8;
 | 
				
			||||||
@@ -147,7 +146,6 @@ fn word_positions_into_word_pair_proximity(
 | 
				
			|||||||
            word_pair_proximity((head_word.clone(), word.clone()), prox);
 | 
					            word_pair_proximity((head_word.clone(), word.clone()), prox);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    Ok(())
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
fn process_document_tokens<'doc>(
 | 
					fn process_document_tokens<'doc>(
 | 
				
			||||||
@@ -163,7 +161,7 @@ fn process_document_tokens<'doc>(
 | 
				
			|||||||
            .front()
 | 
					            .front()
 | 
				
			||||||
            .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
 | 
					            .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
 | 
					            word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        // insert the new word.
 | 
					        // insert the new word.
 | 
				
			||||||
@@ -173,7 +171,7 @@ fn process_document_tokens<'doc>(
 | 
				
			|||||||
    document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
 | 
					    document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    while !word_positions.is_empty() {
 | 
					    while !word_positions.is_empty() {
 | 
				
			||||||
        word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
 | 
					        word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Ok(())
 | 
					    Ok(())
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,76 +3,60 @@ mod extract_word_pair_proximity_docids;
 | 
				
			|||||||
mod tokenize_document;
 | 
					mod tokenize_document;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use std::cell::RefCell;
 | 
					use std::cell::RefCell;
 | 
				
			||||||
use std::fs::File;
 | 
					 | 
				
			||||||
use std::marker::PhantomData;
 | 
					use std::marker::PhantomData;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use bumpalo::Bump;
 | 
					use bumpalo::Bump;
 | 
				
			||||||
pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers};
 | 
					pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
 | 
				
			||||||
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
 | 
					pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
 | 
				
			||||||
use grenad::Merger;
 | 
					 | 
				
			||||||
use heed::RoTxn;
 | 
					use heed::RoTxn;
 | 
				
			||||||
use rayon::iter::{ParallelBridge, ParallelIterator};
 | 
					 | 
				
			||||||
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
 | 
					use tokenize_document::{tokenizer_builder, DocumentTokenizer};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::cache::CboCachedSorter;
 | 
					use super::cache::BalancedCaches;
 | 
				
			||||||
use super::DocidsExtractor;
 | 
					use super::DocidsExtractor;
 | 
				
			||||||
use crate::update::new::indexer::document_changes::{
 | 
					use crate::update::new::indexer::document_changes::{
 | 
				
			||||||
    for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
 | 
					    for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
 | 
				
			||||||
    IndexingContext, ThreadLocal,
 | 
					    IndexingContext, ThreadLocal,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
use crate::update::new::DocumentChange;
 | 
					use crate::update::new::DocumentChange;
 | 
				
			||||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
 | 
					use crate::update::GrenadParameters;
 | 
				
			||||||
use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
 | 
					use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct SearchableExtractorData<'extractor, EX: SearchableExtractor> {
 | 
					pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
 | 
				
			||||||
    tokenizer: &'extractor DocumentTokenizer<'extractor>,
 | 
					    tokenizer: &'a DocumentTokenizer<'a>,
 | 
				
			||||||
    grenad_parameters: GrenadParameters,
 | 
					    grenad_parameters: GrenadParameters,
 | 
				
			||||||
    max_memory: Option<usize>,
 | 
					    buckets: usize,
 | 
				
			||||||
    _ex: PhantomData<EX>,
 | 
					    _ex: PhantomData<EX>,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
 | 
					impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
 | 
				
			||||||
    for SearchableExtractorData<'extractor, EX>
 | 
					    for SearchableExtractorData<'a, EX>
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    type Data = FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>;
 | 
					    type Data = RefCell<BalancedCaches<'extractor>>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn init_data(
 | 
					    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
 | 
				
			||||||
        &self,
 | 
					        Ok(RefCell::new(BalancedCaches::new_in(
 | 
				
			||||||
        _extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
 | 
					            self.buckets,
 | 
				
			||||||
    ) -> Result<Self::Data> {
 | 
					            self.grenad_parameters.max_memory,
 | 
				
			||||||
        Ok(FullySend(RefCell::new(CboCachedSorter::new(
 | 
					            extractor_alloc,
 | 
				
			||||||
            // TODO use a better value
 | 
					        )))
 | 
				
			||||||
            1_000_000.try_into().unwrap(),
 | 
					 | 
				
			||||||
            create_sorter(
 | 
					 | 
				
			||||||
                grenad::SortAlgorithm::Stable,
 | 
					 | 
				
			||||||
                MergeDeladdCboRoaringBitmaps,
 | 
					 | 
				
			||||||
                self.grenad_parameters.chunk_compression_type,
 | 
					 | 
				
			||||||
                self.grenad_parameters.chunk_compression_level,
 | 
					 | 
				
			||||||
                self.grenad_parameters.max_nb_chunks,
 | 
					 | 
				
			||||||
                self.max_memory,
 | 
					 | 
				
			||||||
                false,
 | 
					 | 
				
			||||||
            ),
 | 
					 | 
				
			||||||
        ))))
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn process(
 | 
					    fn process(
 | 
				
			||||||
        &self,
 | 
					        &self,
 | 
				
			||||||
        change: DocumentChange,
 | 
					        change: DocumentChange,
 | 
				
			||||||
        context: &crate::update::new::indexer::document_changes::DocumentChangeContext<Self::Data>,
 | 
					        context: &DocumentChangeContext<Self::Data>,
 | 
				
			||||||
    ) -> Result<()> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
        EX::extract_document_change(context, self.tokenizer, change)
 | 
					        EX::extract_document_change(context, self.tokenizer, change)
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub trait SearchableExtractor: Sized + Sync {
 | 
					pub trait SearchableExtractor: Sized + Sync {
 | 
				
			||||||
    fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
 | 
					    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
 | 
				
			||||||
        grenad_parameters: GrenadParameters,
 | 
					        grenad_parameters: GrenadParameters,
 | 
				
			||||||
        document_changes: &DC,
 | 
					        document_changes: &DC,
 | 
				
			||||||
        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
					        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
				
			||||||
        extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
 | 
					        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
 | 
				
			||||||
    ) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
 | 
					    ) -> Result<Vec<BalancedCaches<'extractor>>> {
 | 
				
			||||||
        let max_memory = grenad_parameters.max_memory_by_thread();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        let rtxn = indexing_context.index.read_txn()?;
 | 
					        let rtxn = indexing_context.index.read_txn()?;
 | 
				
			||||||
        let stop_words = indexing_context.index.stop_words(&rtxn)?;
 | 
					        let stop_words = indexing_context.index.stop_words(&rtxn)?;
 | 
				
			||||||
        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
 | 
					        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
 | 
				
			||||||
@@ -104,7 +88,7 @@ pub trait SearchableExtractor: Sized + Sync {
 | 
				
			|||||||
        let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
 | 
					        let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
 | 
				
			||||||
            tokenizer: &document_tokenizer,
 | 
					            tokenizer: &document_tokenizer,
 | 
				
			||||||
            grenad_parameters,
 | 
					            grenad_parameters,
 | 
				
			||||||
            max_memory,
 | 
					            buckets: rayon::current_num_threads(),
 | 
				
			||||||
            _ex: PhantomData,
 | 
					            _ex: PhantomData,
 | 
				
			||||||
        };
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -122,37 +106,12 @@ pub trait SearchableExtractor: Sized + Sync {
 | 
				
			|||||||
                &datastore,
 | 
					                &datastore,
 | 
				
			||||||
            )?;
 | 
					            )?;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        {
 | 
					 | 
				
			||||||
            let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
 | 
					 | 
				
			||||||
            let span =
 | 
					 | 
				
			||||||
                tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
 | 
					 | 
				
			||||||
            let _entered = span.enter();
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            let readers: Vec<_> = datastore
 | 
					        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
 | 
				
			||||||
                .into_iter()
 | 
					 | 
				
			||||||
                .par_bridge()
 | 
					 | 
				
			||||||
                .map(|cache_entry| {
 | 
					 | 
				
			||||||
                    let cached_sorter: FullySend<
 | 
					 | 
				
			||||||
                        RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>,
 | 
					 | 
				
			||||||
                    > = cache_entry;
 | 
					 | 
				
			||||||
                    let cached_sorter = cached_sorter.0.into_inner();
 | 
					 | 
				
			||||||
                    let sorter = cached_sorter.into_sorter()?;
 | 
					 | 
				
			||||||
                    sorter.into_reader_cursors()
 | 
					 | 
				
			||||||
                })
 | 
					 | 
				
			||||||
                .collect();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            for reader in readers {
 | 
					 | 
				
			||||||
                builder.extend(reader?);
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            Ok(builder.build())
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn extract_document_change(
 | 
					    fn extract_document_change(
 | 
				
			||||||
        context: &DocumentChangeContext<
 | 
					        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
 | 
				
			||||||
            FullySend<RefCell<CboCachedSorter<MergeDeladdCboRoaringBitmaps>>>,
 | 
					 | 
				
			||||||
        >,
 | 
					 | 
				
			||||||
        document_tokenizer: &DocumentTokenizer,
 | 
					        document_tokenizer: &DocumentTokenizer,
 | 
				
			||||||
        document_change: DocumentChange,
 | 
					        document_change: DocumentChange,
 | 
				
			||||||
    ) -> Result<()>;
 | 
					    ) -> Result<()>;
 | 
				
			||||||
@@ -164,12 +123,12 @@ pub trait SearchableExtractor: Sized + Sync {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl<T: SearchableExtractor> DocidsExtractor for T {
 | 
					impl<T: SearchableExtractor> DocidsExtractor for T {
 | 
				
			||||||
    fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>(
 | 
					    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
 | 
				
			||||||
        grenad_parameters: GrenadParameters,
 | 
					        grenad_parameters: GrenadParameters,
 | 
				
			||||||
        document_changes: &DC,
 | 
					        document_changes: &DC,
 | 
				
			||||||
        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
					        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
				
			||||||
        extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
 | 
					        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
 | 
				
			||||||
    ) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> {
 | 
					    ) -> Result<Vec<BalancedCaches<'extractor>>> {
 | 
				
			||||||
        Self::run_extraction(
 | 
					        Self::run_extraction(
 | 
				
			||||||
            grenad_parameters,
 | 
					            grenad_parameters,
 | 
				
			||||||
            document_changes,
 | 
					            document_changes,
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -171,7 +171,6 @@ mod test {
 | 
				
			|||||||
    use bumpalo::Bump;
 | 
					    use bumpalo::Bump;
 | 
				
			||||||
    use charabia::TokenizerBuilder;
 | 
					    use charabia::TokenizerBuilder;
 | 
				
			||||||
    use meili_snap::snapshot;
 | 
					    use meili_snap::snapshot;
 | 
				
			||||||
    
 | 
					 | 
				
			||||||
    use raw_collections::RawMap;
 | 
					    use raw_collections::RawMap;
 | 
				
			||||||
    use serde_json::json;
 | 
					    use serde_json::json;
 | 
				
			||||||
    use serde_json::value::RawValue;
 | 
					    use serde_json::value::RawValue;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,31 +1,24 @@
 | 
				
			|||||||
use std::collections::{BTreeSet, HashMap};
 | 
					use std::collections::{BTreeSet, HashMap};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use charabia::{normalizer::NormalizerOption, Language, Normalize, StrDetection, Token};
 | 
					use charabia::normalizer::NormalizerOption;
 | 
				
			||||||
 | 
					use charabia::{Language, Normalize, StrDetection, Token};
 | 
				
			||||||
use grenad::Sorter;
 | 
					use grenad::Sorter;
 | 
				
			||||||
use heed::{
 | 
					use heed::types::{Bytes, SerdeJson};
 | 
				
			||||||
    types::{Bytes, SerdeJson},
 | 
					use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn};
 | 
				
			||||||
    BytesDecode, BytesEncode, RoTxn,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use super::channel::FacetSearchableSender;
 | 
				
			||||||
 | 
					use super::extract::FacetKind;
 | 
				
			||||||
 | 
					use super::fst_merger_builder::FstMergerBuilder;
 | 
				
			||||||
 | 
					use super::KvReaderDelAdd;
 | 
				
			||||||
 | 
					use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
 | 
				
			||||||
 | 
					use crate::heed_codec::StrRefCodec;
 | 
				
			||||||
 | 
					use crate::update::del_add::{DelAdd, KvWriterDelAdd};
 | 
				
			||||||
 | 
					use crate::update::{create_sorter, MergeDeladdBtreesetString};
 | 
				
			||||||
use crate::{
 | 
					use crate::{
 | 
				
			||||||
    heed_codec::{
 | 
					 | 
				
			||||||
        facet::{FacetGroupKey, FacetGroupKeyCodec},
 | 
					 | 
				
			||||||
        StrRefCodec,
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
    update::{
 | 
					 | 
				
			||||||
        create_sorter,
 | 
					 | 
				
			||||||
        del_add::{DelAdd, KvWriterDelAdd},
 | 
					 | 
				
			||||||
        MergeDeladdBtreesetString,
 | 
					 | 
				
			||||||
    },
 | 
					 | 
				
			||||||
    BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result,
 | 
					    BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result,
 | 
				
			||||||
    MAX_FACET_VALUE_LENGTH,
 | 
					    MAX_FACET_VALUE_LENGTH,
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::{
 | 
					 | 
				
			||||||
    channel::FacetSearchableSender, extract::FacetKind, fst_merger_builder::FstMergerBuilder,
 | 
					 | 
				
			||||||
    KvReaderDelAdd,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
pub struct FacetSearchBuilder<'indexer> {
 | 
					pub struct FacetSearchBuilder<'indexer> {
 | 
				
			||||||
    registered_facets: HashMap<FieldId, usize>,
 | 
					    registered_facets: HashMap<FieldId, usize>,
 | 
				
			||||||
    normalized_facet_string_docids_sorter: Sorter<MergeDeladdBtreesetString>,
 | 
					    normalized_facet_string_docids_sorter: Sorter<MergeDeladdBtreesetString>,
 | 
				
			||||||
@@ -49,6 +42,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
 | 
				
			|||||||
            None,
 | 
					            None,
 | 
				
			||||||
            None,
 | 
					            None,
 | 
				
			||||||
            Some(0),
 | 
					            Some(0),
 | 
				
			||||||
 | 
					            false,
 | 
				
			||||||
        );
 | 
					        );
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Self {
 | 
					        Self {
 | 
				
			||||||
@@ -84,7 +78,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let locales = self.locales(field_id);
 | 
					        let locales = self.locales(field_id);
 | 
				
			||||||
        let hyper_normalized_value = normalize_facet_string(left_bound, locales.as_deref());
 | 
					        let hyper_normalized_value = normalize_facet_string(left_bound, locales);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let set = BTreeSet::from_iter(std::iter::once(left_bound));
 | 
					        let set = BTreeSet::from_iter(std::iter::once(left_bound));
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -103,7 +97,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> {
 | 
					    fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> {
 | 
				
			||||||
        if self.localized_field_ids.get(&field_id).is_none() {
 | 
					        if !self.localized_field_ids.contains_key(&field_id) {
 | 
				
			||||||
            let Some(field_name) = self.global_fields_ids_map.name(field_id) else {
 | 
					            let Some(field_name) = self.global_fields_ids_map.name(field_id) else {
 | 
				
			||||||
                unreachable!("Field id {} not found in the global fields ids map", field_id);
 | 
					                unreachable!("Field id {} not found in the global fields ids map", field_id);
 | 
				
			||||||
            };
 | 
					            };
 | 
				
			||||||
@@ -124,7 +118,8 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
 | 
				
			|||||||
    pub fn merge_and_send(
 | 
					    pub fn merge_and_send(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
        index: &Index,
 | 
					        index: &Index,
 | 
				
			||||||
        rtxn: &RoTxn<'_>,
 | 
					        wtxn: &mut RwTxn,
 | 
				
			||||||
 | 
					        rtxn: &RoTxn,
 | 
				
			||||||
        sender: FacetSearchableSender,
 | 
					        sender: FacetSearchableSender,
 | 
				
			||||||
    ) -> Result<()> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
        let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?;
 | 
					        let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?;
 | 
				
			||||||
@@ -139,13 +134,14 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
 | 
				
			|||||||
        let mut fst_merger_builder: Option<FstMergerBuilder> = None;
 | 
					        let mut fst_merger_builder: Option<FstMergerBuilder> = None;
 | 
				
			||||||
        while let Some((key, deladd)) = merger_iter.next()? {
 | 
					        while let Some((key, deladd)) = merger_iter.next()? {
 | 
				
			||||||
            let (field_id, normalized_facet_string) =
 | 
					            let (field_id, normalized_facet_string) =
 | 
				
			||||||
                BEU16StrCodec::bytes_decode(&key).map_err(heed::Error::Encoding)?;
 | 
					                BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if current_field_id != Some(field_id) {
 | 
					            if current_field_id != Some(field_id) {
 | 
				
			||||||
                if let Some(fst_merger_builder) = fst_merger_builder {
 | 
					                if let Some(fst_merger_builder) = fst_merger_builder {
 | 
				
			||||||
                    // send the previous fst to the channel
 | 
					                    // send the previous fst to the channel
 | 
				
			||||||
                    let mmap = fst_merger_builder.build(&mut callback)?;
 | 
					                    let mmap = fst_merger_builder.build(&mut callback)?;
 | 
				
			||||||
                    sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap();
 | 
					                    // sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap();
 | 
				
			||||||
 | 
					                    todo!("What to do");
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                println!("getting fst for field_id: {}", field_id);
 | 
					                println!("getting fst for field_id: {}", field_id);
 | 
				
			||||||
@@ -198,7 +194,8 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
        if let (Some(field_id), Some(fst_merger_builder)) = (current_field_id, fst_merger_builder) {
 | 
					        if let (Some(field_id), Some(fst_merger_builder)) = (current_field_id, fst_merger_builder) {
 | 
				
			||||||
            let mmap = fst_merger_builder.build(&mut callback)?;
 | 
					            let mmap = fst_merger_builder.build(&mut callback)?;
 | 
				
			||||||
            sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap();
 | 
					            // sender.write_fst(&field_id.to_be_bytes(), mmap).unwrap();
 | 
				
			||||||
 | 
					            todo!("What to do");
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Ok(())
 | 
					        Ok(())
 | 
				
			||||||
@@ -209,7 +206,7 @@ fn callback(_bytes: &[u8], _deladd: DelAdd, _is_modified: bool) -> Result<()> {
 | 
				
			|||||||
    Ok(())
 | 
					    Ok(())
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
fn merge_btreesets<'a>(
 | 
					fn merge_btreesets(
 | 
				
			||||||
    current: Option<&[u8]>,
 | 
					    current: Option<&[u8]>,
 | 
				
			||||||
    del: Option<&[u8]>,
 | 
					    del: Option<&[u8]>,
 | 
				
			||||||
    add: Option<&[u8]>,
 | 
					    add: Option<&[u8]>,
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -49,7 +49,7 @@ impl<'de, 'p, 'indexer: 'de, Mapper: MutFieldIdMapper> Visitor<'de>
 | 
				
			|||||||
                visitor: MutFieldIdMapVisitor(self.fields_ids_map),
 | 
					                visitor: MutFieldIdMapVisitor(self.fields_ids_map),
 | 
				
			||||||
            })?
 | 
					            })?
 | 
				
			||||||
        {
 | 
					        {
 | 
				
			||||||
            let Some(fid) = fid else {
 | 
					            let Some(_fid) = fid else {
 | 
				
			||||||
                return Ok(Err(crate::UserError::AttributeLimitReached));
 | 
					                return Ok(Err(crate::UserError::AttributeLimitReached));
 | 
				
			||||||
            };
 | 
					            };
 | 
				
			||||||
            self.fields_ids_map = fields_ids_map;
 | 
					            self.fields_ids_map = fields_ids_map;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -3,7 +3,6 @@ use std::sync::{Arc, RwLock};
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
use bumpalo::Bump;
 | 
					use bumpalo::Bump;
 | 
				
			||||||
use heed::RoTxn;
 | 
					use heed::RoTxn;
 | 
				
			||||||
use raw_collections::alloc::RefBump;
 | 
					 | 
				
			||||||
use rayon::iter::IndexedParallelIterator;
 | 
					use rayon::iter::IndexedParallelIterator;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::super::document_change::DocumentChange;
 | 
					use super::super::document_change::DocumentChange;
 | 
				
			||||||
@@ -104,6 +103,10 @@ pub struct FullySend<T>(pub T);
 | 
				
			|||||||
// SAFETY: a type **fully** send is always mostly send as well.
 | 
					// SAFETY: a type **fully** send is always mostly send as well.
 | 
				
			||||||
unsafe impl<T> MostlySend for FullySend<T> where T: Send {}
 | 
					unsafe impl<T> MostlySend for FullySend<T> where T: Send {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					unsafe impl<T> MostlySend for RefCell<T> where T: MostlySend {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					unsafe impl<T> MostlySend for Option<T> where T: MostlySend {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl<T> FullySend<T> {
 | 
					impl<T> FullySend<T> {
 | 
				
			||||||
    pub fn into(self) -> T {
 | 
					    pub fn into(self) -> T {
 | 
				
			||||||
        self.0
 | 
					        self.0
 | 
				
			||||||
@@ -256,7 +259,7 @@ pub struct DocumentChangeContext<
 | 
				
			|||||||
    pub doc_alloc: Bump,
 | 
					    pub doc_alloc: Bump,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    /// Data allocated in this allocator is not cleared between each call to `process`, unless the data spills.
 | 
					    /// Data allocated in this allocator is not cleared between each call to `process`, unless the data spills.
 | 
				
			||||||
    pub extractor_alloc: RefBump<'extractor>,
 | 
					    pub extractor_alloc: &'extractor Bump,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    /// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents
 | 
					    /// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents
 | 
				
			||||||
    doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
 | 
					    doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
 | 
				
			||||||
@@ -279,14 +282,14 @@ impl<
 | 
				
			|||||||
        index: &'indexer Index,
 | 
					        index: &'indexer Index,
 | 
				
			||||||
        db_fields_ids_map: &'indexer FieldsIdsMap,
 | 
					        db_fields_ids_map: &'indexer FieldsIdsMap,
 | 
				
			||||||
        new_fields_ids_map: &'fid RwLock<FieldsIdsMap>,
 | 
					        new_fields_ids_map: &'fid RwLock<FieldsIdsMap>,
 | 
				
			||||||
        extractor_allocs: &'extractor ThreadLocal<FullySend<RefCell<Bump>>>,
 | 
					        extractor_allocs: &'extractor ThreadLocal<FullySend<Bump>>,
 | 
				
			||||||
        doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
 | 
					        doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
 | 
				
			||||||
        datastore: &'data ThreadLocal<T>,
 | 
					        datastore: &'data ThreadLocal<T>,
 | 
				
			||||||
        fields_ids_map_store: &'doc ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
 | 
					        fields_ids_map_store: &'doc ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
 | 
				
			||||||
        init_data: F,
 | 
					        init_data: F,
 | 
				
			||||||
    ) -> Result<Self>
 | 
					    ) -> Result<Self>
 | 
				
			||||||
    where
 | 
					    where
 | 
				
			||||||
        F: FnOnce(RefBump<'extractor>) -> Result<T>,
 | 
					        F: FnOnce(&'extractor Bump) -> Result<T>,
 | 
				
			||||||
    {
 | 
					    {
 | 
				
			||||||
        let doc_alloc =
 | 
					        let doc_alloc =
 | 
				
			||||||
            doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024))));
 | 
					            doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024))));
 | 
				
			||||||
@@ -297,9 +300,7 @@ impl<
 | 
				
			|||||||
        let fields_ids_map = &fields_ids_map.0;
 | 
					        let fields_ids_map = &fields_ids_map.0;
 | 
				
			||||||
        let extractor_alloc = extractor_allocs.get_or_default();
 | 
					        let extractor_alloc = extractor_allocs.get_or_default();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let extractor_alloc = RefBump::new(extractor_alloc.0.borrow_or_yield());
 | 
					        let data = datastore.get_or_try(move || init_data(&extractor_alloc.0))?;
 | 
				
			||||||
 | 
					 | 
				
			||||||
        let data = datastore.get_or_try(|| init_data(RefBump::clone(&extractor_alloc)))?;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let txn = index.read_txn()?;
 | 
					        let txn = index.read_txn()?;
 | 
				
			||||||
        Ok(DocumentChangeContext {
 | 
					        Ok(DocumentChangeContext {
 | 
				
			||||||
@@ -308,7 +309,7 @@ impl<
 | 
				
			|||||||
            db_fields_ids_map,
 | 
					            db_fields_ids_map,
 | 
				
			||||||
            new_fields_ids_map: fields_ids_map,
 | 
					            new_fields_ids_map: fields_ids_map,
 | 
				
			||||||
            doc_alloc,
 | 
					            doc_alloc,
 | 
				
			||||||
            extractor_alloc,
 | 
					            extractor_alloc: &extractor_alloc.0,
 | 
				
			||||||
            data,
 | 
					            data,
 | 
				
			||||||
            doc_allocs,
 | 
					            doc_allocs,
 | 
				
			||||||
        })
 | 
					        })
 | 
				
			||||||
@@ -319,7 +320,7 @@ impl<
 | 
				
			|||||||
pub trait Extractor<'extractor>: Sync {
 | 
					pub trait Extractor<'extractor>: Sync {
 | 
				
			||||||
    type Data: MostlySend;
 | 
					    type Data: MostlySend;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn init_data<'doc>(&'doc self, extractor_alloc: RefBump<'extractor>) -> Result<Self::Data>;
 | 
					    fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> Result<Self::Data>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    fn process<'doc>(
 | 
					    fn process<'doc>(
 | 
				
			||||||
        &'doc self,
 | 
					        &'doc self,
 | 
				
			||||||
@@ -375,15 +376,17 @@ pub fn for_each_document_change<
 | 
				
			|||||||
        doc_allocs,
 | 
					        doc_allocs,
 | 
				
			||||||
        fields_ids_map_store,
 | 
					        fields_ids_map_store,
 | 
				
			||||||
    }: IndexingContext<'fid, 'indexer, 'index>,
 | 
					    }: IndexingContext<'fid, 'indexer, 'index>,
 | 
				
			||||||
    extractor_allocs: &'extractor mut ThreadLocal<FullySend<RefCell<Bump>>>,
 | 
					    extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
 | 
				
			||||||
    datastore: &'data ThreadLocal<EX::Data>,
 | 
					    datastore: &'data ThreadLocal<EX::Data>,
 | 
				
			||||||
) -> Result<()>
 | 
					) -> Result<()>
 | 
				
			||||||
where
 | 
					where
 | 
				
			||||||
    EX: Extractor<'extractor>,
 | 
					    EX: Extractor<'extractor>,
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
					    eprintln!("We are resetting the extractor allocators");
 | 
				
			||||||
    // Clean up and reuse the extractor allocs
 | 
					    // Clean up and reuse the extractor allocs
 | 
				
			||||||
    for extractor_alloc in extractor_allocs.iter_mut() {
 | 
					    for extractor_alloc in extractor_allocs.iter_mut() {
 | 
				
			||||||
        extractor_alloc.0.get_mut().reset();
 | 
					        eprintln!("\tWith {} bytes resetted", extractor_alloc.0.allocated_bytes());
 | 
				
			||||||
 | 
					        extractor_alloc.0.reset();
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    let pi = document_changes.iter();
 | 
					    let pi = document_changes.iter();
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -80,7 +80,6 @@ mod test {
 | 
				
			|||||||
    use std::sync::RwLock;
 | 
					    use std::sync::RwLock;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    use bumpalo::Bump;
 | 
					    use bumpalo::Bump;
 | 
				
			||||||
    use raw_collections::alloc::RefBump;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    use crate::index::tests::TempIndex;
 | 
					    use crate::index::tests::TempIndex;
 | 
				
			||||||
    use crate::update::new::indexer::document_changes::{
 | 
					    use crate::update::new::indexer::document_changes::{
 | 
				
			||||||
@@ -95,11 +94,7 @@ mod test {
 | 
				
			|||||||
    fn test_deletions() {
 | 
					    fn test_deletions() {
 | 
				
			||||||
        struct DeletionWithData<'extractor> {
 | 
					        struct DeletionWithData<'extractor> {
 | 
				
			||||||
            deleted: RefCell<
 | 
					            deleted: RefCell<
 | 
				
			||||||
                hashbrown::HashSet<
 | 
					                hashbrown::HashSet<DocumentId, hashbrown::DefaultHashBuilder, &'extractor Bump>,
 | 
				
			||||||
                    DocumentId,
 | 
					 | 
				
			||||||
                    hashbrown::hash_map::DefaultHashBuilder,
 | 
					 | 
				
			||||||
                    RefBump<'extractor>,
 | 
					 | 
				
			||||||
                >,
 | 
					 | 
				
			||||||
            >,
 | 
					            >,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -110,10 +105,7 @@ mod test {
 | 
				
			|||||||
        impl<'extractor> Extractor<'extractor> for TrackDeletion<'extractor> {
 | 
					        impl<'extractor> Extractor<'extractor> for TrackDeletion<'extractor> {
 | 
				
			||||||
            type Data = DeletionWithData<'extractor>;
 | 
					            type Data = DeletionWithData<'extractor>;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            fn init_data(
 | 
					            fn init_data(&self, extractor_alloc: &'extractor Bump) -> crate::Result<Self::Data> {
 | 
				
			||||||
                &self,
 | 
					 | 
				
			||||||
                extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
 | 
					 | 
				
			||||||
            ) -> crate::Result<Self::Data> {
 | 
					 | 
				
			||||||
                let deleted = RefCell::new(hashbrown::HashSet::new_in(extractor_alloc));
 | 
					                let deleted = RefCell::new(hashbrown::HashSet::new_in(extractor_alloc));
 | 
				
			||||||
                Ok(DeletionWithData { deleted })
 | 
					                Ok(DeletionWithData { deleted })
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
@@ -173,8 +165,7 @@ mod test {
 | 
				
			|||||||
                println!("deleted by {index}: {:?}", data.deleted.borrow());
 | 
					                println!("deleted by {index}: {:?}", data.deleted.borrow());
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            for alloc in extractor_allocs.iter_mut() {
 | 
					            for alloc in extractor_allocs.iter_mut() {
 | 
				
			||||||
                let alloc = &mut alloc.0;
 | 
					                alloc.0.reset();
 | 
				
			||||||
                alloc.get_mut().reset();
 | 
					 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,26 +1,24 @@
 | 
				
			|||||||
use std::cell::RefCell;
 | 
					use std::cmp::Ordering;
 | 
				
			||||||
use std::sync::RwLock;
 | 
					use std::sync::RwLock;
 | 
				
			||||||
use std::thread::{self, Builder};
 | 
					use std::thread::{self, Builder};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use big_s::S;
 | 
					use big_s::S;
 | 
				
			||||||
use bumpalo::Bump;
 | 
					 | 
				
			||||||
use document_changes::{
 | 
					use document_changes::{
 | 
				
			||||||
    for_each_document_change, DocumentChanges, Extractor, FullySend, IndexingContext, RefCellExt,
 | 
					    for_each_document_change, DocumentChanges, FullySend, IndexingContext, ThreadLocal,
 | 
				
			||||||
    ThreadLocal,
 | 
					 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
pub use document_deletion::DocumentDeletion;
 | 
					pub use document_deletion::DocumentDeletion;
 | 
				
			||||||
pub use document_operation::DocumentOperation;
 | 
					pub use document_operation::DocumentOperation;
 | 
				
			||||||
 | 
					use heed::types::{Bytes, DecodeIgnore, Str};
 | 
				
			||||||
use heed::{RoTxn, RwTxn};
 | 
					use heed::{RoTxn, RwTxn};
 | 
				
			||||||
 | 
					use itertools::{merge_join_by, EitherOrBoth};
 | 
				
			||||||
pub use partial_dump::PartialDump;
 | 
					pub use partial_dump::PartialDump;
 | 
				
			||||||
use rayon::ThreadPool;
 | 
					use rayon::ThreadPool;
 | 
				
			||||||
use time::OffsetDateTime;
 | 
					use time::OffsetDateTime;
 | 
				
			||||||
pub use update_by_function::UpdateByFunction;
 | 
					pub use update_by_function::UpdateByFunction;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::channel::*;
 | 
					use super::channel::*;
 | 
				
			||||||
use super::document::write_to_obkv;
 | 
					 | 
				
			||||||
use super::document_change::DocumentChange;
 | 
					 | 
				
			||||||
use super::extract::*;
 | 
					use super::extract::*;
 | 
				
			||||||
use super::merger::{merge_grenad_entries, FacetFieldIdsDelta};
 | 
					use super::merger::{FacetDatabases, FacetFieldIdsDelta};
 | 
				
			||||||
use super::word_fst_builder::PrefixDelta;
 | 
					use super::word_fst_builder::PrefixDelta;
 | 
				
			||||||
use super::words_prefix_docids::{
 | 
					use super::words_prefix_docids::{
 | 
				
			||||||
    compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids,
 | 
					    compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids,
 | 
				
			||||||
@@ -28,75 +26,23 @@ use super::words_prefix_docids::{
 | 
				
			|||||||
use super::{StdResult, TopLevelMap};
 | 
					use super::{StdResult, TopLevelMap};
 | 
				
			||||||
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
 | 
					use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
 | 
				
			||||||
use crate::facet::FacetType;
 | 
					use crate::facet::FacetType;
 | 
				
			||||||
 | 
					use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
 | 
				
			||||||
use crate::proximity::ProximityPrecision;
 | 
					use crate::proximity::ProximityPrecision;
 | 
				
			||||||
use crate::update::new::channel::ExtractorSender;
 | 
					use crate::update::del_add::DelAdd;
 | 
				
			||||||
 | 
					use crate::update::new::word_fst_builder::{PrefixData, WordFstBuilder};
 | 
				
			||||||
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
 | 
					use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
 | 
				
			||||||
 | 
					use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids};
 | 
				
			||||||
use crate::update::settings::InnerIndexSettings;
 | 
					use crate::update::settings::InnerIndexSettings;
 | 
				
			||||||
use crate::update::{FacetsUpdateBulk, GrenadParameters};
 | 
					use crate::update::{FacetsUpdateBulk, GrenadParameters};
 | 
				
			||||||
use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
 | 
					use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub(crate) mod de;
 | 
					pub mod de;
 | 
				
			||||||
pub mod document_changes;
 | 
					pub mod document_changes;
 | 
				
			||||||
mod document_deletion;
 | 
					mod document_deletion;
 | 
				
			||||||
mod document_operation;
 | 
					mod document_operation;
 | 
				
			||||||
mod partial_dump;
 | 
					mod partial_dump;
 | 
				
			||||||
mod update_by_function;
 | 
					mod update_by_function;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct DocumentExtractor<'a> {
 | 
					 | 
				
			||||||
    document_sender: &'a DocumentSender<'a>,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> {
 | 
					 | 
				
			||||||
    type Data = FullySend<()>;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fn init_data(
 | 
					 | 
				
			||||||
        &self,
 | 
					 | 
				
			||||||
        _extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
 | 
					 | 
				
			||||||
    ) -> Result<Self::Data> {
 | 
					 | 
				
			||||||
        Ok(FullySend(()))
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fn process(
 | 
					 | 
				
			||||||
        &self,
 | 
					 | 
				
			||||||
        change: DocumentChange,
 | 
					 | 
				
			||||||
        context: &document_changes::DocumentChangeContext<Self::Data>,
 | 
					 | 
				
			||||||
    ) -> Result<()> {
 | 
					 | 
				
			||||||
        let mut document_buffer = Vec::new();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield();
 | 
					 | 
				
			||||||
        let new_fields_ids_map = &*new_fields_ids_map;
 | 
					 | 
				
			||||||
        let new_fields_ids_map = new_fields_ids_map.local_map();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        let external_docid = change.external_docid().to_owned();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        // document but we need to create a function that collects and compresses documents.
 | 
					 | 
				
			||||||
        match change {
 | 
					 | 
				
			||||||
            DocumentChange::Deletion(deletion) => {
 | 
					 | 
				
			||||||
                let docid = deletion.docid();
 | 
					 | 
				
			||||||
                self.document_sender.delete(docid, external_docid).unwrap();
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            /// TODO: change NONE by SOME(vector) when implemented
 | 
					 | 
				
			||||||
            DocumentChange::Update(update) => {
 | 
					 | 
				
			||||||
                let docid = update.docid();
 | 
					 | 
				
			||||||
                let content =
 | 
					 | 
				
			||||||
                    update.new(&context.txn, context.index, &context.db_fields_ids_map)?;
 | 
					 | 
				
			||||||
                let content =
 | 
					 | 
				
			||||||
                    write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
 | 
					 | 
				
			||||||
                self.document_sender.insert(docid, external_docid, content.boxed()).unwrap();
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            DocumentChange::Insertion(insertion) => {
 | 
					 | 
				
			||||||
                let docid = insertion.docid();
 | 
					 | 
				
			||||||
                let content = insertion.new();
 | 
					 | 
				
			||||||
                let content =
 | 
					 | 
				
			||||||
                    write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
 | 
					 | 
				
			||||||
                self.document_sender.insert(docid, external_docid, content.boxed()).unwrap();
 | 
					 | 
				
			||||||
                // extracted_dictionary_sender.send(self, dictionary: &[u8]);
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        Ok(())
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/// This is the main function of this crate.
 | 
					/// This is the main function of this crate.
 | 
				
			||||||
///
 | 
					///
 | 
				
			||||||
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
 | 
					/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
 | 
				
			||||||
@@ -114,12 +60,11 @@ pub fn index<'pl, 'indexer, 'index, DC>(
 | 
				
			|||||||
where
 | 
					where
 | 
				
			||||||
    DC: DocumentChanges<'pl>,
 | 
					    DC: DocumentChanges<'pl>,
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
    let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
 | 
					    // TODO find a better channel limit
 | 
				
			||||||
    // This channel acts as a rendezvous point to ensure that we are one task ahead
 | 
					    let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000);
 | 
				
			||||||
    let (extractor_sender, merger_receiver) = extractors_merger_channels(4);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    let new_fields_ids_map = RwLock::new(new_fields_ids_map);
 | 
					    let new_fields_ids_map = RwLock::new(new_fields_ids_map);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
 | 
				
			||||||
    let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads());
 | 
					    let fields_ids_map_store = ThreadLocal::with_capacity(pool.current_num_threads());
 | 
				
			||||||
    let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
 | 
					    let mut extractor_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
 | 
				
			||||||
    let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
 | 
					    let doc_allocs = ThreadLocal::with_capacity(pool.current_num_threads());
 | 
				
			||||||
@@ -132,42 +77,48 @@ where
 | 
				
			|||||||
        fields_ids_map_store: &fields_ids_map_store,
 | 
					        fields_ids_map_store: &fields_ids_map_store,
 | 
				
			||||||
    };
 | 
					    };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    thread::scope(|s| {
 | 
					    thread::scope(|s| -> crate::Result<_> {
 | 
				
			||||||
        let indexer_span = tracing::Span::current();
 | 
					        let indexer_span = tracing::Span::current();
 | 
				
			||||||
        // TODO manage the errors correctly
 | 
					        // TODO manage the errors correctly
 | 
				
			||||||
        let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
 | 
					        let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
 | 
				
			||||||
            pool.in_place_scope(|_s| {
 | 
					            pool.in_place_scope(|_s| {
 | 
				
			||||||
                let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
 | 
					                let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
 | 
				
			||||||
                let _entered = span.enter();
 | 
					                let _entered = span.enter();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                // document but we need to create a function that collects and compresses documents.
 | 
					                // document but we need to create a function that collects and compresses documents.
 | 
				
			||||||
                    let document_sender = extractor_sender.document_sender();
 | 
					                let rtxn = index.read_txn().unwrap();
 | 
				
			||||||
                    let document_extractor = DocumentExtractor { document_sender: &document_sender};
 | 
					                let document_sender = extractor_sender.documents();
 | 
				
			||||||
 | 
					                let document_extractor = DocumentsExtractor::new(&document_sender);
 | 
				
			||||||
                let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
 | 
					                let datastore = ThreadLocal::with_capacity(pool.current_num_threads());
 | 
				
			||||||
                for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?;
 | 
					                for_each_document_change(document_changes, &document_extractor, indexing_context, &mut extractor_allocs, &datastore)?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    document_sender.finish().unwrap();
 | 
					                let mut documents_ids = index.documents_ids(&rtxn)?;
 | 
				
			||||||
 | 
					                let delta_documents_ids = datastore.into_iter().map(|FullySend(d)| d.into_inner()).reduce(DelAddRoaringBitmap::merge).unwrap_or_default();
 | 
				
			||||||
 | 
					                delta_documents_ids.apply_to(&mut documents_ids);
 | 
				
			||||||
 | 
					                extractor_sender.send_documents_ids(documents_ids).unwrap();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                // document_sender.finish().unwrap();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
 | 
					                const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
 | 
				
			||||||
                    let max_memory = TEN_GIB / dbg!(rayon::current_num_threads());
 | 
					                let current_num_threads = rayon::current_num_threads();
 | 
				
			||||||
 | 
					                let max_memory = TEN_GIB / current_num_threads;
 | 
				
			||||||
 | 
					                eprintln!("A maximum of {max_memory} bytes will be used for each of the {current_num_threads} threads");
 | 
				
			||||||
                let grenad_parameters = GrenadParameters {
 | 
					                let grenad_parameters = GrenadParameters {
 | 
				
			||||||
                    max_memory: Some(max_memory),
 | 
					                    max_memory: Some(max_memory),
 | 
				
			||||||
                    ..GrenadParameters::default()
 | 
					                    ..GrenadParameters::default()
 | 
				
			||||||
                };
 | 
					                };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                let facet_field_ids_delta;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                {
 | 
					                {
 | 
				
			||||||
                    let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
 | 
					                    let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
 | 
				
			||||||
                    let _entered = span.enter();
 | 
					                    let _entered = span.enter();
 | 
				
			||||||
                        extract_and_send_docids::<
 | 
					                    facet_field_ids_delta = merge_and_send_facet_docids(
 | 
				
			||||||
                            _,
 | 
					                        global_fields_ids_map,
 | 
				
			||||||
                            FacetedDocidsExtractor,
 | 
					                        FacetedDocidsExtractor::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?,
 | 
				
			||||||
                            FacetDocids,
 | 
					                        FacetDatabases::new(index),
 | 
				
			||||||
                        >(
 | 
					                        index,
 | 
				
			||||||
                            grenad_parameters,
 | 
					                        extractor_sender.facet_docids(),
 | 
				
			||||||
                            document_changes,
 | 
					 | 
				
			||||||
                            indexing_context,
 | 
					 | 
				
			||||||
                            &mut extractor_allocs,
 | 
					 | 
				
			||||||
                            &extractor_sender,
 | 
					 | 
				
			||||||
                    )?;
 | 
					                    )?;
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -175,37 +126,92 @@ where
 | 
				
			|||||||
                    let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
 | 
					                    let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
 | 
				
			||||||
                    let _entered = span.enter();
 | 
					                    let _entered = span.enter();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                        let WordDocidsMergers {
 | 
					                    let WordDocidsCaches {
 | 
				
			||||||
                            word_fid_docids,
 | 
					 | 
				
			||||||
                        word_docids,
 | 
					                        word_docids,
 | 
				
			||||||
 | 
					                        word_fid_docids,
 | 
				
			||||||
                        exact_word_docids,
 | 
					                        exact_word_docids,
 | 
				
			||||||
                        word_position_docids,
 | 
					                        word_position_docids,
 | 
				
			||||||
                        fid_word_count_docids,
 | 
					                        fid_word_count_docids,
 | 
				
			||||||
                    } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?;
 | 
					                    } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?;
 | 
				
			||||||
                        extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
 | 
					
 | 
				
			||||||
                        extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
 | 
					                    // TODO Word Docids Merger
 | 
				
			||||||
                        extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
 | 
					                    // extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
 | 
				
			||||||
                        extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
 | 
					                    {
 | 
				
			||||||
                        extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
 | 
					                        let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
 | 
				
			||||||
 | 
					                        let _entered = span.enter();
 | 
				
			||||||
 | 
					                        merge_and_send_docids(
 | 
				
			||||||
 | 
					                            word_docids,
 | 
				
			||||||
 | 
					                            index.word_docids.remap_types(),
 | 
				
			||||||
 | 
					                            index,
 | 
				
			||||||
 | 
					                            extractor_sender.docids::<WordDocids>(),
 | 
				
			||||||
 | 
					                        )?;
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    // Word Fid Docids Merging
 | 
				
			||||||
 | 
					                    // extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
 | 
				
			||||||
 | 
					                    {
 | 
				
			||||||
 | 
					                        let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
 | 
				
			||||||
 | 
					                        let _entered = span.enter();
 | 
				
			||||||
 | 
					                        merge_and_send_docids(
 | 
				
			||||||
 | 
					                            word_fid_docids,
 | 
				
			||||||
 | 
					                            index.word_fid_docids.remap_types(),
 | 
				
			||||||
 | 
					                            index,
 | 
				
			||||||
 | 
					                            extractor_sender.docids::<WordFidDocids>()
 | 
				
			||||||
 | 
					                        )?;
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    // Exact Word Docids Merging
 | 
				
			||||||
 | 
					                    // extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
 | 
				
			||||||
 | 
					                    {
 | 
				
			||||||
 | 
					                        let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
 | 
				
			||||||
 | 
					                        let _entered = span.enter();
 | 
				
			||||||
 | 
					                        merge_and_send_docids(
 | 
				
			||||||
 | 
					                            exact_word_docids,
 | 
				
			||||||
 | 
					                            index.exact_word_docids.remap_types(),
 | 
				
			||||||
 | 
					                            index,
 | 
				
			||||||
 | 
					                            extractor_sender.docids::<ExactWordDocids>(),
 | 
				
			||||||
 | 
					                        )?;
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    // Word Position Docids Merging
 | 
				
			||||||
 | 
					                    // extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
 | 
				
			||||||
 | 
					                    {
 | 
				
			||||||
 | 
					                        let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
 | 
				
			||||||
 | 
					                        let _entered = span.enter();
 | 
				
			||||||
 | 
					                        merge_and_send_docids(
 | 
				
			||||||
 | 
					                            word_position_docids,
 | 
				
			||||||
 | 
					                            index.word_position_docids.remap_types(),
 | 
				
			||||||
 | 
					                            index,
 | 
				
			||||||
 | 
					                            extractor_sender.docids::<WordPositionDocids>(),
 | 
				
			||||||
 | 
					                        )?;
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    // Fid Word Count Docids Merging
 | 
				
			||||||
 | 
					                    // extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
 | 
				
			||||||
 | 
					                    {
 | 
				
			||||||
 | 
					                        let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
 | 
				
			||||||
 | 
					                        let _entered = span.enter();
 | 
				
			||||||
 | 
					                        merge_and_send_docids(
 | 
				
			||||||
 | 
					                            fid_word_count_docids,
 | 
				
			||||||
 | 
					                            index.field_id_word_count_docids.remap_types(),
 | 
				
			||||||
 | 
					                            index,
 | 
				
			||||||
 | 
					                            extractor_sender.docids::<FidWordCountDocids>(),
 | 
				
			||||||
 | 
					                        )?;
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                // run the proximity extraction only if the precision is by word
 | 
					                // run the proximity extraction only if the precision is by word
 | 
				
			||||||
                // this works only if the settings didn't change during this transaction.
 | 
					                // this works only if the settings didn't change during this transaction.
 | 
				
			||||||
                    let rtxn = index.read_txn().unwrap();
 | 
					 | 
				
			||||||
                let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default();
 | 
					                let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default();
 | 
				
			||||||
                if proximity_precision == ProximityPrecision::ByWord {
 | 
					                if proximity_precision == ProximityPrecision::ByWord {
 | 
				
			||||||
                    let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
 | 
					                    let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
 | 
				
			||||||
                    let _entered = span.enter();
 | 
					                    let _entered = span.enter();
 | 
				
			||||||
                        extract_and_send_docids::<
 | 
					                    let caches = <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?;
 | 
				
			||||||
                            _,
 | 
					                    merge_and_send_docids(
 | 
				
			||||||
                            WordPairProximityDocidsExtractor,
 | 
					                        caches,
 | 
				
			||||||
                            WordPairProximityDocids,
 | 
					                        index.word_pair_proximity_docids.remap_types(),
 | 
				
			||||||
                        >(
 | 
					                        index,
 | 
				
			||||||
                            grenad_parameters,
 | 
					                        extractor_sender.docids::<WordPairProximityDocids>(),
 | 
				
			||||||
                            document_changes,
 | 
					 | 
				
			||||||
                            indexing_context,
 | 
					 | 
				
			||||||
                      &mut extractor_allocs,
 | 
					 | 
				
			||||||
                            &extractor_sender,
 | 
					 | 
				
			||||||
                    )?;
 | 
					                    )?;
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -229,27 +235,13 @@ where
 | 
				
			|||||||
                // - [x] Extract fieldid facet number docids
 | 
					                // - [x] Extract fieldid facet number docids
 | 
				
			||||||
                // - [x] Extract fieldid facet string docids
 | 
					                // - [x] Extract fieldid facet string docids
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    Ok(()) as Result<_>
 | 
					                // TODO use None when needed
 | 
				
			||||||
 | 
					                Result::Ok(facet_field_ids_delta)
 | 
				
			||||||
            })
 | 
					            })
 | 
				
			||||||
        })?;
 | 
					        })?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
 | 
					        let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map);
 | 
				
			||||||
 | 
					 | 
				
			||||||
        let indexer_span = tracing::Span::current();
 | 
					        let indexer_span = tracing::Span::current();
 | 
				
			||||||
        // TODO manage the errors correctly
 | 
					 | 
				
			||||||
        let merger_thread = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || {
 | 
					 | 
				
			||||||
            let span =
 | 
					 | 
				
			||||||
                tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "merge");
 | 
					 | 
				
			||||||
            let _entered = span.enter();
 | 
					 | 
				
			||||||
            let rtxn = index.read_txn().unwrap();
 | 
					 | 
				
			||||||
            merge_grenad_entries(
 | 
					 | 
				
			||||||
                merger_receiver,
 | 
					 | 
				
			||||||
                merger_sender,
 | 
					 | 
				
			||||||
                &rtxn,
 | 
					 | 
				
			||||||
                index,
 | 
					 | 
				
			||||||
                global_fields_ids_map,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        })?;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for operation in writer_receiver {
 | 
					        for operation in writer_receiver {
 | 
				
			||||||
            let database = operation.database(index);
 | 
					            let database = operation.database(index);
 | 
				
			||||||
@@ -264,18 +256,66 @@ where
 | 
				
			|||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        /// TODO handle the panicking threads
 | 
					        /// TODO handle the panicking threads
 | 
				
			||||||
        handle.join().unwrap()?;
 | 
					        let facet_field_ids_delta = extractor_handle.join().unwrap()?;
 | 
				
			||||||
        let merger_result = merger_thread.join().unwrap()?;
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta {
 | 
					        let prefix_delta = {
 | 
				
			||||||
            compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
 | 
					            let rtxn = index.read_txn()?;
 | 
				
			||||||
 | 
					            let words_fst = index.words_fst(&rtxn)?;
 | 
				
			||||||
 | 
					            let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
 | 
				
			||||||
 | 
					            let prefix_settings = index.prefix_settings(&rtxn)?;
 | 
				
			||||||
 | 
					            word_fst_builder.with_prefix_settings(prefix_settings);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            let previous_words = index.word_docids.iter(&rtxn)?.remap_data_type::<DecodeIgnore>();
 | 
				
			||||||
 | 
					            let current_words = index.word_docids.iter(wtxn)?.remap_data_type::<DecodeIgnore>();
 | 
				
			||||||
 | 
					            for eob in merge_join_by(previous_words, current_words, |lhs, rhs| match (lhs, rhs) {
 | 
				
			||||||
 | 
					                (Ok((l, _)), Ok((r, _))) => l.cmp(r),
 | 
				
			||||||
 | 
					                (Err(_), _) | (_, Err(_)) => Ordering::Equal,
 | 
				
			||||||
 | 
					            }) {
 | 
				
			||||||
 | 
					                match eob {
 | 
				
			||||||
 | 
					                    EitherOrBoth::Both(lhs, rhs) => {
 | 
				
			||||||
 | 
					                        if let Some(e) = lhs.err().or(rhs.err()) {
 | 
				
			||||||
 | 
					                            return Err(e.into());
 | 
				
			||||||
 | 
					                        }
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                    EitherOrBoth::Left(result) => {
 | 
				
			||||||
 | 
					                        let (word, _) = result?;
 | 
				
			||||||
 | 
					                        word_fst_builder.register_word(DelAdd::Deletion, word.as_ref())?;
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                    EitherOrBoth::Right(result) => {
 | 
				
			||||||
 | 
					                        let (word, _) = result?;
 | 
				
			||||||
 | 
					                        word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?;
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if let Some(prefix_delta) = merger_result.prefix_delta {
 | 
					            let span = tracing::trace_span!(target: "indexing::documents::merge", "words_fst");
 | 
				
			||||||
 | 
					            let _entered = span.enter();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, &rtxn)?;
 | 
				
			||||||
 | 
					            // extractor_sender.main().write_words_fst(word_fst_mmap).unwrap();
 | 
				
			||||||
 | 
					            index.main.remap_types::<Str, Bytes>().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?;
 | 
				
			||||||
 | 
					            if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data {
 | 
				
			||||||
 | 
					                // extractor_sender.main().write_words_prefixes_fst(prefixes_fst_mmap).unwrap();
 | 
				
			||||||
 | 
					                index.main.remap_types::<Str, Bytes>().put(
 | 
				
			||||||
 | 
					                    wtxn,
 | 
				
			||||||
 | 
					                    WORDS_PREFIXES_FST_KEY,
 | 
				
			||||||
 | 
					                    &prefixes_fst_mmap,
 | 
				
			||||||
 | 
					                )?;
 | 
				
			||||||
 | 
					                Some(prefix_delta)
 | 
				
			||||||
 | 
					            } else {
 | 
				
			||||||
 | 
					                None
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // if let Some(facet_field_ids_delta) = merger_result.facet_field_ids_delta {
 | 
				
			||||||
 | 
					        //     compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
 | 
				
			||||||
 | 
					        // }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if let Some(prefix_delta) = prefix_delta {
 | 
				
			||||||
            compute_prefix_database(index, wtxn, prefix_delta)?;
 | 
					            compute_prefix_database(index, wtxn, prefix_delta)?;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Ok(()) as Result<_>
 | 
					        Result::Ok(())
 | 
				
			||||||
    })?;
 | 
					    })?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    // required to into_inner the new_fields_ids_map
 | 
					    // required to into_inner the new_fields_ids_map
 | 
				
			||||||
@@ -347,30 +387,6 @@ fn compute_facet_level_database(
 | 
				
			|||||||
    Ok(())
 | 
					    Ok(())
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
 | 
					 | 
				
			||||||
/// TODO: manage the errors correctly
 | 
					 | 
				
			||||||
/// TODO: we must have a single trait that also gives the extractor type
 | 
					 | 
				
			||||||
fn extract_and_send_docids<
 | 
					 | 
				
			||||||
    'pl,
 | 
					 | 
				
			||||||
    'fid,
 | 
					 | 
				
			||||||
    'indexer,
 | 
					 | 
				
			||||||
    'index,
 | 
					 | 
				
			||||||
    DC: DocumentChanges<'pl>,
 | 
					 | 
				
			||||||
    E: DocidsExtractor,
 | 
					 | 
				
			||||||
    D: MergerOperationType,
 | 
					 | 
				
			||||||
>(
 | 
					 | 
				
			||||||
    grenad_parameters: GrenadParameters,
 | 
					 | 
				
			||||||
    document_changes: &DC,
 | 
					 | 
				
			||||||
    indexing_context: IndexingContext<'fid, 'indexer, 'index>,
 | 
					 | 
				
			||||||
    extractor_allocs: &mut ThreadLocal<FullySend<RefCell<Bump>>>,
 | 
					 | 
				
			||||||
    sender: &ExtractorSender,
 | 
					 | 
				
			||||||
) -> Result<()> {
 | 
					 | 
				
			||||||
    let merger =
 | 
					 | 
				
			||||||
        E::run_extraction(grenad_parameters, document_changes, indexing_context, extractor_allocs)?;
 | 
					 | 
				
			||||||
    sender.send_searchable::<D>(merger).unwrap();
 | 
					 | 
				
			||||||
    Ok(())
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/// Returns the primary key that has already been set for this index or the
 | 
					/// Returns the primary key that has already been set for this index or the
 | 
				
			||||||
/// one we will guess by searching for the first key that contains "id" as a substring,
 | 
					/// one we will guess by searching for the first key that contains "id" as a substring,
 | 
				
			||||||
/// and whether the primary key changed
 | 
					/// and whether the primary key changed
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,12 +1,10 @@
 | 
				
			|||||||
use std::ops::DerefMut;
 | 
					use std::ops::DerefMut;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use rayon::iter::IndexedParallelIterator;
 | 
					use rayon::iter::IndexedParallelIterator;
 | 
				
			||||||
use serde::Deserializer;
 | 
					 | 
				
			||||||
use serde_json::value::RawValue;
 | 
					use serde_json::value::RawValue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::de::FieldAndDocidExtractor;
 | 
					 | 
				
			||||||
use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt};
 | 
					use super::document_changes::{DocumentChangeContext, DocumentChanges, MostlySend, RefCellExt};
 | 
				
			||||||
use crate::documents::{DocumentIdExtractionError, PrimaryKey};
 | 
					use crate::documents::PrimaryKey;
 | 
				
			||||||
use crate::update::concurrent_available_ids::ConcurrentAvailableIds;
 | 
					use crate::update::concurrent_available_ids::ConcurrentAvailableIds;
 | 
				
			||||||
use crate::update::new::document::DocumentFromVersions;
 | 
					use crate::update::new::document::DocumentFromVersions;
 | 
				
			||||||
use crate::update::new::document_change::Versions;
 | 
					use crate::update::new::document_change::Versions;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,5 +1,3 @@
 | 
				
			|||||||
use std::collections::BTreeMap;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
use raw_collections::RawMap;
 | 
					use raw_collections::RawMap;
 | 
				
			||||||
use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
 | 
					use rayon::iter::{IndexedParallelIterator, IntoParallelRefIterator, ParallelIterator};
 | 
				
			||||||
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
 | 
					use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
 | 
				
			||||||
@@ -12,8 +10,8 @@ use crate::documents::PrimaryKey;
 | 
				
			|||||||
use crate::error::{FieldIdMapMissingEntry, InternalError};
 | 
					use crate::error::{FieldIdMapMissingEntry, InternalError};
 | 
				
			||||||
use crate::update::new::document::DocumentFromVersions;
 | 
					use crate::update::new::document::DocumentFromVersions;
 | 
				
			||||||
use crate::update::new::document_change::Versions;
 | 
					use crate::update::new::document_change::Versions;
 | 
				
			||||||
use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, KvWriterFieldId, Update};
 | 
					use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update};
 | 
				
			||||||
use crate::{all_obkv_to_json, Error, FieldsIdsMap, GlobalFieldsIdsMap, Object, Result, UserError};
 | 
					use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct UpdateByFunction {
 | 
					pub struct UpdateByFunction {
 | 
				
			||||||
    documents: RoaringBitmap,
 | 
					    documents: RoaringBitmap,
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,222 +1,20 @@
 | 
				
			|||||||
use std::fs::File;
 | 
					 | 
				
			||||||
use std::io::{self};
 | 
					use std::io::{self};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use bincode::ErrorKind;
 | 
					use bincode::ErrorKind;
 | 
				
			||||||
use grenad::Merger;
 | 
					 | 
				
			||||||
use hashbrown::HashSet;
 | 
					use hashbrown::HashSet;
 | 
				
			||||||
use heed::types::Bytes;
 | 
					use heed::types::Bytes;
 | 
				
			||||||
use heed::{Database, RoTxn};
 | 
					use heed::{Database, RoTxn};
 | 
				
			||||||
 | 
					use rayon::iter::{IntoParallelIterator, ParallelIterator};
 | 
				
			||||||
use roaring::RoaringBitmap;
 | 
					use roaring::RoaringBitmap;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::channel::*;
 | 
					use super::channel::*;
 | 
				
			||||||
use super::extract::FacetKind;
 | 
					use super::extract::{
 | 
				
			||||||
 | 
					    merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind,
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
use super::facet_search_builder::FacetSearchBuilder;
 | 
					use super::facet_search_builder::FacetSearchBuilder;
 | 
				
			||||||
use super::word_fst_builder::{PrefixData, PrefixDelta};
 | 
					use super::DocumentChange;
 | 
				
			||||||
use super::{Deletion, DocumentChange, KvReaderDelAdd, KvReaderFieldId};
 | 
					 | 
				
			||||||
use crate::update::del_add::DelAdd;
 | 
					use crate::update::del_add::DelAdd;
 | 
				
			||||||
use crate::update::new::channel::MergerOperation;
 | 
					use crate::{CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap, Index, Result};
 | 
				
			||||||
use crate::update::new::word_fst_builder::WordFstBuilder;
 | 
					 | 
				
			||||||
use crate::update::MergeDeladdCboRoaringBitmaps;
 | 
					 | 
				
			||||||
use crate::{
 | 
					 | 
				
			||||||
    localized_attributes_rules, CboRoaringBitmapCodec, Error, FieldId, GeoPoint,
 | 
					 | 
				
			||||||
    GlobalFieldsIdsMap, Index, Result,
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
/// TODO We must return some infos/stats
 | 
					 | 
				
			||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")]
 | 
					 | 
				
			||||||
pub fn merge_grenad_entries(
 | 
					 | 
				
			||||||
    receiver: MergerReceiver,
 | 
					 | 
				
			||||||
    sender: MergerSender,
 | 
					 | 
				
			||||||
    rtxn: &RoTxn,
 | 
					 | 
				
			||||||
    index: &Index,
 | 
					 | 
				
			||||||
    global_fields_ids_map: GlobalFieldsIdsMap<'_>,
 | 
					 | 
				
			||||||
) -> Result<MergerResult> {
 | 
					 | 
				
			||||||
    let mut buffer: Vec<u8> = Vec::new();
 | 
					 | 
				
			||||||
    let mut documents_ids = index.documents_ids(rtxn)?;
 | 
					 | 
				
			||||||
    let mut geo_extractor = GeoExtractor::new(rtxn, index)?;
 | 
					 | 
				
			||||||
    let mut merger_result = MergerResult::default();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    for merger_operation in receiver {
 | 
					 | 
				
			||||||
        match merger_operation {
 | 
					 | 
				
			||||||
            MergerOperation::ExactWordDocidsMerger(merger) => {
 | 
					 | 
				
			||||||
                let span =
 | 
					 | 
				
			||||||
                    tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
 | 
					 | 
				
			||||||
                let _entered = span.enter();
 | 
					 | 
				
			||||||
                merge_and_send_docids(
 | 
					 | 
				
			||||||
                    merger,
 | 
					 | 
				
			||||||
                    /// TODO do a MergerOperation::database(&Index) -> Database<Bytes, Bytes>.
 | 
					 | 
				
			||||||
                    index.exact_word_docids.remap_types(),
 | 
					 | 
				
			||||||
                    rtxn,
 | 
					 | 
				
			||||||
                    &mut buffer,
 | 
					 | 
				
			||||||
                    sender.docids::<ExactWordDocids>(),
 | 
					 | 
				
			||||||
                    |_, _key| Ok(()),
 | 
					 | 
				
			||||||
                )?;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            MergerOperation::FidWordCountDocidsMerger(merger) => {
 | 
					 | 
				
			||||||
                let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
 | 
					 | 
				
			||||||
                let _entered = span.enter();
 | 
					 | 
				
			||||||
                merge_and_send_docids(
 | 
					 | 
				
			||||||
                    merger,
 | 
					 | 
				
			||||||
                    index.field_id_word_count_docids.remap_types(),
 | 
					 | 
				
			||||||
                    rtxn,
 | 
					 | 
				
			||||||
                    &mut buffer,
 | 
					 | 
				
			||||||
                    sender.docids::<FidWordCountDocids>(),
 | 
					 | 
				
			||||||
                    |_, _key| Ok(()),
 | 
					 | 
				
			||||||
                )?;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            MergerOperation::WordDocidsMerger(merger) => {
 | 
					 | 
				
			||||||
                let words_fst = index.words_fst(rtxn)?;
 | 
					 | 
				
			||||||
                let mut word_fst_builder = WordFstBuilder::new(&words_fst)?;
 | 
					 | 
				
			||||||
                let prefix_settings = index.prefix_settings(rtxn)?;
 | 
					 | 
				
			||||||
                word_fst_builder.with_prefix_settings(prefix_settings);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                {
 | 
					 | 
				
			||||||
                    let span =
 | 
					 | 
				
			||||||
                        tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
 | 
					 | 
				
			||||||
                    let _entered = span.enter();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                    merge_and_send_docids(
 | 
					 | 
				
			||||||
                        merger,
 | 
					 | 
				
			||||||
                        index.word_docids.remap_types(),
 | 
					 | 
				
			||||||
                        rtxn,
 | 
					 | 
				
			||||||
                        &mut buffer,
 | 
					 | 
				
			||||||
                        sender.docids::<WordDocids>(),
 | 
					 | 
				
			||||||
                        |deladd, key| word_fst_builder.register_word(deladd, key),
 | 
					 | 
				
			||||||
                    )?;
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                {
 | 
					 | 
				
			||||||
                    let span =
 | 
					 | 
				
			||||||
                        tracing::trace_span!(target: "indexing::documents::merge", "words_fst");
 | 
					 | 
				
			||||||
                    let _entered = span.enter();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                    let (word_fst_mmap, prefix_data) = word_fst_builder.build(index, rtxn)?;
 | 
					 | 
				
			||||||
                    sender.main().write_words_fst(word_fst_mmap).unwrap();
 | 
					 | 
				
			||||||
                    if let Some(PrefixData { prefixes_fst_mmap, prefix_delta }) = prefix_data {
 | 
					 | 
				
			||||||
                        sender.main().write_words_prefixes_fst(prefixes_fst_mmap).unwrap();
 | 
					 | 
				
			||||||
                        merger_result.prefix_delta = Some(prefix_delta);
 | 
					 | 
				
			||||||
                    }
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            MergerOperation::WordFidDocidsMerger(merger) => {
 | 
					 | 
				
			||||||
                let span =
 | 
					 | 
				
			||||||
                    tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
 | 
					 | 
				
			||||||
                let _entered = span.enter();
 | 
					 | 
				
			||||||
                merge_and_send_docids(
 | 
					 | 
				
			||||||
                    merger,
 | 
					 | 
				
			||||||
                    index.word_fid_docids.remap_types(),
 | 
					 | 
				
			||||||
                    rtxn,
 | 
					 | 
				
			||||||
                    &mut buffer,
 | 
					 | 
				
			||||||
                    sender.docids::<WordFidDocids>(),
 | 
					 | 
				
			||||||
                    |_, _key| Ok(()),
 | 
					 | 
				
			||||||
                )?;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            MergerOperation::WordPairProximityDocidsMerger(merger) => {
 | 
					 | 
				
			||||||
                let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids");
 | 
					 | 
				
			||||||
                let _entered = span.enter();
 | 
					 | 
				
			||||||
                merge_and_send_docids(
 | 
					 | 
				
			||||||
                    merger,
 | 
					 | 
				
			||||||
                    index.word_pair_proximity_docids.remap_types(),
 | 
					 | 
				
			||||||
                    rtxn,
 | 
					 | 
				
			||||||
                    &mut buffer,
 | 
					 | 
				
			||||||
                    sender.docids::<WordPairProximityDocids>(),
 | 
					 | 
				
			||||||
                    |_, _key| Ok(()),
 | 
					 | 
				
			||||||
                )?;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            MergerOperation::WordPositionDocidsMerger(merger) => {
 | 
					 | 
				
			||||||
                let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
 | 
					 | 
				
			||||||
                let _entered = span.enter();
 | 
					 | 
				
			||||||
                merge_and_send_docids(
 | 
					 | 
				
			||||||
                    merger,
 | 
					 | 
				
			||||||
                    index.word_position_docids.remap_types(),
 | 
					 | 
				
			||||||
                    rtxn,
 | 
					 | 
				
			||||||
                    &mut buffer,
 | 
					 | 
				
			||||||
                    sender.docids::<WordPositionDocids>(),
 | 
					 | 
				
			||||||
                    |_, _key| Ok(()),
 | 
					 | 
				
			||||||
                )?;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            MergerOperation::InsertDocument { docid, external_id, document } => {
 | 
					 | 
				
			||||||
                let span =
 | 
					 | 
				
			||||||
                    tracing::trace_span!(target: "indexing::documents::merge", "insert_document");
 | 
					 | 
				
			||||||
                let _entered = span.enter();
 | 
					 | 
				
			||||||
                documents_ids.insert(docid);
 | 
					 | 
				
			||||||
                sender.documents().uncompressed(docid, external_id.clone(), &document).unwrap();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                if let Some(geo_extractor) = geo_extractor.as_mut() {
 | 
					 | 
				
			||||||
                    let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
 | 
					 | 
				
			||||||
                    let current: Option<&KvReaderFieldId> = current.map(Into::into);
 | 
					 | 
				
			||||||
                    let change = match current {
 | 
					 | 
				
			||||||
                        Some(current) => DocumentChange::Update(todo!()),
 | 
					 | 
				
			||||||
                        None => DocumentChange::Insertion(todo!()),
 | 
					 | 
				
			||||||
                    };
 | 
					 | 
				
			||||||
                    geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            MergerOperation::DeleteDocument { docid, external_id } => {
 | 
					 | 
				
			||||||
                let span =
 | 
					 | 
				
			||||||
                    tracing::trace_span!(target: "indexing::documents::merge", "delete_document");
 | 
					 | 
				
			||||||
                let _entered = span.enter();
 | 
					 | 
				
			||||||
                if !documents_ids.remove(docid) {
 | 
					 | 
				
			||||||
                    unreachable!("Tried deleting a document that we do not know about");
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
                sender.documents().delete(docid, external_id.clone()).unwrap();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                if let Some(geo_extractor) = geo_extractor.as_mut() {
 | 
					 | 
				
			||||||
                    let change = DocumentChange::Deletion(Deletion::create(docid, todo!()));
 | 
					 | 
				
			||||||
                    geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
 | 
					 | 
				
			||||||
                }
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            MergerOperation::FinishedDocument => {
 | 
					 | 
				
			||||||
                // send the rtree
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            MergerOperation::FacetDocidsMerger(merger) => {
 | 
					 | 
				
			||||||
                let span =
 | 
					 | 
				
			||||||
                    tracing::trace_span!(target: "indexing::documents::merge", "facet_docids");
 | 
					 | 
				
			||||||
                let _entered = span.enter();
 | 
					 | 
				
			||||||
                let mut facet_field_ids_delta = FacetFieldIdsDelta::new();
 | 
					 | 
				
			||||||
                let localized_attributes_rules =
 | 
					 | 
				
			||||||
                    index.localized_attributes_rules(rtxn)?.unwrap_or_default();
 | 
					 | 
				
			||||||
                let mut facet_search_builder = FacetSearchBuilder::new(
 | 
					 | 
				
			||||||
                    global_fields_ids_map.clone(),
 | 
					 | 
				
			||||||
                    localized_attributes_rules,
 | 
					 | 
				
			||||||
                );
 | 
					 | 
				
			||||||
                merge_and_send_facet_docids(
 | 
					 | 
				
			||||||
                    merger,
 | 
					 | 
				
			||||||
                    FacetDatabases::new(index),
 | 
					 | 
				
			||||||
                    rtxn,
 | 
					 | 
				
			||||||
                    &mut buffer,
 | 
					 | 
				
			||||||
                    sender.facet_docids(),
 | 
					 | 
				
			||||||
                    &mut facet_field_ids_delta,
 | 
					 | 
				
			||||||
                    &mut facet_search_builder,
 | 
					 | 
				
			||||||
                )?;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                merger_result.facet_field_ids_delta = Some(facet_field_ids_delta);
 | 
					 | 
				
			||||||
                // merge and send the facet fst and the searchable facet values
 | 
					 | 
				
			||||||
                facet_search_builder.merge_and_send(index, rtxn, sender.facet_searchable())?;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
        let span = tracing::trace_span!(target: "indexing::documents::merge", "documents_ids");
 | 
					 | 
				
			||||||
        let _entered = span.enter();
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        // Send the documents ids unionized with the current one
 | 
					 | 
				
			||||||
        sender.send_documents_ids(documents_ids).unwrap();
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    // ...
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    Ok(merger_result)
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
#[derive(Default, Debug)]
 | 
					 | 
				
			||||||
pub struct MergerResult {
 | 
					 | 
				
			||||||
    /// The delta of the prefixes
 | 
					 | 
				
			||||||
    pub prefix_delta: Option<PrefixDelta>,
 | 
					 | 
				
			||||||
    /// The field ids that have been modified
 | 
					 | 
				
			||||||
    pub facet_field_ids_delta: Option<FacetFieldIdsDelta>,
 | 
					 | 
				
			||||||
}
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub struct GeoExtractor {
 | 
					pub struct GeoExtractor {
 | 
				
			||||||
    rtree: Option<rstar::RTree<GeoPoint>>,
 | 
					    rtree: Option<rstar::RTree<GeoPoint>>,
 | 
				
			||||||
@@ -267,80 +65,92 @@ impl GeoExtractor {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
 | 
					#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
 | 
				
			||||||
fn merge_and_send_docids(
 | 
					pub fn merge_and_send_docids<'extractor>(
 | 
				
			||||||
    merger: Merger<File, MergeDeladdCboRoaringBitmaps>,
 | 
					    mut caches: Vec<BalancedCaches<'extractor>>,
 | 
				
			||||||
    database: Database<Bytes, Bytes>,
 | 
					    database: Database<Bytes, Bytes>,
 | 
				
			||||||
    rtxn: &RoTxn<'_>,
 | 
					    index: &Index,
 | 
				
			||||||
    buffer: &mut Vec<u8>,
 | 
					    docids_sender: impl DocidsSender + Sync,
 | 
				
			||||||
    docids_sender: impl DocidsSender,
 | 
					 | 
				
			||||||
    mut register_key: impl FnMut(DelAdd, &[u8]) -> Result<()>,
 | 
					 | 
				
			||||||
) -> Result<()> {
 | 
					) -> Result<()> {
 | 
				
			||||||
    let mut merger_iter = merger.into_stream_merger_iter().unwrap();
 | 
					    transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| {
 | 
				
			||||||
    while let Some((key, deladd)) = merger_iter.next().unwrap() {
 | 
					        let rtxn = index.read_txn()?;
 | 
				
			||||||
        let current = database.get(rtxn, key)?;
 | 
					        let mut buffer = Vec::new();
 | 
				
			||||||
        let deladd: &KvReaderDelAdd = deladd.into();
 | 
					        merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| {
 | 
				
			||||||
        let del = deladd.get(DelAdd::Deletion);
 | 
					            let current = database.get(&rtxn, key)?;
 | 
				
			||||||
        let add = deladd.get(DelAdd::Addition);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
            match merge_cbo_bitmaps(current, del, add)? {
 | 
					            match merge_cbo_bitmaps(current, del, add)? {
 | 
				
			||||||
                Operation::Write(bitmap) => {
 | 
					                Operation::Write(bitmap) => {
 | 
				
			||||||
                let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer);
 | 
					                    let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer);
 | 
				
			||||||
                    docids_sender.write(key, value).unwrap();
 | 
					                    docids_sender.write(key, value).unwrap();
 | 
				
			||||||
                register_key(DelAdd::Addition, key)?;
 | 
					                    Ok(())
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                Operation::Delete => {
 | 
					                Operation::Delete => {
 | 
				
			||||||
                    docids_sender.delete(key).unwrap();
 | 
					                    docids_sender.delete(key).unwrap();
 | 
				
			||||||
                register_key(DelAdd::Deletion, key)?;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            Operation::Ignore => (),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                    Ok(())
 | 
					                    Ok(())
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
 | 
					                Operation::Ignore => Ok(()),
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					        })
 | 
				
			||||||
 | 
					    })
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
 | 
					#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
 | 
				
			||||||
fn merge_and_send_facet_docids(
 | 
					pub fn merge_and_send_facet_docids<'indexer, 'extractor>(
 | 
				
			||||||
    merger: Merger<File, MergeDeladdCboRoaringBitmaps>,
 | 
					    global_fields_ids_map: GlobalFieldsIdsMap<'indexer>,
 | 
				
			||||||
 | 
					    mut caches: Vec<BalancedCaches<'extractor>>,
 | 
				
			||||||
    database: FacetDatabases,
 | 
					    database: FacetDatabases,
 | 
				
			||||||
    rtxn: &RoTxn<'_>,
 | 
					    index: &Index,
 | 
				
			||||||
    buffer: &mut Vec<u8>,
 | 
					    docids_sender: impl DocidsSender + Sync,
 | 
				
			||||||
    docids_sender: impl DocidsSender,
 | 
					) -> Result<(FacetFieldIdsDelta, FacetSearchBuilder<'indexer>)> {
 | 
				
			||||||
    facet_field_ids_delta: &mut FacetFieldIdsDelta,
 | 
					    transpose_and_freeze_caches(&mut caches)?
 | 
				
			||||||
    facet_search_builder: &mut FacetSearchBuilder,
 | 
					        .into_par_iter()
 | 
				
			||||||
) -> Result<()> {
 | 
					        .map(|frozen| {
 | 
				
			||||||
    let mut merger_iter = merger.into_stream_merger_iter().unwrap();
 | 
					            let mut facet_field_ids_delta = FacetFieldIdsDelta::default();
 | 
				
			||||||
    while let Some((key, deladd)) = merger_iter.next().unwrap() {
 | 
					            let rtxn = index.read_txn()?;
 | 
				
			||||||
        let current = database.get_cbo_roaring_bytes_value(rtxn, key)?;
 | 
					            let localized_attributes_rules = index.localized_attributes_rules(&rtxn)?;
 | 
				
			||||||
        let deladd: &KvReaderDelAdd = deladd.into();
 | 
					            let mut facet_search_builder = FacetSearchBuilder::new(
 | 
				
			||||||
        let del = deladd.get(DelAdd::Deletion);
 | 
					                global_fields_ids_map.clone(),
 | 
				
			||||||
        let add = deladd.get(DelAdd::Addition);
 | 
					                localized_attributes_rules.unwrap_or_default(),
 | 
				
			||||||
 | 
					            );
 | 
				
			||||||
 | 
					            let mut buffer = Vec::new();
 | 
				
			||||||
 | 
					            merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| {
 | 
				
			||||||
 | 
					                let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?;
 | 
				
			||||||
                match merge_cbo_bitmaps(current, del, add)? {
 | 
					                match merge_cbo_bitmaps(current, del, add)? {
 | 
				
			||||||
                    Operation::Write(bitmap) => {
 | 
					                    Operation::Write(bitmap) => {
 | 
				
			||||||
                        facet_field_ids_delta.register_from_key(key);
 | 
					                        facet_field_ids_delta.register_from_key(key);
 | 
				
			||||||
                        facet_search_builder.register_from_key(DelAdd::Addition, key)?;
 | 
					                        facet_search_builder.register_from_key(DelAdd::Addition, key)?;
 | 
				
			||||||
                let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer);
 | 
					                        let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer);
 | 
				
			||||||
                        docids_sender.write(key, value).unwrap();
 | 
					                        docids_sender.write(key, value).unwrap();
 | 
				
			||||||
 | 
					                        Ok(())
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
                    Operation::Delete => {
 | 
					                    Operation::Delete => {
 | 
				
			||||||
                        facet_field_ids_delta.register_from_key(key);
 | 
					                        facet_field_ids_delta.register_from_key(key);
 | 
				
			||||||
                        facet_search_builder.register_from_key(DelAdd::Deletion, key)?;
 | 
					                        facet_search_builder.register_from_key(DelAdd::Deletion, key)?;
 | 
				
			||||||
                        docids_sender.delete(key).unwrap();
 | 
					                        docids_sender.delete(key).unwrap();
 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
            Operation::Ignore => (),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
                        Ok(())
 | 
					                        Ok(())
 | 
				
			||||||
                    }
 | 
					                    }
 | 
				
			||||||
 | 
					                    Operation::Ignore => Ok(()),
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					            })?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct FacetDatabases<'a> {
 | 
					            Ok((facet_field_ids_delta, facet_search_builder))
 | 
				
			||||||
 | 
					        })
 | 
				
			||||||
 | 
					        .reduce(
 | 
				
			||||||
 | 
					            || Ok((FacetFieldIdsDelta::default(), todo!())),
 | 
				
			||||||
 | 
					            |lhs, rhs| {
 | 
				
			||||||
 | 
					                let (lhs_ffid, lhs_fsb) = lhs?;
 | 
				
			||||||
 | 
					                let (rhs_ffid, rhs_fsb) = rhs?;
 | 
				
			||||||
 | 
					                let ffid_merged = lhs_ffid.merge(rhs_ffid);
 | 
				
			||||||
 | 
					                let fsb_merged = todo!();
 | 
				
			||||||
 | 
					                Ok((ffid_merged, fsb_merged))
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					pub struct FacetDatabases<'a> {
 | 
				
			||||||
    index: &'a Index,
 | 
					    index: &'a Index,
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl<'a> FacetDatabases<'a> {
 | 
					impl<'a> FacetDatabases<'a> {
 | 
				
			||||||
    fn new(index: &'a Index) -> Self {
 | 
					    pub fn new(index: &'a Index) -> Self {
 | 
				
			||||||
        Self { index }
 | 
					        Self { index }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -361,7 +171,7 @@ impl<'a> FacetDatabases<'a> {
 | 
				
			|||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#[derive(Debug)]
 | 
					#[derive(Debug, Default)]
 | 
				
			||||||
pub struct FacetFieldIdsDelta {
 | 
					pub struct FacetFieldIdsDelta {
 | 
				
			||||||
    /// The field ids that have been modified
 | 
					    /// The field ids that have been modified
 | 
				
			||||||
    modified_facet_string_ids: HashSet<FieldId>,
 | 
					    modified_facet_string_ids: HashSet<FieldId>,
 | 
				
			||||||
@@ -369,13 +179,6 @@ pub struct FacetFieldIdsDelta {
 | 
				
			|||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl FacetFieldIdsDelta {
 | 
					impl FacetFieldIdsDelta {
 | 
				
			||||||
    fn new() -> Self {
 | 
					 | 
				
			||||||
        Self {
 | 
					 | 
				
			||||||
            modified_facet_string_ids: HashSet::new(),
 | 
					 | 
				
			||||||
            modified_facet_number_ids: HashSet::new(),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    fn register_facet_string_id(&mut self, field_id: FieldId) {
 | 
					    fn register_facet_string_id(&mut self, field_id: FieldId) {
 | 
				
			||||||
        self.modified_facet_string_ids.insert(field_id);
 | 
					        self.modified_facet_string_ids.insert(field_id);
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
@@ -414,6 +217,17 @@ impl FacetFieldIdsDelta {
 | 
				
			|||||||
            Some(self.modified_facet_number_ids.iter().copied().collect())
 | 
					            Some(self.modified_facet_number_ids.iter().copied().collect())
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn merge(mut self, rhs: Self) -> Self {
 | 
				
			||||||
 | 
					        let Self { modified_facet_number_ids, modified_facet_string_ids } = rhs;
 | 
				
			||||||
 | 
					        modified_facet_number_ids.into_iter().for_each(|fid| {
 | 
				
			||||||
 | 
					            self.modified_facet_number_ids.insert(fid);
 | 
				
			||||||
 | 
					        });
 | 
				
			||||||
 | 
					        modified_facet_string_ids.into_iter().for_each(|fid| {
 | 
				
			||||||
 | 
					            self.modified_facet_string_ids.insert(fid);
 | 
				
			||||||
 | 
					        });
 | 
				
			||||||
 | 
					        self
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
enum Operation {
 | 
					enum Operation {
 | 
				
			||||||
@@ -425,13 +239,10 @@ enum Operation {
 | 
				
			|||||||
/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap.
 | 
					/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap.
 | 
				
			||||||
fn merge_cbo_bitmaps(
 | 
					fn merge_cbo_bitmaps(
 | 
				
			||||||
    current: Option<&[u8]>,
 | 
					    current: Option<&[u8]>,
 | 
				
			||||||
    del: Option<&[u8]>,
 | 
					    del: Option<RoaringBitmap>,
 | 
				
			||||||
    add: Option<&[u8]>,
 | 
					    add: Option<RoaringBitmap>,
 | 
				
			||||||
) -> Result<Operation> {
 | 
					) -> Result<Operation> {
 | 
				
			||||||
    let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
 | 
					    let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
 | 
				
			||||||
    let del = del.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
 | 
					 | 
				
			||||||
    let add = add.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    match (current, del, add) {
 | 
					    match (current, del, add) {
 | 
				
			||||||
        (None, None, None) => Ok(Operation::Ignore), // but it's strange
 | 
					        (None, None, None) => Ok(Operation::Ignore), // but it's strange
 | 
				
			||||||
        (None, None, Some(add)) => Ok(Operation::Write(add)),
 | 
					        (None, None, Some(add)) => Ok(Operation::Write(add)),
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,4 +1,7 @@
 | 
				
			|||||||
pub use document_change::{Deletion, DocumentChange, Insertion, Update};
 | 
					pub use document_change::{Deletion, DocumentChange, Insertion, Update};
 | 
				
			||||||
 | 
					pub use merger::{
 | 
				
			||||||
 | 
					    merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta,
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
pub use top_level_map::{CowStr, TopLevelMap};
 | 
					pub use top_level_map::{CowStr, TopLevelMap};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use super::del_add::DelAdd;
 | 
					use super::del_add::DelAdd;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,38 +1,8 @@
 | 
				
			|||||||
use std::sync::Arc;
 | 
					use std::sync::Arc;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use rayon::iter::{MapInit, ParallelIterator};
 | 
					use rayon::iter::ParallelIterator;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
pub trait ParallelIteratorExt: ParallelIterator {
 | 
					pub trait ParallelIteratorExt: ParallelIterator {
 | 
				
			||||||
    /// Maps items based on the init function.
 | 
					 | 
				
			||||||
    ///
 | 
					 | 
				
			||||||
    /// The init function is ran only as necessary which is basically once by thread.
 | 
					 | 
				
			||||||
    fn try_map_try_init<F, INIT, T, E, R>(
 | 
					 | 
				
			||||||
        self,
 | 
					 | 
				
			||||||
        init: INIT,
 | 
					 | 
				
			||||||
        map_op: F,
 | 
					 | 
				
			||||||
    ) -> MapInit<
 | 
					 | 
				
			||||||
        Self,
 | 
					 | 
				
			||||||
        impl Fn() -> Result<T, Arc<E>> + Sync + Send + Clone,
 | 
					 | 
				
			||||||
        impl Fn(&mut Result<T, Arc<E>>, Self::Item) -> Result<R, Arc<E>> + Sync + Send + Clone,
 | 
					 | 
				
			||||||
    >
 | 
					 | 
				
			||||||
    where
 | 
					 | 
				
			||||||
        E: Send + Sync,
 | 
					 | 
				
			||||||
        F: Fn(&mut T, Self::Item) -> Result<R, E> + Sync + Send + Clone,
 | 
					 | 
				
			||||||
        INIT: Fn() -> Result<T, E> + Sync + Send + Clone,
 | 
					 | 
				
			||||||
        R: Send,
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
        self.map_init(
 | 
					 | 
				
			||||||
            move || match init() {
 | 
					 | 
				
			||||||
                Ok(t) => Ok(t),
 | 
					 | 
				
			||||||
                Err(err) => Err(Arc::new(err)),
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
            move |result, item| match result {
 | 
					 | 
				
			||||||
                Ok(t) => map_op(t, item).map_err(Arc::new),
 | 
					 | 
				
			||||||
                Err(err) => Err(err.clone()),
 | 
					 | 
				
			||||||
            },
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    /// A method to run a closure of all the items and return an owned error.
 | 
					    /// A method to run a closure of all the items and return an owned error.
 | 
				
			||||||
    ///
 | 
					    ///
 | 
				
			||||||
    /// The init function is ran only as necessary which is basically once by thread.
 | 
					    /// The init function is ran only as necessary which is basically once by thread.
 | 
				
			||||||
@@ -58,17 +28,6 @@ pub trait ParallelIteratorExt: ParallelIterator {
 | 
				
			|||||||
            Err(err) => Err(Arc::into_inner(err).expect("the error must be only owned by us")),
 | 
					            Err(err) => Err(Arc::into_inner(err).expect("the error must be only owned by us")),
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					 | 
				
			||||||
    fn try_arc_for_each<F, E>(self, op: F) -> Result<(), E>
 | 
					 | 
				
			||||||
    where
 | 
					 | 
				
			||||||
        E: Send + Sync,
 | 
					 | 
				
			||||||
        F: Fn(Self::Item) -> Result<(), Arc<E>> + Sync + Send + Clone,
 | 
					 | 
				
			||||||
    {
 | 
					 | 
				
			||||||
        match self.try_for_each(op) {
 | 
					 | 
				
			||||||
            Ok(()) => Ok(()),
 | 
					 | 
				
			||||||
            Err(err) => Err(Arc::into_inner(err).expect("the error must be only owned by us")),
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
impl<T: ParallelIterator> ParallelIteratorExt for T {}
 | 
					impl<T: ParallelIterator> ParallelIteratorExt for T {}
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,10 +1,16 @@
 | 
				
			|||||||
 | 
					use std::cell::RefCell;
 | 
				
			||||||
use std::collections::HashSet;
 | 
					use std::collections::HashSet;
 | 
				
			||||||
 | 
					use std::io::{BufReader, BufWriter, Read, Seek, Write};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
use hashbrown::HashMap;
 | 
					use hashbrown::HashMap;
 | 
				
			||||||
use heed::types::Bytes;
 | 
					use heed::types::Bytes;
 | 
				
			||||||
use heed::{BytesDecode, Database, RwTxn};
 | 
					use heed::{BytesDecode, Database, RoTxn, RwTxn};
 | 
				
			||||||
use roaring::RoaringBitmap;
 | 
					use rayon::iter::{IntoParallelIterator, ParallelIterator as _};
 | 
				
			||||||
 | 
					use roaring::MultiOps;
 | 
				
			||||||
 | 
					use tempfile::tempfile;
 | 
				
			||||||
 | 
					use thread_local::ThreadLocal;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					use super::indexer::document_changes::RefCellExt;
 | 
				
			||||||
use crate::heed_codec::StrBEU16Codec;
 | 
					use crate::heed_codec::StrBEU16Codec;
 | 
				
			||||||
use crate::{CboRoaringBitmapCodec, Index, Prefix, Result};
 | 
					use crate::{CboRoaringBitmapCodec, Index, Prefix, Result};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -38,22 +44,103 @@ impl WordPrefixDocids {
 | 
				
			|||||||
        prefixes: &HashSet<Prefix>,
 | 
					        prefixes: &HashSet<Prefix>,
 | 
				
			||||||
    ) -> Result<()> {
 | 
					    ) -> Result<()> {
 | 
				
			||||||
        // We fetch the docids associated to the newly added word prefix fst only.
 | 
					        // We fetch the docids associated to the newly added word prefix fst only.
 | 
				
			||||||
        let mut docids = RoaringBitmap::new();
 | 
					        // And collect the CboRoaringBitmaps pointers in an HashMap.
 | 
				
			||||||
        for prefix in prefixes {
 | 
					        let frozen = FrozenPrefixBitmaps::from_prefixes(self.database, wtxn, prefixes)?;
 | 
				
			||||||
            docids.clear();
 | 
					 | 
				
			||||||
            let prefix = prefix.as_bytes();
 | 
					 | 
				
			||||||
            for result in self.database.prefix_iter(wtxn, prefix)? {
 | 
					 | 
				
			||||||
                let (_word, data) = result?;
 | 
					 | 
				
			||||||
                docids |= &data;
 | 
					 | 
				
			||||||
            }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.prefix_database.put(wtxn, prefix, &docids)?;
 | 
					        // We access this HashMap in parallel to compute the *union* of all
 | 
				
			||||||
 | 
					        // of them and *serialize* them into files. There is one file by CPU.
 | 
				
			||||||
 | 
					        let local_entries = ThreadLocal::with_capacity(rayon::current_num_threads());
 | 
				
			||||||
 | 
					        prefixes.into_par_iter().map(AsRef::as_ref).try_for_each(|prefix| {
 | 
				
			||||||
 | 
					            let refcell = local_entries.get_or_try(|| {
 | 
				
			||||||
 | 
					                tempfile().map(BufWriter::new).map(|f| RefCell::new((Vec::new(), f, Vec::new())))
 | 
				
			||||||
 | 
					            })?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            let mut refmut = refcell.borrow_mut_or_yield();
 | 
				
			||||||
 | 
					            let (ref mut index, ref mut file, ref mut buffer) = *refmut;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            let output = frozen
 | 
				
			||||||
 | 
					                .bitmaps(prefix)
 | 
				
			||||||
 | 
					                .unwrap()
 | 
				
			||||||
 | 
					                .iter()
 | 
				
			||||||
 | 
					                .map(|bytes| CboRoaringBitmapCodec::deserialize_from(bytes))
 | 
				
			||||||
 | 
					                .union()?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            buffer.clear();
 | 
				
			||||||
 | 
					            CboRoaringBitmapCodec::serialize_into(&output, buffer);
 | 
				
			||||||
 | 
					            index.push(PrefixEntry { prefix, serialized_length: buffer.len() });
 | 
				
			||||||
 | 
					            file.write_all(buffer)
 | 
				
			||||||
 | 
					        })?;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        drop(frozen);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        // We iterate over all the collected and serialized bitmaps through
 | 
				
			||||||
 | 
					        // the files and entries to eventually put them in the final database.
 | 
				
			||||||
 | 
					        for refcell in local_entries {
 | 
				
			||||||
 | 
					            let (index, file, mut buffer) = refcell.into_inner();
 | 
				
			||||||
 | 
					            let mut file = file.into_inner().map_err(|e| e.into_error())?;
 | 
				
			||||||
 | 
					            file.rewind()?;
 | 
				
			||||||
 | 
					            let mut file = BufReader::new(file);
 | 
				
			||||||
 | 
					            for PrefixEntry { prefix, serialized_length } in index {
 | 
				
			||||||
 | 
					                buffer.resize(serialized_length, 0);
 | 
				
			||||||
 | 
					                file.read_exact(&mut buffer)?;
 | 
				
			||||||
 | 
					                self.prefix_database.remap_data_type::<Bytes>().put(
 | 
				
			||||||
 | 
					                    wtxn,
 | 
				
			||||||
 | 
					                    prefix.as_bytes(),
 | 
				
			||||||
 | 
					                    &buffer,
 | 
				
			||||||
 | 
					                )?;
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Ok(())
 | 
					        Ok(())
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/// Represents a prefix and the lenght the bitmap takes on disk.
 | 
				
			||||||
 | 
					struct PrefixEntry<'a> {
 | 
				
			||||||
 | 
					    prefix: &'a str,
 | 
				
			||||||
 | 
					    serialized_length: usize,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/// Stores prefixes along with all the pointers to the associated
 | 
				
			||||||
 | 
					/// CBoRoaringBitmaps.
 | 
				
			||||||
 | 
					///
 | 
				
			||||||
 | 
					/// They are collected synchronously and stored into an HashMap. The
 | 
				
			||||||
 | 
					/// Synchronous process is doing a small amount of work by just storing
 | 
				
			||||||
 | 
					/// pointers. It can then be accessed in parallel to get the associated
 | 
				
			||||||
 | 
					/// bitmaps pointers.
 | 
				
			||||||
 | 
					struct FrozenPrefixBitmaps<'a, 'rtxn> {
 | 
				
			||||||
 | 
					    prefixes_bitmaps: HashMap<&'a str, Vec<&'rtxn [u8]>>,
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> {
 | 
				
			||||||
 | 
					    #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
 | 
				
			||||||
 | 
					    pub fn from_prefixes(
 | 
				
			||||||
 | 
					        database: Database<Bytes, CboRoaringBitmapCodec>,
 | 
				
			||||||
 | 
					        rtxn: &'rtxn RoTxn,
 | 
				
			||||||
 | 
					        prefixes: &'a HashSet<Prefix>,
 | 
				
			||||||
 | 
					    ) -> heed::Result<Self> {
 | 
				
			||||||
 | 
					        let database = database.remap_data_type::<Bytes>();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        let mut prefixes_bitmaps = HashMap::new();
 | 
				
			||||||
 | 
					        for prefix in prefixes {
 | 
				
			||||||
 | 
					            let mut bitmap_bytes = Vec::new();
 | 
				
			||||||
 | 
					            for result in database.prefix_iter(rtxn, prefix.as_bytes())? {
 | 
				
			||||||
 | 
					                let (_word, bytes) = result?;
 | 
				
			||||||
 | 
					                bitmap_bytes.push(bytes);
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					            assert!(prefixes_bitmaps.insert(prefix.as_str(), bitmap_bytes).is_none());
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Ok(Self { prefixes_bitmaps })
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    pub fn bitmaps(&self, key: &str) -> Option<&[&'rtxn [u8]]> {
 | 
				
			||||||
 | 
					        self.prefixes_bitmaps.get(key).map(AsRef::as_ref)
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					unsafe impl<'a, 'rtxn> Sync for FrozenPrefixBitmaps<'a, 'rtxn> {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
struct WordPrefixIntegerDocids {
 | 
					struct WordPrefixIntegerDocids {
 | 
				
			||||||
    database: Database<Bytes, CboRoaringBitmapCodec>,
 | 
					    database: Database<Bytes, CboRoaringBitmapCodec>,
 | 
				
			||||||
    prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
 | 
					    prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user