mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	Merge #5187
	
		
			
	
		
	
	
		
	
		
			Some checks failed
		
		
	
	
		
			
				
	
				Indexing bench (push) / Run and upload benchmarks (push) Waiting to run
				
					
					
				
			
		
			
				
	
				Benchmarks of indexing (push) / Run and upload benchmarks (push) Waiting to run
				
					
					
				
			
		
			
				
	
				Benchmarks of search for geo (push) / Run and upload benchmarks (push) Waiting to run
				
					
					
				
			
		
			
				
	
				Benchmarks of search for songs (push) / Run and upload benchmarks (push) Waiting to run
				
					
					
				
			
		
			
				
	
				Benchmarks of search for Wikipedia articles (push) / Run and upload benchmarks (push) Waiting to run
				
					
					
				
			
		
			
				
	
				Test suite / Tests on ${{ matrix.os }} (macos-13) (push) Waiting to run
				
					
					
				
			
		
			
				
	
				Test suite / Tests on ${{ matrix.os }} (windows-2022) (push) Failing after 12s
				
					
					
				
			
		
			
				
	
				Test suite / Tests on ubuntu-20.04 (push) Failing after 20s
				
					
					
				
			
		
			
				
	
				Test suite / Tests almost all features (push) Has been skipped
				
					
					
				
			
		
			
				
	
				Test suite / Test disabled tokenization (push) Has been skipped
				
					
					
				
			
		
			
				
	
				Test suite / Run tests in debug (push) Failing after 16s
				
					
					
				
			
		
			
				
	
				Test suite / Run Clippy (push) Successful in 33m58s
				
					
					
				
			
		
			
				
	
				Test suite / Run Rustfmt (push) Successful in 11m45s
				
					
					
				
			
		
			
				
	
				Run the indexing fuzzer / Setup the action (push) Successful in 1h10m33s
				
					
					
				
			
		
		
	
	
				
					
				
			
		
			Some checks failed
		
		
	
	Indexing bench (push) / Run and upload benchmarks (push) Waiting to run
				Benchmarks of indexing (push) / Run and upload benchmarks (push) Waiting to run
				Benchmarks of search for geo (push) / Run and upload benchmarks (push) Waiting to run
				Benchmarks of search for songs (push) / Run and upload benchmarks (push) Waiting to run
				Benchmarks of search for Wikipedia articles (push) / Run and upload benchmarks (push) Waiting to run
				Test suite / Tests on ${{ matrix.os }} (macos-13) (push) Waiting to run
				Test suite / Tests on ${{ matrix.os }} (windows-2022) (push) Failing after 12s
				Test suite / Tests on ubuntu-20.04 (push) Failing after 20s
				Test suite / Tests almost all features (push) Has been skipped
				Test suite / Test disabled tokenization (push) Has been skipped
				Test suite / Run tests in debug (push) Failing after 16s
				Test suite / Run Clippy (push) Successful in 33m58s
				Test suite / Run Rustfmt (push) Successful in 11m45s
				Run the indexing fuzzer / Setup the action (push) Successful in 1h10m33s
				5187: Bring back v1.12.0 of pre-release changes into `main` r=irevoire a=curquiza Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com> Co-authored-by: Clément Renault <clement@meilisearch.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com> Co-authored-by: Many the fish <many@meilisearch.com>
This commit is contained in:
		| @@ -18,8 +18,7 @@ bincode = "1.3.3" | ||||
| bstr = "1.9.1" | ||||
| bytemuck = { version = "1.18.0", features = ["extern_crate_alloc"] } | ||||
| byteorder = "1.5.0" | ||||
| # charabia = { version = "0.9.0", default-features = false } | ||||
| charabia = { git = "https://github.com/meilisearch/charabia", branch = "mutualize-char-normalizer", default-features = false } | ||||
| charabia = { version = "0.9.2", default-features = false } | ||||
| concat-arrays = "0.1.2" | ||||
| crossbeam-channel = "0.5.13" | ||||
| deserr = "0.6.2" | ||||
| @@ -28,10 +27,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } | ||||
| fst = "0.4.7" | ||||
| fxhash = "0.2.1" | ||||
| geoutils = "0.5.1" | ||||
| grenad = { version = "0.4.7", default-features = false, features = [ | ||||
|     "rayon",    # TODO Should we keep this feature | ||||
|     "tempfile", | ||||
| ], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" } | ||||
| grenad = { version = "0.5.0", default-features = false, features = ["rayon", "tempfile"] } | ||||
| heed = { version = "0.20.3", default-features = false, features = [ | ||||
|     "serde-json", | ||||
|     "serde-bincode", | ||||
| @@ -42,11 +38,11 @@ json-depth-checker = { path = "../json-depth-checker" } | ||||
| levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | ||||
| memchr = "2.5.0" | ||||
| memmap2 = "0.9.4" | ||||
| obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" } | ||||
| obkv = "0.3.0" | ||||
| once_cell = "1.19.0" | ||||
| ordered-float = "4.2.1" | ||||
| rayon = "1.10.0" | ||||
| roaring = { version = "0.10.6", features = ["serde"] } | ||||
| roaring = { version = "0.10.7", features = ["serde"] } | ||||
| rstar = { version = "0.12.0", features = ["serde"] } | ||||
| serde = { version = "1.0.204", features = ["derive"] } | ||||
| serde_json = { version = "1.0.120", features = ["preserve_order", "raw_value"] } | ||||
| @@ -95,13 +91,15 @@ ureq = { version = "2.10.0", features = ["json"] } | ||||
| url = "2.5.2" | ||||
| rayon-par-bridge = "0.1.0" | ||||
| hashbrown = "0.15.0" | ||||
| raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" } | ||||
| bumpalo = "3.16.0" | ||||
| bumparaw-collections = "0.1.2" | ||||
| thread_local = "1.1.8" | ||||
| allocator-api2 = "0.2.18" | ||||
| rustc-hash = "2.0.0" | ||||
| uell = "0.1.0" | ||||
| enum-iterator = "2.1.0" | ||||
| bbqueue = { git = "https://github.com/meilisearch/bbqueue" } | ||||
| flume = { version = "0.11.1", default-features = false } | ||||
|  | ||||
| [dev-dependencies] | ||||
| mimalloc = { version = "0.1.43", default-features = false } | ||||
|   | ||||
| @@ -280,7 +280,7 @@ fn starts_with(selector: &str, key: &str) -> bool { | ||||
|  | ||||
| pub fn validate_document_id_str(document_id: &str) -> Option<&str> { | ||||
|     if document_id.is_empty() | ||||
|         || document_id.len() > 512 | ||||
|         || document_id.len() >= 512 | ||||
|         || !document_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_') | ||||
|     { | ||||
|         None | ||||
|   | ||||
| @@ -3,6 +3,7 @@ use std::convert::Infallible; | ||||
| use std::fmt::Write; | ||||
| use std::{io, str}; | ||||
|  | ||||
| use bstr::BString; | ||||
| use heed::{Error as HeedError, MdbError}; | ||||
| use rayon::ThreadPoolBuildError; | ||||
| use rhai::EvalAltResult; | ||||
| @@ -61,6 +62,10 @@ pub enum InternalError { | ||||
|     Serialization(#[from] SerializationError), | ||||
|     #[error(transparent)] | ||||
|     Store(#[from] MdbError), | ||||
|     #[error("Cannot delete {key:?} from database {database_name}: {error}")] | ||||
|     StoreDeletion { database_name: &'static str, key: BString, error: heed::Error }, | ||||
|     #[error("Cannot insert {key:?} and value with length {value_length} into database {database_name}: {error}")] | ||||
|     StorePut { database_name: &'static str, key: BString, value_length: usize, error: heed::Error }, | ||||
|     #[error(transparent)] | ||||
|     Utf8(#[from] str::Utf8Error), | ||||
|     #[error("An indexation process was explicitly aborted")] | ||||
| @@ -109,7 +114,7 @@ pub enum UserError { | ||||
|         "Document identifier `{}` is invalid. \ | ||||
| A document identifier can be of type integer or string, \ | ||||
| only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \ | ||||
| and can not be more than 512 bytes.", .document_id.to_string() | ||||
| and can not be more than 511 bytes.", .document_id.to_string() | ||||
|     )] | ||||
|     InvalidDocumentId { document_id: Value }, | ||||
|     #[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))] | ||||
|   | ||||
| @@ -97,7 +97,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { | ||||
|  | ||||
|     fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> { | ||||
|         let mut v = vec![value.size]; | ||||
|         CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); | ||||
|         CboRoaringBitmapCodec::serialize_into_vec(&value.bitmap, &mut v); | ||||
|         Ok(Cow::Owned(v)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -27,18 +27,27 @@ impl CboRoaringBitmapCodec { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec<u8>) { | ||||
|     pub fn serialize_into_vec(roaring: &RoaringBitmap, vec: &mut Vec<u8>) { | ||||
|         Self::serialize_into_writer(roaring, vec).unwrap() | ||||
|     } | ||||
|  | ||||
|     pub fn serialize_into_writer<W: io::Write>( | ||||
|         roaring: &RoaringBitmap, | ||||
|         mut writer: W, | ||||
|     ) -> io::Result<()> { | ||||
|         if roaring.len() <= THRESHOLD as u64 { | ||||
|             // If the number of items (u32s) to encode is less than or equal to the threshold | ||||
|             // it means that it would weigh the same or less than the RoaringBitmap | ||||
|             // header, so we directly encode them using ByteOrder instead. | ||||
|             for integer in roaring { | ||||
|                 vec.write_u32::<NativeEndian>(integer).unwrap(); | ||||
|                 writer.write_u32::<NativeEndian>(integer)?; | ||||
|             } | ||||
|         } else { | ||||
|             // Otherwise, we use the classic RoaringBitmapCodec that writes a header. | ||||
|             roaring.serialize_into(vec).unwrap(); | ||||
|             roaring.serialize_into(writer)?; | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> { | ||||
| @@ -143,7 +152,7 @@ impl CboRoaringBitmapCodec { | ||||
|             return Ok(None); | ||||
|         } | ||||
|  | ||||
|         Self::serialize_into(&previous, buffer); | ||||
|         Self::serialize_into_vec(&previous, buffer); | ||||
|         Ok(Some(&buffer[..])) | ||||
|     } | ||||
| } | ||||
| @@ -169,7 +178,7 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { | ||||
|  | ||||
|     fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> { | ||||
|         let mut vec = Vec::with_capacity(Self::serialized_size(item)); | ||||
|         Self::serialize_into(item, &mut vec); | ||||
|         Self::serialize_into_vec(item, &mut vec); | ||||
|         Ok(Cow::Owned(vec)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -70,6 +70,8 @@ pub mod main_key { | ||||
|     pub const EMBEDDING_CONFIGS: &str = "embedding_configs"; | ||||
|     pub const SEARCH_CUTOFF: &str = "search_cutoff"; | ||||
|     pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules"; | ||||
|     pub const FACET_SEARCH: &str = "facet_search"; | ||||
|     pub const PREFIX_SEARCH: &str = "prefix_search"; | ||||
| } | ||||
|  | ||||
| pub mod db_name { | ||||
| @@ -1233,6 +1235,10 @@ impl Index { | ||||
|         ) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn delete_words_prefixes_fst(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> { | ||||
|         self.main.remap_key_type::<Str>().delete(wtxn, main_key::WORDS_PREFIXES_FST_KEY) | ||||
|     } | ||||
|  | ||||
|     /// Returns the FST which is the words prefixes dictionary of the engine. | ||||
|     pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn<'t>) -> Result<fst::Set<Cow<'t, [u8]>>> { | ||||
|         match self.main.remap_types::<Str, Bytes>().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? { | ||||
| @@ -1562,6 +1568,41 @@ impl Index { | ||||
|         self.main.remap_key_type::<Str>().delete(txn, main_key::PROXIMITY_PRECISION) | ||||
|     } | ||||
|  | ||||
|     pub fn prefix_search(&self, txn: &RoTxn<'_>) -> heed::Result<Option<PrefixSearch>> { | ||||
|         self.main.remap_types::<Str, SerdeBincode<PrefixSearch>>().get(txn, main_key::PREFIX_SEARCH) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn put_prefix_search( | ||||
|         &self, | ||||
|         txn: &mut RwTxn<'_>, | ||||
|         val: PrefixSearch, | ||||
|     ) -> heed::Result<()> { | ||||
|         self.main.remap_types::<Str, SerdeBincode<PrefixSearch>>().put( | ||||
|             txn, | ||||
|             main_key::PREFIX_SEARCH, | ||||
|             &val, | ||||
|         ) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn delete_prefix_search(&self, txn: &mut RwTxn<'_>) -> heed::Result<bool> { | ||||
|         self.main.remap_key_type::<Str>().delete(txn, main_key::PREFIX_SEARCH) | ||||
|     } | ||||
|  | ||||
|     pub fn facet_search(&self, txn: &RoTxn<'_>) -> heed::Result<bool> { | ||||
|         self.main | ||||
|             .remap_types::<Str, SerdeBincode<bool>>() | ||||
|             .get(txn, main_key::FACET_SEARCH) | ||||
|             .map(|v| v.unwrap_or(true)) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn put_facet_search(&self, txn: &mut RwTxn<'_>, val: bool) -> heed::Result<()> { | ||||
|         self.main.remap_types::<Str, SerdeBincode<bool>>().put(txn, main_key::FACET_SEARCH, &val) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn delete_facet_search(&self, txn: &mut RwTxn<'_>) -> heed::Result<bool> { | ||||
|         self.main.remap_key_type::<Str>().delete(txn, main_key::FACET_SEARCH) | ||||
|     } | ||||
|  | ||||
|     pub fn localized_attributes_rules( | ||||
|         &self, | ||||
|         rtxn: &RoTxn<'_>, | ||||
| @@ -1647,12 +1688,9 @@ impl Index { | ||||
|         Ok(res) | ||||
|     } | ||||
|  | ||||
|     pub fn prefix_settings(&self, _rtxn: &RoTxn<'_>) -> Result<PrefixSettings> { | ||||
|         Ok(PrefixSettings { | ||||
|             compute_prefixes: true, | ||||
|             max_prefix_length: 4, | ||||
|             prefix_count_threshold: 100, | ||||
|         }) | ||||
|     pub fn prefix_settings(&self, rtxn: &RoTxn<'_>) -> Result<PrefixSettings> { | ||||
|         let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); | ||||
|         Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -1665,9 +1703,17 @@ pub struct IndexEmbeddingConfig { | ||||
|  | ||||
| #[derive(Debug, Deserialize, Serialize)] | ||||
| pub struct PrefixSettings { | ||||
|     pub prefix_count_threshold: u64, | ||||
|     pub prefix_count_threshold: usize, | ||||
|     pub max_prefix_length: usize, | ||||
|     pub compute_prefixes: bool, | ||||
|     pub compute_prefixes: PrefixSearch, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| pub enum PrefixSearch { | ||||
|     #[default] | ||||
|     IndexingTime, | ||||
|     Disabled, | ||||
| } | ||||
|  | ||||
| #[derive(Serialize, Deserialize)] | ||||
| @@ -1688,6 +1734,7 @@ pub(crate) mod tests { | ||||
|  | ||||
|     use crate::error::{Error, InternalError}; | ||||
|     use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; | ||||
|     use crate::progress::Progress; | ||||
|     use crate::update::new::indexer; | ||||
|     use crate::update::settings::InnerIndexSettings; | ||||
|     use crate::update::{ | ||||
| @@ -1764,7 +1811,7 @@ pub(crate) mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             )?; | ||||
|  | ||||
|             if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) { | ||||
| @@ -1775,6 +1822,7 @@ pub(crate) mod tests { | ||||
|                 indexer::index( | ||||
|                     wtxn, | ||||
|                     &self.inner, | ||||
|                     &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|                     indexer_config.grenad_parameters(), | ||||
|                     &db_fields_ids_map, | ||||
|                     new_fields_ids_map, | ||||
| @@ -1782,7 +1830,7 @@ pub(crate) mod tests { | ||||
|                     &document_changes, | ||||
|                     embedders, | ||||
|                     &|| false, | ||||
|                     &|_| (), | ||||
|                     &Progress::default(), | ||||
|                 ) | ||||
|             }) | ||||
|             .unwrap()?; | ||||
| @@ -1854,7 +1902,7 @@ pub(crate) mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             )?; | ||||
|  | ||||
|             if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) { | ||||
| @@ -1865,6 +1913,7 @@ pub(crate) mod tests { | ||||
|                 indexer::index( | ||||
|                     wtxn, | ||||
|                     &self.inner, | ||||
|                     &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|                     indexer_config.grenad_parameters(), | ||||
|                     &db_fields_ids_map, | ||||
|                     new_fields_ids_map, | ||||
| @@ -1872,7 +1921,7 @@ pub(crate) mod tests { | ||||
|                     &document_changes, | ||||
|                     embedders, | ||||
|                     &|| false, | ||||
|                     &|_| (), | ||||
|                     &Progress::default(), | ||||
|                 ) | ||||
|             }) | ||||
|             .unwrap()?; | ||||
| @@ -1934,7 +1983,7 @@ pub(crate) mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
| @@ -1945,6 +1994,7 @@ pub(crate) mod tests { | ||||
|                 indexer::index( | ||||
|                     &mut wtxn, | ||||
|                     &index.inner, | ||||
|                     &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|                     indexer_config.grenad_parameters(), | ||||
|                     &db_fields_ids_map, | ||||
|                     new_fields_ids_map, | ||||
| @@ -1952,7 +2002,7 @@ pub(crate) mod tests { | ||||
|                     &document_changes, | ||||
|                     embedders, | ||||
|                     &|| should_abort.load(Relaxed), | ||||
|                     &|_| (), | ||||
|                     &Progress::default(), | ||||
|                 ) | ||||
|             }) | ||||
|             .unwrap() | ||||
|   | ||||
| @@ -1,6 +1,7 @@ | ||||
| #![cfg_attr(all(test, fuzzing), feature(no_coverage))] | ||||
| #![allow(clippy::type_complexity)] | ||||
|  | ||||
| #[cfg(not(windows))] | ||||
| #[cfg(test)] | ||||
| #[global_allocator] | ||||
| pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; | ||||
| @@ -30,6 +31,7 @@ pub mod vector; | ||||
| #[macro_use] | ||||
| pub mod snapshot_tests; | ||||
| mod fieldids_weights_map; | ||||
| pub mod progress; | ||||
|  | ||||
| use std::collections::{BTreeMap, HashMap}; | ||||
| use std::convert::{TryFrom, TryInto}; | ||||
|   | ||||
							
								
								
									
										152
									
								
								crates/milli/src/progress.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										152
									
								
								crates/milli/src/progress.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,152 @@ | ||||
| use std::any::TypeId; | ||||
| use std::borrow::Cow; | ||||
| use std::sync::atomic::{AtomicU32, Ordering}; | ||||
| use std::sync::{Arc, RwLock}; | ||||
|  | ||||
| use serde::Serialize; | ||||
|  | ||||
| pub trait Step: 'static + Send + Sync { | ||||
|     fn name(&self) -> Cow<'static, str>; | ||||
|     fn current(&self) -> u32; | ||||
|     fn total(&self) -> u32; | ||||
| } | ||||
|  | ||||
| #[derive(Clone, Default)] | ||||
| pub struct Progress { | ||||
|     steps: Arc<RwLock<Vec<(TypeId, Box<dyn Step>)>>>, | ||||
| } | ||||
|  | ||||
| impl Progress { | ||||
|     pub fn update_progress<P: Step>(&self, sub_progress: P) { | ||||
|         let mut steps = self.steps.write().unwrap(); | ||||
|         let step_type = TypeId::of::<P>(); | ||||
|         if let Some(idx) = steps.iter().position(|(id, _)| *id == step_type) { | ||||
|             steps.truncate(idx); | ||||
|         } | ||||
|         steps.push((step_type, Box::new(sub_progress))); | ||||
|     } | ||||
|  | ||||
|     // TODO: This code should be in meilisearch_types but cannot because milli can't depend on meilisearch_types | ||||
|     pub fn as_progress_view(&self) -> ProgressView { | ||||
|         let steps = self.steps.read().unwrap(); | ||||
|  | ||||
|         let mut percentage = 0.0; | ||||
|         let mut prev_factors = 1.0; | ||||
|  | ||||
|         let mut step_view = Vec::with_capacity(steps.len()); | ||||
|         for (_, step) in steps.iter() { | ||||
|             prev_factors *= step.total() as f32; | ||||
|             percentage += step.current() as f32 / prev_factors; | ||||
|  | ||||
|             step_view.push(ProgressStepView { | ||||
|                 current_step: step.name(), | ||||
|                 finished: step.current(), | ||||
|                 total: step.total(), | ||||
|             }); | ||||
|         } | ||||
|  | ||||
|         ProgressView { steps: step_view, percentage: percentage * 100.0 } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// This trait lets you use the AtomicSubStep defined right below. | ||||
| /// The name must be a const that never changed but that can't be enforced by the type system because it make the trait non object-safe. | ||||
| /// By forcing the Default trait + the &'static str we make it harder to miss-use the trait. | ||||
| pub trait NamedStep: 'static + Send + Sync + Default { | ||||
|     fn name(&self) -> &'static str; | ||||
| } | ||||
|  | ||||
| /// Structure to quickly define steps that need very quick, lockless updating of their current step. | ||||
| /// You can use this struct if: | ||||
| /// - The name of the step doesn't change | ||||
| /// - The total number of steps doesn't change | ||||
| pub struct AtomicSubStep<Name: NamedStep> { | ||||
|     unit_name: Name, | ||||
|     current: Arc<AtomicU32>, | ||||
|     total: u32, | ||||
| } | ||||
|  | ||||
| impl<Name: NamedStep> AtomicSubStep<Name> { | ||||
|     pub fn new(total: u32) -> (Arc<AtomicU32>, Self) { | ||||
|         let current = Arc::new(AtomicU32::new(0)); | ||||
|         (current.clone(), Self { current, total, unit_name: Name::default() }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<Name: NamedStep> Step for AtomicSubStep<Name> { | ||||
|     fn name(&self) -> Cow<'static, str> { | ||||
|         self.unit_name.name().into() | ||||
|     } | ||||
|  | ||||
|     fn current(&self) -> u32 { | ||||
|         self.current.load(Ordering::Relaxed) | ||||
|     } | ||||
|  | ||||
|     fn total(&self) -> u32 { | ||||
|         self.total | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[macro_export] | ||||
| macro_rules! make_enum_progress { | ||||
|     ($visibility:vis enum $name:ident { $($variant:ident,)+ }) => { | ||||
|         #[repr(u8)] | ||||
|         #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] | ||||
|         #[allow(clippy::enum_variant_names)] | ||||
|         $visibility enum $name { | ||||
|             $($variant),+ | ||||
|         } | ||||
|  | ||||
|         impl Step for $name { | ||||
|             fn name(&self) -> Cow<'static, str> { | ||||
|                 use convert_case::Casing; | ||||
|  | ||||
|                 match self { | ||||
|                     $( | ||||
|                         $name::$variant => stringify!($variant).from_case(convert_case::Case::Camel).to_case(convert_case::Case::Lower).into() | ||||
|                     ),+ | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             fn current(&self) -> u32 { | ||||
|                 *self as u32 | ||||
|             } | ||||
|  | ||||
|             fn total(&self) -> u32 { | ||||
|                 Self::CARDINALITY as u32 | ||||
|             } | ||||
|         } | ||||
|     }; | ||||
| } | ||||
|  | ||||
| #[macro_export] | ||||
| macro_rules! make_atomic_progress { | ||||
|     ($struct_name:ident alias $atomic_struct_name:ident => $step_name:literal) => { | ||||
|         #[derive(Default, Debug, Clone, Copy)] | ||||
|         pub struct $struct_name {} | ||||
|         impl NamedStep for $struct_name { | ||||
|             fn name(&self) -> &'static str { | ||||
|                 $step_name | ||||
|             } | ||||
|         } | ||||
|         pub type $atomic_struct_name = AtomicSubStep<$struct_name>; | ||||
|     }; | ||||
| } | ||||
|  | ||||
| make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); | ||||
| make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" ); | ||||
|  | ||||
| #[derive(Debug, Serialize, Clone)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| pub struct ProgressView { | ||||
|     pub steps: Vec<ProgressStepView>, | ||||
|     pub percentage: f32, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Serialize, Clone)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| pub struct ProgressStepView { | ||||
|     pub current_step: Cow<'static, str>, | ||||
|     pub finished: u32, | ||||
|     pub total: u32, | ||||
| } | ||||
| @@ -3,12 +3,13 @@ use std::collections::BTreeMap; | ||||
| use std::fmt::{self, Debug}; | ||||
|  | ||||
| use bumpalo::Bump; | ||||
| use bumparaw_collections::{RawMap, RawVec, Value}; | ||||
| use liquid::model::{ | ||||
|     ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State, | ||||
|     Value as LiquidValue, | ||||
| }; | ||||
| use liquid::{ObjectView, ValueView}; | ||||
| use raw_collections::{RawMap, RawVec}; | ||||
| use rustc_hash::FxBuildHasher; | ||||
| use serde_json::value::RawValue; | ||||
|  | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd}; | ||||
| @@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc | ||||
| } | ||||
|  | ||||
| impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> { | ||||
|     fn as_debug(&self) -> &dyn fmt::Debug { | ||||
|     fn as_debug(&self) -> &dyn Debug { | ||||
|         self | ||||
|     } | ||||
|     fn render(&self) -> liquid::model::DisplayCow<'_> { | ||||
| @@ -243,14 +244,13 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| struct ParseableValue<'doc> { | ||||
|     value: raw_collections::Value<'doc>, | ||||
|     value: Value<'doc, FxBuildHasher>, | ||||
| } | ||||
|  | ||||
| impl<'doc> ParseableValue<'doc> { | ||||
|     pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self { | ||||
|         let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap(); | ||||
|         let value = Value::from_raw_value_and_hasher(value, FxBuildHasher, doc_alloc).unwrap(); | ||||
|         Self { value } | ||||
|     } | ||||
|  | ||||
| @@ -260,19 +260,19 @@ impl<'doc> ParseableValue<'doc> { | ||||
| } | ||||
|  | ||||
| // transparent newtype for implementing ValueView | ||||
| #[repr(transparent)] | ||||
| #[derive(Debug)] | ||||
| struct ParseableMap<'doc>(RawMap<'doc>); | ||||
| #[repr(transparent)] | ||||
| struct ParseableMap<'doc>(RawMap<'doc, FxBuildHasher>); | ||||
|  | ||||
| // transparent newtype for implementing ValueView | ||||
| #[repr(transparent)] | ||||
| #[derive(Debug)] | ||||
| #[repr(transparent)] | ||||
| struct ParseableArray<'doc>(RawVec<'doc>); | ||||
|  | ||||
| impl<'doc> ParseableMap<'doc> { | ||||
|     pub fn as_parseable<'a>(map: &'a RawMap<'doc>) -> &'a ParseableMap<'doc> { | ||||
|     pub fn as_parseable<'a>(map: &'a RawMap<'doc, FxBuildHasher>) -> &'a ParseableMap<'doc> { | ||||
|         // SAFETY: repr(transparent) | ||||
|         unsafe { &*(map as *const RawMap as *const Self) } | ||||
|         unsafe { &*(map as *const RawMap<FxBuildHasher> as *const Self) } | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -447,8 +447,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { | ||||
|     } | ||||
|  | ||||
|     fn render(&self) -> DisplayCow<'_> { | ||||
|         use raw_collections::value::Number; | ||||
|         use raw_collections::Value; | ||||
|         use bumparaw_collections::value::Number; | ||||
|         use bumparaw_collections::Value; | ||||
|  | ||||
|         match &self.value { | ||||
|             Value::Null => LiquidValue::Nil.render(), | ||||
|             Value::Bool(v) => v.render(), | ||||
| @@ -464,8 +465,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { | ||||
|     } | ||||
|  | ||||
|     fn source(&self) -> DisplayCow<'_> { | ||||
|         use raw_collections::value::Number; | ||||
|         use raw_collections::Value; | ||||
|         use bumparaw_collections::value::Number; | ||||
|         use bumparaw_collections::Value; | ||||
|  | ||||
|         match &self.value { | ||||
|             Value::Null => LiquidValue::Nil.source(), | ||||
|             Value::Bool(v) => ValueView::source(v), | ||||
| @@ -481,8 +483,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { | ||||
|     } | ||||
|  | ||||
|     fn type_name(&self) -> &'static str { | ||||
|         use raw_collections::value::Number; | ||||
|         use raw_collections::Value; | ||||
|         use bumparaw_collections::value::Number; | ||||
|         use bumparaw_collections::Value; | ||||
|  | ||||
|         match &self.value { | ||||
|             Value::Null => LiquidValue::Nil.type_name(), | ||||
|             Value::Bool(v) => v.type_name(), | ||||
| @@ -498,7 +501,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { | ||||
|     } | ||||
|  | ||||
|     fn query_state(&self, state: State) -> bool { | ||||
|         use raw_collections::Value; | ||||
|         use bumparaw_collections::Value; | ||||
|  | ||||
|         match &self.value { | ||||
|             Value::Null => ValueView::query_state(&LiquidValue::Nil, state), | ||||
|             Value::Bool(v) => ValueView::query_state(v, state), | ||||
| @@ -515,7 +519,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { | ||||
|     } | ||||
|  | ||||
|     fn to_kstr(&self) -> KStringCow<'_> { | ||||
|         use raw_collections::Value; | ||||
|         use bumparaw_collections::Value; | ||||
|  | ||||
|         match &self.value { | ||||
|             Value::Null => ValueView::to_kstr(&LiquidValue::Nil), | ||||
|             Value::Bool(v) => ValueView::to_kstr(v), | ||||
| @@ -527,12 +532,14 @@ impl<'doc> ValueView for ParseableValue<'doc> { | ||||
|     } | ||||
|  | ||||
|     fn to_value(&self) -> LiquidValue { | ||||
|         use raw_collections::Value; | ||||
|         use bumparaw_collections::value::Number; | ||||
|         use bumparaw_collections::Value; | ||||
|  | ||||
|         match &self.value { | ||||
|             Value::Null => LiquidValue::Nil, | ||||
|             Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)), | ||||
|             Value::Number(number) => match number { | ||||
|                 raw_collections::value::Number::PosInt(number) => { | ||||
|                 Number::PosInt(number) => { | ||||
|                     let number: i64 = match (*number).try_into() { | ||||
|                         Ok(number) => number, | ||||
|                         Err(_) => { | ||||
| @@ -541,12 +548,8 @@ impl<'doc> ValueView for ParseableValue<'doc> { | ||||
|                     }; | ||||
|                     LiquidValue::Scalar(ScalarCow::new(number)) | ||||
|                 } | ||||
|                 raw_collections::value::Number::NegInt(number) => { | ||||
|                     LiquidValue::Scalar(ScalarCow::new(*number)) | ||||
|                 } | ||||
|                 raw_collections::value::Number::Finite(number) => { | ||||
|                     LiquidValue::Scalar(ScalarCow::new(*number)) | ||||
|                 } | ||||
|                 Number::NegInt(number) => LiquidValue::Scalar(ScalarCow::new(*number)), | ||||
|                 Number::Finite(number) => LiquidValue::Scalar(ScalarCow::new(*number)), | ||||
|             }, | ||||
|             Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())), | ||||
|             Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(), | ||||
| @@ -555,8 +558,9 @@ impl<'doc> ValueView for ParseableValue<'doc> { | ||||
|     } | ||||
|  | ||||
|     fn as_scalar(&self) -> Option<liquid::model::ScalarCow<'_>> { | ||||
|         use raw_collections::value::Number; | ||||
|         use raw_collections::Value; | ||||
|         use bumparaw_collections::value::Number; | ||||
|         use bumparaw_collections::Value; | ||||
|  | ||||
|         match &self.value { | ||||
|             Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)), | ||||
|             Value::Number(number) => match number { | ||||
| @@ -576,34 +580,41 @@ impl<'doc> ValueView for ParseableValue<'doc> { | ||||
|     } | ||||
|  | ||||
|     fn is_scalar(&self) -> bool { | ||||
|         use raw_collections::Value; | ||||
|         use bumparaw_collections::Value; | ||||
|  | ||||
|         matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_)) | ||||
|     } | ||||
|  | ||||
|     fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> { | ||||
|         if let raw_collections::Value::Array(array) = &self.value { | ||||
|         if let Value::Array(array) = &self.value { | ||||
|             return Some(ParseableArray::as_parseable(array) as _); | ||||
|         } | ||||
|         None | ||||
|     } | ||||
|  | ||||
|     fn is_array(&self) -> bool { | ||||
|         matches!(&self.value, raw_collections::Value::Array(_)) | ||||
|         matches!(&self.value, bumparaw_collections::Value::Array(_)) | ||||
|     } | ||||
|  | ||||
|     fn as_object(&self) -> Option<&dyn ObjectView> { | ||||
|         if let raw_collections::Value::Object(object) = &self.value { | ||||
|         if let Value::Object(object) = &self.value { | ||||
|             return Some(ParseableMap::as_parseable(object) as _); | ||||
|         } | ||||
|         None | ||||
|     } | ||||
|  | ||||
|     fn is_object(&self) -> bool { | ||||
|         matches!(&self.value, raw_collections::Value::Object(_)) | ||||
|         matches!(&self.value, bumparaw_collections::Value::Object(_)) | ||||
|     } | ||||
|  | ||||
|     fn is_nil(&self) -> bool { | ||||
|         matches!(&self.value, raw_collections::Value::Null) | ||||
|         matches!(&self.value, bumparaw_collections::Value::Null) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Debug for ParseableValue<'_> { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||||
|         f.debug_struct("ParseableValue").field("value", &self.value).finish() | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -38,6 +38,16 @@ pub struct RenderPromptError { | ||||
|     pub fault: FaultSource, | ||||
| } | ||||
| impl RenderPromptError { | ||||
|     pub(crate) fn missing_context_with_external_docid( | ||||
|         external_docid: String, | ||||
|         inner: liquid::Error, | ||||
|     ) -> RenderPromptError { | ||||
|         Self { | ||||
|             kind: RenderPromptErrorKind::MissingContextWithExternalDocid(external_docid, inner), | ||||
|             fault: FaultSource::User, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn missing_context(inner: liquid::Error) -> RenderPromptError { | ||||
|         Self { kind: RenderPromptErrorKind::MissingContext(inner), fault: FaultSource::User } | ||||
|     } | ||||
| @@ -47,6 +57,8 @@ impl RenderPromptError { | ||||
| pub enum RenderPromptErrorKind { | ||||
|     #[error("missing field in document: {0}")] | ||||
|     MissingContext(liquid::Error), | ||||
|     #[error("missing field in document `{0}`: {1}")] | ||||
|     MissingContextWithExternalDocid(String, liquid::Error), | ||||
| } | ||||
|  | ||||
| impl From<RenderPromptError> for crate::Error { | ||||
|   | ||||
| @@ -119,6 +119,7 @@ impl Prompt { | ||||
|         'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents | ||||
|     >( | ||||
|         &self, | ||||
|         external_docid: &str, | ||||
|         document: impl crate::update::new::document::Document<'a> + Debug, | ||||
|         field_id_map: &RefCell<GlobalFieldsIdsMap>, | ||||
|         doc_alloc: &'doc Bump, | ||||
| @@ -130,9 +131,12 @@ impl Prompt { | ||||
|             self.max_bytes.unwrap_or_else(default_max_bytes).get(), | ||||
|             doc_alloc, | ||||
|         ); | ||||
|         self.template | ||||
|             .render_to(&mut rendered, &context) | ||||
|             .map_err(RenderPromptError::missing_context)?; | ||||
|         self.template.render_to(&mut rendered, &context).map_err(|liquid_error| { | ||||
|             RenderPromptError::missing_context_with_external_docid( | ||||
|                 external_docid.to_owned(), | ||||
|                 liquid_error, | ||||
|             ) | ||||
|         })?; | ||||
|         Ok(std::str::from_utf8(rendered.into_bump_slice()) | ||||
|             .expect("render can only write UTF-8 because all inputs and processing preserve utf-8")) | ||||
|     } | ||||
|   | ||||
| @@ -207,7 +207,11 @@ impl<'a> Search<'a> { | ||||
|                     Ok(embedding) => embedding, | ||||
|                     Err(error) => { | ||||
|                         tracing::error!(error=%error, "Embedding failed"); | ||||
|                         return Ok((keyword_results, Some(0))); | ||||
|                         return Ok(return_keyword_results( | ||||
|                             self.limit, | ||||
|                             self.offset, | ||||
|                             keyword_results, | ||||
|                         )); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|   | ||||
| @@ -274,7 +274,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | ||||
|                     last_match_last_token_position_plus_one | ||||
|                 } else { | ||||
|                     // we have matched the end of possible tokens, there's nothing to advance | ||||
|                     tokens.len() - 1 | ||||
|                     tokens.len() | ||||
|                 } | ||||
|             }; | ||||
|  | ||||
|   | ||||
| @@ -49,6 +49,7 @@ pub use self::geo_sort::Strategy as GeoSortStrategy; | ||||
| use self::graph_based_ranking_rule::Words; | ||||
| use self::interner::Interned; | ||||
| use self::vector_sort::VectorSort; | ||||
| use crate::index::PrefixSearch; | ||||
| use crate::localized_attributes_rules::LocalizedFieldIds; | ||||
| use crate::score_details::{ScoreDetails, ScoringStrategy}; | ||||
| use crate::search::new::distinct::apply_distinct_rule; | ||||
| @@ -68,6 +69,7 @@ pub struct SearchContext<'ctx> { | ||||
|     pub term_interner: Interner<QueryTerm>, | ||||
|     pub phrase_docids: PhraseDocIdsCache, | ||||
|     pub restricted_fids: Option<RestrictedFids>, | ||||
|     pub prefix_search: PrefixSearch, | ||||
| } | ||||
|  | ||||
| impl<'ctx> SearchContext<'ctx> { | ||||
| @@ -85,6 +87,8 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let prefix_search = index.prefix_search(txn)?.unwrap_or_default(); | ||||
|  | ||||
|         Ok(Self { | ||||
|             index, | ||||
|             txn, | ||||
| @@ -94,9 +98,14 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|             term_interner: <_>::default(), | ||||
|             phrase_docids: <_>::default(), | ||||
|             restricted_fids: None, | ||||
|             prefix_search, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn is_prefix_search_allowed(&self) -> bool { | ||||
|         self.prefix_search != PrefixSearch::Disabled | ||||
|     } | ||||
|  | ||||
|     pub fn attributes_to_search_on( | ||||
|         &mut self, | ||||
|         attributes_to_search_on: &'ctx [String], | ||||
|   | ||||
| @@ -28,6 +28,7 @@ pub fn located_query_terms_from_tokens( | ||||
|     words_limit: Option<usize>, | ||||
| ) -> Result<ExtractedTokens> { | ||||
|     let nbr_typos = number_of_typos_allowed(ctx)?; | ||||
|     let allow_prefix_search = ctx.is_prefix_search_allowed(); | ||||
|  | ||||
|     let mut query_terms = Vec::new(); | ||||
|  | ||||
| @@ -94,7 +95,7 @@ pub fn located_query_terms_from_tokens( | ||||
|                         ctx, | ||||
|                         word, | ||||
|                         nbr_typos(word), | ||||
|                         true, | ||||
|                         allow_prefix_search, | ||||
|                         false, | ||||
|                     )?; | ||||
|                     let located_term = LocatedQueryTerm { | ||||
|   | ||||
| @@ -193,15 +193,23 @@ pub fn compute_phrase_docids( | ||||
|     if words.is_empty() { | ||||
|         return Ok(RoaringBitmap::new()); | ||||
|     } | ||||
|     let mut candidates = RoaringBitmap::new(); | ||||
|     let mut candidates = None; | ||||
|     for word in words.iter().flatten().copied() { | ||||
|         if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? { | ||||
|             candidates |= word_docids; | ||||
|             if let Some(candidates) = candidates.as_mut() { | ||||
|                 *candidates &= word_docids; | ||||
|             } else { | ||||
|                 candidates = Some(word_docids); | ||||
|             } | ||||
|         } else { | ||||
|             return Ok(RoaringBitmap::new()); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let Some(mut candidates) = candidates else { | ||||
|         return Ok(RoaringBitmap::new()); | ||||
|     }; | ||||
|  | ||||
|     let winsize = words.len().min(3); | ||||
|  | ||||
|     for win in words.windows(winsize) { | ||||
|   | ||||
| @@ -5,6 +5,7 @@ use bumpalo::Bump; | ||||
| use heed::EnvOpenOptions; | ||||
| use maplit::{btreemap, hashset}; | ||||
|  | ||||
| use crate::progress::Progress; | ||||
| use crate::update::new::indexer; | ||||
| use crate::update::{IndexDocumentsMethod, IndexerConfig, Settings}; | ||||
| use crate::vector::EmbeddingConfigs; | ||||
| @@ -72,7 +73,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | ||||
|             None, | ||||
|             &mut new_fields_ids_map, | ||||
|             &|| false, | ||||
|             &|_progress| (), | ||||
|             Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|  | ||||
| @@ -83,6 +84,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | ||||
|     indexer::index( | ||||
|         &mut wtxn, | ||||
|         &index, | ||||
|         &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|         config.grenad_parameters(), | ||||
|         &db_fields_ids_map, | ||||
|         new_fields_ids_map, | ||||
| @@ -90,7 +92,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | ||||
|         &document_changes, | ||||
|         embedders, | ||||
|         &|| false, | ||||
|         &|_| (), | ||||
|         &Progress::default(), | ||||
|     ) | ||||
|     .unwrap(); | ||||
|  | ||||
|   | ||||
| @@ -172,6 +172,14 @@ impl<'i> FacetsUpdate<'i> { | ||||
|             incremental_update.execute(wtxn)?; | ||||
|         } | ||||
|  | ||||
|         if !self.index.facet_search(wtxn)? { | ||||
|             // If facet search is disabled, we don't need to compute facet search databases. | ||||
|             // We clear the facet search databases. | ||||
|             self.index.facet_id_string_fst.clear(wtxn)?; | ||||
|             self.index.facet_id_normalized_string_strings.clear(wtxn)?; | ||||
|             return Ok(()); | ||||
|         } | ||||
|  | ||||
|         match self.normalized_delta_data { | ||||
|             Some(data) => index_facet_search(wtxn, data, self.index), | ||||
|             None => Ok(()), | ||||
|   | ||||
| @@ -58,9 +58,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||
|         .map(|s| s.iter().map(String::as_str).collect()); | ||||
|     let old_dictionary: Option<Vec<_>> = | ||||
|         settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||
|     let del_builder = | ||||
|     let mut del_builder = | ||||
|         tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref()); | ||||
|     let del_tokenizer = del_builder.into_tokenizer(); | ||||
|     let del_tokenizer = del_builder.build(); | ||||
|  | ||||
|     let new_stop_words = settings_diff.new.stop_words.as_ref(); | ||||
|     let new_separators: Option<Vec<_>> = settings_diff | ||||
| @@ -70,9 +70,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||
|         .map(|s| s.iter().map(String::as_str).collect()); | ||||
|     let new_dictionary: Option<Vec<_>> = | ||||
|         settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||
|     let add_builder = | ||||
|     let mut add_builder = | ||||
|         tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref()); | ||||
|     let add_tokenizer = add_builder.into_tokenizer(); | ||||
|     let add_tokenizer = add_builder.build(); | ||||
|  | ||||
|     // iterate over documents. | ||||
|     let mut cursor = obkv_documents.into_cursor()?; | ||||
|   | ||||
| @@ -34,10 +34,12 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | ||||
|         extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) | ||||
|     } else { | ||||
|         let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; | ||||
|         let facet_search = settings_diff.new.facet_search; | ||||
|         extract_facet_string_docids_document_update( | ||||
|             docid_fid_facet_string, | ||||
|             indexer, | ||||
|             localized_field_ids, | ||||
|             facet_search, | ||||
|         ) | ||||
|     } | ||||
| } | ||||
| @@ -51,6 +53,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>( | ||||
|     docid_fid_facet_string: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
|     localized_field_ids: &LocalizedFieldIds, | ||||
|     facet_search: bool, | ||||
| ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
| @@ -96,7 +99,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>( | ||||
|         let normalized_value = str::from_utf8(normalized_value_bytes)?; | ||||
|  | ||||
|         // Facet search normalization | ||||
|         { | ||||
|         if facet_search { | ||||
|             let locales = localized_field_ids.locales(field_id); | ||||
|             let hyper_normalized_value = normalize_facet_string(normalized_value, locales); | ||||
|  | ||||
| @@ -179,8 +182,10 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>( | ||||
|         let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); | ||||
|  | ||||
|         let are_same_locales = old_locales == new_locales; | ||||
|         let reindex_facet_search = | ||||
|             settings_diff.new.facet_search && !settings_diff.old.facet_search; | ||||
|  | ||||
|         if is_same_value && are_same_locales { | ||||
|         if is_same_value && are_same_locales && !reindex_facet_search { | ||||
|             continue; | ||||
|         } | ||||
|  | ||||
| @@ -191,18 +196,26 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>( | ||||
|         let normalized_value = str::from_utf8(normalized_value_bytes)?; | ||||
|  | ||||
|         // Facet search normalization | ||||
|         { | ||||
|             let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); | ||||
|             let new_hyper_normalized_value = if are_same_locales { | ||||
|                 &old_hyper_normalized_value | ||||
|         if settings_diff.new.facet_search { | ||||
|             let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales); | ||||
|             let old_hyper_normalized_value; | ||||
|             let old_hyper_normalized_value = if !settings_diff.old.facet_search | ||||
|                 || deladd_reader.get(DelAdd::Deletion).is_none() | ||||
|             { | ||||
|                 // if the facet search is disabled in the old settings or if no facet string is deleted, | ||||
|                 // we don't need to normalize the facet string. | ||||
|                 None | ||||
|             } else if are_same_locales { | ||||
|                 Some(&new_hyper_normalized_value) | ||||
|             } else { | ||||
|                 &normalize_facet_string(normalized_value, new_locales) | ||||
|                 old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales); | ||||
|                 Some(&old_hyper_normalized_value) | ||||
|             }; | ||||
|  | ||||
|             let set = BTreeSet::from_iter(std::iter::once(normalized_value)); | ||||
|  | ||||
|             // if the facet string is the same, we can put the deletion and addition in the same obkv. | ||||
|             if old_hyper_normalized_value == new_hyper_normalized_value.as_str() { | ||||
|             if old_hyper_normalized_value == Some(&new_hyper_normalized_value) { | ||||
|                 // nothing to do if we delete and re-add the value. | ||||
|                 if is_same_value { | ||||
|                     continue; | ||||
| @@ -222,7 +235,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>( | ||||
|             } else { | ||||
|                 // if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different. | ||||
|                 // deletion | ||||
|                 if deladd_reader.get(DelAdd::Deletion).is_some() { | ||||
|                 if let Some(old_hyper_normalized_value) = old_hyper_normalized_value { | ||||
|                     // insert old value | ||||
|                     let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; | ||||
|                     buffer.clear(); | ||||
|   | ||||
| @@ -80,7 +80,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|     let new_faceted_fids: BTreeSet<_> = | ||||
|         settings_diff.new.faceted_fields_ids.iter().copied().collect(); | ||||
|  | ||||
|     if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids { | ||||
|     if !settings_diff.settings_update_only || settings_diff.reindex_facets() { | ||||
|         let mut cursor = obkv_documents.into_cursor()?; | ||||
|         while let Some((docid_bytes, value)) = cursor.move_on_next()? { | ||||
|             let obkv = obkv::KvReader::from_slice(value); | ||||
| @@ -112,8 +112,10 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|                         (field_id, None, add_value) | ||||
|                     } | ||||
|                     EitherOrBoth::Both(&field_id, _) => { | ||||
|                         // during settings update, recompute the changing settings only. | ||||
|                         if settings_diff.settings_update_only { | ||||
|                         // during settings update, recompute the changing settings only unless a global change is detected. | ||||
|                         if settings_diff.settings_update_only | ||||
|                             && !settings_diff.global_facet_settings_changed() | ||||
|                         { | ||||
|                             continue; | ||||
|                         } | ||||
|  | ||||
|   | ||||
| @@ -29,6 +29,7 @@ pub use self::transform::{Transform, TransformOutput}; | ||||
| use super::new::StdResult; | ||||
| use crate::documents::{obkv_to_object, DocumentsBatchReader}; | ||||
| use crate::error::{Error, InternalError}; | ||||
| use crate::index::{PrefixSearch, PrefixSettings}; | ||||
| use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder; | ||||
| pub use crate::update::index_documents::helpers::CursorClonableMmap; | ||||
| use crate::update::{ | ||||
| @@ -82,8 +83,6 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { | ||||
|  | ||||
| #[derive(Default, Debug, Clone)] | ||||
| pub struct IndexDocumentsConfig { | ||||
|     pub words_prefix_threshold: Option<u32>, | ||||
|     pub max_prefix_length: Option<usize>, | ||||
|     pub words_positions_level_group_size: Option<NonZeroU32>, | ||||
|     pub words_positions_min_level_size: Option<NonZeroU32>, | ||||
|     pub update_method: IndexDocumentsMethod, | ||||
| @@ -565,14 +564,32 @@ where | ||||
|             self.index.words_prefixes_fst(self.wtxn)?.map_data(|cow| cow.into_owned())?; | ||||
|  | ||||
|         // Run the words prefixes update operation. | ||||
|         let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); | ||||
|         if let Some(value) = self.config.words_prefix_threshold { | ||||
|             builder.threshold(value); | ||||
|         let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } = | ||||
|             self.index.prefix_settings(self.wtxn)?; | ||||
|  | ||||
|         // If the prefix search is enabled at indexing time, we compute the prefixes. | ||||
|         if compute_prefixes == PrefixSearch::IndexingTime { | ||||
|             let mut builder = WordsPrefixesFst::new(self.wtxn, self.index); | ||||
|             builder.threshold(prefix_count_threshold); | ||||
|             builder.max_prefix_length(max_prefix_length); | ||||
|             builder.execute()?; | ||||
|         } else { | ||||
|             // If the prefix search is disabled at indexing time, we delete the previous words prefixes fst. | ||||
|             // And all the associated docids databases. | ||||
|             self.index.delete_words_prefixes_fst(self.wtxn)?; | ||||
|             self.index.word_prefix_docids.clear(self.wtxn)?; | ||||
|             self.index.exact_word_prefix_docids.clear(self.wtxn)?; | ||||
|             self.index.word_prefix_position_docids.clear(self.wtxn)?; | ||||
|             self.index.word_prefix_fid_docids.clear(self.wtxn)?; | ||||
|  | ||||
|             databases_seen += 3; | ||||
|             (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|                 databases_seen, | ||||
|                 total_databases: TOTAL_POSTING_DATABASE_COUNT, | ||||
|             }); | ||||
|  | ||||
|             return Ok(()); | ||||
|         } | ||||
|         if let Some(value) = self.config.max_prefix_length { | ||||
|             builder.max_prefix_length(value); | ||||
|         } | ||||
|         builder.execute()?; | ||||
|  | ||||
|         if (self.should_abort)() { | ||||
|             return Err(Error::InternalError(InternalError::AbortedIndexation)); | ||||
| @@ -749,6 +766,7 @@ mod tests { | ||||
|     use crate::documents::mmap_from_objects; | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use crate::index::IndexEmbeddingConfig; | ||||
|     use crate::progress::Progress; | ||||
|     use crate::search::TermsMatchingStrategy; | ||||
|     use crate::update::new::indexer; | ||||
|     use crate::update::Setting; | ||||
| @@ -1947,7 +1965,7 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
| @@ -2131,13 +2149,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2145,7 +2164,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2192,13 +2211,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2206,7 +2226,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2244,13 +2264,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2258,7 +2279,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2295,13 +2316,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2309,7 +2331,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2348,13 +2370,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2362,7 +2385,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2406,13 +2429,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2420,7 +2444,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2457,13 +2481,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2471,7 +2496,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2508,13 +2533,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2522,7 +2548,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2701,13 +2727,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2715,7 +2742,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2759,13 +2786,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2773,7 +2801,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
| @@ -2814,13 +2842,14 @@ mod tests { | ||||
|                 None, | ||||
|                 &mut new_fields_ids_map, | ||||
|                 &|| false, | ||||
|                 &|_progress| (), | ||||
|                 Progress::default(), | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|         indexer::index( | ||||
|             &mut wtxn, | ||||
|             &index.inner, | ||||
|             &crate::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|             indexer_config.grenad_parameters(), | ||||
|             &db_fields_ids_map, | ||||
|             new_fields_ids_map, | ||||
| @@ -2828,7 +2857,7 @@ mod tests { | ||||
|             &document_changes, | ||||
|             embedders, | ||||
|             &|| false, | ||||
|             &|_| (), | ||||
|             &Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|   | ||||
| @@ -667,14 +667,23 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; | ||||
|  | ||||
|         // If only a faceted field has been added, keep only this field. | ||||
|         let must_reindex_facets = settings_diff.reindex_facets(); | ||||
|         let necessary_faceted_field = |id: FieldId| -> bool { | ||||
|             let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); | ||||
|             must_reindex_facets | ||||
|                 && modified_faceted_fields | ||||
|                     .iter() | ||||
|                     .any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long)) | ||||
|         }; | ||||
|         let global_facet_settings_changed = settings_diff.global_facet_settings_changed(); | ||||
|         let facet_fids_changed = settings_diff.facet_fids_changed(); | ||||
|         let necessary_faceted_field = | ||||
|             |id: FieldId| -> bool { | ||||
|                 let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); | ||||
|                 if global_facet_settings_changed { | ||||
|                     settings_diff.new.user_defined_faceted_fields.iter().any(|long| { | ||||
|                         is_faceted_by(long, field_name) || is_faceted_by(field_name, long) | ||||
|                     }) | ||||
|                 } else if facet_fids_changed { | ||||
|                     modified_faceted_fields.iter().any(|long| { | ||||
|                         is_faceted_by(long, field_name) || is_faceted_by(field_name, long) | ||||
|                     }) | ||||
|                 } else { | ||||
|                     false | ||||
|                 } | ||||
|             }; | ||||
|  | ||||
|         // Alway provide all fields when vectors are involved because | ||||
|         // we need the fields for the prompt/templating. | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,7 +1,8 @@ | ||||
| use std::collections::{BTreeMap, BTreeSet}; | ||||
|  | ||||
| use bumparaw_collections::RawMap; | ||||
| use heed::RoTxn; | ||||
| use raw_collections::RawMap; | ||||
| use rustc_hash::FxBuildHasher; | ||||
| use serde_json::value::RawValue; | ||||
|  | ||||
| use super::vector_document::VectorDocument; | ||||
| @@ -385,12 +386,12 @@ pub type Entry<'doc> = (&'doc str, &'doc RawValue); | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct Versions<'doc> { | ||||
|     data: RawMap<'doc>, | ||||
|     data: RawMap<'doc, FxBuildHasher>, | ||||
| } | ||||
|  | ||||
| impl<'doc> Versions<'doc> { | ||||
|     pub fn multiple( | ||||
|         mut versions: impl Iterator<Item = Result<RawMap<'doc>>>, | ||||
|         mut versions: impl Iterator<Item = Result<RawMap<'doc, FxBuildHasher>>>, | ||||
|     ) -> Result<Option<Self>> { | ||||
|         let Some(data) = versions.next() else { return Ok(None) }; | ||||
|         let mut data = data?; | ||||
| @@ -403,7 +404,7 @@ impl<'doc> Versions<'doc> { | ||||
|         Ok(Some(Self::single(data))) | ||||
|     } | ||||
|  | ||||
|     pub fn single(version: RawMap<'doc>) -> Self { | ||||
|     pub fn single(version: RawMap<'doc, FxBuildHasher>) -> Self { | ||||
|         Self { data: version } | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -1,7 +1,10 @@ | ||||
| use bumpalo::Bump; | ||||
| use heed::RoTxn; | ||||
|  | ||||
| use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions}; | ||||
| use super::document::{ | ||||
|     Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, | ||||
| }; | ||||
| use super::extract::perm_json_p; | ||||
| use super::vector_document::{ | ||||
|     MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, | ||||
| }; | ||||
| @@ -164,6 +167,80 @@ impl<'doc> Update<'doc> { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Returns whether the updated version of the document is different from the current version for the passed subset of fields. | ||||
|     /// | ||||
|     /// `true` if at least one top-level-field that is a exactly a member of field or a parent of a member of field changed. | ||||
|     /// Otherwise `false`. | ||||
|     pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>( | ||||
|         &self, | ||||
|         fields: Option<&[&str]>, | ||||
|         rtxn: &'t RoTxn, | ||||
|         index: &'t Index, | ||||
|         mapper: &'t Mapper, | ||||
|     ) -> Result<bool> { | ||||
|         let mut changed = false; | ||||
|         let mut cached_current = None; | ||||
|         let mut updated_selected_field_count = 0; | ||||
|  | ||||
|         for entry in self.updated().iter_top_level_fields() { | ||||
|             let (key, updated_value) = entry?; | ||||
|  | ||||
|             if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { | ||||
|                 continue; | ||||
|             } | ||||
|  | ||||
|             updated_selected_field_count += 1; | ||||
|             let current = match cached_current { | ||||
|                 Some(current) => current, | ||||
|                 None => self.current(rtxn, index, mapper)?, | ||||
|             }; | ||||
|             let current_value = current.top_level_field(key)?; | ||||
|             let Some(current_value) = current_value else { | ||||
|                 changed = true; | ||||
|                 break; | ||||
|             }; | ||||
|  | ||||
|             if current_value.get() != updated_value.get() { | ||||
|                 changed = true; | ||||
|                 break; | ||||
|             } | ||||
|             cached_current = Some(current); | ||||
|         } | ||||
|  | ||||
|         if !self.has_deletion { | ||||
|             // no field deletion, so fields that don't appear in `updated` cannot have changed | ||||
|             return Ok(changed); | ||||
|         } | ||||
|  | ||||
|         if changed { | ||||
|             return Ok(true); | ||||
|         } | ||||
|  | ||||
|         // we saw all updated fields, and set `changed` if any field wasn't in `current`. | ||||
|         // so if there are as many fields in `current` as in `updated`, then nothing changed. | ||||
|         // If there is any more fields in `current`, then they are missing in `updated`. | ||||
|         let has_deleted_fields = { | ||||
|             let current = match cached_current { | ||||
|                 Some(current) => current, | ||||
|                 None => self.current(rtxn, index, mapper)?, | ||||
|             }; | ||||
|  | ||||
|             let mut current_selected_field_count = 0; | ||||
|             for entry in current.iter_top_level_fields() { | ||||
|                 let (key, _) = entry?; | ||||
|  | ||||
|                 if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { | ||||
|                     continue; | ||||
|                 } | ||||
|                 current_selected_field_count += 1; | ||||
|             } | ||||
|  | ||||
|             current_selected_field_count != updated_selected_field_count | ||||
|         }; | ||||
|  | ||||
|         Ok(has_deleted_fields) | ||||
|     } | ||||
|  | ||||
|     pub fn updated_vectors( | ||||
|         &self, | ||||
|         doc_alloc: &'doc Bump, | ||||
|   | ||||
| @@ -69,12 +69,12 @@ use std::io::BufReader; | ||||
| use std::{io, iter, mem}; | ||||
|  | ||||
| use bumpalo::Bump; | ||||
| use bumparaw_collections::bbbul::{BitPacker, BitPacker4x}; | ||||
| use bumparaw_collections::map::FrozenMap; | ||||
| use bumparaw_collections::{Bbbul, FrozenBbbul}; | ||||
| use grenad::ReaderCursor; | ||||
| use hashbrown::hash_map::RawEntryMut; | ||||
| use hashbrown::HashMap; | ||||
| use raw_collections::bbbul::{BitPacker, BitPacker4x}; | ||||
| use raw_collections::map::FrozenMap; | ||||
| use raw_collections::{Bbbul, FrozenBbbul}; | ||||
| use roaring::RoaringBitmap; | ||||
| use rustc_hash::FxBuildHasher; | ||||
|  | ||||
| @@ -177,12 +177,12 @@ impl<'extractor> BalancedCaches<'extractor> { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     pub fn freeze(&mut self) -> Result<Vec<FrozenCache<'_, 'extractor>>> { | ||||
|     pub fn freeze(&mut self, source_id: usize) -> Result<Vec<FrozenCache<'_, 'extractor>>> { | ||||
|         match &mut self.caches { | ||||
|             InnerCaches::Normal(NormalCaches { caches }) => caches | ||||
|                 .iter_mut() | ||||
|                 .enumerate() | ||||
|                 .map(|(bucket, map)| { | ||||
|                 .map(|(bucket_id, map)| { | ||||
|                     // safety: we are transmuting the Bbbul into a FrozenBbbul | ||||
|                     //         that are the same size. | ||||
|                     let map = unsafe { | ||||
| @@ -201,14 +201,19 @@ impl<'extractor> BalancedCaches<'extractor> { | ||||
|                             >, | ||||
|                         >(map) | ||||
|                     }; | ||||
|                     Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() }) | ||||
|                     Ok(FrozenCache { | ||||
|                         source_id, | ||||
|                         bucket_id, | ||||
|                         cache: FrozenMap::new(map), | ||||
|                         spilled: Vec::new(), | ||||
|                     }) | ||||
|                 }) | ||||
|                 .collect(), | ||||
|             InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches | ||||
|                 .iter_mut() | ||||
|                 .zip(mem::take(spilled_entries)) | ||||
|                 .enumerate() | ||||
|                 .map(|(bucket, (map, sorter))| { | ||||
|                 .map(|(bucket_id, (map, sorter))| { | ||||
|                     let spilled = sorter | ||||
|                         .into_reader_cursors()? | ||||
|                         .into_iter() | ||||
| @@ -234,7 +239,7 @@ impl<'extractor> BalancedCaches<'extractor> { | ||||
|                             >, | ||||
|                         >(map) | ||||
|                     }; | ||||
|                     Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled }) | ||||
|                     Ok(FrozenCache { source_id, bucket_id, cache: FrozenMap::new(map), spilled }) | ||||
|                 }) | ||||
|                 .collect(), | ||||
|         } | ||||
| @@ -415,21 +420,21 @@ fn spill_entry_to_sorter( | ||||
|     match deladd { | ||||
|         DelAddRoaringBitmap { del: Some(del), add: None } => { | ||||
|             cbo_buffer.clear(); | ||||
|             CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); | ||||
|             CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); | ||||
|             value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; | ||||
|         } | ||||
|         DelAddRoaringBitmap { del: None, add: Some(add) } => { | ||||
|             cbo_buffer.clear(); | ||||
|             CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); | ||||
|             CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); | ||||
|             value_writer.insert(DelAdd::Addition, &cbo_buffer)?; | ||||
|         } | ||||
|         DelAddRoaringBitmap { del: Some(del), add: Some(add) } => { | ||||
|             cbo_buffer.clear(); | ||||
|             CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer); | ||||
|             CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer); | ||||
|             value_writer.insert(DelAdd::Deletion, &cbo_buffer)?; | ||||
|  | ||||
|             cbo_buffer.clear(); | ||||
|             CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer); | ||||
|             CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer); | ||||
|             value_writer.insert(DelAdd::Addition, &cbo_buffer)?; | ||||
|         } | ||||
|         DelAddRoaringBitmap { del: None, add: None } => return Ok(()), | ||||
| @@ -440,7 +445,8 @@ fn spill_entry_to_sorter( | ||||
| } | ||||
|  | ||||
| pub struct FrozenCache<'a, 'extractor> { | ||||
|     bucket: usize, | ||||
|     bucket_id: usize, | ||||
|     source_id: usize, | ||||
|     cache: FrozenMap< | ||||
|         'a, | ||||
|         'extractor, | ||||
| @@ -457,40 +463,36 @@ pub fn transpose_and_freeze_caches<'a, 'extractor>( | ||||
|     let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0); | ||||
|     let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect(); | ||||
|  | ||||
|     for thread_cache in caches { | ||||
|         for frozen in thread_cache.freeze()? { | ||||
|             bucket_caches[frozen.bucket].push(frozen); | ||||
|     for (thread_index, thread_cache) in caches.iter_mut().enumerate() { | ||||
|         for frozen in thread_cache.freeze(thread_index)? { | ||||
|             bucket_caches[frozen.bucket_id].push(frozen); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(bucket_caches) | ||||
| } | ||||
|  | ||||
| /// Merges the caches that must be all associated to the same bucket. | ||||
| /// Merges the caches that must be all associated to the same bucket | ||||
| /// but make sure to sort the different buckets before performing the merges. | ||||
| /// | ||||
| /// # Panics | ||||
| /// | ||||
| /// - If the bucket IDs in these frozen caches are not exactly the same. | ||||
| pub fn merge_caches<F>(frozen: Vec<FrozenCache>, mut f: F) -> Result<()> | ||||
| pub fn merge_caches_sorted<F>(frozen: Vec<FrozenCache>, mut f: F) -> Result<()> | ||||
| where | ||||
|     F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>, | ||||
| { | ||||
|     let mut maps = Vec::new(); | ||||
|     let mut readers = Vec::new(); | ||||
|     let mut current_bucket = None; | ||||
|     for FrozenCache { bucket, cache, ref mut spilled } in frozen { | ||||
|         assert_eq!(*current_bucket.get_or_insert(bucket), bucket); | ||||
|         maps.push(cache); | ||||
|         readers.append(spilled); | ||||
|     } | ||||
|  | ||||
|     // First manage the spilled entries by looking into the HashMaps, | ||||
|     // merge them and mark them as dummy. | ||||
|     let mut heap = BinaryHeap::new(); | ||||
|     for (source_index, source) in readers.into_iter().enumerate() { | ||||
|         let mut cursor = source.into_cursor()?; | ||||
|         if cursor.move_on_next()?.is_some() { | ||||
|             heap.push(Entry { cursor, source_index }); | ||||
|     let mut current_bucket = None; | ||||
|     for FrozenCache { source_id, bucket_id, cache, spilled } in frozen { | ||||
|         assert_eq!(*current_bucket.get_or_insert(bucket_id), bucket_id); | ||||
|         maps.push((source_id, cache)); | ||||
|         for reader in spilled { | ||||
|             let mut cursor = reader.into_cursor()?; | ||||
|             if cursor.move_on_next()?.is_some() { | ||||
|                 heap.push(Entry { cursor, source_id }); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -507,25 +509,29 @@ where | ||||
|  | ||||
|         let mut output = DelAddRoaringBitmap::from_bytes(first_value)?; | ||||
|         while let Some(mut entry) = heap.peek_mut() { | ||||
|             if let Some((key, _value)) = entry.cursor.current() { | ||||
|                 if first_key == key { | ||||
|                     let new = DelAddRoaringBitmap::from_bytes(first_value)?; | ||||
|                     output = output.merge(new); | ||||
|                     // When we are done we the current value of this entry move make | ||||
|                     // it move forward and let the heap reorganize itself (on drop) | ||||
|                     if entry.cursor.move_on_next()?.is_none() { | ||||
|                         PeekMut::pop(entry); | ||||
|                     } | ||||
|                 } else { | ||||
|             if let Some((key, value)) = entry.cursor.current() { | ||||
|                 if first_key != key { | ||||
|                     break; | ||||
|                 } | ||||
|  | ||||
|                 let new = DelAddRoaringBitmap::from_bytes(value)?; | ||||
|                 output = output.merge(new); | ||||
|                 // When we are done we the current value of this entry move make | ||||
|                 // it move forward and let the heap reorganize itself (on drop) | ||||
|                 if entry.cursor.move_on_next()?.is_none() { | ||||
|                     PeekMut::pop(entry); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // Once we merged all of the spilled bitmaps we must also | ||||
|         // fetch the entries from the non-spilled entries (the HashMaps). | ||||
|         for (map_index, map) in maps.iter_mut().enumerate() { | ||||
|             if first_entry.source_index != map_index { | ||||
|         for (source_id, map) in maps.iter_mut() { | ||||
|             debug_assert!( | ||||
|                 !(map.get(first_key).is_some() && first_entry.source_id == *source_id), | ||||
|                 "A thread should not have spiled a key that has been inserted in the cache" | ||||
|             ); | ||||
|             if first_entry.source_id != *source_id { | ||||
|                 if let Some(new) = map.get_mut(first_key) { | ||||
|                     output.union_and_clear_bbbul(new); | ||||
|                 } | ||||
| @@ -537,22 +543,22 @@ where | ||||
|  | ||||
|         // Don't forget to put the first entry back into the heap. | ||||
|         if first_entry.cursor.move_on_next()?.is_some() { | ||||
|             heap.push(first_entry) | ||||
|             heap.push(first_entry); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // Then manage the content on the HashMap entries that weren't taken (mem::take). | ||||
|     while let Some(mut map) = maps.pop() { | ||||
|         for (key, bbbul) in map.iter_mut() { | ||||
|             // Make sure we don't try to work with entries already managed by the spilled | ||||
|             if bbbul.is_empty() { | ||||
|                 continue; | ||||
|             } | ||||
|     while let Some((_, mut map)) = maps.pop() { | ||||
|         // Make sure we don't try to work with entries already managed by the spilled | ||||
|         let mut ordered_entries: Vec<_> = | ||||
|             map.iter_mut().filter(|(_, bbbul)| !bbbul.is_empty()).collect(); | ||||
|         ordered_entries.sort_unstable_by_key(|(key, _)| *key); | ||||
|  | ||||
|         for (key, bbbul) in ordered_entries { | ||||
|             let mut output = DelAddRoaringBitmap::empty(); | ||||
|             output.union_and_clear_bbbul(bbbul); | ||||
|  | ||||
|             for rhs in maps.iter_mut() { | ||||
|             for (_, rhs) in maps.iter_mut() { | ||||
|                 if let Some(new) = rhs.get_mut(key) { | ||||
|                     output.union_and_clear_bbbul(new); | ||||
|                 } | ||||
| @@ -568,14 +574,14 @@ where | ||||
|  | ||||
| struct Entry<R> { | ||||
|     cursor: ReaderCursor<R>, | ||||
|     source_index: usize, | ||||
|     source_id: usize, | ||||
| } | ||||
|  | ||||
| impl<R> Ord for Entry<R> { | ||||
|     fn cmp(&self, other: &Entry<R>) -> Ordering { | ||||
|         let skey = self.cursor.current().map(|(k, _)| k); | ||||
|         let okey = other.cursor.current().map(|(k, _)| k); | ||||
|         skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse() | ||||
|         skey.cmp(&okey).then(self.source_id.cmp(&other.source_id)).reverse() | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -12,13 +12,14 @@ use crate::update::new::thread_local::FullySend; | ||||
| use crate::update::new::DocumentChange; | ||||
| use crate::vector::EmbeddingConfigs; | ||||
| use crate::Result; | ||||
| pub struct DocumentsExtractor<'a> { | ||||
|     document_sender: &'a DocumentsSender<'a>, | ||||
|  | ||||
| pub struct DocumentsExtractor<'a, 'b> { | ||||
|     document_sender: DocumentsSender<'a, 'b>, | ||||
|     embedders: &'a EmbeddingConfigs, | ||||
| } | ||||
|  | ||||
| impl<'a> DocumentsExtractor<'a> { | ||||
|     pub fn new(document_sender: &'a DocumentsSender<'a>, embedders: &'a EmbeddingConfigs) -> Self { | ||||
| impl<'a, 'b> DocumentsExtractor<'a, 'b> { | ||||
|     pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self { | ||||
|         Self { document_sender, embedders } | ||||
|     } | ||||
| } | ||||
| @@ -29,7 +30,7 @@ pub struct DocumentExtractorData { | ||||
|     pub field_distribution_delta: HashMap<String, i64>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> { | ||||
| impl<'a, 'b, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a, 'b> { | ||||
|     type Data = FullySend<RefCell<DocumentExtractorData>>; | ||||
|  | ||||
|     fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> { | ||||
|   | ||||
| @@ -16,23 +16,23 @@ use crate::update::del_add::DelAdd; | ||||
| use crate::update::new::channel::FieldIdDocidFacetSender; | ||||
| use crate::update::new::extract::perm_json_p; | ||||
| use crate::update::new::indexer::document_changes::{ | ||||
|     extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, | ||||
|     extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, | ||||
| }; | ||||
| use crate::update::new::ref_cell_ext::RefCellExt as _; | ||||
| use crate::update::new::steps::Step; | ||||
| use crate::update::new::steps::IndexingStep; | ||||
| use crate::update::new::thread_local::{FullySend, ThreadLocal}; | ||||
| use crate::update::new::DocumentChange; | ||||
| use crate::update::GrenadParameters; | ||||
| use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; | ||||
|  | ||||
| pub struct FacetedExtractorData<'a> { | ||||
| pub struct FacetedExtractorData<'a, 'b> { | ||||
|     attributes_to_extract: &'a [&'a str], | ||||
|     sender: &'a FieldIdDocidFacetSender<'a>, | ||||
|     sender: &'a FieldIdDocidFacetSender<'a, 'b>, | ||||
|     grenad_parameters: GrenadParameters, | ||||
|     buckets: usize, | ||||
| } | ||||
|  | ||||
| impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> { | ||||
| impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> { | ||||
|     type Data = RefCell<BalancedCaches<'extractor>>; | ||||
|  | ||||
|     fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> { | ||||
| @@ -97,6 +97,15 @@ impl FacetedDocidsExtractor { | ||||
|                 }, | ||||
|             ), | ||||
|             DocumentChange::Update(inner) => { | ||||
|                 if !inner.has_changed_for_fields( | ||||
|                     Some(attributes_to_extract), | ||||
|                     rtxn, | ||||
|                     index, | ||||
|                     context.db_fields_ids_map, | ||||
|                 )? { | ||||
|                     return Ok(()); | ||||
|                 } | ||||
|  | ||||
|                 extract_document_facets( | ||||
|                     attributes_to_extract, | ||||
|                     inner.current(rtxn, index, context.db_fields_ids_map)?, | ||||
| @@ -318,7 +327,7 @@ impl<'doc> DelAddFacetValue<'doc> { | ||||
|         docid: DocumentId, | ||||
|         sender: &FieldIdDocidFacetSender, | ||||
|         doc_alloc: &Bump, | ||||
|     ) -> std::result::Result<(), crossbeam_channel::SendError<()>> { | ||||
|     ) -> crate::Result<()> { | ||||
|         let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc); | ||||
|         for ((fid, value), deladd) in self.strings { | ||||
|             if let Ok(s) = std::str::from_utf8(&value) { | ||||
| @@ -364,26 +373,16 @@ fn truncate_str(s: &str) -> &str { | ||||
|  | ||||
| impl FacetedDocidsExtractor { | ||||
|     #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] | ||||
|     pub fn run_extraction< | ||||
|         'pl, | ||||
|         'fid, | ||||
|         'indexer, | ||||
|         'index, | ||||
|         'extractor, | ||||
|         DC: DocumentChanges<'pl>, | ||||
|         MSP, | ||||
|         SP, | ||||
|     >( | ||||
|     pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( | ||||
|         grenad_parameters: GrenadParameters, | ||||
|         document_changes: &DC, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, | ||||
|         extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, | ||||
|         sender: &FieldIdDocidFacetSender, | ||||
|         step: Step, | ||||
|         step: IndexingStep, | ||||
|     ) -> Result<Vec<BalancedCaches<'extractor>>> | ||||
|     where | ||||
|         MSP: Fn() -> bool + Sync, | ||||
|         SP: Fn(Progress) + Sync, | ||||
|     { | ||||
|         let index = indexing_context.index; | ||||
|         let rtxn = index.read_txn()?; | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| use std::cell::RefCell; | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Write as _}; | ||||
| use std::io::{self, BufReader, BufWriter, ErrorKind, Read, Seek as _, Write as _}; | ||||
| use std::{iter, mem, result}; | ||||
|  | ||||
| use bumpalo::Bump; | ||||
| @@ -97,30 +97,34 @@ pub struct FrozenGeoExtractorData<'extractor> { | ||||
| impl<'extractor> FrozenGeoExtractorData<'extractor> { | ||||
|     pub fn iter_and_clear_removed( | ||||
|         &mut self, | ||||
|     ) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ { | ||||
|         mem::take(&mut self.removed) | ||||
|     ) -> io::Result<impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_> { | ||||
|         Ok(mem::take(&mut self.removed) | ||||
|             .iter() | ||||
|             .copied() | ||||
|             .map(Ok) | ||||
|             .chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)) | ||||
|             .chain(iterator_over_spilled_geopoints(&mut self.spilled_removed)?)) | ||||
|     } | ||||
|  | ||||
|     pub fn iter_and_clear_inserted( | ||||
|         &mut self, | ||||
|     ) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ { | ||||
|         mem::take(&mut self.inserted) | ||||
|     ) -> io::Result<impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_> { | ||||
|         Ok(mem::take(&mut self.inserted) | ||||
|             .iter() | ||||
|             .copied() | ||||
|             .map(Ok) | ||||
|             .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)) | ||||
|             .chain(iterator_over_spilled_geopoints(&mut self.spilled_inserted)?)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn iterator_over_spilled_geopoints( | ||||
|     spilled: &mut Option<BufReader<File>>, | ||||
| ) -> impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_ { | ||||
| ) -> io::Result<impl IntoIterator<Item = io::Result<ExtractedGeoPoint>> + '_> { | ||||
|     let mut spilled = spilled.take(); | ||||
|     iter::from_fn(move || match &mut spilled { | ||||
|     if let Some(spilled) = &mut spilled { | ||||
|         spilled.rewind()?; | ||||
|     } | ||||
|  | ||||
|     Ok(iter::from_fn(move || match &mut spilled { | ||||
|         Some(file) => { | ||||
|             let geopoint_bytes = &mut [0u8; mem::size_of::<ExtractedGeoPoint>()]; | ||||
|             match file.read_exact(geopoint_bytes) { | ||||
| @@ -130,7 +134,7 @@ fn iterator_over_spilled_geopoints( | ||||
|             } | ||||
|         } | ||||
|         None => None, | ||||
|     }) | ||||
|     })) | ||||
| } | ||||
|  | ||||
| impl<'extractor> Extractor<'extractor> for GeoExtractor { | ||||
| @@ -157,7 +161,9 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { | ||||
|         let mut data_ref = context.data.borrow_mut_or_yield(); | ||||
|  | ||||
|         for change in changes { | ||||
|             if max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) { | ||||
|             if data_ref.spilled_removed.is_none() | ||||
|                 && max_memory.map_or(false, |mm| context.extractor_alloc.allocated_bytes() >= mm) | ||||
|             { | ||||
|                 // We must spill as we allocated too much memory | ||||
|                 data_ref.spilled_removed = tempfile::tempfile().map(BufWriter::new).map(Some)?; | ||||
|                 data_ref.spilled_inserted = tempfile::tempfile().map(BufWriter::new).map(Some)?; | ||||
|   | ||||
| @@ -6,30 +6,31 @@ mod searchable; | ||||
| mod vectors; | ||||
|  | ||||
| use bumpalo::Bump; | ||||
| pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap}; | ||||
| pub use cache::{ | ||||
|     merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, | ||||
| }; | ||||
| pub use documents::*; | ||||
| pub use faceted::*; | ||||
| pub use geo::*; | ||||
| pub use searchable::*; | ||||
| pub use vectors::EmbeddingExtractor; | ||||
|  | ||||
| use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress}; | ||||
| use super::steps::Step; | ||||
| use super::indexer::document_changes::{DocumentChanges, IndexingContext}; | ||||
| use super::steps::IndexingStep; | ||||
| use super::thread_local::{FullySend, ThreadLocal}; | ||||
| use crate::update::GrenadParameters; | ||||
| use crate::Result; | ||||
|  | ||||
| pub trait DocidsExtractor { | ||||
|     fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( | ||||
|     fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( | ||||
|         grenad_parameters: GrenadParameters, | ||||
|         document_changes: &DC, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, | ||||
|         extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, | ||||
|         step: Step, | ||||
|         step: IndexingStep, | ||||
|     ) -> Result<Vec<BalancedCaches<'extractor>>> | ||||
|     where | ||||
|         MSP: Fn() -> bool + Sync, | ||||
|         SP: Fn(Progress) + Sync; | ||||
|         MSP: Fn() -> bool + Sync; | ||||
| } | ||||
|  | ||||
| /// TODO move in permissive json pointer | ||||
|   | ||||
| @@ -11,10 +11,10 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; | ||||
| use crate::update::new::extract::cache::BalancedCaches; | ||||
| use crate::update::new::extract::perm_json_p::contained_in; | ||||
| use crate::update::new::indexer::document_changes::{ | ||||
|     extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, | ||||
|     extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, | ||||
| }; | ||||
| use crate::update::new::ref_cell_ext::RefCellExt as _; | ||||
| use crate::update::new::steps::Step; | ||||
| use crate::update::new::steps::IndexingStep; | ||||
| use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; | ||||
| use crate::update::new::DocumentChange; | ||||
| use crate::update::GrenadParameters; | ||||
| @@ -28,7 +28,7 @@ pub struct WordDocidsBalancedCaches<'extractor> { | ||||
|     exact_word_docids: BalancedCaches<'extractor>, | ||||
|     word_position_docids: BalancedCaches<'extractor>, | ||||
|     fid_word_count_docids: BalancedCaches<'extractor>, | ||||
|     fid_word_count: HashMap<FieldId, (usize, usize)>, | ||||
|     fid_word_count: HashMap<FieldId, (Option<usize>, Option<usize>)>, | ||||
|     current_docid: Option<DocumentId>, | ||||
| } | ||||
|  | ||||
| @@ -85,8 +85,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { | ||||
|  | ||||
|         self.fid_word_count | ||||
|             .entry(field_id) | ||||
|             .and_modify(|(_current_count, new_count)| *new_count += 1) | ||||
|             .or_insert((0, 1)); | ||||
|             .and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1) | ||||
|             .or_insert((None, Some(1))); | ||||
|         self.current_docid = Some(docid); | ||||
|  | ||||
|         Ok(()) | ||||
| @@ -130,8 +130,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { | ||||
|  | ||||
|         self.fid_word_count | ||||
|             .entry(field_id) | ||||
|             .and_modify(|(current_count, _new_count)| *current_count += 1) | ||||
|             .or_insert((1, 0)); | ||||
|             .and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1) | ||||
|             .or_insert((Some(1), None)); | ||||
|  | ||||
|         self.current_docid = Some(docid); | ||||
|  | ||||
| @@ -141,14 +141,18 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> { | ||||
|     fn flush_fid_word_count(&mut self, buffer: &mut BumpVec<u8>) -> Result<()> { | ||||
|         for (fid, (current_count, new_count)) in self.fid_word_count.drain() { | ||||
|             if current_count != new_count { | ||||
|                 if current_count <= MAX_COUNTED_WORDS { | ||||
|                 if let Some(current_count) = | ||||
|                     current_count.filter(|current_count| *current_count <= MAX_COUNTED_WORDS) | ||||
|                 { | ||||
|                     buffer.clear(); | ||||
|                     buffer.extend_from_slice(&fid.to_be_bytes()); | ||||
|                     buffer.push(current_count as u8); | ||||
|                     self.fid_word_count_docids | ||||
|                         .insert_del_u32(buffer, self.current_docid.unwrap())?; | ||||
|                 } | ||||
|                 if new_count <= MAX_COUNTED_WORDS { | ||||
|                 if let Some(new_count) = | ||||
|                     new_count.filter(|new_count| *new_count <= MAX_COUNTED_WORDS) | ||||
|                 { | ||||
|                     buffer.clear(); | ||||
|                     buffer.extend_from_slice(&fid.to_be_bytes()); | ||||
|                     buffer.push(new_count as u8); | ||||
| @@ -235,25 +239,15 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> { | ||||
| pub struct WordDocidsExtractors; | ||||
|  | ||||
| impl WordDocidsExtractors { | ||||
|     pub fn run_extraction< | ||||
|         'pl, | ||||
|         'fid, | ||||
|         'indexer, | ||||
|         'index, | ||||
|         'extractor, | ||||
|         DC: DocumentChanges<'pl>, | ||||
|         MSP, | ||||
|         SP, | ||||
|     >( | ||||
|     pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( | ||||
|         grenad_parameters: GrenadParameters, | ||||
|         document_changes: &DC, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, | ||||
|         extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, | ||||
|         step: Step, | ||||
|         step: IndexingStep, | ||||
|     ) -> Result<WordDocidsCaches<'extractor>> | ||||
|     where | ||||
|         MSP: Fn() -> bool + Sync, | ||||
|         SP: Fn(Progress) + Sync, | ||||
|     { | ||||
|         let index = indexing_context.index; | ||||
|         let rtxn = index.read_txn()?; | ||||
| @@ -351,6 +345,15 @@ impl WordDocidsExtractors { | ||||
|                 )?; | ||||
|             } | ||||
|             DocumentChange::Update(inner) => { | ||||
|                 if !inner.has_changed_for_fields( | ||||
|                     document_tokenizer.attribute_to_extract, | ||||
|                     &context.rtxn, | ||||
|                     context.index, | ||||
|                     context.db_fields_ids_map, | ||||
|                 )? { | ||||
|                     return Ok(()); | ||||
|                 } | ||||
|  | ||||
|                 let mut token_fn = |fname: &str, fid, pos, word: &str| { | ||||
|                     cached_sorter.insert_del_u32( | ||||
|                         fid, | ||||
|   | ||||
| @@ -70,6 +70,15 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | ||||
|                 )?; | ||||
|             } | ||||
|             DocumentChange::Update(inner) => { | ||||
|                 if !inner.has_changed_for_fields( | ||||
|                     document_tokenizer.attribute_to_extract, | ||||
|                     rtxn, | ||||
|                     index, | ||||
|                     context.db_fields_ids_map, | ||||
|                 )? { | ||||
|                     return Ok(()); | ||||
|                 } | ||||
|  | ||||
|                 let document = inner.current(rtxn, index, context.db_fields_ids_map)?; | ||||
|                 process_document_tokens( | ||||
|                     document, | ||||
|   | ||||
| @@ -14,9 +14,9 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; | ||||
| use super::cache::BalancedCaches; | ||||
| use super::DocidsExtractor; | ||||
| use crate::update::new::indexer::document_changes::{ | ||||
|     extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress, | ||||
|     extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, | ||||
| }; | ||||
| use crate::update::new::steps::Step; | ||||
| use crate::update::new::steps::IndexingStep; | ||||
| use crate::update::new::thread_local::{FullySend, ThreadLocal}; | ||||
| use crate::update::new::DocumentChange; | ||||
| use crate::update::GrenadParameters; | ||||
| @@ -56,16 +56,15 @@ impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> | ||||
| } | ||||
|  | ||||
| pub trait SearchableExtractor: Sized + Sync { | ||||
|     fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( | ||||
|     fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( | ||||
|         grenad_parameters: GrenadParameters, | ||||
|         document_changes: &DC, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, | ||||
|         extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, | ||||
|         step: Step, | ||||
|         step: IndexingStep, | ||||
|     ) -> Result<Vec<BalancedCaches<'extractor>>> | ||||
|     where | ||||
|         MSP: Fn() -> bool + Sync, | ||||
|         SP: Fn(Progress) + Sync, | ||||
|     { | ||||
|         let rtxn = indexing_context.index.read_txn()?; | ||||
|         let stop_words = indexing_context.index.stop_words(&rtxn)?; | ||||
| @@ -134,16 +133,15 @@ pub trait SearchableExtractor: Sized + Sync { | ||||
| } | ||||
|  | ||||
| impl<T: SearchableExtractor> DocidsExtractor for T { | ||||
|     fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>( | ||||
|     fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( | ||||
|         grenad_parameters: GrenadParameters, | ||||
|         document_changes: &DC, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, | ||||
|         indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, | ||||
|         extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, | ||||
|         step: Step, | ||||
|         step: IndexingStep, | ||||
|     ) -> Result<Vec<BalancedCaches<'extractor>>> | ||||
|     where | ||||
|         MSP: Fn() -> bool + Sync, | ||||
|         SP: Fn(Progress) + Sync, | ||||
|     { | ||||
|         Self::run_extraction( | ||||
|             grenad_parameters, | ||||
|   | ||||
| @@ -176,9 +176,10 @@ pub fn tokenizer_builder<'a>( | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|     use bumpalo::Bump; | ||||
|     use bumparaw_collections::RawMap; | ||||
|     use charabia::TokenizerBuilder; | ||||
|     use meili_snap::snapshot; | ||||
|     use raw_collections::RawMap; | ||||
|     use rustc_hash::FxBuildHasher; | ||||
|     use serde_json::json; | ||||
|     use serde_json::value::RawValue; | ||||
|  | ||||
| @@ -234,7 +235,7 @@ mod test { | ||||
|  | ||||
|         let bump = Bump::new(); | ||||
|         let document: &RawValue = serde_json::from_str(&document).unwrap(); | ||||
|         let document = RawMap::from_raw_value(document, &bump).unwrap(); | ||||
|         let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, &bump).unwrap(); | ||||
|  | ||||
|         let document = Versions::single(document); | ||||
|         let document = DocumentFromVersions::new(&document); | ||||
|   | ||||
| @@ -18,17 +18,17 @@ use crate::vector::error::{ | ||||
| use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; | ||||
| use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; | ||||
|  | ||||
| pub struct EmbeddingExtractor<'a> { | ||||
| pub struct EmbeddingExtractor<'a, 'b> { | ||||
|     embedders: &'a EmbeddingConfigs, | ||||
|     sender: &'a EmbeddingSender<'a>, | ||||
|     sender: EmbeddingSender<'a, 'b>, | ||||
|     possible_embedding_mistakes: PossibleEmbeddingMistakes, | ||||
|     threads: &'a ThreadPoolNoAbort, | ||||
| } | ||||
|  | ||||
| impl<'a> EmbeddingExtractor<'a> { | ||||
| impl<'a, 'b> EmbeddingExtractor<'a, 'b> { | ||||
|     pub fn new( | ||||
|         embedders: &'a EmbeddingConfigs, | ||||
|         sender: &'a EmbeddingSender<'a>, | ||||
|         sender: EmbeddingSender<'a, 'b>, | ||||
|         field_distribution: &'a FieldDistribution, | ||||
|         threads: &'a ThreadPoolNoAbort, | ||||
|     ) -> Self { | ||||
| @@ -43,7 +43,7 @@ pub struct EmbeddingExtractorData<'extractor>( | ||||
|  | ||||
| unsafe impl MostlySend for EmbeddingExtractorData<'_> {} | ||||
|  | ||||
| impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { | ||||
| impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { | ||||
|     type Data = RefCell<EmbeddingExtractorData<'extractor>>; | ||||
|  | ||||
|     fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result<Self::Data> { | ||||
| @@ -130,6 +130,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { | ||||
|                                 ); | ||||
|                             } else if new_vectors.regenerate { | ||||
|                                 let new_rendered = prompt.render_document( | ||||
|                                     update.external_document_id(), | ||||
|                                     update.current( | ||||
|                                         &context.rtxn, | ||||
|                                         context.index, | ||||
| @@ -139,6 +140,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { | ||||
|                                     &context.doc_alloc, | ||||
|                                 )?; | ||||
|                                 let old_rendered = prompt.render_document( | ||||
|                                     update.external_document_id(), | ||||
|                                     update.merged( | ||||
|                                         &context.rtxn, | ||||
|                                         context.index, | ||||
| @@ -158,6 +160,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { | ||||
|                             } | ||||
|                         } else if old_vectors.regenerate { | ||||
|                             let old_rendered = prompt.render_document( | ||||
|                                 update.external_document_id(), | ||||
|                                 update.current( | ||||
|                                     &context.rtxn, | ||||
|                                     context.index, | ||||
| @@ -167,6 +170,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { | ||||
|                                 &context.doc_alloc, | ||||
|                             )?; | ||||
|                             let new_rendered = prompt.render_document( | ||||
|                                 update.external_document_id(), | ||||
|                                 update.merged( | ||||
|                                     &context.rtxn, | ||||
|                                     context.index, | ||||
| @@ -216,6 +220,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { | ||||
|                                 ); | ||||
|                             } else if new_vectors.regenerate { | ||||
|                                 let rendered = prompt.render_document( | ||||
|                                     insertion.external_document_id(), | ||||
|                                     insertion.inserted(), | ||||
|                                     context.new_fields_ids_map, | ||||
|                                     &context.doc_alloc, | ||||
| @@ -229,6 +234,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { | ||||
|                             } | ||||
|                         } else { | ||||
|                             let rendered = prompt.render_document( | ||||
|                                 insertion.external_document_id(), | ||||
|                                 insertion.inserted(), | ||||
|                                 context.new_fields_ids_map, | ||||
|                                 &context.doc_alloc, | ||||
| @@ -259,7 +265,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { | ||||
| // Currently this is the case as: | ||||
| // 1. BVec are inside of the bumaplo | ||||
| // 2. All other fields are either trivial (u8) or references. | ||||
| struct Chunks<'a, 'extractor> { | ||||
| struct Chunks<'a, 'b, 'extractor> { | ||||
|     texts: BVec<'a, &'a str>, | ||||
|     ids: BVec<'a, DocumentId>, | ||||
|  | ||||
| @@ -270,11 +276,11 @@ struct Chunks<'a, 'extractor> { | ||||
|     possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, | ||||
|     user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>, | ||||
|     threads: &'a ThreadPoolNoAbort, | ||||
|     sender: &'a EmbeddingSender<'a>, | ||||
|     sender: EmbeddingSender<'a, 'b>, | ||||
|     has_manual_generation: Option<&'a str>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'extractor> Chunks<'a, 'extractor> { | ||||
| impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { | ||||
|     #[allow(clippy::too_many_arguments)] | ||||
|     pub fn new( | ||||
|         embedder: &'a Embedder, | ||||
| @@ -284,7 +290,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { | ||||
|         user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>, | ||||
|         possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, | ||||
|         threads: &'a ThreadPoolNoAbort, | ||||
|         sender: &'a EmbeddingSender<'a>, | ||||
|         sender: EmbeddingSender<'a, 'b>, | ||||
|         doc_alloc: &'a Bump, | ||||
|     ) -> Self { | ||||
|         let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); | ||||
| @@ -368,7 +374,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { | ||||
|         possible_embedding_mistakes: &PossibleEmbeddingMistakes, | ||||
|         unused_vectors_distribution: &UnusedVectorsDistributionBump, | ||||
|         threads: &ThreadPoolNoAbort, | ||||
|         sender: &EmbeddingSender<'a>, | ||||
|         sender: EmbeddingSender<'a, 'b>, | ||||
|         has_manual_generation: Option<&'a str>, | ||||
|     ) -> Result<()> { | ||||
|         if let Some(external_docid) = has_manual_generation { | ||||
|   | ||||
| @@ -103,6 +103,8 @@ impl<'indexer> FacetSearchBuilder<'indexer> { | ||||
|  | ||||
|     #[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")] | ||||
|     pub fn merge_and_write(self, index: &Index, wtxn: &mut RwTxn, rtxn: &RoTxn) -> Result<()> { | ||||
|         tracing::trace!("merge facet strings for facet search: {:?}", self.registered_facets); | ||||
|  | ||||
|         let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?; | ||||
|         let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString); | ||||
|         builder.extend(reader); | ||||
| @@ -118,12 +120,15 @@ impl<'indexer> FacetSearchBuilder<'indexer> { | ||||
|                 BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?; | ||||
|  | ||||
|             if current_field_id != Some(field_id) { | ||||
|                 if let Some(fst_merger_builder) = fst_merger_builder { | ||||
|                 if let (Some(current_field_id), Some(fst_merger_builder)) = | ||||
|                     (current_field_id, fst_merger_builder) | ||||
|                 { | ||||
|                     let mmap = fst_merger_builder.build(&mut callback)?; | ||||
|                     index | ||||
|                         .facet_id_string_fst | ||||
|                         .remap_data_type::<Bytes>() | ||||
|                         .put(wtxn, &field_id, &mmap)?; | ||||
|                     index.facet_id_string_fst.remap_data_type::<Bytes>().put( | ||||
|                         wtxn, | ||||
|                         ¤t_field_id, | ||||
|                         &mmap, | ||||
|                     )?; | ||||
|                 } | ||||
|  | ||||
|                 fst = index.facet_id_string_fst.get(rtxn, &field_id)?; | ||||
|   | ||||
| @@ -1,6 +1,8 @@ | ||||
| use std::ops::ControlFlow; | ||||
|  | ||||
| use bumpalo::Bump; | ||||
| use bumparaw_collections::RawVec; | ||||
| use rustc_hash::FxBuildHasher; | ||||
| use serde::de::{DeserializeSeed, Deserializer as _, Visitor}; | ||||
| use serde_json::value::RawValue; | ||||
|  | ||||
| @@ -360,7 +362,7 @@ impl<'a> DeserrRawValue<'a> { | ||||
| } | ||||
|  | ||||
| pub struct DeserrRawVec<'a> { | ||||
|     vec: raw_collections::RawVec<'a>, | ||||
|     vec: RawVec<'a>, | ||||
|     alloc: &'a Bump, | ||||
| } | ||||
|  | ||||
| @@ -379,7 +381,7 @@ impl<'a> deserr::Sequence for DeserrRawVec<'a> { | ||||
| } | ||||
|  | ||||
| pub struct DeserrRawVecIter<'a> { | ||||
|     it: raw_collections::vec::iter::IntoIter<'a>, | ||||
|     it: bumparaw_collections::vec::iter::IntoIter<'a>, | ||||
|     alloc: &'a Bump, | ||||
| } | ||||
|  | ||||
| @@ -393,7 +395,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> { | ||||
| } | ||||
|  | ||||
| pub struct DeserrRawMap<'a> { | ||||
|     map: raw_collections::RawMap<'a>, | ||||
|     map: bumparaw_collections::RawMap<'a, FxBuildHasher>, | ||||
|     alloc: &'a Bump, | ||||
| } | ||||
|  | ||||
| @@ -416,7 +418,7 @@ impl<'a> deserr::Map for DeserrRawMap<'a> { | ||||
| } | ||||
|  | ||||
| pub struct DeserrRawMapIter<'a> { | ||||
|     it: raw_collections::map::iter::IntoIter<'a>, | ||||
|     it: bumparaw_collections::map::iter::IntoIter<'a>, | ||||
|     alloc: &'a Bump, | ||||
| } | ||||
|  | ||||
| @@ -615,7 +617,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> { | ||||
|     where | ||||
|         A: serde::de::SeqAccess<'de>, | ||||
|     { | ||||
|         let mut raw_vec = raw_collections::RawVec::new_in(self.alloc); | ||||
|         let mut raw_vec = RawVec::new_in(self.alloc); | ||||
|         while let Some(next) = seq.next_element()? { | ||||
|             raw_vec.push(next); | ||||
|         } | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| use std::cell::{Cell, RefCell}; | ||||
| use std::sync::atomic::Ordering; | ||||
| use std::sync::{Arc, RwLock}; | ||||
|  | ||||
| use bumpalo::Bump; | ||||
| @@ -7,8 +8,9 @@ use rayon::iter::IndexedParallelIterator; | ||||
|  | ||||
| use super::super::document_change::DocumentChange; | ||||
| use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; | ||||
| use crate::progress::{AtomicDocumentStep, Progress}; | ||||
| use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _; | ||||
| use crate::update::new::steps::Step; | ||||
| use crate::update::new::steps::IndexingStep; | ||||
| use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; | ||||
| use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result}; | ||||
|  | ||||
| @@ -70,7 +72,7 @@ impl< | ||||
|         F: FnOnce(&'extractor Bump) -> Result<T>, | ||||
|     { | ||||
|         let doc_alloc = | ||||
|             doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024 * 1024)))); | ||||
|             doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024)))); | ||||
|         let doc_alloc = doc_alloc.0.take(); | ||||
|         let fields_ids_map = fields_ids_map_store | ||||
|             .get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into()); | ||||
| @@ -133,10 +135,8 @@ pub struct IndexingContext< | ||||
|     'indexer, // covariant lifetime of objects that are borrowed  during the entire indexing operation | ||||
|     'index,   // covariant lifetime of the index | ||||
|     MSP, | ||||
|     SP, | ||||
| > where | ||||
|     MSP: Fn() -> bool + Sync, | ||||
|     SP: Fn(Progress) + Sync, | ||||
| { | ||||
|     pub index: &'index Index, | ||||
|     pub db_fields_ids_map: &'indexer FieldsIdsMap, | ||||
| @@ -144,7 +144,7 @@ pub struct IndexingContext< | ||||
|     pub doc_allocs: &'indexer ThreadLocal<FullySend<Cell<Bump>>>, | ||||
|     pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>, | ||||
|     pub must_stop_processing: &'indexer MSP, | ||||
|     pub send_progress: &'indexer SP, | ||||
|     pub progress: &'indexer Progress, | ||||
| } | ||||
|  | ||||
| impl< | ||||
| @@ -152,18 +152,15 @@ impl< | ||||
|         'indexer, // covariant lifetime of objects that are borrowed  during the entire indexing operation | ||||
|         'index,   // covariant lifetime of the index | ||||
|         MSP, | ||||
|         SP, | ||||
|     > Copy | ||||
|     for IndexingContext< | ||||
|         'fid,     // invariant lifetime of fields ids map | ||||
|         'indexer, // covariant lifetime of objects that are borrowed  during the entire indexing operation | ||||
|         'index,   // covariant lifetime of the index | ||||
|         MSP, | ||||
|         SP, | ||||
|     > | ||||
| where | ||||
|     MSP: Fn() -> bool + Sync, | ||||
|     SP: Fn(Progress) + Sync, | ||||
| { | ||||
| } | ||||
|  | ||||
| @@ -172,18 +169,15 @@ impl< | ||||
|         'indexer, // covariant lifetime of objects that are borrowed  during the entire indexing operation | ||||
|         'index,   // covariant lifetime of the index | ||||
|         MSP, | ||||
|         SP, | ||||
|     > Clone | ||||
|     for IndexingContext< | ||||
|         'fid,     // invariant lifetime of fields ids map | ||||
|         'indexer, // covariant lifetime of objects that are borrowed  during the entire indexing operation | ||||
|         'index,   // covariant lifetime of the index | ||||
|         MSP, | ||||
|         SP, | ||||
|     > | ||||
| where | ||||
|     MSP: Fn() -> bool + Sync, | ||||
|     SP: Fn(Progress) + Sync, | ||||
| { | ||||
|     fn clone(&self) -> Self { | ||||
|         *self | ||||
| @@ -202,7 +196,6 @@ pub fn extract< | ||||
|     EX, | ||||
|     DC: DocumentChanges<'pl>, | ||||
|     MSP, | ||||
|     SP, | ||||
| >( | ||||
|     document_changes: &DC, | ||||
|     extractor: &EX, | ||||
| @@ -213,18 +206,18 @@ pub fn extract< | ||||
|         doc_allocs, | ||||
|         fields_ids_map_store, | ||||
|         must_stop_processing, | ||||
|         send_progress, | ||||
|     }: IndexingContext<'fid, 'indexer, 'index, MSP, SP>, | ||||
|         progress, | ||||
|     }: IndexingContext<'fid, 'indexer, 'index, MSP>, | ||||
|     extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>, | ||||
|     datastore: &'data ThreadLocal<EX::Data>, | ||||
|     step: Step, | ||||
|     step: IndexingStep, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     EX: Extractor<'extractor>, | ||||
|     MSP: Fn() -> bool + Sync, | ||||
|     SP: Fn(Progress) + Sync, | ||||
| { | ||||
|     tracing::trace!("We are resetting the extractor allocators"); | ||||
|     progress.update_progress(step); | ||||
|     // Clean up and reuse the extractor allocs | ||||
|     for extractor_alloc in extractor_allocs.iter_mut() { | ||||
|         tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes()); | ||||
| @@ -232,9 +225,11 @@ where | ||||
|     } | ||||
|  | ||||
|     let total_documents = document_changes.len() as u32; | ||||
|     let (step, progress_step) = AtomicDocumentStep::new(total_documents); | ||||
|     progress.update_progress(progress_step); | ||||
|  | ||||
|     let pi = document_changes.iter(CHUNK_SIZE); | ||||
|     pi.enumerate().try_arc_for_each_try_init( | ||||
|     pi.try_arc_for_each_try_init( | ||||
|         || { | ||||
|             DocumentChangeContext::new( | ||||
|                 index, | ||||
| @@ -247,13 +242,10 @@ where | ||||
|                 move |index_alloc| extractor.init_data(index_alloc), | ||||
|             ) | ||||
|         }, | ||||
|         |context, (finished_documents, items)| { | ||||
|         |context, items| { | ||||
|             if (must_stop_processing)() { | ||||
|                 return Err(Arc::new(InternalError::AbortedIndexation.into())); | ||||
|             } | ||||
|             let finished_documents = (finished_documents * CHUNK_SIZE) as u32; | ||||
|  | ||||
|             (send_progress)(Progress::from_step_substep(step, finished_documents, total_documents)); | ||||
|  | ||||
|             // Clean up and reuse the document-specific allocator | ||||
|             context.doc_alloc.reset(); | ||||
| @@ -264,6 +256,7 @@ where | ||||
|             }); | ||||
|  | ||||
|             let res = extractor.process(changes, context).map_err(Arc::new); | ||||
|             step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed); | ||||
|  | ||||
|             // send back the doc_alloc in the pool | ||||
|             context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc)); | ||||
| @@ -271,32 +264,7 @@ where | ||||
|             res | ||||
|         }, | ||||
|     )?; | ||||
|  | ||||
|     (send_progress)(Progress::from_step_substep(step, total_documents, total_documents)); | ||||
|     step.store(total_documents, Ordering::Relaxed); | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| pub struct Progress { | ||||
|     pub finished_steps: u16, | ||||
|     pub total_steps: u16, | ||||
|     pub step_name: &'static str, | ||||
|     pub finished_total_substep: Option<(u32, u32)>, | ||||
| } | ||||
|  | ||||
| impl Progress { | ||||
|     pub fn from_step(step: Step) -> Self { | ||||
|         Self { | ||||
|             finished_steps: step.finished_steps(), | ||||
|             total_steps: Step::total_steps(), | ||||
|             step_name: step.name(), | ||||
|             finished_total_substep: None, | ||||
|         } | ||||
|     } | ||||
|     pub fn from_step_substep(step: Step, finished_substep: u32, total_substep: u32) -> Self { | ||||
|         Self { | ||||
|             finished_total_substep: Some((finished_substep, total_substep)), | ||||
|             ..Progress::from_step(step) | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -92,11 +92,12 @@ mod test { | ||||
|  | ||||
|     use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use crate::progress::Progress; | ||||
|     use crate::update::new::indexer::document_changes::{ | ||||
|         extract, DocumentChangeContext, Extractor, IndexingContext, | ||||
|     }; | ||||
|     use crate::update::new::indexer::DocumentDeletion; | ||||
|     use crate::update::new::steps::Step; | ||||
|     use crate::update::new::steps::IndexingStep; | ||||
|     use crate::update::new::thread_local::{MostlySend, ThreadLocal}; | ||||
|     use crate::update::new::DocumentChange; | ||||
|     use crate::DocumentId; | ||||
| @@ -164,7 +165,7 @@ mod test { | ||||
|             doc_allocs: &doc_allocs, | ||||
|             fields_ids_map_store: &fields_ids_map_store, | ||||
|             must_stop_processing: &(|| false), | ||||
|             send_progress: &(|_progress| {}), | ||||
|             progress: &Progress::default(), | ||||
|         }; | ||||
|  | ||||
|         for _ in 0..3 { | ||||
| @@ -176,7 +177,7 @@ mod test { | ||||
|                 context, | ||||
|                 &mut extractor_allocs, | ||||
|                 &datastore, | ||||
|                 Step::ExtractingDocuments, | ||||
|                 IndexingStep::ExtractingDocuments, | ||||
|             ) | ||||
|             .unwrap(); | ||||
|  | ||||
|   | ||||
| @@ -1,19 +1,23 @@ | ||||
| use std::sync::atomic::Ordering; | ||||
|  | ||||
| use bumpalo::collections::CollectIn; | ||||
| use bumpalo::Bump; | ||||
| use bumparaw_collections::RawMap; | ||||
| use hashbrown::hash_map::Entry; | ||||
| use heed::RoTxn; | ||||
| use memmap2::Mmap; | ||||
| use raw_collections::RawMap; | ||||
| use rayon::slice::ParallelSlice; | ||||
| use rustc_hash::FxBuildHasher; | ||||
| use serde_json::value::RawValue; | ||||
| use serde_json::Deserializer; | ||||
|  | ||||
| use super::super::document_change::DocumentChange; | ||||
| use super::document_changes::{DocumentChangeContext, DocumentChanges, Progress}; | ||||
| use super::document_changes::{DocumentChangeContext, DocumentChanges}; | ||||
| use super::retrieve_or_guess_primary_key; | ||||
| use crate::documents::PrimaryKey; | ||||
| use crate::progress::{AtomicPayloadStep, Progress}; | ||||
| use crate::update::new::document::Versions; | ||||
| use crate::update::new::steps::Step; | ||||
| use crate::update::new::steps::IndexingStep; | ||||
| use crate::update::new::thread_local::MostlySend; | ||||
| use crate::update::new::{Deletion, Insertion, Update}; | ||||
| use crate::update::{AvailableIds, IndexDocumentsMethod}; | ||||
| @@ -44,7 +48,7 @@ impl<'pl> DocumentOperation<'pl> { | ||||
|  | ||||
|     #[allow(clippy::too_many_arguments)] | ||||
|     #[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")] | ||||
|     pub fn into_changes<MSP, SP>( | ||||
|     pub fn into_changes<MSP>( | ||||
|         self, | ||||
|         indexer: &'pl Bump, | ||||
|         index: &Index, | ||||
| @@ -52,12 +56,12 @@ impl<'pl> DocumentOperation<'pl> { | ||||
|         primary_key_from_op: Option<&'pl str>, | ||||
|         new_fields_ids_map: &mut FieldsIdsMap, | ||||
|         must_stop_processing: &MSP, | ||||
|         send_progress: &SP, | ||||
|         progress: Progress, | ||||
|     ) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)> | ||||
|     where | ||||
|         MSP: Fn() -> bool, | ||||
|         SP: Fn(Progress), | ||||
|     { | ||||
|         progress.update_progress(IndexingStep::PreparingPayloads); | ||||
|         let Self { operations, method } = self; | ||||
|  | ||||
|         let documents_ids = index.documents_ids(rtxn)?; | ||||
| @@ -67,16 +71,14 @@ impl<'pl> DocumentOperation<'pl> { | ||||
|         let mut primary_key = None; | ||||
|  | ||||
|         let payload_count = operations.len(); | ||||
|         let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32); | ||||
|         progress.update_progress(progress_step); | ||||
|  | ||||
|         for (payload_index, operation) in operations.into_iter().enumerate() { | ||||
|             if must_stop_processing() { | ||||
|                 return Err(InternalError::AbortedIndexation.into()); | ||||
|             } | ||||
|             send_progress(Progress::from_step_substep( | ||||
|                 Step::PreparingPayloads, | ||||
|                 payload_index as u32, | ||||
|                 payload_count as u32, | ||||
|             )); | ||||
|             step.store(payload_index as u32, Ordering::Relaxed); | ||||
|  | ||||
|             let mut bytes = 0; | ||||
|             let result = match operation { | ||||
| @@ -117,12 +119,7 @@ impl<'pl> DocumentOperation<'pl> { | ||||
|             }; | ||||
|             operations_stats.push(PayloadStats { document_count, bytes, error }); | ||||
|         } | ||||
|  | ||||
|         send_progress(Progress::from_step_substep( | ||||
|             Step::PreparingPayloads, | ||||
|             payload_count as u32, | ||||
|             payload_count as u32, | ||||
|         )); | ||||
|         step.store(payload_count as u32, Ordering::Relaxed); | ||||
|  | ||||
|         // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone | ||||
|         let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> = | ||||
| @@ -166,8 +163,9 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>( | ||||
|  | ||||
|         // Only guess the primary key if it is the first document | ||||
|         let retrieved_primary_key = if previous_offset == 0 { | ||||
|             let doc = | ||||
|                 RawMap::from_raw_value(doc, indexer).map(Some).map_err(UserError::SerdeJson)?; | ||||
|             let doc = RawMap::from_raw_value_and_hasher(doc, FxBuildHasher, indexer) | ||||
|                 .map(Some) | ||||
|                 .map_err(UserError::SerdeJson)?; | ||||
|  | ||||
|             let result = retrieve_or_guess_primary_key( | ||||
|                 rtxn, | ||||
| @@ -545,8 +543,9 @@ impl MergeChanges for MergeDocumentForReplacement { | ||||
|         match operations.last() { | ||||
|             Some(InnerDocOp::Addition(DocumentOffset { content })) => { | ||||
|                 let document = serde_json::from_slice(content).unwrap(); | ||||
|                 let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) | ||||
|                     .map_err(UserError::SerdeJson)?; | ||||
|                 let document = | ||||
|                     RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) | ||||
|                         .map_err(UserError::SerdeJson)?; | ||||
|  | ||||
|                 if is_new { | ||||
|                     Ok(Some(DocumentChange::Insertion(Insertion::create( | ||||
| @@ -632,8 +631,9 @@ impl MergeChanges for MergeDocumentForUpdates { | ||||
|                     } | ||||
|                 }; | ||||
|                 let document = serde_json::from_slice(content).unwrap(); | ||||
|                 let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) | ||||
|                     .map_err(UserError::SerdeJson)?; | ||||
|                 let document = | ||||
|                     RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) | ||||
|                         .map_err(UserError::SerdeJson)?; | ||||
|  | ||||
|                 Some(Versions::single(document)) | ||||
|             } | ||||
| @@ -647,8 +647,9 @@ impl MergeChanges for MergeDocumentForUpdates { | ||||
|                     }; | ||||
|  | ||||
|                     let document = serde_json::from_slice(content).unwrap(); | ||||
|                     let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) | ||||
|                         .map_err(UserError::SerdeJson)?; | ||||
|                     let document = | ||||
|                         RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) | ||||
|                             .map_err(UserError::SerdeJson)?; | ||||
|                     Ok(document) | ||||
|                 }); | ||||
|                 Versions::multiple(versions)? | ||||
|   | ||||
| @@ -1,9 +1,11 @@ | ||||
| use std::cmp::Ordering; | ||||
| use std::sync::atomic::AtomicBool; | ||||
| use std::sync::{OnceLock, RwLock}; | ||||
| use std::thread::{self, Builder}; | ||||
|  | ||||
| use big_s::S; | ||||
| use document_changes::{extract, DocumentChanges, IndexingContext, Progress}; | ||||
| use bumparaw_collections::RawMap; | ||||
| use document_changes::{extract, DocumentChanges, IndexingContext}; | ||||
| pub use document_deletion::DocumentDeletion; | ||||
| pub use document_operation::{DocumentOperation, PayloadStats}; | ||||
| use hashbrown::HashMap; | ||||
| @@ -12,7 +14,7 @@ use heed::{RoTxn, RwTxn}; | ||||
| use itertools::{merge_join_by, EitherOrBoth}; | ||||
| pub use partial_dump::PartialDump; | ||||
| use rand::SeedableRng as _; | ||||
| use raw_collections::RawMap; | ||||
| use rustc_hash::FxBuildHasher; | ||||
| use time::OffsetDateTime; | ||||
| pub use update_by_function::UpdateByFunction; | ||||
|  | ||||
| @@ -20,7 +22,7 @@ use super::channel::*; | ||||
| use super::extract::*; | ||||
| use super::facet_search_builder::FacetSearchBuilder; | ||||
| use super::merger::FacetFieldIdsDelta; | ||||
| use super::steps::Step; | ||||
| use super::steps::IndexingStep; | ||||
| use super::thread_local::ThreadLocal; | ||||
| use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder}; | ||||
| use super::words_prefix_docids::{ | ||||
| @@ -31,6 +33,7 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY}; | ||||
| use crate::facet::FacetType; | ||||
| use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; | ||||
| use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; | ||||
| use crate::progress::Progress; | ||||
| use crate::proximity::ProximityPrecision; | ||||
| use crate::update::del_add::DelAdd; | ||||
| use crate::update::new::extract::EmbeddingExtractor; | ||||
| @@ -41,7 +44,7 @@ use crate::update::settings::InnerIndexSettings; | ||||
| use crate::update::{FacetsUpdateBulk, GrenadParameters}; | ||||
| use crate::vector::{ArroyWrapper, EmbeddingConfigs, Embeddings}; | ||||
| use crate::{ | ||||
|     FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, | ||||
|     Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort, | ||||
|     ThreadPoolNoAbortBuilder, UserError, | ||||
| }; | ||||
|  | ||||
| @@ -58,9 +61,10 @@ mod update_by_function; | ||||
| /// | ||||
| /// TODO return stats | ||||
| #[allow(clippy::too_many_arguments)] // clippy: 😝 | ||||
| pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( | ||||
| pub fn index<'pl, 'indexer, 'index, DC, MSP>( | ||||
|     wtxn: &mut RwTxn, | ||||
|     index: &'index Index, | ||||
|     pool: &ThreadPoolNoAbort, | ||||
|     grenad_parameters: GrenadParameters, | ||||
|     db_fields_ids_map: &'indexer FieldsIdsMap, | ||||
|     new_fields_ids_map: FieldsIdsMap, | ||||
| @@ -68,14 +72,44 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>( | ||||
|     document_changes: &DC, | ||||
|     embedders: EmbeddingConfigs, | ||||
|     must_stop_processing: &'indexer MSP, | ||||
|     send_progress: &'indexer SP, | ||||
|     progress: &'indexer Progress, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     DC: DocumentChanges<'pl>, | ||||
|     MSP: Fn() -> bool + Sync, | ||||
|     SP: Fn(Progress) + Sync, | ||||
| { | ||||
|     let (extractor_sender, writer_receiver) = extractor_writer_channel(10_000); | ||||
|     let mut bbbuffers = Vec::new(); | ||||
|     let finished_extraction = AtomicBool::new(false); | ||||
|  | ||||
|     // We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch | ||||
|     // is because we still use the old indexer for the settings and it is highly impacted by the | ||||
|     // max memory. So we keep the changes here and will remove these changes once we use the new | ||||
|     // indexer to also index settings. Related to #5125 and #5141. | ||||
|     let grenad_parameters = GrenadParameters { | ||||
|         max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100), | ||||
|         ..grenad_parameters | ||||
|     }; | ||||
|  | ||||
|     // We compute and remove the allocated BBQueues buffers capacity from the indexing memory. | ||||
|     let minimum_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB | ||||
|     let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or( | ||||
|         (grenad_parameters, 2 * minimum_capacity), // 100 MiB by thread by default | ||||
|         |max_memory| { | ||||
|             // 2% of the indexing memory | ||||
|             let total_bbbuffer_capacity = (max_memory / 100 / 2).max(minimum_capacity); | ||||
|             let new_grenad_parameters = GrenadParameters { | ||||
|                 max_memory: Some( | ||||
|                     max_memory.saturating_sub(total_bbbuffer_capacity).max(100 * 1024 * 1024), | ||||
|                 ), | ||||
|                 ..grenad_parameters | ||||
|             }; | ||||
|             (new_grenad_parameters, total_bbbuffer_capacity) | ||||
|         }, | ||||
|     ); | ||||
|  | ||||
|     let (extractor_sender, mut writer_receiver) = pool | ||||
|         .install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000)) | ||||
|         .unwrap(); | ||||
|  | ||||
|     let metadata_builder = MetadataBuilder::from_index(index, wtxn)?; | ||||
|     let new_fields_ids_map = FieldIdMapWithMetadata::new(new_fields_ids_map, metadata_builder); | ||||
| @@ -91,244 +125,274 @@ where | ||||
|         doc_allocs: &doc_allocs, | ||||
|         fields_ids_map_store: &fields_ids_map_store, | ||||
|         must_stop_processing, | ||||
|         send_progress, | ||||
|         progress, | ||||
|     }; | ||||
|  | ||||
|     let mut index_embeddings = index.embedding_configs(wtxn)?; | ||||
|     let mut field_distribution = index.field_distribution(wtxn)?; | ||||
|     let mut document_ids = index.documents_ids(wtxn)?; | ||||
|  | ||||
|     thread::scope(|s| -> Result<()> { | ||||
|         let indexer_span = tracing::Span::current(); | ||||
|         let embedders = &embedders; | ||||
|         let finished_extraction = &finished_extraction; | ||||
|         // prevent moving the field_distribution and document_ids in the inner closure... | ||||
|         let field_distribution = &mut field_distribution; | ||||
|         let document_ids = &mut document_ids; | ||||
|         let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { | ||||
|             let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); | ||||
|             let _entered = span.enter(); | ||||
|  | ||||
|             let rtxn = index.read_txn()?; | ||||
|  | ||||
|             // document but we need to create a function that collects and compresses documents. | ||||
|             let document_sender = extractor_sender.documents(); | ||||
|             let document_extractor = DocumentsExtractor::new(&document_sender, embedders); | ||||
|             let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); | ||||
|  | ||||
|             extract(document_changes, | ||||
|                 &document_extractor, | ||||
|                 indexing_context, | ||||
|                 &mut extractor_allocs, | ||||
|                 &datastore, | ||||
|                 Step::ExtractingDocuments, | ||||
|             )?; | ||||
|  | ||||
|             for document_extractor_data in datastore { | ||||
|                 let document_extractor_data = document_extractor_data.0.into_inner(); | ||||
|                 for (field, delta) in document_extractor_data.field_distribution_delta { | ||||
|                     let current = field_distribution.entry(field).or_default(); | ||||
|                     // adding the delta should never cause a negative result, as we are removing fields that previously existed. | ||||
|                     *current = current.saturating_add_signed(delta); | ||||
|                 } | ||||
|                 document_extractor_data.docids_delta.apply_to(document_ids); | ||||
|             } | ||||
|  | ||||
|             field_distribution.retain(|_, v| *v != 0); | ||||
|  | ||||
|             let facet_field_ids_delta; | ||||
|  | ||||
|             { | ||||
|                 let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted"); | ||||
|             pool.install(move || { | ||||
|                 let span = tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract"); | ||||
|                 let _entered = span.enter(); | ||||
|  | ||||
|                 facet_field_ids_delta = merge_and_send_facet_docids( | ||||
|                     FacetedDocidsExtractor::run_extraction( | ||||
|                         grenad_parameters, | ||||
|                 let rtxn = index.read_txn()?; | ||||
|  | ||||
|                 // document but we need to create a function that collects and compresses documents. | ||||
|                 let document_sender = extractor_sender.documents(); | ||||
|                 let document_extractor = DocumentsExtractor::new(document_sender, embedders); | ||||
|                 let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); | ||||
|                 { | ||||
|                     let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents"); | ||||
|                     let _entered = span.enter(); | ||||
|                     extract( | ||||
|                         document_changes, | ||||
|                         &document_extractor, | ||||
|                         indexing_context, | ||||
|                         &mut extractor_allocs, | ||||
|                         &extractor_sender.field_id_docid_facet_sender(), | ||||
|                         Step::ExtractingFacets | ||||
|                     )?, | ||||
|                     FacetDatabases::new(index), | ||||
|                     index, | ||||
|                     extractor_sender.facet_docids(), | ||||
|                 )?; | ||||
|             } | ||||
|  | ||||
|             { | ||||
|                 let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); | ||||
|                 let _entered = span.enter(); | ||||
|  | ||||
|  | ||||
|                 let WordDocidsCaches { | ||||
|                     word_docids, | ||||
|                     word_fid_docids, | ||||
|                     exact_word_docids, | ||||
|                     word_position_docids, | ||||
|                     fid_word_count_docids, | ||||
|                 } = WordDocidsExtractors::run_extraction( | ||||
|                     grenad_parameters, | ||||
|                     document_changes, | ||||
|                     indexing_context, | ||||
|                     &mut extractor_allocs, | ||||
|                     Step::ExtractingWords | ||||
|                 )?; | ||||
|  | ||||
|                 // TODO Word Docids Merger | ||||
|                 { | ||||
|                     let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); | ||||
|                     let _entered = span.enter(); | ||||
|                     merge_and_send_docids( | ||||
|                         word_docids, | ||||
|                         index.word_docids.remap_types(), | ||||
|                         index, | ||||
|                         extractor_sender.docids::<WordDocids>(), | ||||
|                         &indexing_context.must_stop_processing, | ||||
|                         &datastore, | ||||
|                         IndexingStep::ExtractingDocuments, | ||||
|                     )?; | ||||
|                 } | ||||
|  | ||||
|                 // Word Fid Docids Merging | ||||
|                 { | ||||
|                     let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); | ||||
|                     let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents"); | ||||
|                     let _entered = span.enter(); | ||||
|                     merge_and_send_docids( | ||||
|                         word_fid_docids, | ||||
|                         index.word_fid_docids.remap_types(), | ||||
|                         index, | ||||
|                         extractor_sender.docids::<WordFidDocids>(), | ||||
|                         &indexing_context.must_stop_processing, | ||||
|                     )?; | ||||
|                     for document_extractor_data in datastore { | ||||
|                         let document_extractor_data = document_extractor_data.0.into_inner(); | ||||
|                         for (field, delta) in document_extractor_data.field_distribution_delta { | ||||
|                             let current = field_distribution.entry(field).or_default(); | ||||
|                             // adding the delta should never cause a negative result, as we are removing fields that previously existed. | ||||
|                             *current = current.saturating_add_signed(delta); | ||||
|                         } | ||||
|                         document_extractor_data.docids_delta.apply_to(document_ids); | ||||
|                     } | ||||
|  | ||||
|                     field_distribution.retain(|_, v| *v != 0); | ||||
|                 } | ||||
|  | ||||
|                 // Exact Word Docids Merging | ||||
|                 let facet_field_ids_delta; | ||||
|  | ||||
|                 { | ||||
|                     let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); | ||||
|                     let _entered = span.enter(); | ||||
|                     merge_and_send_docids( | ||||
|                         exact_word_docids, | ||||
|                         index.exact_word_docids.remap_types(), | ||||
|                         index, | ||||
|                         extractor_sender.docids::<ExactWordDocids>(), | ||||
|                         &indexing_context.must_stop_processing, | ||||
|                     )?; | ||||
|                 } | ||||
|                     let caches = { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted"); | ||||
|                         let _entered = span.enter(); | ||||
|  | ||||
|                 // Word Position Docids Merging | ||||
|                 { | ||||
|                     let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); | ||||
|                     let _entered = span.enter(); | ||||
|                     merge_and_send_docids( | ||||
|                         word_position_docids, | ||||
|                         index.word_position_docids.remap_types(), | ||||
|                         index, | ||||
|                         extractor_sender.docids::<WordPositionDocids>(), | ||||
|                         &indexing_context.must_stop_processing, | ||||
|                     )?; | ||||
|                 } | ||||
|                         FacetedDocidsExtractor::run_extraction( | ||||
|                                 grenad_parameters, | ||||
|                                 document_changes, | ||||
|                                 indexing_context, | ||||
|                                 &mut extractor_allocs, | ||||
|                                 &extractor_sender.field_id_docid_facet_sender(), | ||||
|                                 IndexingStep::ExtractingFacets | ||||
|                             )? | ||||
|                     }; | ||||
|  | ||||
|                 // Fid Word Count Docids Merging | ||||
|                 { | ||||
|                     let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); | ||||
|                     let _entered = span.enter(); | ||||
|                     merge_and_send_docids( | ||||
|                         fid_word_count_docids, | ||||
|                         index.field_id_word_count_docids.remap_types(), | ||||
|                         index, | ||||
|                         extractor_sender.docids::<FidWordCountDocids>(), | ||||
|                         &indexing_context.must_stop_processing, | ||||
|                     )?; | ||||
|                 } | ||||
|             } | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted"); | ||||
|                         let _entered = span.enter(); | ||||
|  | ||||
|             // run the proximity extraction only if the precision is by word | ||||
|             // this works only if the settings didn't change during this transaction. | ||||
|             let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); | ||||
|             if proximity_precision == ProximityPrecision::ByWord { | ||||
|                 let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); | ||||
|                 let _entered = span.enter(); | ||||
|  | ||||
|  | ||||
|                 let caches = <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction( | ||||
|                     grenad_parameters, | ||||
|                     document_changes, | ||||
|                     indexing_context, | ||||
|                     &mut extractor_allocs, | ||||
|                     Step::ExtractingWordProximity, | ||||
|                 )?; | ||||
|  | ||||
|                 merge_and_send_docids( | ||||
|                     caches, | ||||
|                     index.word_pair_proximity_docids.remap_types(), | ||||
|                     index, | ||||
|                     extractor_sender.docids::<WordPairProximityDocids>(), | ||||
|                     &indexing_context.must_stop_processing, | ||||
|                 )?; | ||||
|             } | ||||
|  | ||||
|             'vectors: { | ||||
|                 let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); | ||||
|                 let _entered = span.enter(); | ||||
|  | ||||
|                 let mut index_embeddings = index.embedding_configs(&rtxn)?; | ||||
|                 if index_embeddings.is_empty() { | ||||
|                     break 'vectors; | ||||
|                 } | ||||
|  | ||||
|                 let embedding_sender = extractor_sender.embeddings(); | ||||
|                 let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads()); | ||||
|                 let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); | ||||
|                 extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?; | ||||
|  | ||||
|                 for config in &mut index_embeddings { | ||||
|                     'data: for data in datastore.iter_mut() { | ||||
|                         let data = &mut data.get_mut().0; | ||||
|                         let Some(deladd) = data.remove(&config.name) else { continue 'data; }; | ||||
|                         deladd.apply_to(&mut config.user_provided); | ||||
|                         facet_field_ids_delta = merge_and_send_facet_docids( | ||||
|                             caches, | ||||
|                             FacetDatabases::new(index), | ||||
|                             index, | ||||
|                             extractor_sender.facet_docids(), | ||||
|                         )?; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 embedding_sender.finish(index_embeddings).unwrap(); | ||||
|             } | ||||
|                 { | ||||
|                     let WordDocidsCaches { | ||||
|                         word_docids, | ||||
|                         word_fid_docids, | ||||
|                         exact_word_docids, | ||||
|                         word_position_docids, | ||||
|                         fid_word_count_docids, | ||||
|                     } = { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids"); | ||||
|                         let _entered = span.enter(); | ||||
|  | ||||
|             'geo: { | ||||
|                 let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); | ||||
|                 let _entered = span.enter(); | ||||
|                         WordDocidsExtractors::run_extraction( | ||||
|                             grenad_parameters, | ||||
|                             document_changes, | ||||
|                             indexing_context, | ||||
|                             &mut extractor_allocs, | ||||
|                             IndexingStep::ExtractingWords | ||||
|                         )? | ||||
|                     }; | ||||
|  | ||||
|                 let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { | ||||
|                     break 'geo; | ||||
|                 }; | ||||
|                 let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); | ||||
|                 extract( | ||||
|                     document_changes, | ||||
|                     &extractor, | ||||
|                     indexing_context, | ||||
|                     &mut extractor_allocs, | ||||
|                     &datastore, | ||||
|                     Step::WritingGeoPoints | ||||
|                 )?; | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids"); | ||||
|                         let _entered = span.enter(); | ||||
|                         merge_and_send_docids( | ||||
|                             word_docids, | ||||
|                             index.word_docids.remap_types(), | ||||
|                             index, | ||||
|                             extractor_sender.docids::<WordDocids>(), | ||||
|                             &indexing_context.must_stop_processing, | ||||
|                         )?; | ||||
|                     } | ||||
|  | ||||
|                 merge_and_send_rtree( | ||||
|                     datastore, | ||||
|                     &rtxn, | ||||
|                     index, | ||||
|                     extractor_sender.geo(), | ||||
|                     &indexing_context.must_stop_processing, | ||||
|                 )?; | ||||
|             } | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids"); | ||||
|                         let _entered = span.enter(); | ||||
|                         merge_and_send_docids( | ||||
|                             word_fid_docids, | ||||
|                             index.word_fid_docids.remap_types(), | ||||
|                             index, | ||||
|                             extractor_sender.docids::<WordFidDocids>(), | ||||
|                             &indexing_context.must_stop_processing, | ||||
|                         )?; | ||||
|                     } | ||||
|  | ||||
|             { | ||||
|                 let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH"); | ||||
|                 let _entered = span.enter(); | ||||
|                 (indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase)); | ||||
|             } | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); | ||||
|                         let _entered = span.enter(); | ||||
|                         merge_and_send_docids( | ||||
|                             exact_word_docids, | ||||
|                             index.exact_word_docids.remap_types(), | ||||
|                             index, | ||||
|                             extractor_sender.docids::<ExactWordDocids>(), | ||||
|                             &indexing_context.must_stop_processing, | ||||
|                         )?; | ||||
|                     } | ||||
|  | ||||
|             Result::Ok(facet_field_ids_delta) | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids"); | ||||
|                         let _entered = span.enter(); | ||||
|                         merge_and_send_docids( | ||||
|                             word_position_docids, | ||||
|                             index.word_position_docids.remap_types(), | ||||
|                             index, | ||||
|                             extractor_sender.docids::<WordPositionDocids>(), | ||||
|                             &indexing_context.must_stop_processing, | ||||
|                         )?; | ||||
|                     } | ||||
|  | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids"); | ||||
|                         let _entered = span.enter(); | ||||
|                         merge_and_send_docids( | ||||
|                             fid_word_count_docids, | ||||
|                             index.field_id_word_count_docids.remap_types(), | ||||
|                             index, | ||||
|                             extractor_sender.docids::<FidWordCountDocids>(), | ||||
|                             &indexing_context.must_stop_processing, | ||||
|                         )?; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 // run the proximity extraction only if the precision is by word | ||||
|                 // this works only if the settings didn't change during this transaction. | ||||
|                 let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default(); | ||||
|                 if proximity_precision == ProximityPrecision::ByWord { | ||||
|                     let caches = { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids"); | ||||
|                         let _entered = span.enter(); | ||||
|  | ||||
|                         <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction( | ||||
|                             grenad_parameters, | ||||
|                             document_changes, | ||||
|                             indexing_context, | ||||
|                             &mut extractor_allocs, | ||||
|                             IndexingStep::ExtractingWordProximity, | ||||
|                         )? | ||||
|                     }; | ||||
|  | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids"); | ||||
|                         let _entered = span.enter(); | ||||
|  | ||||
|                         merge_and_send_docids( | ||||
|                             caches, | ||||
|                             index.word_pair_proximity_docids.remap_types(), | ||||
|                             index, | ||||
|                             extractor_sender.docids::<WordPairProximityDocids>(), | ||||
|                             &indexing_context.must_stop_processing, | ||||
|                         )?; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 'vectors: { | ||||
|                     if index_embeddings.is_empty() { | ||||
|                         break 'vectors; | ||||
|                     } | ||||
|  | ||||
|                     let embedding_sender = extractor_sender.embeddings(); | ||||
|                     let extractor = EmbeddingExtractor::new(embedders, embedding_sender, field_distribution, request_threads()); | ||||
|                     let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors"); | ||||
|                         let _entered = span.enter(); | ||||
|  | ||||
|                         extract( | ||||
|                             document_changes, | ||||
|                             &extractor, | ||||
|                             indexing_context, | ||||
|                             &mut extractor_allocs, | ||||
|                             &datastore, | ||||
|                             IndexingStep::ExtractingEmbeddings, | ||||
|                         )?; | ||||
|                     } | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors"); | ||||
|                         let _entered = span.enter(); | ||||
|  | ||||
|                         for config in &mut index_embeddings { | ||||
|                             'data: for data in datastore.iter_mut() { | ||||
|                                 let data = &mut data.get_mut().0; | ||||
|                                 let Some(deladd) = data.remove(&config.name) else { continue 'data; }; | ||||
|                                 deladd.apply_to(&mut config.user_provided); | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 'geo: { | ||||
|                     let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else { | ||||
|                         break 'geo; | ||||
|                     }; | ||||
|                     let datastore = ThreadLocal::with_capacity(rayon::current_num_threads()); | ||||
|  | ||||
|                     { | ||||
|                         let span = tracing::trace_span!(target: "indexing::documents::extract", "geo"); | ||||
|                         let _entered = span.enter(); | ||||
|  | ||||
|                         extract( | ||||
|                             document_changes, | ||||
|                             &extractor, | ||||
|                             indexing_context, | ||||
|                             &mut extractor_allocs, | ||||
|                             &datastore, | ||||
|                             IndexingStep::WritingGeoPoints | ||||
|                         )?; | ||||
|                     } | ||||
|  | ||||
|                     merge_and_send_rtree( | ||||
|                         datastore, | ||||
|                         &rtxn, | ||||
|                         index, | ||||
|                         extractor_sender.geo(), | ||||
|                         &indexing_context.must_stop_processing, | ||||
|                     )?; | ||||
|                 } | ||||
|                 indexing_context.progress.update_progress(IndexingStep::WritingToDatabase); | ||||
|                 finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed); | ||||
|  | ||||
|                 Result::Ok((facet_field_ids_delta, index_embeddings)) | ||||
|             }).unwrap() | ||||
|         })?; | ||||
|  | ||||
|         let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); | ||||
|  | ||||
|         let vector_arroy = index.vector_arroy; | ||||
|         let mut rng = rand::rngs::StdRng::seed_from_u64(42); | ||||
|         let indexer_span = tracing::Span::current(); | ||||
|         let arroy_writers: Result<HashMap<_, _>> = embedders | ||||
|             .inner_as_ref() | ||||
| @@ -351,94 +415,116 @@ where | ||||
|             }) | ||||
|             .collect(); | ||||
|  | ||||
|         // Used by by the ArroySetVector to copy the embedding into an | ||||
|         // aligned memory area, required by arroy to accept a new vector. | ||||
|         let mut aligned_embedding = Vec::new(); | ||||
|         let mut arroy_writers = arroy_writers?; | ||||
|         for operation in writer_receiver { | ||||
|             match operation { | ||||
|                 WriterOperation::DbOperation(db_operation) => { | ||||
|                     let database = db_operation.database(index); | ||||
|                     match db_operation.entry() { | ||||
|                         EntryOperation::Delete(e) => { | ||||
|                             if !database.delete(wtxn, e.entry())? { | ||||
|                                 unreachable!("We tried to delete an unknown key") | ||||
|                             } | ||||
|                         } | ||||
|                         EntryOperation::Write(e) => database.put(wtxn, e.key(), e.value())?, | ||||
|                     } | ||||
|  | ||||
|         { | ||||
|             let span = tracing::trace_span!(target: "indexing::write_db", "all"); | ||||
|             let _entered = span.enter(); | ||||
|  | ||||
|             let span = tracing::trace_span!(target: "indexing::write_db", "post_merge"); | ||||
|             let mut _entered_post_merge = None; | ||||
|  | ||||
|             while let Some(action) = writer_receiver.recv_action() { | ||||
|                 if _entered_post_merge.is_none() | ||||
|                     && finished_extraction.load(std::sync::atomic::Ordering::Relaxed) | ||||
|                 { | ||||
|                     _entered_post_merge = Some(span.enter()); | ||||
|                 } | ||||
|                 WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation { | ||||
|                     ArroyOperation::DeleteVectors { docid } => { | ||||
|                         for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in | ||||
|                             &mut arroy_writers | ||||
|                         { | ||||
|                             let dimensions = *dimensions; | ||||
|                             writer.del_items(wtxn, dimensions, docid)?; | ||||
|  | ||||
|                 match action { | ||||
|                     ReceiverAction::WakeUp => (), | ||||
|                     ReceiverAction::LargeEntry(LargeEntry { database, key, value }) => { | ||||
|                         let database_name = database.database_name(); | ||||
|                         let database = database.database(index); | ||||
|                         if let Err(error) = database.put(wtxn, &key, &value) { | ||||
|                             return Err(Error::InternalError(InternalError::StorePut { | ||||
|                                 database_name, | ||||
|                                 key: bstr::BString::from(&key[..]), | ||||
|                                 value_length: value.len(), | ||||
|                                 error, | ||||
|                             })); | ||||
|                         } | ||||
|                     } | ||||
|                     ArroyOperation::SetVectors { | ||||
|                         docid, | ||||
|                         embedder_id, | ||||
|                         embeddings: raw_embeddings, | ||||
|                     } => { | ||||
|                     ReceiverAction::LargeVectors(large_vectors) => { | ||||
|                         let LargeVectors { docid, embedder_id, .. } = large_vectors; | ||||
|                         let (_, _, writer, dimensions) = | ||||
|                             arroy_writers.get(&embedder_id).expect("requested a missing embedder"); | ||||
|                         // TODO: switch to Embeddings | ||||
|                         let mut embeddings = Embeddings::new(*dimensions); | ||||
|                         for embedding in raw_embeddings { | ||||
|                             embeddings.append(embedding).unwrap(); | ||||
|                         for embedding in large_vectors.read_embeddings(*dimensions) { | ||||
|                             embeddings.push(embedding.to_vec()).unwrap(); | ||||
|                         } | ||||
|  | ||||
|                         writer.del_items(wtxn, *dimensions, docid)?; | ||||
|                         writer.add_items(wtxn, docid, &embeddings)?; | ||||
|                     } | ||||
|                     ArroyOperation::SetVector { docid, embedder_id, embedding } => { | ||||
|                         let (_, _, writer, dimensions) = | ||||
|                             arroy_writers.get(&embedder_id).expect("requested a missing embedder"); | ||||
|                         writer.del_items(wtxn, *dimensions, docid)?; | ||||
|                         writer.add_item(wtxn, docid, &embedding)?; | ||||
|                     } | ||||
|                     ArroyOperation::Finish { configs } => { | ||||
|                         let span = tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); | ||||
|                         let _entered = span.enter(); | ||||
|                 } | ||||
|  | ||||
|                         (indexing_context.send_progress)(Progress::from_step( | ||||
|                             Step::WritingEmbeddingsToDatabase, | ||||
|                         )); | ||||
|  | ||||
|                         for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in | ||||
|                             &mut arroy_writers | ||||
|                         { | ||||
|                             let dimensions = *dimensions; | ||||
|                             writer.build_and_quantize( | ||||
|                                 wtxn, | ||||
|                                 &mut rng, | ||||
|                                 dimensions, | ||||
|                                 false, | ||||
|                                 &indexing_context.must_stop_processing, | ||||
|                             )?; | ||||
|                         } | ||||
|  | ||||
|                         index.put_embedding_configs(wtxn, configs)?; | ||||
|                     } | ||||
|                 }, | ||||
|                 // Every time the is a message in the channel we search | ||||
|                 // for new entries in the BBQueue buffers. | ||||
|                 write_from_bbqueue( | ||||
|                     &mut writer_receiver, | ||||
|                     index, | ||||
|                     wtxn, | ||||
|                     &arroy_writers, | ||||
|                     &mut aligned_embedding, | ||||
|                 )?; | ||||
|             } | ||||
|  | ||||
|             // Once the extractor/writer channel is closed | ||||
|             // we must process the remaining BBQueue messages. | ||||
|             write_from_bbqueue( | ||||
|                 &mut writer_receiver, | ||||
|                 index, | ||||
|                 wtxn, | ||||
|                 &arroy_writers, | ||||
|                 &mut aligned_embedding, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|         (indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors)); | ||||
|         indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); | ||||
|  | ||||
|         let facet_field_ids_delta = extractor_handle.join().unwrap()?; | ||||
|         let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?; | ||||
|  | ||||
|         (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets)); | ||||
|         'vectors: { | ||||
|             let span = | ||||
|                 tracing::trace_span!(target: "indexing::vectors", parent: &indexer_span, "build"); | ||||
|             let _entered = span.enter(); | ||||
|  | ||||
|             if index_embeddings.is_empty() { | ||||
|                 break 'vectors; | ||||
|             } | ||||
|  | ||||
|             indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase); | ||||
|             let mut rng = rand::rngs::StdRng::seed_from_u64(42); | ||||
|             for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers { | ||||
|                 let dimensions = *dimensions; | ||||
|                 writer.build_and_quantize( | ||||
|                     wtxn, | ||||
|                     &mut rng, | ||||
|                     dimensions, | ||||
|                     false, | ||||
|                     &indexing_context.must_stop_processing, | ||||
|                 )?; | ||||
|             } | ||||
|  | ||||
|             index.put_embedding_configs(wtxn, index_embeddings)?; | ||||
|         } | ||||
|  | ||||
|         indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets); | ||||
|         if index.facet_search(wtxn)? { | ||||
|             compute_facet_search_database(index, wtxn, global_fields_ids_map)?; | ||||
|         } | ||||
|  | ||||
|         compute_facet_search_database(index, wtxn, global_fields_ids_map)?; | ||||
|         compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; | ||||
|  | ||||
|         (indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords)); | ||||
|  | ||||
|         indexing_context.progress.update_progress(IndexingStep::PostProcessingWords); | ||||
|         if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { | ||||
|             compute_prefix_database(index, wtxn, prefix_delta, grenad_parameters)?; | ||||
|         } | ||||
|  | ||||
|         (indexing_context.send_progress)(Progress::from_step(Step::Finalizing)); | ||||
|         indexing_context.progress.update_progress(IndexingStep::Finalizing); | ||||
|  | ||||
|         Ok(()) as Result<_> | ||||
|     })?; | ||||
| @@ -464,6 +550,72 @@ where | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// A function dedicated to manage all the available BBQueue frames. | ||||
| /// | ||||
| /// It reads all the available frames, do the corresponding database operations | ||||
| /// and stops when no frame are available. | ||||
| fn write_from_bbqueue( | ||||
|     writer_receiver: &mut WriterBbqueueReceiver<'_>, | ||||
|     index: &Index, | ||||
|     wtxn: &mut RwTxn<'_>, | ||||
|     arroy_writers: &HashMap<u8, (&str, &crate::vector::Embedder, ArroyWrapper, usize)>, | ||||
|     aligned_embedding: &mut Vec<f32>, | ||||
| ) -> crate::Result<()> { | ||||
|     while let Some(frame_with_header) = writer_receiver.recv_frame() { | ||||
|         match frame_with_header.header() { | ||||
|             EntryHeader::DbOperation(operation) => { | ||||
|                 let database_name = operation.database.database_name(); | ||||
|                 let database = operation.database.database(index); | ||||
|                 let frame = frame_with_header.frame(); | ||||
|                 match operation.key_value(frame) { | ||||
|                     (key, Some(value)) => { | ||||
|                         if let Err(error) = database.put(wtxn, key, value) { | ||||
|                             return Err(Error::InternalError(InternalError::StorePut { | ||||
|                                 database_name, | ||||
|                                 key: key.into(), | ||||
|                                 value_length: value.len(), | ||||
|                                 error, | ||||
|                             })); | ||||
|                         } | ||||
|                     } | ||||
|                     (key, None) => match database.delete(wtxn, key) { | ||||
|                         Ok(false) => { | ||||
|                             unreachable!("We tried to delete an unknown key: {key:?}") | ||||
|                         } | ||||
|                         Ok(_) => (), | ||||
|                         Err(error) => { | ||||
|                             return Err(Error::InternalError(InternalError::StoreDeletion { | ||||
|                                 database_name, | ||||
|                                 key: key.into(), | ||||
|                                 error, | ||||
|                             })); | ||||
|                         } | ||||
|                     }, | ||||
|                 } | ||||
|             } | ||||
|             EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { | ||||
|                 for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers { | ||||
|                     let dimensions = *dimensions; | ||||
|                     writer.del_items(wtxn, dimensions, docid)?; | ||||
|                 } | ||||
|             } | ||||
|             EntryHeader::ArroySetVectors(asvs) => { | ||||
|                 let ArroySetVectors { docid, embedder_id, .. } = asvs; | ||||
|                 let frame = frame_with_header.frame(); | ||||
|                 let (_, _, writer, dimensions) = | ||||
|                     arroy_writers.get(&embedder_id).expect("requested a missing embedder"); | ||||
|                 let mut embeddings = Embeddings::new(*dimensions); | ||||
|                 let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); | ||||
|                 embeddings.append(all_embeddings.to_vec()).unwrap(); | ||||
|                 writer.del_items(wtxn, *dimensions, docid)?; | ||||
|                 writer.add_items(wtxn, docid, &embeddings)?; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| #[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")] | ||||
| fn compute_prefix_database( | ||||
|     index: &Index, | ||||
| @@ -618,7 +770,7 @@ pub fn retrieve_or_guess_primary_key<'a>( | ||||
|     index: &Index, | ||||
|     new_fields_ids_map: &mut FieldsIdsMap, | ||||
|     primary_key_from_op: Option<&'a str>, | ||||
|     first_document: Option<RawMap<'a>>, | ||||
|     first_document: Option<RawMap<'a, FxBuildHasher>>, | ||||
| ) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> { | ||||
|     // make sure that we have a declared primary key, either fetching it from the index or attempting to guess it. | ||||
|  | ||||
|   | ||||
| @@ -1,6 +1,8 @@ | ||||
| use std::ops::DerefMut; | ||||
|  | ||||
| use bumparaw_collections::RawMap; | ||||
| use rayon::iter::IndexedParallelIterator; | ||||
| use rustc_hash::FxBuildHasher; | ||||
| use serde_json::value::RawValue; | ||||
|  | ||||
| use super::document_changes::{DocumentChangeContext, DocumentChanges}; | ||||
| @@ -75,7 +77,7 @@ where | ||||
|             self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?; | ||||
|         let external_document_id = external_document_id.to_de(); | ||||
|  | ||||
|         let document = raw_collections::RawMap::from_raw_value(document, doc_alloc) | ||||
|         let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc) | ||||
|             .map_err(InternalError::SerdeJson)?; | ||||
|  | ||||
|         let insertion = Insertion::create(docid, external_document_id, Versions::single(document)); | ||||
|   | ||||
| @@ -1,8 +1,9 @@ | ||||
| use raw_collections::RawMap; | ||||
| use bumparaw_collections::RawMap; | ||||
| use rayon::iter::IndexedParallelIterator; | ||||
| use rayon::slice::ParallelSlice as _; | ||||
| use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST}; | ||||
| use roaring::RoaringBitmap; | ||||
| use rustc_hash::FxBuildHasher; | ||||
|  | ||||
| use super::document_changes::DocumentChangeContext; | ||||
| use super::DocumentChanges; | ||||
| @@ -160,8 +161,12 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> { | ||||
|                         if document_id != new_document_id { | ||||
|                             Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey)) | ||||
|                         } else { | ||||
|                             let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc) | ||||
|                                 .map_err(InternalError::SerdeJson)?; | ||||
|                             let raw_new_doc = RawMap::from_raw_value_and_hasher( | ||||
|                                 raw_new_doc, | ||||
|                                 FxBuildHasher, | ||||
|                                 doc_alloc, | ||||
|                             ) | ||||
|                             .map_err(InternalError::SerdeJson)?; | ||||
|  | ||||
|                             Ok(Some(DocumentChange::Update(Update::create( | ||||
|                                 docid, | ||||
|   | ||||
| @@ -9,8 +9,8 @@ use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::channel::*; | ||||
| use super::extract::{ | ||||
|     merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, FacetKind, | ||||
|     GeoExtractorData, | ||||
|     merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, | ||||
|     FacetKind, GeoExtractorData, | ||||
| }; | ||||
| use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result}; | ||||
|  | ||||
| @@ -19,7 +19,7 @@ pub fn merge_and_send_rtree<'extractor, MSP>( | ||||
|     datastore: impl IntoIterator<Item = RefCell<GeoExtractorData<'extractor>>>, | ||||
|     rtxn: &RoTxn, | ||||
|     index: &Index, | ||||
|     geo_sender: GeoSender<'_>, | ||||
|     geo_sender: GeoSender<'_, '_>, | ||||
|     must_stop_processing: &MSP, | ||||
| ) -> Result<()> | ||||
| where | ||||
| @@ -34,7 +34,7 @@ where | ||||
|         } | ||||
|  | ||||
|         let mut frozen = data.into_inner().freeze()?; | ||||
|         for result in frozen.iter_and_clear_removed() { | ||||
|         for result in frozen.iter_and_clear_removed()? { | ||||
|             let extracted_geo_point = result?; | ||||
|             let removed = rtree.remove(&GeoPoint::from(extracted_geo_point)); | ||||
|             debug_assert!(removed.is_some()); | ||||
| @@ -42,7 +42,7 @@ where | ||||
|             debug_assert!(removed); | ||||
|         } | ||||
|  | ||||
|         for result in frozen.iter_and_clear_inserted() { | ||||
|         for result in frozen.iter_and_clear_inserted()? { | ||||
|             let extracted_geo_point = result?; | ||||
|             rtree.insert(GeoPoint::from(extracted_geo_point)); | ||||
|             let inserted = faceted.insert(extracted_geo_point.docid); | ||||
| @@ -56,38 +56,37 @@ where | ||||
|  | ||||
|     let rtree_mmap = unsafe { Mmap::map(&file)? }; | ||||
|     geo_sender.set_rtree(rtree_mmap).unwrap(); | ||||
|     geo_sender.set_geo_faceted(&faceted).unwrap(); | ||||
|     geo_sender.set_geo_faceted(&faceted)?; | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] | ||||
| pub fn merge_and_send_docids<'extractor, MSP>( | ||||
| pub fn merge_and_send_docids<'extractor, MSP, D>( | ||||
|     mut caches: Vec<BalancedCaches<'extractor>>, | ||||
|     database: Database<Bytes, Bytes>, | ||||
|     index: &Index, | ||||
|     docids_sender: impl DocidsSender + Sync, | ||||
|     docids_sender: WordDocidsSender<D>, | ||||
|     must_stop_processing: &MSP, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     MSP: Fn() -> bool + Sync, | ||||
|     D: DatabaseType + Sync, | ||||
| { | ||||
|     transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| { | ||||
|         let rtxn = index.read_txn()?; | ||||
|         let mut buffer = Vec::new(); | ||||
|         if must_stop_processing() { | ||||
|             return Err(InternalError::AbortedIndexation.into()); | ||||
|         } | ||||
|         merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { | ||||
|         merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| { | ||||
|             let current = database.get(&rtxn, key)?; | ||||
|             match merge_cbo_bitmaps(current, del, add)? { | ||||
|                 Operation::Write(bitmap) => { | ||||
|                     let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); | ||||
|                     docids_sender.write(key, value).unwrap(); | ||||
|                     docids_sender.write(key, &bitmap)?; | ||||
|                     Ok(()) | ||||
|                 } | ||||
|                 Operation::Delete => { | ||||
|                     docids_sender.delete(key).unwrap(); | ||||
|                     docids_sender.delete(key)?; | ||||
|                     Ok(()) | ||||
|                 } | ||||
|                 Operation::Ignore => Ok(()), | ||||
| @@ -101,26 +100,24 @@ pub fn merge_and_send_facet_docids<'extractor>( | ||||
|     mut caches: Vec<BalancedCaches<'extractor>>, | ||||
|     database: FacetDatabases, | ||||
|     index: &Index, | ||||
|     docids_sender: impl DocidsSender + Sync, | ||||
|     docids_sender: FacetDocidsSender, | ||||
| ) -> Result<FacetFieldIdsDelta> { | ||||
|     transpose_and_freeze_caches(&mut caches)? | ||||
|         .into_par_iter() | ||||
|         .map(|frozen| { | ||||
|             let mut facet_field_ids_delta = FacetFieldIdsDelta::default(); | ||||
|             let rtxn = index.read_txn()?; | ||||
|             let mut buffer = Vec::new(); | ||||
|             merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| { | ||||
|             merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| { | ||||
|                 let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?; | ||||
|                 match merge_cbo_bitmaps(current, del, add)? { | ||||
|                     Operation::Write(bitmap) => { | ||||
|                         facet_field_ids_delta.register_from_key(key); | ||||
|                         let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer); | ||||
|                         docids_sender.write(key, value).unwrap(); | ||||
|                         docids_sender.write(key, &bitmap)?; | ||||
|                         Ok(()) | ||||
|                     } | ||||
|                     Operation::Delete => { | ||||
|                         facet_field_ids_delta.register_from_key(key); | ||||
|                         docids_sender.delete(key).unwrap(); | ||||
|                         docids_sender.delete(key)?; | ||||
|                         Ok(()) | ||||
|                     } | ||||
|                     Operation::Ignore => Ok(()), | ||||
| @@ -238,8 +235,12 @@ fn merge_cbo_bitmaps( | ||||
|         (Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange | ||||
|         (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)), | ||||
|         (Some(current), Some(del), add) => { | ||||
|             debug_assert!( | ||||
|                 del.is_subset(¤t), | ||||
|                 "del is not a subset of current, which must be impossible." | ||||
|             ); | ||||
|             let output = match add { | ||||
|                 Some(add) => (¤t - del) | add, | ||||
|                 Some(add) => (¤t - (&del - &add)) | (add - del), | ||||
|                 None => ¤t - del, | ||||
|             }; | ||||
|             if output.is_empty() { | ||||
| @@ -252,10 +253,3 @@ fn merge_cbo_bitmaps( | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// TODO Return the slice directly from the serialize_into method | ||||
| fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec<u8>) -> &'b [u8] { | ||||
|     buffer.clear(); | ||||
|     CboRoaringBitmapCodec::serialize_into(bitmap, buffer); | ||||
|     buffer.as_slice() | ||||
| } | ||||
|   | ||||
| @@ -5,6 +5,7 @@ pub trait RefCellExt<T: ?Sized> { | ||||
|         &self, | ||||
|     ) -> std::result::Result<RefMut<'_, T>, std::cell::BorrowMutError>; | ||||
|  | ||||
|     #[track_caller] | ||||
|     fn borrow_mut_or_yield(&self) -> RefMut<'_, T> { | ||||
|         self.try_borrow_mut_or_yield().unwrap() | ||||
|     } | ||||
|   | ||||
| @@ -1,8 +1,12 @@ | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use enum_iterator::Sequence; | ||||
|  | ||||
| use crate::progress::Step; | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)] | ||||
| #[repr(u16)] | ||||
| pub enum Step { | ||||
| #[repr(u8)] | ||||
| pub enum IndexingStep { | ||||
|     PreparingPayloads, | ||||
|     ExtractingDocuments, | ||||
|     ExtractingFacets, | ||||
| @@ -11,37 +15,38 @@ pub enum Step { | ||||
|     ExtractingEmbeddings, | ||||
|     WritingGeoPoints, | ||||
|     WritingToDatabase, | ||||
|     WritingEmbeddingsToDatabase, | ||||
|     WaitingForExtractors, | ||||
|     WritingEmbeddingsToDatabase, | ||||
|     PostProcessingFacets, | ||||
|     PostProcessingWords, | ||||
|     Finalizing, | ||||
| } | ||||
|  | ||||
| impl Step { | ||||
|     pub fn name(&self) -> &'static str { | ||||
| impl Step for IndexingStep { | ||||
|     fn name(&self) -> Cow<'static, str> { | ||||
|         match self { | ||||
|             Step::PreparingPayloads => "preparing update file", | ||||
|             Step::ExtractingDocuments => "extracting documents", | ||||
|             Step::ExtractingFacets => "extracting facets", | ||||
|             Step::ExtractingWords => "extracting words", | ||||
|             Step::ExtractingWordProximity => "extracting word proximity", | ||||
|             Step::ExtractingEmbeddings => "extracting embeddings", | ||||
|             Step::WritingGeoPoints => "writing geo points", | ||||
|             Step::WritingToDatabase => "writing to database", | ||||
|             Step::WritingEmbeddingsToDatabase => "writing embeddings to database", | ||||
|             Step::WaitingForExtractors => "waiting for extractors", | ||||
|             Step::PostProcessingFacets => "post-processing facets", | ||||
|             Step::PostProcessingWords => "post-processing words", | ||||
|             Step::Finalizing => "finalizing", | ||||
|             IndexingStep::PreparingPayloads => "preparing update file", | ||||
|             IndexingStep::ExtractingDocuments => "extracting documents", | ||||
|             IndexingStep::ExtractingFacets => "extracting facets", | ||||
|             IndexingStep::ExtractingWords => "extracting words", | ||||
|             IndexingStep::ExtractingWordProximity => "extracting word proximity", | ||||
|             IndexingStep::ExtractingEmbeddings => "extracting embeddings", | ||||
|             IndexingStep::WritingGeoPoints => "writing geo points", | ||||
|             IndexingStep::WritingToDatabase => "writing to database", | ||||
|             IndexingStep::WaitingForExtractors => "waiting for extractors", | ||||
|             IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database", | ||||
|             IndexingStep::PostProcessingFacets => "post-processing facets", | ||||
|             IndexingStep::PostProcessingWords => "post-processing words", | ||||
|             IndexingStep::Finalizing => "finalizing", | ||||
|         } | ||||
|         .into() | ||||
|     } | ||||
|  | ||||
|     pub fn finished_steps(self) -> u16 { | ||||
|         self as u16 | ||||
|     fn current(&self) -> u32 { | ||||
|         *self as u32 | ||||
|     } | ||||
|  | ||||
|     pub const fn total_steps() -> u16 { | ||||
|         Self::CARDINALITY as u16 | ||||
|     fn total(&self) -> u32 { | ||||
|         Self::CARDINALITY as u32 | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,9 +1,10 @@ | ||||
| use std::collections::BTreeSet; | ||||
|  | ||||
| use bumpalo::Bump; | ||||
| use bumparaw_collections::RawMap; | ||||
| use deserr::{Deserr, IntoValue}; | ||||
| use heed::RoTxn; | ||||
| use raw_collections::RawMap; | ||||
| use rustc_hash::FxBuildHasher; | ||||
| use serde::Serialize; | ||||
| use serde_json::value::RawValue; | ||||
|  | ||||
| @@ -84,7 +85,7 @@ pub struct VectorDocumentFromDb<'t> { | ||||
|     docid: DocumentId, | ||||
|     embedding_config: Vec<IndexEmbeddingConfig>, | ||||
|     index: &'t Index, | ||||
|     vectors_field: Option<RawMap<'t>>, | ||||
|     vectors_field: Option<RawMap<'t, FxBuildHasher>>, | ||||
|     rtxn: &'t RoTxn<'t>, | ||||
|     doc_alloc: &'t Bump, | ||||
| } | ||||
| @@ -102,9 +103,10 @@ impl<'t> VectorDocumentFromDb<'t> { | ||||
|         }; | ||||
|         let vectors = document.vectors_field()?; | ||||
|         let vectors_field = match vectors { | ||||
|             Some(vectors) => { | ||||
|                 Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?) | ||||
|             } | ||||
|             Some(vectors) => Some( | ||||
|                 RawMap::from_raw_value_and_hasher(vectors, FxBuildHasher, doc_alloc) | ||||
|                     .map_err(InternalError::SerdeJson)?, | ||||
|             ), | ||||
|             None => None, | ||||
|         }; | ||||
|  | ||||
| @@ -220,7 +222,7 @@ fn entry_from_raw_value( | ||||
|  | ||||
| pub struct VectorDocumentFromVersions<'doc> { | ||||
|     external_document_id: &'doc str, | ||||
|     vectors: RawMap<'doc>, | ||||
|     vectors: RawMap<'doc, FxBuildHasher>, | ||||
|     embedders: &'doc EmbeddingConfigs, | ||||
| } | ||||
|  | ||||
| @@ -233,8 +235,8 @@ impl<'doc> VectorDocumentFromVersions<'doc> { | ||||
|     ) -> Result<Option<Self>> { | ||||
|         let document = DocumentFromVersions::new(versions); | ||||
|         if let Some(vectors_field) = document.vectors_field()? { | ||||
|             let vectors = | ||||
|                 RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?; | ||||
|             let vectors = RawMap::from_raw_value_and_hasher(vectors_field, FxBuildHasher, bump) | ||||
|                 .map_err(UserError::SerdeJson)?; | ||||
|             Ok(Some(Self { external_document_id, vectors, embedders })) | ||||
|         } else { | ||||
|             Ok(None) | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| use std::collections::HashSet; | ||||
| use std::collections::BTreeSet; | ||||
| use std::io::BufWriter; | ||||
|  | ||||
| use fst::{Set, SetBuilder, Streamer}; | ||||
| @@ -75,18 +75,18 @@ pub struct PrefixData { | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct PrefixDelta { | ||||
|     pub modified: HashSet<Prefix>, | ||||
|     pub deleted: HashSet<Prefix>, | ||||
|     pub modified: BTreeSet<Prefix>, | ||||
|     pub deleted: BTreeSet<Prefix>, | ||||
| } | ||||
|  | ||||
| struct PrefixFstBuilder { | ||||
|     prefix_count_threshold: u64, | ||||
|     prefix_count_threshold: usize, | ||||
|     max_prefix_length: usize, | ||||
|     /// TODO: Replace the full memory allocation | ||||
|     prefix_fst_builders: Vec<SetBuilder<Vec<u8>>>, | ||||
|     current_prefix: Vec<Prefix>, | ||||
|     current_prefix_count: Vec<u64>, | ||||
|     modified_prefixes: HashSet<Prefix>, | ||||
|     current_prefix_count: Vec<usize>, | ||||
|     modified_prefixes: BTreeSet<Prefix>, | ||||
|     current_prefix_is_modified: Vec<bool>, | ||||
| } | ||||
|  | ||||
| @@ -95,7 +95,7 @@ impl PrefixFstBuilder { | ||||
|         let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } = | ||||
|             prefix_settings; | ||||
|  | ||||
|         if !compute_prefixes { | ||||
|         if compute_prefixes != crate::index::PrefixSearch::IndexingTime { | ||||
|             return None; | ||||
|         } | ||||
|  | ||||
| @@ -110,7 +110,7 @@ impl PrefixFstBuilder { | ||||
|             prefix_fst_builders, | ||||
|             current_prefix: vec![Prefix::new(); max_prefix_length], | ||||
|             current_prefix_count: vec![0; max_prefix_length], | ||||
|             modified_prefixes: HashSet::new(), | ||||
|             modified_prefixes: BTreeSet::new(), | ||||
|             current_prefix_is_modified: vec![false; max_prefix_length], | ||||
|         }) | ||||
|     } | ||||
| @@ -180,7 +180,7 @@ impl PrefixFstBuilder { | ||||
|         let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? }; | ||||
|         let new_prefix_fst = Set::new(&prefix_fst_mmap)?; | ||||
|         let old_prefix_fst = index.words_prefixes_fst(rtxn)?; | ||||
|         let mut deleted_prefixes = HashSet::new(); | ||||
|         let mut deleted_prefixes = BTreeSet::new(); | ||||
|         { | ||||
|             let mut deleted_prefixes_stream = old_prefix_fst.op().add(&new_prefix_fst).difference(); | ||||
|             while let Some(prefix) = deleted_prefixes_stream.next() { | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| use std::cell::RefCell; | ||||
| use std::collections::HashSet; | ||||
| use std::collections::BTreeSet; | ||||
| use std::io::{BufReader, BufWriter, Read, Seek, Write}; | ||||
|  | ||||
| use hashbrown::HashMap; | ||||
| @@ -37,8 +37,8 @@ impl WordPrefixDocids { | ||||
|     fn execute( | ||||
|         self, | ||||
|         wtxn: &mut heed::RwTxn, | ||||
|         prefix_to_compute: &HashSet<Prefix>, | ||||
|         prefix_to_delete: &HashSet<Prefix>, | ||||
|         prefix_to_compute: &BTreeSet<Prefix>, | ||||
|         prefix_to_delete: &BTreeSet<Prefix>, | ||||
|     ) -> Result<()> { | ||||
|         delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; | ||||
|         self.recompute_modified_prefixes(wtxn, prefix_to_compute) | ||||
| @@ -48,7 +48,7 @@ impl WordPrefixDocids { | ||||
|     fn recompute_modified_prefixes( | ||||
|         &self, | ||||
|         wtxn: &mut RwTxn, | ||||
|         prefixes: &HashSet<Prefix>, | ||||
|         prefixes: &BTreeSet<Prefix>, | ||||
|     ) -> Result<()> { | ||||
|         // We fetch the docids associated to the newly added word prefix fst only. | ||||
|         // And collect the CboRoaringBitmaps pointers in an HashMap. | ||||
| @@ -76,7 +76,7 @@ impl WordPrefixDocids { | ||||
|                 .union()?; | ||||
|  | ||||
|             buffer.clear(); | ||||
|             CboRoaringBitmapCodec::serialize_into(&output, buffer); | ||||
|             CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); | ||||
|             index.push(PrefixEntry { prefix, serialized_length: buffer.len() }); | ||||
|             file.write_all(buffer) | ||||
|         })?; | ||||
| @@ -127,7 +127,7 @@ impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> { | ||||
|     pub fn from_prefixes( | ||||
|         database: Database<Bytes, CboRoaringBitmapCodec>, | ||||
|         rtxn: &'rtxn RoTxn, | ||||
|         prefixes: &'a HashSet<Prefix>, | ||||
|         prefixes: &'a BTreeSet<Prefix>, | ||||
|     ) -> heed::Result<Self> { | ||||
|         let database = database.remap_data_type::<Bytes>(); | ||||
|  | ||||
| @@ -173,8 +173,8 @@ impl WordPrefixIntegerDocids { | ||||
|     fn execute( | ||||
|         self, | ||||
|         wtxn: &mut heed::RwTxn, | ||||
|         prefix_to_compute: &HashSet<Prefix>, | ||||
|         prefix_to_delete: &HashSet<Prefix>, | ||||
|         prefix_to_compute: &BTreeSet<Prefix>, | ||||
|         prefix_to_delete: &BTreeSet<Prefix>, | ||||
|     ) -> Result<()> { | ||||
|         delete_prefixes(wtxn, &self.prefix_database, prefix_to_delete)?; | ||||
|         self.recompute_modified_prefixes(wtxn, prefix_to_compute) | ||||
| @@ -184,7 +184,7 @@ impl WordPrefixIntegerDocids { | ||||
|     fn recompute_modified_prefixes( | ||||
|         &self, | ||||
|         wtxn: &mut RwTxn, | ||||
|         prefixes: &HashSet<Prefix>, | ||||
|         prefixes: &BTreeSet<Prefix>, | ||||
|     ) -> Result<()> { | ||||
|         // We fetch the docids associated to the newly added word prefix fst only. | ||||
|         // And collect the CboRoaringBitmaps pointers in an HashMap. | ||||
| @@ -211,7 +211,7 @@ impl WordPrefixIntegerDocids { | ||||
|                     .union()?; | ||||
|  | ||||
|                 buffer.clear(); | ||||
|                 CboRoaringBitmapCodec::serialize_into(&output, buffer); | ||||
|                 CboRoaringBitmapCodec::serialize_into_vec(&output, buffer); | ||||
|                 index.push(PrefixIntegerEntry { prefix, pos, serialized_length: buffer.len() }); | ||||
|                 file.write_all(buffer)?; | ||||
|             } | ||||
| @@ -262,7 +262,7 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> { | ||||
|     pub fn from_prefixes( | ||||
|         database: Database<Bytes, CboRoaringBitmapCodec>, | ||||
|         rtxn: &'rtxn RoTxn, | ||||
|         prefixes: &'a HashSet<Prefix>, | ||||
|         prefixes: &'a BTreeSet<Prefix>, | ||||
|     ) -> heed::Result<Self> { | ||||
|         let database = database.remap_data_type::<Bytes>(); | ||||
|  | ||||
| @@ -291,7 +291,7 @@ unsafe impl<'a, 'rtxn> Sync for FrozenPrefixIntegerBitmaps<'a, 'rtxn> {} | ||||
| fn delete_prefixes( | ||||
|     wtxn: &mut RwTxn, | ||||
|     prefix_database: &Database<Bytes, CboRoaringBitmapCodec>, | ||||
|     prefixes: &HashSet<Prefix>, | ||||
|     prefixes: &BTreeSet<Prefix>, | ||||
| ) -> Result<()> { | ||||
|     // We remove all the entries that are no more required in this word prefix docids database. | ||||
|     for prefix in prefixes { | ||||
| @@ -309,8 +309,8 @@ fn delete_prefixes( | ||||
| pub fn compute_word_prefix_docids( | ||||
|     wtxn: &mut RwTxn, | ||||
|     index: &Index, | ||||
|     prefix_to_compute: &HashSet<Prefix>, | ||||
|     prefix_to_delete: &HashSet<Prefix>, | ||||
|     prefix_to_compute: &BTreeSet<Prefix>, | ||||
|     prefix_to_delete: &BTreeSet<Prefix>, | ||||
|     grenad_parameters: GrenadParameters, | ||||
| ) -> Result<()> { | ||||
|     WordPrefixDocids::new( | ||||
| @@ -325,8 +325,8 @@ pub fn compute_word_prefix_docids( | ||||
| pub fn compute_exact_word_prefix_docids( | ||||
|     wtxn: &mut RwTxn, | ||||
|     index: &Index, | ||||
|     prefix_to_compute: &HashSet<Prefix>, | ||||
|     prefix_to_delete: &HashSet<Prefix>, | ||||
|     prefix_to_compute: &BTreeSet<Prefix>, | ||||
|     prefix_to_delete: &BTreeSet<Prefix>, | ||||
|     grenad_parameters: GrenadParameters, | ||||
| ) -> Result<()> { | ||||
|     WordPrefixDocids::new( | ||||
| @@ -341,8 +341,8 @@ pub fn compute_exact_word_prefix_docids( | ||||
| pub fn compute_word_prefix_fid_docids( | ||||
|     wtxn: &mut RwTxn, | ||||
|     index: &Index, | ||||
|     prefix_to_compute: &HashSet<Prefix>, | ||||
|     prefix_to_delete: &HashSet<Prefix>, | ||||
|     prefix_to_compute: &BTreeSet<Prefix>, | ||||
|     prefix_to_delete: &BTreeSet<Prefix>, | ||||
|     grenad_parameters: GrenadParameters, | ||||
| ) -> Result<()> { | ||||
|     WordPrefixIntegerDocids::new( | ||||
| @@ -357,8 +357,8 @@ pub fn compute_word_prefix_fid_docids( | ||||
| pub fn compute_word_prefix_position_docids( | ||||
|     wtxn: &mut RwTxn, | ||||
|     index: &Index, | ||||
|     prefix_to_compute: &HashSet<Prefix>, | ||||
|     prefix_to_delete: &HashSet<Prefix>, | ||||
|     prefix_to_compute: &BTreeSet<Prefix>, | ||||
|     prefix_to_delete: &BTreeSet<Prefix>, | ||||
|     grenad_parameters: GrenadParameters, | ||||
| ) -> Result<()> { | ||||
|     WordPrefixIntegerDocids::new( | ||||
|   | ||||
| @@ -17,7 +17,8 @@ use super::IndexerConfig; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::error::UserError; | ||||
| use crate::index::{ | ||||
|     IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, | ||||
|     IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO, | ||||
|     DEFAULT_MIN_WORD_LEN_TWO_TYPOS, | ||||
| }; | ||||
| use crate::order_by_map::OrderByMap; | ||||
| use crate::prompt::default_max_bytes; | ||||
| @@ -177,6 +178,8 @@ pub struct Settings<'a, 't, 'i> { | ||||
|     embedder_settings: Setting<BTreeMap<String, Setting<EmbeddingSettings>>>, | ||||
|     search_cutoff: Setting<u64>, | ||||
|     localized_attributes_rules: Setting<Vec<LocalizedAttributesRule>>, | ||||
|     prefix_search: Setting<PrefixSearch>, | ||||
|     facet_search: Setting<bool>, | ||||
| } | ||||
|  | ||||
| impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
| @@ -212,6 +215,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|             embedder_settings: Setting::NotSet, | ||||
|             search_cutoff: Setting::NotSet, | ||||
|             localized_attributes_rules: Setting::NotSet, | ||||
|             prefix_search: Setting::NotSet, | ||||
|             facet_search: Setting::NotSet, | ||||
|             indexer_config, | ||||
|         } | ||||
|     } | ||||
| @@ -418,6 +423,22 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|         self.localized_attributes_rules = Setting::Reset; | ||||
|     } | ||||
|  | ||||
|     pub fn set_prefix_search(&mut self, value: PrefixSearch) { | ||||
|         self.prefix_search = Setting::Set(value); | ||||
|     } | ||||
|  | ||||
|     pub fn reset_prefix_search(&mut self) { | ||||
|         self.prefix_search = Setting::Reset; | ||||
|     } | ||||
|  | ||||
|     pub fn set_facet_search(&mut self, value: bool) { | ||||
|         self.facet_search = Setting::Set(value); | ||||
|     } | ||||
|  | ||||
|     pub fn reset_facet_search(&mut self) { | ||||
|         self.facet_search = Setting::Reset; | ||||
|     } | ||||
|  | ||||
|     #[tracing::instrument( | ||||
|         level = "trace" | ||||
|         skip(self, progress_callback, should_abort, settings_diff), | ||||
| @@ -944,7 +965,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|                     false | ||||
|                 } else { | ||||
|                     self.index.put_proximity_precision(self.wtxn, new)?; | ||||
|                     true | ||||
|                     old.is_some() || new != ProximityPrecision::default() | ||||
|                 } | ||||
|             } | ||||
|             Setting::Reset => self.index.delete_proximity_precision(self.wtxn)?, | ||||
| @@ -954,6 +975,42 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|         Ok(changed) | ||||
|     } | ||||
|  | ||||
|     fn update_prefix_search(&mut self) -> Result<bool> { | ||||
|         let changed = match self.prefix_search { | ||||
|             Setting::Set(new) => { | ||||
|                 let old = self.index.prefix_search(self.wtxn)?; | ||||
|                 if old == Some(new) { | ||||
|                     false | ||||
|                 } else { | ||||
|                     self.index.put_prefix_search(self.wtxn, new)?; | ||||
|                     old.is_some() || new != PrefixSearch::default() | ||||
|                 } | ||||
|             } | ||||
|             Setting::Reset => self.index.delete_prefix_search(self.wtxn)?, | ||||
|             Setting::NotSet => false, | ||||
|         }; | ||||
|  | ||||
|         Ok(changed) | ||||
|     } | ||||
|  | ||||
|     fn update_facet_search(&mut self) -> Result<bool> { | ||||
|         let changed = match self.facet_search { | ||||
|             Setting::Set(new) => { | ||||
|                 let old = self.index.facet_search(self.wtxn)?; | ||||
|                 if old == new { | ||||
|                     false | ||||
|                 } else { | ||||
|                     self.index.put_facet_search(self.wtxn, new)?; | ||||
|                     true | ||||
|                 } | ||||
|             } | ||||
|             Setting::Reset => self.index.delete_facet_search(self.wtxn)?, | ||||
|             Setting::NotSet => false, | ||||
|         }; | ||||
|  | ||||
|         Ok(changed) | ||||
|     } | ||||
|  | ||||
|     fn update_embedding_configs(&mut self) -> Result<BTreeMap<String, EmbedderAction>> { | ||||
|         match std::mem::take(&mut self.embedder_settings) { | ||||
|             Setting::Set(configs) => self.update_embedding_configs_set(configs), | ||||
| @@ -1203,6 +1260,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|         self.update_searchable()?; | ||||
|         self.update_exact_attributes()?; | ||||
|         self.update_proximity_precision()?; | ||||
|         self.update_prefix_search()?; | ||||
|         self.update_facet_search()?; | ||||
|         self.update_localized_attributes_rules()?; | ||||
|  | ||||
|         let embedding_config_updates = self.update_embedding_configs()?; | ||||
| @@ -1282,6 +1341,7 @@ impl InnerIndexSettingsDiff { | ||||
|                 || old_settings.allowed_separators != new_settings.allowed_separators | ||||
|                 || old_settings.dictionary != new_settings.dictionary | ||||
|                 || old_settings.proximity_precision != new_settings.proximity_precision | ||||
|                 || old_settings.prefix_search != new_settings.prefix_search | ||||
|                 || old_settings.localized_searchable_fields_ids | ||||
|                     != new_settings.localized_searchable_fields_ids | ||||
|         }; | ||||
| @@ -1372,7 +1432,7 @@ impl InnerIndexSettingsDiff { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn reindex_facets(&self) -> bool { | ||||
|     pub fn facet_fids_changed(&self) -> bool { | ||||
|         let existing_fields = &self.new.existing_fields; | ||||
|         if existing_fields.iter().any(|field| field.contains('.')) { | ||||
|             return true; | ||||
| @@ -1392,7 +1452,15 @@ impl InnerIndexSettingsDiff { | ||||
|         } | ||||
|  | ||||
|         (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) | ||||
|             || self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids | ||||
|     } | ||||
|  | ||||
|     pub fn global_facet_settings_changed(&self) -> bool { | ||||
|         self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids | ||||
|             || self.old.facet_search != self.new.facet_search | ||||
|     } | ||||
|  | ||||
|     pub fn reindex_facets(&self) -> bool { | ||||
|         self.facet_fids_changed() || self.global_facet_settings_changed() | ||||
|     } | ||||
|  | ||||
|     pub fn reindex_vectors(&self) -> bool { | ||||
| @@ -1432,6 +1500,8 @@ pub(crate) struct InnerIndexSettings { | ||||
|     pub non_faceted_fields_ids: Vec<FieldId>, | ||||
|     pub localized_searchable_fields_ids: LocalizedFieldIds, | ||||
|     pub localized_faceted_fields_ids: LocalizedFieldIds, | ||||
|     pub prefix_search: PrefixSearch, | ||||
|     pub facet_search: bool, | ||||
| } | ||||
|  | ||||
| impl InnerIndexSettings { | ||||
| @@ -1457,6 +1527,8 @@ impl InnerIndexSettings { | ||||
|             Some(embedding_configs) => embedding_configs, | ||||
|             None => embedders(index.embedding_configs(rtxn)?)?, | ||||
|         }; | ||||
|         let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); | ||||
|         let facet_search = index.facet_search(rtxn)?; | ||||
|         let existing_fields: HashSet<_> = index | ||||
|             .field_distribution(rtxn)? | ||||
|             .into_iter() | ||||
| @@ -1514,6 +1586,8 @@ impl InnerIndexSettings { | ||||
|             non_faceted_fields_ids: vectors_fids.clone(), | ||||
|             localized_searchable_fields_ids, | ||||
|             localized_faceted_fields_ids, | ||||
|             prefix_search, | ||||
|             facet_search, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
| @@ -2721,6 +2795,8 @@ mod tests { | ||||
|                     embedder_settings, | ||||
|                     search_cutoff, | ||||
|                     localized_attributes_rules, | ||||
|                     prefix_search, | ||||
|                     facet_search, | ||||
|                 } = settings; | ||||
|                 assert!(matches!(searchable_fields, Setting::NotSet)); | ||||
|                 assert!(matches!(displayed_fields, Setting::NotSet)); | ||||
| @@ -2746,6 +2822,8 @@ mod tests { | ||||
|                 assert!(matches!(embedder_settings, Setting::NotSet)); | ||||
|                 assert!(matches!(search_cutoff, Setting::NotSet)); | ||||
|                 assert!(matches!(localized_attributes_rules, Setting::NotSet)); | ||||
|                 assert!(matches!(prefix_search, Setting::NotSet)); | ||||
|                 assert!(matches!(facet_search, Setting::NotSet)); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|     } | ||||
|   | ||||
| @@ -9,7 +9,7 @@ use crate::{Index, Result, SmallString32}; | ||||
| pub struct WordsPrefixesFst<'t, 'i> { | ||||
|     wtxn: &'t mut RwTxn<'i>, | ||||
|     index: &'i Index, | ||||
|     threshold: u32, | ||||
|     threshold: usize, | ||||
|     max_prefix_length: usize, | ||||
| } | ||||
|  | ||||
| @@ -24,8 +24,8 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { | ||||
|     /// | ||||
|     /// Default value is 100. This value must be higher than 50 and will be clamped | ||||
|     /// to this bound otherwise. | ||||
|     pub fn threshold(&mut self, value: u32) -> &mut Self { | ||||
|         self.threshold = value.max(50); | ||||
|     pub fn threshold(&mut self, value: usize) -> &mut Self { | ||||
|         self.threshold = value; | ||||
|         self | ||||
|     } | ||||
|  | ||||
| @@ -34,7 +34,7 @@ impl<'t, 'i> WordsPrefixesFst<'t, 'i> { | ||||
|     /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped | ||||
|     /// to these bounds, otherwise. | ||||
|     pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { | ||||
|         self.max_prefix_length = value.clamp(1, 25); | ||||
|         self.max_prefix_length = value; | ||||
|         self | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -475,7 +475,7 @@ impl<F> Embeddings<F> { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     /// Append a flat vector of embeddings a the end of the embeddings. | ||||
|     /// Append a flat vector of embeddings at the end of the embeddings. | ||||
|     /// | ||||
|     /// If `embeddings.len() % self.dimension != 0`, then the append operation fails. | ||||
|     pub fn append(&mut self, mut embeddings: Vec<F>) -> Result<(), Vec<F>> { | ||||
|   | ||||
| @@ -3,6 +3,7 @@ use bumpalo::Bump; | ||||
| use heed::EnvOpenOptions; | ||||
| use maplit::hashset; | ||||
| use milli::documents::mmap_from_objects; | ||||
| use milli::progress::Progress; | ||||
| use milli::update::new::indexer; | ||||
| use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; | ||||
| use milli::vector::EmbeddingConfigs; | ||||
| @@ -57,13 +58,14 @@ fn test_facet_distribution_with_no_facet_values() { | ||||
|             None, | ||||
|             &mut new_fields_ids_map, | ||||
|             &|| false, | ||||
|             &|_progress| (), | ||||
|             Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|  | ||||
|     indexer::index( | ||||
|         &mut wtxn, | ||||
|         &index, | ||||
|         &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|         config.grenad_parameters(), | ||||
|         &db_fields_ids_map, | ||||
|         new_fields_ids_map, | ||||
| @@ -71,7 +73,7 @@ fn test_facet_distribution_with_no_facet_values() { | ||||
|         &document_changes, | ||||
|         embedders, | ||||
|         &|| false, | ||||
|         &|_| (), | ||||
|         &Progress::default(), | ||||
|     ) | ||||
|     .unwrap(); | ||||
|  | ||||
|   | ||||
| @@ -7,6 +7,7 @@ use bumpalo::Bump; | ||||
| use either::{Either, Left, Right}; | ||||
| use heed::EnvOpenOptions; | ||||
| use maplit::{btreemap, hashset}; | ||||
| use milli::progress::Progress; | ||||
| use milli::update::new::indexer; | ||||
| use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; | ||||
| use milli::vector::EmbeddingConfigs; | ||||
| @@ -90,7 +91,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | ||||
|             None, | ||||
|             &mut new_fields_ids_map, | ||||
|             &|| false, | ||||
|             &|_progress| (), | ||||
|             Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|  | ||||
| @@ -101,6 +102,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | ||||
|     indexer::index( | ||||
|         &mut wtxn, | ||||
|         &index, | ||||
|         &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|         config.grenad_parameters(), | ||||
|         &db_fields_ids_map, | ||||
|         new_fields_ids_map, | ||||
| @@ -108,7 +110,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { | ||||
|         &document_changes, | ||||
|         embedders, | ||||
|         &|| false, | ||||
|         &|_| (), | ||||
|         &Progress::default(), | ||||
|     ) | ||||
|     .unwrap(); | ||||
|  | ||||
|   | ||||
| @@ -5,6 +5,7 @@ use bumpalo::Bump; | ||||
| use heed::EnvOpenOptions; | ||||
| use itertools::Itertools; | ||||
| use maplit::hashset; | ||||
| use milli::progress::Progress; | ||||
| use milli::update::new::indexer; | ||||
| use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; | ||||
| use milli::vector::EmbeddingConfigs; | ||||
| @@ -326,13 +327,14 @@ fn criteria_ascdesc() { | ||||
|             None, | ||||
|             &mut new_fields_ids_map, | ||||
|             &|| false, | ||||
|             &|_progress| (), | ||||
|             Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|  | ||||
|     indexer::index( | ||||
|         &mut wtxn, | ||||
|         &index, | ||||
|         &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|         config.grenad_parameters(), | ||||
|         &db_fields_ids_map, | ||||
|         new_fields_ids_map, | ||||
| @@ -340,7 +342,7 @@ fn criteria_ascdesc() { | ||||
|         &document_changes, | ||||
|         embedders, | ||||
|         &|| false, | ||||
|         &|_| (), | ||||
|         &Progress::default(), | ||||
|     ) | ||||
|     .unwrap(); | ||||
|  | ||||
|   | ||||
| @@ -3,6 +3,7 @@ use std::collections::BTreeSet; | ||||
| use bumpalo::Bump; | ||||
| use heed::EnvOpenOptions; | ||||
| use milli::documents::mmap_from_objects; | ||||
| use milli::progress::Progress; | ||||
| use milli::update::new::indexer; | ||||
| use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; | ||||
| use milli::vector::EmbeddingConfigs; | ||||
| @@ -135,13 +136,14 @@ fn test_typo_disabled_on_word() { | ||||
|             None, | ||||
|             &mut new_fields_ids_map, | ||||
|             &|| false, | ||||
|             &|_progress| (), | ||||
|             Progress::default(), | ||||
|         ) | ||||
|         .unwrap(); | ||||
|  | ||||
|     indexer::index( | ||||
|         &mut wtxn, | ||||
|         &index, | ||||
|         &milli::ThreadPoolNoAbortBuilder::new().build().unwrap(), | ||||
|         config.grenad_parameters(), | ||||
|         &db_fields_ids_map, | ||||
|         new_fields_ids_map, | ||||
| @@ -149,7 +151,7 @@ fn test_typo_disabled_on_word() { | ||||
|         &document_changes, | ||||
|         embedders, | ||||
|         &|| false, | ||||
|         &|_| (), | ||||
|         &Progress::default(), | ||||
|     ) | ||||
|     .unwrap(); | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user