mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Merge remote-tracking branch 'origin/main' into tmp-release-v1.5.1
This commit is contained in:
		| @@ -1,5 +1,6 @@ | ||||
| mod builder; | ||||
| mod enriched; | ||||
| mod primary_key; | ||||
| mod reader; | ||||
| mod serde_impl; | ||||
|  | ||||
| @@ -11,6 +12,7 @@ use bimap::BiHashMap; | ||||
| pub use builder::DocumentsBatchBuilder; | ||||
| pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; | ||||
| use obkv::KvReader; | ||||
| pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY}; | ||||
| pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; | ||||
| use serde::{Deserialize, Serialize}; | ||||
|  | ||||
| @@ -87,6 +89,12 @@ impl DocumentsBatchIndex { | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl FieldIdMapper for DocumentsBatchIndex { | ||||
|     fn id(&self, name: &str) -> Option<FieldId> { | ||||
|         self.id(name) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, thiserror::Error)] | ||||
| pub enum Error { | ||||
|     #[error("Error parsing number {value:?} at line {line}: {error}")] | ||||
|   | ||||
							
								
								
									
										172
									
								
								milli/src/documents/primary_key.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								milli/src/documents/primary_key.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,172 @@ | ||||
| use std::iter; | ||||
| use std::result::Result as StdResult; | ||||
|  | ||||
| use serde_json::Value; | ||||
|  | ||||
| use crate::{FieldId, InternalError, Object, Result, UserError}; | ||||
|  | ||||
| /// The symbol used to define levels in a nested primary key. | ||||
| const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; | ||||
|  | ||||
| /// The default primary that is used when not specified. | ||||
| pub const DEFAULT_PRIMARY_KEY: &str = "id"; | ||||
|  | ||||
| /// Trait for objects that can map the name of a field to its [`FieldId`]. | ||||
| pub trait FieldIdMapper { | ||||
|     /// Attempts to map the passed name to its [`FieldId`]. | ||||
|     /// | ||||
|     /// `None` if the field with this name was not found. | ||||
|     fn id(&self, name: &str) -> Option<FieldId>; | ||||
| } | ||||
|  | ||||
| /// A type that represent the type of primary key that has been set | ||||
| /// for this index, a classic flat one or a nested one. | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub enum PrimaryKey<'a> { | ||||
|     Flat { name: &'a str, field_id: FieldId }, | ||||
|     Nested { name: &'a str }, | ||||
| } | ||||
|  | ||||
| pub enum DocumentIdExtractionError { | ||||
|     InvalidDocumentId(UserError), | ||||
|     MissingDocumentId, | ||||
|     TooManyDocumentIds(usize), | ||||
| } | ||||
|  | ||||
| impl<'a> PrimaryKey<'a> { | ||||
|     pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> { | ||||
|         Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) { | ||||
|             Self::Nested { name: path } | ||||
|         } else { | ||||
|             let field_id = fields.id(path)?; | ||||
|             Self::Flat { name: path, field_id } | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn name(&self) -> &str { | ||||
|         match self { | ||||
|             PrimaryKey::Flat { name, .. } => name, | ||||
|             PrimaryKey::Nested { name } => name, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn document_id( | ||||
|         &self, | ||||
|         document: &obkv::KvReader<FieldId>, | ||||
|         fields: &impl FieldIdMapper, | ||||
|     ) -> Result<StdResult<String, DocumentIdExtractionError>> { | ||||
|         match self { | ||||
|             PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) { | ||||
|                 Some(document_id_bytes) => { | ||||
|                     let document_id = serde_json::from_slice(document_id_bytes) | ||||
|                         .map_err(InternalError::SerdeJson)?; | ||||
|                     match validate_document_id_value(document_id)? { | ||||
|                         Ok(document_id) => Ok(Ok(document_id)), | ||||
|                         Err(user_error) => { | ||||
|                             Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), | ||||
|             }, | ||||
|             nested @ PrimaryKey::Nested { .. } => { | ||||
|                 let mut matching_documents_ids = Vec::new(); | ||||
|                 for (first_level_name, right) in nested.possible_level_names() { | ||||
|                     if let Some(field_id) = fields.id(first_level_name) { | ||||
|                         if let Some(value_bytes) = document.get(field_id) { | ||||
|                             let object = serde_json::from_slice(value_bytes) | ||||
|                                 .map_err(InternalError::SerdeJson)?; | ||||
|                             fetch_matching_values(object, right, &mut matching_documents_ids); | ||||
|  | ||||
|                             if matching_documents_ids.len() >= 2 { | ||||
|                                 return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds( | ||||
|                                     matching_documents_ids.len(), | ||||
|                                 ))); | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 match matching_documents_ids.pop() { | ||||
|                     Some(document_id) => match validate_document_id_value(document_id)? { | ||||
|                         Ok(document_id) => Ok(Ok(document_id)), | ||||
|                         Err(user_error) => { | ||||
|                             Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error))) | ||||
|                         } | ||||
|                     }, | ||||
|                     None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)), | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Returns an `Iterator` that gives all the possible fields names the primary key | ||||
|     /// can have depending of the first level name and depth of the objects. | ||||
|     pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ { | ||||
|         let name = self.name(); | ||||
|         name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) | ||||
|             .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) | ||||
|             .chain(iter::once((name, ""))) | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) { | ||||
|     match value { | ||||
|         Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), | ||||
|         otherwise => output.push(otherwise), | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn fetch_matching_values_in_object( | ||||
|     object: Object, | ||||
|     selector: &str, | ||||
|     base_key: &str, | ||||
|     output: &mut Vec<Value>, | ||||
| ) { | ||||
|     for (key, value) in object { | ||||
|         let base_key = if base_key.is_empty() { | ||||
|             key.to_string() | ||||
|         } else { | ||||
|             format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) | ||||
|         }; | ||||
|  | ||||
|         if starts_with(selector, &base_key) { | ||||
|             match value { | ||||
|                 Value::Object(object) => { | ||||
|                     fetch_matching_values_in_object(object, selector, &base_key, output) | ||||
|                 } | ||||
|                 value => output.push(value), | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn starts_with(selector: &str, key: &str) -> bool { | ||||
|     selector.strip_prefix(key).map_or(false, |tail| { | ||||
|         tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) | ||||
|     }) | ||||
| } | ||||
|  | ||||
| // FIXME: move to a DocumentId struct | ||||
|  | ||||
| fn validate_document_id(document_id: &str) -> Option<&str> { | ||||
|     if !document_id.is_empty() | ||||
|         && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) | ||||
|     { | ||||
|         Some(document_id) | ||||
|     } else { | ||||
|         None | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> { | ||||
|     match document_id { | ||||
|         Value::String(string) => match validate_document_id(&string) { | ||||
|             Some(s) if s.len() == string.len() => Ok(Ok(string)), | ||||
|             Some(s) => Ok(Ok(s.to_string())), | ||||
|             None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), | ||||
|         }, | ||||
|         Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), | ||||
|         content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), | ||||
|     } | ||||
| } | ||||
| @@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry { | ||||
|  | ||||
| #[derive(Error, Debug)] | ||||
| pub enum UserError { | ||||
|     #[error("A soft deleted internal document id have been used: `{document_id}`.")] | ||||
|     AccessingSoftDeletedDocument { document_id: DocumentId }, | ||||
|     #[error("A document cannot contain more than 65,535 fields.")] | ||||
|     AttributeLimitReached, | ||||
|     #[error(transparent)] | ||||
| @@ -154,7 +152,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco | ||||
|         valid_fields: BTreeSet<String>, | ||||
|         hidden_fields: bool, | ||||
|     }, | ||||
|     #[error("{}", HeedError::BadOpenOptions)] | ||||
|     #[error("an environment is already opened with different options")] | ||||
|     InvalidLmdbOpenOptions, | ||||
|     #[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")] | ||||
|     SortRankingRuleMissing, | ||||
| @@ -328,11 +326,12 @@ impl From<HeedError> for Error { | ||||
|             HeedError::Mdb(MdbError::MapFull) => UserError(MaxDatabaseSizeReached), | ||||
|             HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile), | ||||
|             HeedError::Mdb(error) => InternalError(Store(error)), | ||||
|             HeedError::Encoding => InternalError(Serialization(Encoding { db_name: None })), | ||||
|             HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), | ||||
|             // TODO use the encoding | ||||
|             HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })), | ||||
|             HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })), | ||||
|             HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), | ||||
|             HeedError::DatabaseClosing => InternalError(DatabaseClosing), | ||||
|             HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions), | ||||
|             HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,159 +1,75 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashMap; | ||||
| use std::convert::TryInto; | ||||
| use std::{fmt, str}; | ||||
|  | ||||
| use fst::map::IndexedValue; | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use roaring::RoaringBitmap; | ||||
| use heed::types::Str; | ||||
| use heed::{Database, RoIter, RoTxn, RwTxn}; | ||||
|  | ||||
| const DELETED_ID: u64 = u64::MAX; | ||||
| use crate::{DocumentId, BEU32}; | ||||
|  | ||||
| pub struct ExternalDocumentsIds<'a> { | ||||
|     pub(crate) hard: fst::Map<Cow<'a, [u8]>>, | ||||
|     pub(crate) soft: fst::Map<Cow<'a, [u8]>>, | ||||
|     soft_deleted_docids: RoaringBitmap, | ||||
| pub enum DocumentOperationKind { | ||||
|     Create, | ||||
|     Delete, | ||||
| } | ||||
|  | ||||
| impl<'a> ExternalDocumentsIds<'a> { | ||||
|     pub fn new( | ||||
|         hard: fst::Map<Cow<'a, [u8]>>, | ||||
|         soft: fst::Map<Cow<'a, [u8]>>, | ||||
|         soft_deleted_docids: RoaringBitmap, | ||||
|     ) -> ExternalDocumentsIds<'a> { | ||||
|         ExternalDocumentsIds { hard, soft, soft_deleted_docids } | ||||
|     } | ||||
| pub struct DocumentOperation { | ||||
|     pub external_id: String, | ||||
|     pub internal_id: DocumentId, | ||||
|     pub kind: DocumentOperationKind, | ||||
| } | ||||
|  | ||||
|     pub fn into_static(self) -> ExternalDocumentsIds<'static> { | ||||
|         ExternalDocumentsIds { | ||||
|             hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), | ||||
|             soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), | ||||
|             soft_deleted_docids: self.soft_deleted_docids, | ||||
|         } | ||||
| pub struct ExternalDocumentsIds(Database<Str, BEU32>); | ||||
|  | ||||
| impl ExternalDocumentsIds { | ||||
|     pub fn new(db: Database<Str, BEU32>) -> ExternalDocumentsIds { | ||||
|         ExternalDocumentsIds(db) | ||||
|     } | ||||
|  | ||||
|     /// Returns `true` if hard and soft external documents lists are empty. | ||||
|     pub fn is_empty(&self) -> bool { | ||||
|         self.hard.is_empty() && self.soft.is_empty() | ||||
|     pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result<bool> { | ||||
|         self.0.is_empty(rtxn).map_err(Into::into) | ||||
|     } | ||||
|  | ||||
|     pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> { | ||||
|         let external_id = external_id.as_ref(); | ||||
|         match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { | ||||
|             Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => { | ||||
|                 Some(id.try_into().unwrap()) | ||||
|             } | ||||
|             _otherwise => None, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they | ||||
|     /// don't contain any soft deleted document id. | ||||
|     pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { | ||||
|         let mut new_hard_builder = fst::MapBuilder::memory(); | ||||
|  | ||||
|         let union_op = self.hard.op().add(&self.soft).r#union(); | ||||
|         let mut iter = union_op.into_stream(); | ||||
|         while let Some((external_id, docids)) = iter.next() { | ||||
|             // prefer selecting the ids from soft, always | ||||
|             let id = indexed_last_value(docids).unwrap(); | ||||
|             if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) { | ||||
|                 new_hard_builder.insert(external_id, id)?; | ||||
|             } | ||||
|         } | ||||
|         drop(iter); | ||||
|  | ||||
|         // Delete soft map completely | ||||
|         self.soft = fst::Map::default().map_data(Cow::Owned)?; | ||||
|         // We save the new map as the new hard map. | ||||
|         self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> { | ||||
|         let union_op = self.soft.op().add(other).r#union(); | ||||
|  | ||||
|         let mut new_soft_builder = fst::MapBuilder::memory(); | ||||
|         let mut iter = union_op.into_stream(); | ||||
|         while let Some((external_id, marked_docids)) = iter.next() { | ||||
|             let id = indexed_last_value(marked_docids).unwrap(); | ||||
|             new_soft_builder.insert(external_id, id)?; | ||||
|         } | ||||
|  | ||||
|         drop(iter); | ||||
|  | ||||
|         // We save the new map as the new soft map. | ||||
|         self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?; | ||||
|         self.merge_soft_into_hard() | ||||
|     pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> { | ||||
|         self.0.get(rtxn, external_id.as_ref()) | ||||
|     } | ||||
|  | ||||
|     /// An helper function to debug this type, returns an `HashMap` of both, | ||||
|     /// soft and hard fst maps, combined. | ||||
|     pub fn to_hash_map(&self) -> HashMap<String, u32> { | ||||
|         let mut map = HashMap::new(); | ||||
|  | ||||
|         let union_op = self.hard.op().add(&self.soft).r#union(); | ||||
|         let mut iter = union_op.into_stream(); | ||||
|         while let Some((external_id, marked_docids)) = iter.next() { | ||||
|             let id = indexed_last_value(marked_docids).unwrap(); | ||||
|             if id != DELETED_ID { | ||||
|                 let external_id = str::from_utf8(external_id).unwrap(); | ||||
|                 map.insert(external_id.to_owned(), id.try_into().unwrap()); | ||||
|             } | ||||
|     pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, u32>> { | ||||
|         let mut map = HashMap::default(); | ||||
|         for result in self.0.iter(rtxn)? { | ||||
|             let (external, internal) = result?; | ||||
|             map.insert(external.to_owned(), internal); | ||||
|         } | ||||
|  | ||||
|         map | ||||
|         Ok(map) | ||||
|     } | ||||
|  | ||||
|     /// Return an fst of the combined hard and soft deleted ID. | ||||
|     pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> { | ||||
|         if self.soft.is_empty() { | ||||
|             return Ok(Cow::Borrowed(&self.hard)); | ||||
|         } | ||||
|         let union_op = self.hard.op().add(&self.soft).r#union(); | ||||
|  | ||||
|         let mut iter = union_op.into_stream(); | ||||
|         let mut new_hard_builder = fst::MapBuilder::memory(); | ||||
|         while let Some((external_id, marked_docids)) = iter.next() { | ||||
|             let value = indexed_last_value(marked_docids).unwrap(); | ||||
|             if value != DELETED_ID { | ||||
|                 new_hard_builder.insert(external_id, value)?; | ||||
|     /// Applies the list of operations passed as argument, modifying the current external to internal id mapping. | ||||
|     /// | ||||
|     /// If the list contains multiple operations on the same external id, then the result is unspecified. | ||||
|     /// | ||||
|     /// # Panics | ||||
|     /// | ||||
|     /// - If attempting to delete a document that doesn't exist | ||||
|     /// - If attempting to create a document that already exists | ||||
|     pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec<DocumentOperation>) -> heed::Result<()> { | ||||
|         for DocumentOperation { external_id, internal_id, kind } in operations { | ||||
|             match kind { | ||||
|                 DocumentOperationKind::Create => { | ||||
|                     self.0.put(wtxn, &external_id, &internal_id)?; | ||||
|                 } | ||||
|                 DocumentOperationKind::Delete => { | ||||
|                     if !self.0.delete(wtxn, &external_id)? { | ||||
|                         panic!("Attempting to delete a non-existing document") | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         drop(iter); | ||||
|  | ||||
|         Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?)) | ||||
|     } | ||||
|  | ||||
|     fn merge_soft_into_hard(&mut self) -> fst::Result<()> { | ||||
|         if self.soft.len() >= self.hard.len() / 2 { | ||||
|             self.hard = self.to_fst()?.into_owned(); | ||||
|             self.soft = fst::Map::default().map_data(Cow::Owned)?; | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl fmt::Debug for ExternalDocumentsIds<'_> { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish() | ||||
|     /// Returns an iterator over all the external ids. | ||||
|     pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, BEU32>> { | ||||
|         self.0.iter(rtxn) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Default for ExternalDocumentsIds<'static> { | ||||
|     fn default() -> Self { | ||||
|         ExternalDocumentsIds { | ||||
|             hard: fst::Map::default().map_data(Cow::Owned).unwrap(), | ||||
|             soft: fst::Map::default().map_data(Cow::Owned).unwrap(), | ||||
|             soft_deleted_docids: RoaringBitmap::new(), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Returns the value of the `IndexedValue` with the highest _index_. | ||||
| fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> { | ||||
|     indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value) | ||||
| } | ||||
|   | ||||
| @@ -81,6 +81,12 @@ impl Default for FieldsIdsMap { | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl crate::documents::FieldIdMapper for FieldsIdsMap { | ||||
|     fn id(&self, name: &str) -> Option<FieldId> { | ||||
|         self.id(name) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|   | ||||
| @@ -2,26 +2,28 @@ use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
| use std::str; | ||||
|  | ||||
| use heed::BoxedError; | ||||
|  | ||||
| pub struct BEU16StrCodec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for BEU16StrCodec { | ||||
|     type DItem = (u16, &'a str); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let (n_bytes, str_bytes) = bytes.split_at(2); | ||||
|         let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?; | ||||
|         let s = str::from_utf8(str_bytes).ok()?; | ||||
|         Some((n, s)) | ||||
|         let n = n_bytes.try_into().map(u16::from_be_bytes)?; | ||||
|         let s = str::from_utf8(str_bytes)?; | ||||
|         Ok((n, s)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for BEU16StrCodec { | ||||
|     type EItem = (u16, &'a str); | ||||
|  | ||||
|     fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut bytes = Vec::with_capacity(s.len() + 2); | ||||
|         bytes.extend_from_slice(&n.to_be_bytes()); | ||||
|         bytes.extend_from_slice(s.as_bytes()); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -2,26 +2,28 @@ use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
| use std::str; | ||||
|  | ||||
| use heed::BoxedError; | ||||
|  | ||||
| pub struct BEU32StrCodec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for BEU32StrCodec { | ||||
|     type DItem = (u32, &'a str); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let (n_bytes, str_bytes) = bytes.split_at(4); | ||||
|         let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?; | ||||
|         let s = str::from_utf8(str_bytes).ok()?; | ||||
|         Some((n, s)) | ||||
|         let n = n_bytes.try_into().map(u32::from_be_bytes)?; | ||||
|         let s = str::from_utf8(str_bytes)?; | ||||
|         Ok((n, s)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for BEU32StrCodec { | ||||
|     type EItem = (u32, &'a str); | ||||
|  | ||||
|     fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut bytes = Vec::with_capacity(s.len() + 4); | ||||
|         bytes.extend_from_slice(&n.to_be_bytes()); | ||||
|         bytes.extend_from_slice(s.as_bytes()); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,23 +1,23 @@ | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
| use heed::{BoxedError, BytesDecode, BytesEncode}; | ||||
|  | ||||
| /// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated | ||||
| /// A codec for values of type `&[u8]`. Unlike `Bytes`, its `EItem` and `DItem` associated | ||||
| /// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. | ||||
| pub struct ByteSliceRefCodec; | ||||
| pub struct BytesRefCodec; | ||||
|  | ||||
| impl<'a> BytesEncode<'a> for ByteSliceRefCodec { | ||||
| impl<'a> BytesEncode<'a> for BytesRefCodec { | ||||
|     type EItem = &'a [u8]; | ||||
|  | ||||
|     fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> { | ||||
|         Some(Cow::Borrowed(item)) | ||||
|     fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> { | ||||
|         Ok(Cow::Borrowed(item)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> BytesDecode<'a> for ByteSliceRefCodec { | ||||
| impl<'a> BytesDecode<'a> for BytesRefCodec { | ||||
|     type DItem = &'a [u8]; | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         Some(bytes) | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         Ok(bytes) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,8 +1,9 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::marker::PhantomData; | ||||
|  | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
| use heed::{BoxedError, BytesDecode, BytesEncode}; | ||||
|  | ||||
| use crate::heed_codec::SliceTooShortError; | ||||
| use crate::{try_split_array_at, DocumentId, FieldId}; | ||||
|  | ||||
| pub struct FieldDocIdFacetCodec<C>(PhantomData<C>); | ||||
| @@ -13,16 +14,16 @@ where | ||||
| { | ||||
|     type DItem = (FieldId, DocumentId, C::DItem); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (field_id_bytes, bytes) = try_split_array_at(bytes)?; | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?; | ||||
|         let field_id = u16::from_be_bytes(field_id_bytes); | ||||
|  | ||||
|         let (document_id_bytes, bytes) = try_split_array_at(bytes)?; | ||||
|         let (document_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?; | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|  | ||||
|         let value = C::bytes_decode(bytes)?; | ||||
|  | ||||
|         Some((field_id, document_id, value)) | ||||
|         Ok((field_id, document_id, value)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -32,13 +33,15 @@ where | ||||
| { | ||||
|     type EItem = (FieldId, DocumentId, C::EItem); | ||||
|  | ||||
|     fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode( | ||||
|         (field_id, document_id, value): &'a Self::EItem, | ||||
|     ) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut bytes = Vec::with_capacity(32); | ||||
|         bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes | ||||
|         bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes | ||||
|         let value_bytes = C::bytes_encode(value)?; | ||||
|         // variable length, if f64 -> 16 bytes, if string -> large, potentially | ||||
|         bytes.extend_from_slice(&value_bytes); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -5,8 +5,8 @@ use std::borrow::Cow; | ||||
| use std::convert::TryFrom; | ||||
| use std::marker::PhantomData; | ||||
|  | ||||
| use heed::types::{DecodeIgnore, OwnedType}; | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
| use heed::types::DecodeIgnore; | ||||
| use heed::{BoxedError, BytesDecode, BytesEncode}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; | ||||
| @@ -18,7 +18,7 @@ pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec<OrderedF64Codec>; | ||||
| pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec<StrRefCodec>; | ||||
| pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec<DecodeIgnore>; | ||||
|  | ||||
| pub type FieldIdCodec = OwnedType<BEU16>; | ||||
| pub type FieldIdCodec = BEU16; | ||||
|  | ||||
| /// Tries to split a slice in half at the given middle point, | ||||
| /// `None` if the slice is too short. | ||||
| @@ -58,7 +58,7 @@ where | ||||
| { | ||||
|     type EItem = FacetGroupKey<T::EItem>; | ||||
|  | ||||
|     fn bytes_encode(value: &'a Self::EItem) -> Option<Cow<'a, [u8]>> { | ||||
|     fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> { | ||||
|         let mut v = vec![]; | ||||
|         v.extend_from_slice(&value.field_id.to_be_bytes()); | ||||
|         v.extend_from_slice(&[value.level]); | ||||
| @@ -66,7 +66,7 @@ where | ||||
|         let bound = T::bytes_encode(&value.left_bound)?; | ||||
|         v.extend_from_slice(&bound); | ||||
|  | ||||
|         Some(Cow::Owned(v)) | ||||
|         Ok(Cow::Owned(v)) | ||||
|     } | ||||
| } | ||||
| impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec<T> | ||||
| @@ -75,11 +75,11 @@ where | ||||
| { | ||||
|     type DItem = FacetGroupKey<T::DItem>; | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1])?); | ||||
|         let level = bytes[2]; | ||||
|         let bound = T::bytes_decode(&bytes[3..])?; | ||||
|         Some(FacetGroupKey { field_id: fid, level, left_bound: bound }) | ||||
|         Ok(FacetGroupKey { field_id: fid, level, left_bound: bound }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -87,17 +87,17 @@ pub struct FacetGroupValueCodec; | ||||
| impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { | ||||
|     type EItem = FacetGroupValue; | ||||
|  | ||||
|     fn bytes_encode(value: &'a Self::EItem) -> Option<Cow<'a, [u8]>> { | ||||
|     fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> { | ||||
|         let mut v = vec![value.size]; | ||||
|         CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); | ||||
|         Some(Cow::Owned(v)) | ||||
|         Ok(Cow::Owned(v)) | ||||
|     } | ||||
| } | ||||
| impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { | ||||
|     type DItem = FacetGroupValue; | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let size = bytes[0]; | ||||
|         let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; | ||||
|         Some(FacetGroupValue { size, bitmap }) | ||||
|         let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?; | ||||
|         Ok(FacetGroupValue { size, bitmap }) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,37 +1,45 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
|  | ||||
| use heed::BytesDecode; | ||||
| use heed::{BoxedError, BytesDecode}; | ||||
| use thiserror::Error; | ||||
|  | ||||
| use crate::facet::value_encoding::f64_into_bytes; | ||||
| use crate::heed_codec::SliceTooShortError; | ||||
|  | ||||
| pub struct OrderedF64Codec; | ||||
|  | ||||
| impl<'a> BytesDecode<'a> for OrderedF64Codec { | ||||
|     type DItem = f64; | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         if bytes.len() < 16 { | ||||
|             return None; | ||||
|             Err(SliceTooShortError.into()) | ||||
|         } else { | ||||
|             bytes[8..].try_into().map(f64::from_be_bytes).map_err(Into::into) | ||||
|         } | ||||
|         let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; | ||||
|         Some(f) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for OrderedF64Codec { | ||||
|     type EItem = f64; | ||||
|  | ||||
|     fn bytes_encode(f: &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode(f: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut buffer = [0u8; 16]; | ||||
|  | ||||
|         // write the globally ordered float | ||||
|         let bytes = f64_into_bytes(*f)?; | ||||
|         let bytes = f64_into_bytes(*f).ok_or(InvalidGloballyOrderedFloatError { float: *f })?; | ||||
|         buffer[..8].copy_from_slice(&bytes[..]); | ||||
|         // Then the f64 value just to be able to read it back | ||||
|         let bytes = f.to_be_bytes(); | ||||
|         buffer[8..16].copy_from_slice(&bytes[..]); | ||||
|  | ||||
|         Some(Cow::Owned(buffer.to_vec())) | ||||
|         Ok(Cow::Owned(buffer.to_vec())) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Error, Debug)] | ||||
| #[error("the float {float} cannot be converted to a globally ordered representation")] | ||||
| pub struct InvalidGloballyOrderedFloatError { | ||||
|     float: f64, | ||||
| } | ||||
|   | ||||
| @@ -1,5 +1,8 @@ | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use heed::BoxedError; | ||||
|  | ||||
| use super::SliceTooShortError; | ||||
| use crate::{try_split_array_at, FieldId}; | ||||
|  | ||||
| pub struct FieldIdWordCountCodec; | ||||
| @@ -7,21 +10,21 @@ pub struct FieldIdWordCountCodec; | ||||
| impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec { | ||||
|     type DItem = (FieldId, u8); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (field_id_bytes, bytes) = try_split_array_at(bytes)?; | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?; | ||||
|         let field_id = u16::from_be_bytes(field_id_bytes); | ||||
|         let ([word_count], _nothing) = try_split_array_at(bytes)?; | ||||
|         Some((field_id, word_count)) | ||||
|         let ([word_count], _nothing) = try_split_array_at(bytes).ok_or(SliceTooShortError)?; | ||||
|         Ok((field_id, word_count)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec { | ||||
|     type EItem = (FieldId, u8); | ||||
|  | ||||
|     fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode((field_id, word_count): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut bytes = Vec::with_capacity(2 + 1); | ||||
|         bytes.extend_from_slice(&field_id.to_be_bytes()); | ||||
|         bytes.push(*word_count); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use fst::Set; | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
| use heed::{BoxedError, BytesDecode, BytesEncode}; | ||||
|  | ||||
| /// A codec for values of type `Set<&[u8]>`. | ||||
| pub struct FstSetCodec; | ||||
| @@ -9,15 +9,15 @@ pub struct FstSetCodec; | ||||
| impl<'a> BytesEncode<'a> for FstSetCodec { | ||||
|     type EItem = Set<Vec<u8>>; | ||||
|  | ||||
|     fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> { | ||||
|         Some(Cow::Borrowed(item.as_fst().as_bytes())) | ||||
|     fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> { | ||||
|         Ok(Cow::Borrowed(item.as_fst().as_bytes())) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> BytesDecode<'a> for FstSetCodec { | ||||
|     type DItem = Set<&'a [u8]>; | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         Set::new(bytes).ok() | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         Set::new(bytes).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -12,8 +12,10 @@ mod str_beu32_codec; | ||||
| mod str_ref; | ||||
| mod str_str_u8_codec; | ||||
|  | ||||
| pub use byte_slice_ref::ByteSliceRefCodec; | ||||
| pub use byte_slice_ref::BytesRefCodec; | ||||
| use heed::BoxedError; | ||||
| pub use str_ref::StrRefCodec; | ||||
| use thiserror::Error; | ||||
|  | ||||
| pub use self::beu16_str_codec::BEU16StrCodec; | ||||
| pub use self::beu32_str_codec::BEU32StrCodec; | ||||
| @@ -31,5 +33,9 @@ pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; | ||||
| pub trait BytesDecodeOwned { | ||||
|     type DItem; | ||||
|  | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem>; | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError>; | ||||
| } | ||||
|  | ||||
| #[derive(Error, Debug)] | ||||
| #[error("the slice is too short")] | ||||
| pub struct SliceTooShortError; | ||||
|   | ||||
| @@ -1,5 +1,6 @@ | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use heed::BoxedError; | ||||
| use obkv::{KvReaderU16, KvWriterU16}; | ||||
|  | ||||
| pub struct ObkvCodec; | ||||
| @@ -7,15 +8,15 @@ pub struct ObkvCodec; | ||||
| impl<'a> heed::BytesDecode<'a> for ObkvCodec { | ||||
|     type DItem = KvReaderU16<'a>; | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         Some(KvReaderU16::new(bytes)) | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         Ok(KvReaderU16::new(bytes)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for ObkvCodec { | ||||
|     type EItem = KvWriterU16<Vec<u8>>; | ||||
|  | ||||
|     fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         item.clone().into_inner().map(Cow::Owned).ok() | ||||
|     fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         item.clone().into_inner().map(Cow::Owned).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -2,7 +2,7 @@ use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
| use std::mem::size_of; | ||||
|  | ||||
| use heed::BytesDecode; | ||||
| use heed::{BoxedError, BytesDecode}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::heed_codec::BytesDecodeOwned; | ||||
| @@ -19,22 +19,22 @@ impl BoRoaringBitmapCodec { | ||||
| impl BytesDecode<'_> for BoRoaringBitmapCodec { | ||||
|     type DItem = RoaringBitmap; | ||||
|  | ||||
|     fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let mut bitmap = RoaringBitmap::new(); | ||||
|  | ||||
|         for chunk in bytes.chunks(size_of::<u32>()) { | ||||
|             let bytes = chunk.try_into().ok()?; | ||||
|             let bytes = chunk.try_into()?; | ||||
|             bitmap.push(u32::from_ne_bytes(bytes)); | ||||
|         } | ||||
|  | ||||
|         Some(bitmap) | ||||
|         Ok(bitmap) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl BytesDecodeOwned for BoRoaringBitmapCodec { | ||||
|     type DItem = RoaringBitmap; | ||||
|  | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         Self::bytes_decode(bytes) | ||||
|     } | ||||
| } | ||||
| @@ -42,9 +42,9 @@ impl BytesDecodeOwned for BoRoaringBitmapCodec { | ||||
| impl heed::BytesEncode<'_> for BoRoaringBitmapCodec { | ||||
|     type EItem = RoaringBitmap; | ||||
|  | ||||
|     fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut out = Vec::new(); | ||||
|         BoRoaringBitmapCodec::serialize_into(item, &mut out); | ||||
|         Some(Cow::Owned(out)) | ||||
|         Ok(Cow::Owned(out)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -3,9 +3,11 @@ use std::io; | ||||
| use std::mem::size_of; | ||||
|  | ||||
| use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; | ||||
| use heed::BoxedError; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::heed_codec::BytesDecodeOwned; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd}; | ||||
|  | ||||
| /// This is the limit where using a byteorder became less size efficient | ||||
| /// than using a direct roaring encoding, it is also the point where we are able | ||||
| @@ -60,12 +62,16 @@ impl CboRoaringBitmapCodec { | ||||
|     /// if the merged values length is under the threshold, values are directly | ||||
|     /// serialized in the buffer else a RoaringBitmap is created from the | ||||
|     /// values and is serialized in the buffer. | ||||
|     pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> { | ||||
|     pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()> | ||||
|     where | ||||
|         I: IntoIterator<Item = A>, | ||||
|         A: AsRef<[u8]>, | ||||
|     { | ||||
|         let mut roaring = RoaringBitmap::new(); | ||||
|         let mut vec = Vec::new(); | ||||
|  | ||||
|         for bytes in slices { | ||||
|             if bytes.len() <= THRESHOLD * size_of::<u32>() { | ||||
|             if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() { | ||||
|                 let mut reader = bytes.as_ref(); | ||||
|                 while let Ok(integer) = reader.read_u32::<NativeEndian>() { | ||||
|                     vec.push(integer); | ||||
| @@ -85,7 +91,7 @@ impl CboRoaringBitmapCodec { | ||||
|                 } | ||||
|             } else { | ||||
|                 // We can unwrap safely because the vector is sorted upper. | ||||
|                 let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap(); | ||||
|                 let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap(); | ||||
|                 roaring.serialize_into(buffer)?; | ||||
|             } | ||||
|         } else { | ||||
| @@ -95,31 +101,58 @@ impl CboRoaringBitmapCodec { | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     /// Merges a DelAdd delta into a CboRoaringBitmap. | ||||
|     pub fn merge_deladd_into<'a>( | ||||
|         deladd: KvReaderDelAdd<'_>, | ||||
|         previous: &[u8], | ||||
|         buffer: &'a mut Vec<u8>, | ||||
|     ) -> io::Result<Option<&'a [u8]>> { | ||||
|         // Deserialize the bitmap that is already there | ||||
|         let mut previous = Self::deserialize_from(previous)?; | ||||
|  | ||||
|         // Remove integers we no more want in the previous bitmap | ||||
|         if let Some(value) = deladd.get(DelAdd::Deletion) { | ||||
|             previous -= Self::deserialize_from(value)?; | ||||
|         } | ||||
|  | ||||
|         // Insert the new integers we want in the previous bitmap | ||||
|         if let Some(value) = deladd.get(DelAdd::Addition) { | ||||
|             previous |= Self::deserialize_from(value)?; | ||||
|         } | ||||
|  | ||||
|         if previous.is_empty() { | ||||
|             return Ok(None); | ||||
|         } | ||||
|  | ||||
|         Self::serialize_into(&previous, buffer); | ||||
|         Ok(Some(&buffer[..])) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { | ||||
|     type DItem = RoaringBitmap; | ||||
|  | ||||
|     fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|         Self::deserialize_from(bytes).ok() | ||||
|     fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         Self::deserialize_from(bytes).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl BytesDecodeOwned for CboRoaringBitmapCodec { | ||||
|     type DItem = RoaringBitmap; | ||||
|  | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|         Self::deserialize_from(bytes).ok() | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         Self::deserialize_from(bytes).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for CboRoaringBitmapCodec { | ||||
|     type EItem = RoaringBitmap; | ||||
|  | ||||
|     fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut vec = Vec::with_capacity(Self::serialized_size(item)); | ||||
|         Self::serialize_into(item, &mut vec); | ||||
|         Some(Cow::Owned(vec)) | ||||
|         Ok(Cow::Owned(vec)) | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -1,5 +1,6 @@ | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use heed::BoxedError; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::heed_codec::BytesDecodeOwned; | ||||
| @@ -9,25 +10,25 @@ pub struct RoaringBitmapCodec; | ||||
| impl heed::BytesDecode<'_> for RoaringBitmapCodec { | ||||
|     type DItem = RoaringBitmap; | ||||
|  | ||||
|     fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|         RoaringBitmap::deserialize_unchecked_from(bytes).ok() | ||||
|     fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         RoaringBitmap::deserialize_unchecked_from(bytes).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl BytesDecodeOwned for RoaringBitmapCodec { | ||||
|     type DItem = RoaringBitmap; | ||||
|  | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|         RoaringBitmap::deserialize_from(bytes).ok() | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         RoaringBitmap::deserialize_from(bytes).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl heed::BytesEncode<'_> for RoaringBitmapCodec { | ||||
|     type EItem = RoaringBitmap; | ||||
|  | ||||
|     fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut bytes = Vec::with_capacity(item.serialized_size()); | ||||
|         item.serialize_into(&mut bytes).ok()?; | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         item.serialize_into(&mut bytes)?; | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| use std::mem; | ||||
|  | ||||
| use heed::BytesDecode; | ||||
| use heed::{BoxedError, BytesDecode}; | ||||
|  | ||||
| use crate::heed_codec::BytesDecodeOwned; | ||||
|  | ||||
| @@ -9,15 +9,15 @@ pub struct BoRoaringBitmapLenCodec; | ||||
| impl BytesDecode<'_> for BoRoaringBitmapLenCodec { | ||||
|     type DItem = u64; | ||||
|  | ||||
|     fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|         Some((bytes.len() / mem::size_of::<u32>()) as u64) | ||||
|     fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         Ok((bytes.len() / mem::size_of::<u32>()) as u64) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl BytesDecodeOwned for BoRoaringBitmapLenCodec { | ||||
|     type DItem = u64; | ||||
|  | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         Self::bytes_decode(bytes) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| use std::mem; | ||||
|  | ||||
| use heed::BytesDecode; | ||||
| use heed::{BoxedError, BytesDecode}; | ||||
|  | ||||
| use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec}; | ||||
| use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD; | ||||
| @@ -11,7 +11,7 @@ pub struct CboRoaringBitmapLenCodec; | ||||
| impl BytesDecode<'_> for CboRoaringBitmapLenCodec { | ||||
|     type DItem = u64; | ||||
|  | ||||
|     fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         if bytes.len() <= THRESHOLD * mem::size_of::<u32>() { | ||||
|             // If there is threshold or less than threshold integers that can fit into this array | ||||
|             // of bytes it means that we used the ByteOrder codec serializer. | ||||
| @@ -27,7 +27,7 @@ impl BytesDecode<'_> for CboRoaringBitmapLenCodec { | ||||
| impl BytesDecodeOwned for CboRoaringBitmapLenCodec { | ||||
|     type DItem = u64; | ||||
|  | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         Self::bytes_decode(bytes) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -2,6 +2,7 @@ use std::io::{self, BufRead, Read}; | ||||
| use std::mem; | ||||
|  | ||||
| use byteorder::{LittleEndian, ReadBytesExt}; | ||||
| use heed::BoxedError; | ||||
|  | ||||
| use crate::heed_codec::BytesDecodeOwned; | ||||
|  | ||||
| @@ -56,16 +57,16 @@ impl RoaringBitmapLenCodec { | ||||
| impl heed::BytesDecode<'_> for RoaringBitmapLenCodec { | ||||
|     type DItem = u64; | ||||
|  | ||||
|     fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|         RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok() | ||||
|     fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl BytesDecodeOwned for RoaringBitmapLenCodec { | ||||
|     type DItem = u64; | ||||
|  | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> { | ||||
|         RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok() | ||||
|     fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -1,30 +1,31 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::ffi::CStr; | ||||
| use std::str; | ||||
|  | ||||
| use charabia::{Language, Script}; | ||||
| use heed::BoxedError; | ||||
|  | ||||
| pub struct ScriptLanguageCodec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { | ||||
|     type DItem = (Script, Language); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let sep = bytes.iter().position(|b| *b == 0)?; | ||||
|         let (s_bytes, l_bytes) = bytes.split_at(sep); | ||||
|         let script = str::from_utf8(s_bytes).ok()?; | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let cstr = CStr::from_bytes_until_nul(bytes)?; | ||||
|         let script = cstr.to_str()?; | ||||
|         let script_name = Script::from_name(script); | ||||
|         let lan = str::from_utf8(l_bytes).ok()?; | ||||
|         // skip '\0' byte between the two strings. | ||||
|         let lan_name = Language::from_name(&lan[1..]); | ||||
|         let lan = str::from_utf8(&bytes[script.len() + 1..])?; | ||||
|         let lan_name = Language::from_name(lan); | ||||
|  | ||||
|         Some((script_name, lan_name)) | ||||
|         Ok((script_name, lan_name)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { | ||||
|     type EItem = (Script, Language); | ||||
|  | ||||
|     fn bytes_encode((script, lan): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode((script, lan): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let script_name = script.name().as_bytes(); | ||||
|         let lan_name = lan.name().as_bytes(); | ||||
|  | ||||
| @@ -33,6 +34,6 @@ impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { | ||||
|         bytes.push(0); | ||||
|         bytes.extend_from_slice(lan_name); | ||||
|  | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -3,37 +3,41 @@ use std::convert::TryInto; | ||||
| use std::mem::size_of; | ||||
| use std::str; | ||||
|  | ||||
| use heed::BoxedError; | ||||
|  | ||||
| use super::SliceTooShortError; | ||||
|  | ||||
| pub struct StrBEU32Codec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for StrBEU32Codec { | ||||
|     type DItem = (&'a str, u32); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let footer_len = size_of::<u32>(); | ||||
|  | ||||
|         if bytes.len() < footer_len { | ||||
|             return None; | ||||
|             return Err(SliceTooShortError.into()); | ||||
|         } | ||||
|  | ||||
|         let (word, bytes) = bytes.split_at(bytes.len() - footer_len); | ||||
|         let word = str::from_utf8(word).ok()?; | ||||
|         let pos = bytes.try_into().map(u32::from_be_bytes).ok()?; | ||||
|         let word = str::from_utf8(word)?; | ||||
|         let pos = bytes.try_into().map(u32::from_be_bytes)?; | ||||
|  | ||||
|         Some((word, pos)) | ||||
|         Ok((word, pos)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { | ||||
|     type EItem = (&'a str, u32); | ||||
|  | ||||
|     fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let pos = pos.to_be_bytes(); | ||||
|  | ||||
|         let mut bytes = Vec::with_capacity(word.len() + pos.len()); | ||||
|         bytes.extend_from_slice(word.as_bytes()); | ||||
|         bytes.extend_from_slice(&pos[..]); | ||||
|  | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -42,26 +46,27 @@ pub struct StrBEU16Codec; | ||||
| impl<'a> heed::BytesDecode<'a> for StrBEU16Codec { | ||||
|     type DItem = (&'a str, u16); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let footer_len = size_of::<u16>(); | ||||
|  | ||||
|         if bytes.len() < footer_len + 1 { | ||||
|             return None; | ||||
|             return Err(SliceTooShortError.into()); | ||||
|         } | ||||
|  | ||||
|         let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len); | ||||
|         let (_, word) = word_plus_nul_byte.split_last()?; | ||||
|         let word = str::from_utf8(word).ok()?; | ||||
|         let pos = bytes.try_into().map(u16::from_be_bytes).ok()?; | ||||
|         // unwrap: we just checked the footer + 1 above. | ||||
|         let (_, word) = word_plus_nul_byte.split_last().unwrap(); | ||||
|         let word = str::from_utf8(word)?; | ||||
|         let pos = bytes.try_into().map(u16::from_be_bytes)?; | ||||
|  | ||||
|         Some((word, pos)) | ||||
|         Ok((word, pos)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { | ||||
|     type EItem = (&'a str, u16); | ||||
|  | ||||
|     fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let pos = pos.to_be_bytes(); | ||||
|  | ||||
|         let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len()); | ||||
| @@ -69,6 +74,6 @@ impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { | ||||
|         bytes.push(0); | ||||
|         bytes.extend_from_slice(&pos[..]); | ||||
|  | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,6 +1,6 @@ | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
| use heed::{BoxedError, BytesDecode, BytesEncode}; | ||||
|  | ||||
| /// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated | ||||
| /// types are equivalent (= `&'a str`) and these values can reside within another structure. | ||||
| @@ -8,15 +8,14 @@ pub struct StrRefCodec; | ||||
| impl<'a> BytesEncode<'a> for StrRefCodec { | ||||
|     type EItem = &'a str; | ||||
|  | ||||
|     fn bytes_encode(item: &'a &'a str) -> Option<Cow<'a, [u8]>> { | ||||
|         Some(Cow::Borrowed(item.as_bytes())) | ||||
|     fn bytes_encode(item: &'a &'a str) -> Result<Cow<'a, [u8]>, BoxedError> { | ||||
|         Ok(Cow::Borrowed(item.as_bytes())) | ||||
|     } | ||||
| } | ||||
| impl<'a> BytesDecode<'a> for StrRefCodec { | ||||
|     type DItem = &'a str; | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let s = std::str::from_utf8(bytes).ok()?; | ||||
|         Some(s) | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         std::str::from_utf8(bytes).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,32 +1,36 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::ffi::CStr; | ||||
| use std::str; | ||||
|  | ||||
| use heed::BoxedError; | ||||
|  | ||||
| use super::SliceTooShortError; | ||||
|  | ||||
| pub struct U8StrStrCodec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for U8StrStrCodec { | ||||
|     type DItem = (u8, &'a str, &'a str); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (n, bytes) = bytes.split_first()?; | ||||
|         let s1_end = bytes.iter().position(|b| *b == 0)?; | ||||
|         let (s1_bytes, rest) = bytes.split_at(s1_end); | ||||
|         let s2_bytes = &rest[1..]; | ||||
|         let s1 = str::from_utf8(s1_bytes).ok()?; | ||||
|         let s2 = str::from_utf8(s2_bytes).ok()?; | ||||
|         Some((*n, s1, s2)) | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?; | ||||
|         let cstr = CStr::from_bytes_until_nul(bytes)?; | ||||
|         let s1 = cstr.to_str()?; | ||||
|         // skip '\0' byte between the two strings. | ||||
|         let s2 = str::from_utf8(&bytes[s1.len() + 1..])?; | ||||
|         Ok((*n, s1, s2)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for U8StrStrCodec { | ||||
|     type EItem = (u8, &'a str, &'a str); | ||||
|  | ||||
|     fn bytes_encode((n, s1, s2): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); | ||||
|         bytes.push(*n); | ||||
|         bytes.extend_from_slice(s1.as_bytes()); | ||||
|         bytes.push(0); | ||||
|         bytes.extend_from_slice(s2.as_bytes()); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
| pub struct UncheckedU8StrStrCodec; | ||||
| @@ -34,24 +38,25 @@ pub struct UncheckedU8StrStrCodec; | ||||
| impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec { | ||||
|     type DItem = (u8, &'a [u8], &'a [u8]); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let (n, bytes) = bytes.split_first()?; | ||||
|         let s1_end = bytes.iter().position(|b| *b == 0)?; | ||||
|         let (s1_bytes, rest) = bytes.split_at(s1_end); | ||||
|         let s2_bytes = &rest[1..]; | ||||
|         Some((*n, s1_bytes, s2_bytes)) | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { | ||||
|         let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?; | ||||
|         let cstr = CStr::from_bytes_until_nul(bytes)?; | ||||
|         let s1_bytes = cstr.to_bytes(); | ||||
|         // skip '\0' byte between the two strings. | ||||
|         let s2_bytes = &bytes[s1_bytes.len() + 1..]; | ||||
|         Ok((*n, s1_bytes, s2_bytes)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec { | ||||
|     type EItem = (u8, &'a [u8], &'a [u8]); | ||||
|  | ||||
|     fn bytes_encode((n, s1, s2): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|     fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> { | ||||
|         let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); | ||||
|         bytes.push(*n); | ||||
|         bytes.extend_from_slice(s1); | ||||
|         bytes.push(0); | ||||
|         bytes.extend_from_slice(s2); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|         Ok(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
|   | ||||
							
								
								
									
										1083
									
								
								milli/src/index.rs
									
									
									
									
									
								
							
							
						
						
									
										1083
									
								
								milli/src/index.rs
									
									
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -66,9 +66,9 @@ pub use self::search::{ | ||||
| pub type Result<T> = std::result::Result<T, error::Error>; | ||||
|  | ||||
| pub type Attribute = u32; | ||||
| pub type BEU16 = heed::zerocopy::U16<heed::byteorder::BE>; | ||||
| pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>; | ||||
| pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>; | ||||
| pub type BEU16 = heed::types::U16<heed::byteorder::BE>; | ||||
| pub type BEU32 = heed::types::U32<heed::byteorder::BE>; | ||||
| pub type BEU64 = heed::types::U64<heed::byteorder::BE>; | ||||
| pub type DocumentId = u32; | ||||
| pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | ||||
| pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>; | ||||
|   | ||||
| @@ -1,5 +1,7 @@ | ||||
| use std::cmp; | ||||
|  | ||||
| use serde::{Deserialize, Serialize}; | ||||
|  | ||||
| use crate::{relative_from_absolute_position, Position}; | ||||
|  | ||||
| pub const MAX_DISTANCE: u32 = 4; | ||||
| @@ -25,3 +27,11 @@ pub fn positions_proximity(lhs: Position, rhs: Position) -> u32 { | ||||
| pub fn path_proximity(path: &[Position]) -> u32 { | ||||
|     path.windows(2).map(|w| positions_proximity(w[0], w[1])).sum::<u32>() | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| pub enum ProximityPrecision { | ||||
|     #[default] | ||||
|     WordScale, | ||||
|     AttributeScale, | ||||
| } | ||||
|   | ||||
| @@ -2,7 +2,7 @@ use std::collections::{BTreeMap, HashMap, HashSet}; | ||||
| use std::ops::ControlFlow; | ||||
| use std::{fmt, mem}; | ||||
|  | ||||
| use heed::types::ByteSlice; | ||||
| use heed::types::Bytes; | ||||
| use heed::BytesDecode; | ||||
| use indexmap::IndexMap; | ||||
| use roaring::RoaringBitmap; | ||||
| @@ -13,7 +13,7 @@ use crate::facet::FacetType; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec, | ||||
| }; | ||||
| use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; | ||||
| use crate::heed_codec::{BytesRefCodec, StrRefCodec}; | ||||
| use crate::search::facet::facet_distribution_iter::{ | ||||
|     count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution, | ||||
| }; | ||||
| @@ -105,7 +105,7 @@ impl<'a> FacetDistribution<'a> { | ||||
|                     key_buffer.truncate(mem::size_of::<FieldId>()); | ||||
|                     key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||
|                     let iter = db | ||||
|                         .remap_key_type::<ByteSlice>() | ||||
|                         .remap_key_type::<Bytes>() | ||||
|                         .prefix_iter(self.rtxn, &key_buffer)? | ||||
|                         .remap_key_type::<FieldDocIdFacetF64Codec>(); | ||||
|  | ||||
| @@ -129,7 +129,7 @@ impl<'a> FacetDistribution<'a> { | ||||
|                     key_buffer.truncate(mem::size_of::<FieldId>()); | ||||
|                     key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||
|                     let iter = db | ||||
|                         .remap_key_type::<ByteSlice>() | ||||
|                         .remap_key_type::<Bytes>() | ||||
|                         .prefix_iter(self.rtxn, &key_buffer)? | ||||
|                         .remap_key_type::<FieldDocIdFacetStringCodec>(); | ||||
|  | ||||
| @@ -172,9 +172,7 @@ impl<'a> FacetDistribution<'a> { | ||||
|  | ||||
|         search_function( | ||||
|             self.rtxn, | ||||
|             self.index | ||||
|                 .facet_id_f64_docids | ||||
|                 .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), | ||||
|             self.index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(), | ||||
|             field_id, | ||||
|             candidates, | ||||
|             |facet_key, nbr_docids, _| { | ||||
| @@ -203,9 +201,7 @@ impl<'a> FacetDistribution<'a> { | ||||
|  | ||||
|         search_function( | ||||
|             self.rtxn, | ||||
|             self.index | ||||
|                 .facet_id_string_docids | ||||
|                 .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), | ||||
|             self.index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(), | ||||
|             field_id, | ||||
|             candidates, | ||||
|             |facet_key, nbr_docids, any_docid| { | ||||
|   | ||||
| @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::{get_first_facet_value, get_highest_level}; | ||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
| use crate::DocumentId; | ||||
|  | ||||
| /// Call the given closure on the facet distribution of the candidate documents. | ||||
| @@ -23,7 +23,7 @@ use crate::DocumentId; | ||||
| /// keep iterating over the different facet values or stop. | ||||
| pub fn lexicographically_iterate_over_facet_distribution<'t, CB>( | ||||
|     rtxn: &'t heed::RoTxn<'t>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
|     candidates: &RoaringBitmap, | ||||
|     callback: CB, | ||||
| @@ -34,11 +34,11 @@ where | ||||
|     let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback }; | ||||
|     let highest_level = get_highest_level( | ||||
|         rtxn, | ||||
|         db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), | ||||
|         db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(), | ||||
|         field_id, | ||||
|     )?; | ||||
|  | ||||
|     if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? { | ||||
|     if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { | ||||
|         fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; | ||||
|         Ok(()) | ||||
|     } else { | ||||
| @@ -48,7 +48,7 @@ where | ||||
|  | ||||
| pub fn count_iterate_over_facet_distribution<'t, CB>( | ||||
|     rtxn: &'t heed::RoTxn<'t>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
|     candidates: &RoaringBitmap, | ||||
|     mut callback: CB, | ||||
| @@ -77,11 +77,11 @@ where | ||||
|     let mut heap = BinaryHeap::new(); | ||||
|     let highest_level = get_highest_level( | ||||
|         rtxn, | ||||
|         db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), | ||||
|         db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(), | ||||
|         field_id, | ||||
|     )?; | ||||
|  | ||||
|     if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? { | ||||
|     if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { | ||||
|         // We first fill the heap with values from the highest level | ||||
|         let starting_key = | ||||
|             FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; | ||||
| @@ -146,7 +146,7 @@ where | ||||
|     CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>, | ||||
| { | ||||
|     rtxn: &'t heed::RoTxn<'t>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
|     callback: CB, | ||||
| } | ||||
|   | ||||
| @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; | ||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
| use crate::Result; | ||||
|  | ||||
| /// Find all the document ids for which the given field contains a value contained within | ||||
| @@ -25,11 +25,11 @@ where | ||||
|     let inner; | ||||
|     let left = match left { | ||||
|         Bound::Included(left) => { | ||||
|             inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; | ||||
|             inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?; | ||||
|             Bound::Included(inner.as_ref()) | ||||
|         } | ||||
|         Bound::Excluded(left) => { | ||||
|             inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; | ||||
|             inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?; | ||||
|             Bound::Excluded(inner.as_ref()) | ||||
|         } | ||||
|         Bound::Unbounded => Bound::Unbounded, | ||||
| @@ -37,25 +37,22 @@ where | ||||
|     let inner; | ||||
|     let right = match right { | ||||
|         Bound::Included(right) => { | ||||
|             inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; | ||||
|             inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?; | ||||
|             Bound::Included(inner.as_ref()) | ||||
|         } | ||||
|         Bound::Excluded(right) => { | ||||
|             inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; | ||||
|             inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?; | ||||
|             Bound::Excluded(inner.as_ref()) | ||||
|         } | ||||
|         Bound::Unbounded => Bound::Unbounded, | ||||
|     }; | ||||
|     let db = db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(); | ||||
|     let db = db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(); | ||||
|     let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; | ||||
|     let highest_level = get_highest_level(rtxn, db, field_id)?; | ||||
|  | ||||
|     if let Some(starting_left_bound) = | ||||
|         get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? | ||||
|     { | ||||
|         let rightmost_bound = Bound::Included( | ||||
|             get_last_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)?.unwrap(), | ||||
|         ); // will not fail because get_first_facet_value succeeded | ||||
|     if let Some(starting_left_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { | ||||
|         let rightmost_bound = | ||||
|             Bound::Included(get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded | ||||
|         let group_size = usize::MAX; | ||||
|         f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; | ||||
|         Ok(()) | ||||
| @@ -67,7 +64,7 @@ where | ||||
| /// Fetch the document ids that have a facet with a value between the two given bounds | ||||
| struct FacetRangeSearch<'t, 'b, 'bitmap> { | ||||
|     rtxn: &'t heed::RoTxn<'t>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
|     left: Bound<&'b [u8]>, | ||||
|     right: Bound<&'b [u8]>, | ||||
|   | ||||
| @@ -5,7 +5,7 @@ use super::{get_first_facet_value, get_highest_level}; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, | ||||
| }; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
|  | ||||
| /// Return an iterator which iterates over the given candidate documents in | ||||
| /// ascending order of their facet value for the given field id. | ||||
| @@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec; | ||||
| /// The documents returned by the iterator are grouped by the facet values that | ||||
| /// determined their rank. For example, given the documents: | ||||
| /// | ||||
| /// ```ignore | ||||
| /// ```text | ||||
| /// 0: { "colour": ["blue", "green"] } | ||||
| /// 1: { "colour": ["blue", "red"] } | ||||
| /// 2: { "colour": ["orange", "red"] } | ||||
| @@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec; | ||||
| /// ``` | ||||
| /// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator | ||||
| /// over the following elements: | ||||
| /// ```ignore | ||||
| /// ```text | ||||
| /// [0, 4]  // corresponds to all the documents within the candidates that have the facet value "blue" | ||||
| /// [3]     // same for "green" | ||||
| /// [2]     // same for "orange" | ||||
| @@ -31,12 +31,12 @@ use crate::heed_codec::ByteSliceRefCodec; | ||||
| /// Note that once a document id is returned by the iterator, it is never returned again. | ||||
| pub fn ascending_facet_sort<'t>( | ||||
|     rtxn: &'t heed::RoTxn<'t>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
|     candidates: RoaringBitmap, | ||||
| ) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> { | ||||
|     let highest_level = get_highest_level(rtxn, db, field_id)?; | ||||
|     if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? { | ||||
|     if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { | ||||
|         let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; | ||||
|         let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); | ||||
|  | ||||
| @@ -53,14 +53,12 @@ pub fn ascending_facet_sort<'t>( | ||||
|  | ||||
| struct AscendingFacetSort<'t, 'e> { | ||||
|     rtxn: &'t heed::RoTxn<'e>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
|     #[allow(clippy::type_complexity)] | ||||
|     stack: Vec<( | ||||
|         RoaringBitmap, | ||||
|         std::iter::Take< | ||||
|             heed::RoRange<'t, FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|         >, | ||||
|         std::iter::Take<heed::RoRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>>, | ||||
|     )>, | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -7,21 +7,21 @@ use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, | ||||
| }; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
|  | ||||
| /// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort). | ||||
| /// | ||||
| /// This function does the same thing, but in the opposite order. | ||||
| pub fn descending_facet_sort<'t>( | ||||
|     rtxn: &'t heed::RoTxn<'t>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
|     candidates: RoaringBitmap, | ||||
| ) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> { | ||||
|     let highest_level = get_highest_level(rtxn, db, field_id)?; | ||||
|     if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? { | ||||
|     if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { | ||||
|         let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; | ||||
|         let last_bound = get_last_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)?.unwrap(); | ||||
|         let last_bound = get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap(); | ||||
|         let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; | ||||
|         let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); | ||||
|         Ok(itertools::Either::Left(DescendingFacetSort { | ||||
| @@ -37,13 +37,13 @@ pub fn descending_facet_sort<'t>( | ||||
|  | ||||
| struct DescendingFacetSort<'t> { | ||||
|     rtxn: &'t heed::RoTxn<'t>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
|     #[allow(clippy::type_complexity)] | ||||
|     stack: Vec<( | ||||
|         RoaringBitmap, | ||||
|         std::iter::Take< | ||||
|             heed::RoRevRange<'t, FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|             heed::RoRevRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|         >, | ||||
|         Bound<&'t [u8]>, | ||||
|     )>, | ||||
| @@ -100,7 +100,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> { | ||||
|                     *right_bound = Bound::Excluded(left_bound); | ||||
|                     let iter = match self | ||||
|                         .db | ||||
|                         .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>() | ||||
|                         .remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>() | ||||
|                         .rev_range(self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow)) | ||||
|                     { | ||||
|                         Ok(iter) => iter, | ||||
| @@ -123,7 +123,7 @@ mod tests { | ||||
|     use roaring::RoaringBitmap; | ||||
|  | ||||
|     use crate::heed_codec::facet::FacetGroupKeyCodec; | ||||
|     use crate::heed_codec::ByteSliceRefCodec; | ||||
|     use crate::heed_codec::BytesRefCodec; | ||||
|     use crate::milli_snap; | ||||
|     use crate::search::facet::facet_sort_descending::descending_facet_sort; | ||||
|     use crate::search::facet::tests::{ | ||||
| @@ -144,7 +144,7 @@ mod tests { | ||||
|             let txn = index.env.read_txn().unwrap(); | ||||
|             let candidates = (200..=300).collect::<RoaringBitmap>(); | ||||
|             let mut results = String::new(); | ||||
|             let db = index.content.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(); | ||||
|             let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(); | ||||
|             let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); | ||||
|             for el in iter { | ||||
|                 let (docids, _) = el.unwrap(); | ||||
| @@ -167,7 +167,7 @@ mod tests { | ||||
|             let txn = index.env.read_txn().unwrap(); | ||||
|             let candidates = (200..=300).collect::<RoaringBitmap>(); | ||||
|             let mut results = String::new(); | ||||
|             let db = index.content.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(); | ||||
|             let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(); | ||||
|             let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap(); | ||||
|             for el in iter { | ||||
|                 let (docids, _) = el.unwrap(); | ||||
|   | ||||
| @@ -223,12 +223,9 @@ impl<'a> Filter<'a> { | ||||
| impl<'a> Filter<'a> { | ||||
|     pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> { | ||||
|         // to avoid doing this for each recursive call we're going to do it ONCE ahead of time | ||||
|         let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; | ||||
|         let filterable_fields = index.filterable_fields(rtxn)?; | ||||
|  | ||||
|         // and finally we delete all the soft_deleted_documents, again, only once at the very end | ||||
|         self.inner_evaluate(rtxn, index, &filterable_fields) | ||||
|             .map(|result| result - soft_deleted_documents) | ||||
|     } | ||||
|  | ||||
|     fn evaluate_operator( | ||||
|   | ||||
| @@ -1,13 +1,13 @@ | ||||
| pub use facet_sort_ascending::ascending_facet_sort; | ||||
| pub use facet_sort_descending::descending_facet_sort; | ||||
| use heed::types::{ByteSlice, DecodeIgnore}; | ||||
| use heed::types::{Bytes, DecodeIgnore}; | ||||
| use heed::{BytesDecode, RoTxn}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; | ||||
| pub use self::filter::{BadGeoError, Filter}; | ||||
| use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
| use crate::{Index, Result}; | ||||
| mod facet_distribution; | ||||
| mod facet_distribution_iter; | ||||
| @@ -22,8 +22,10 @@ fn facet_extreme_value<'t>( | ||||
|     let extreme_value = | ||||
|         if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) }; | ||||
|     let (_, extreme_value) = extreme_value?; | ||||
|  | ||||
|     Ok(OrderedF64Codec::bytes_decode(extreme_value)) | ||||
|     OrderedF64Codec::bytes_decode(extreme_value) | ||||
|         .map(Some) | ||||
|         .map_err(heed::Error::Decoding) | ||||
|         .map_err(Into::into) | ||||
| } | ||||
|  | ||||
| pub fn facet_min_value<'t>( | ||||
| @@ -32,7 +34,7 @@ pub fn facet_min_value<'t>( | ||||
|     field_id: u16, | ||||
|     candidates: RoaringBitmap, | ||||
| ) -> Result<Option<f64>> { | ||||
|     let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(); | ||||
|     let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(); | ||||
|     let it = ascending_facet_sort(rtxn, db, field_id, candidates)?; | ||||
|     facet_extreme_value(it) | ||||
| } | ||||
| @@ -43,7 +45,7 @@ pub fn facet_max_value<'t>( | ||||
|     field_id: u16, | ||||
|     candidates: RoaringBitmap, | ||||
| ) -> Result<Option<f64>> { | ||||
|     let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(); | ||||
|     let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(); | ||||
|     let it = descending_facet_sort(rtxn, db, field_id, candidates)?; | ||||
|     facet_extreme_value(it) | ||||
| } | ||||
| @@ -51,7 +53,7 @@ pub fn facet_max_value<'t>( | ||||
| /// Get the first facet value in the facet database | ||||
| pub(crate) fn get_first_facet_value<'t, BoundCodec>( | ||||
|     txn: &'t RoTxn, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
| ) -> heed::Result<Option<BoundCodec::DItem>> | ||||
| where | ||||
| @@ -60,13 +62,12 @@ where | ||||
|     let mut level0prefix = vec![]; | ||||
|     level0prefix.extend_from_slice(&field_id.to_be_bytes()); | ||||
|     level0prefix.push(0); | ||||
|     let mut level0_iter_forward = db | ||||
|         .as_polymorph() | ||||
|         .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; | ||||
|     let mut level0_iter_forward = | ||||
|         db.remap_types::<Bytes, DecodeIgnore>().prefix_iter(txn, level0prefix.as_slice())?; | ||||
|     if let Some(first) = level0_iter_forward.next() { | ||||
|         let (first_key, _) = first?; | ||||
|         let first_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(first_key) | ||||
|             .ok_or(heed::Error::Encoding)?; | ||||
|             .map_err(heed::Error::Decoding)?; | ||||
|         Ok(Some(first_key.left_bound)) | ||||
|     } else { | ||||
|         Ok(None) | ||||
| @@ -76,7 +77,7 @@ where | ||||
| /// Get the last facet value in the facet database | ||||
| pub(crate) fn get_last_facet_value<'t, BoundCodec>( | ||||
|     txn: &'t RoTxn, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
| ) -> heed::Result<Option<BoundCodec::DItem>> | ||||
| where | ||||
| @@ -85,13 +86,12 @@ where | ||||
|     let mut level0prefix = vec![]; | ||||
|     level0prefix.extend_from_slice(&field_id.to_be_bytes()); | ||||
|     level0prefix.push(0); | ||||
|     let mut level0_iter_backward = db | ||||
|         .as_polymorph() | ||||
|         .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; | ||||
|     let mut level0_iter_backward = | ||||
|         db.remap_types::<Bytes, DecodeIgnore>().rev_prefix_iter(txn, level0prefix.as_slice())?; | ||||
|     if let Some(last) = level0_iter_backward.next() { | ||||
|         let (last_key, _) = last?; | ||||
|         let last_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(last_key) | ||||
|             .ok_or(heed::Error::Encoding)?; | ||||
|             .map_err(heed::Error::Decoding)?; | ||||
|         Ok(Some(last_key.left_bound)) | ||||
|     } else { | ||||
|         Ok(None) | ||||
| @@ -101,17 +101,17 @@ where | ||||
| /// Get the height of the highest level in the facet database | ||||
| pub(crate) fn get_highest_level<'t>( | ||||
|     txn: &'t RoTxn<'t>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     field_id: u16, | ||||
| ) -> heed::Result<u8> { | ||||
|     let field_id_prefix = &field_id.to_be_bytes(); | ||||
|     Ok(db | ||||
|         .as_polymorph() | ||||
|         .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, field_id_prefix)? | ||||
|         .remap_types::<Bytes, DecodeIgnore>() | ||||
|         .rev_prefix_iter(txn, field_id_prefix)? | ||||
|         .next() | ||||
|         .map(|el| { | ||||
|             let (key, _) = el.unwrap(); | ||||
|             let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key).unwrap(); | ||||
|             let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key).unwrap(); | ||||
|             key.level | ||||
|         }) | ||||
|         .unwrap_or(0)) | ||||
|   | ||||
| @@ -17,8 +17,7 @@ use crate::error::UserError; | ||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; | ||||
| use crate::score_details::{ScoreDetails, ScoringStrategy}; | ||||
| use crate::{ | ||||
|     execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result, | ||||
|     SearchContext, BEU16, | ||||
|     execute_search, AscDesc, DefaultSearchLogger, DocumentId, FieldId, Index, Result, SearchContext, | ||||
| }; | ||||
|  | ||||
| // Building these factories is not free. | ||||
| @@ -299,7 +298,7 @@ impl<'a> SearchForFacetValues<'a> { | ||||
|             None => return Ok(Vec::new()), | ||||
|         }; | ||||
|  | ||||
|         let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &BEU16::new(fid))? { | ||||
|         let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? { | ||||
|             Some(fst) => fst, | ||||
|             None => return Ok(vec![]), | ||||
|         }; | ||||
|   | ||||
| @@ -3,16 +3,17 @@ use std::collections::hash_map::Entry; | ||||
| use std::hash::Hash; | ||||
|  | ||||
| use fxhash::FxHashMap; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::types::Bytes; | ||||
| use heed::{BytesEncode, Database, RoTxn}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::interner::Interned; | ||||
| use super::Word; | ||||
| use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; | ||||
| use crate::proximity::ProximityPrecision; | ||||
| use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; | ||||
| use crate::{ | ||||
|     CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext, | ||||
|     CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, | ||||
| }; | ||||
|  | ||||
| /// A cache storing pointers to values in the LMDB databases. | ||||
| @@ -25,7 +26,7 @@ pub struct DatabaseCache<'ctx> { | ||||
|     pub word_pair_proximity_docids: | ||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, | ||||
|     pub word_prefix_pair_proximity_docids: | ||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, | ||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>, | ||||
|     pub prefix_word_pair_proximity_docids: | ||||
|         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, | ||||
|     pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>, | ||||
| @@ -50,7 +51,7 @@ impl<'ctx> DatabaseCache<'ctx> { | ||||
|         cache_key: K1, | ||||
|         db_key: &'v KC::EItem, | ||||
|         cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>, | ||||
|         db: Database<KC, ByteSlice>, | ||||
|         db: Database<KC, Bytes>, | ||||
|     ) -> Result<Option<DC::DItem>> | ||||
|     where | ||||
|         K1: Copy + Eq + Hash, | ||||
| @@ -63,12 +64,14 @@ impl<'ctx> DatabaseCache<'ctx> { | ||||
|         } | ||||
|  | ||||
|         match cache.get(&cache_key).unwrap() { | ||||
|             Some(Cow::Borrowed(bytes)) => { | ||||
|                 DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) | ||||
|             } | ||||
|             Some(Cow::Owned(bytes)) => { | ||||
|                 DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) | ||||
|             } | ||||
|             Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes) | ||||
|                 .map(Some) | ||||
|                 .map_err(heed::Error::Decoding) | ||||
|                 .map_err(Into::into), | ||||
|             Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes) | ||||
|                 .map(Some) | ||||
|                 .map_err(heed::Error::Decoding) | ||||
|                 .map_err(Into::into), | ||||
|             None => Ok(None), | ||||
|         } | ||||
|     } | ||||
| @@ -78,7 +81,7 @@ impl<'ctx> DatabaseCache<'ctx> { | ||||
|         cache_key: K1, | ||||
|         db_keys: &'v [KC::EItem], | ||||
|         cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>, | ||||
|         db: Database<KC, ByteSlice>, | ||||
|         db: Database<KC, Bytes>, | ||||
|         merger: MergeFn, | ||||
|     ) -> Result<Option<DC::DItem>> | ||||
|     where | ||||
| @@ -110,12 +113,14 @@ impl<'ctx> DatabaseCache<'ctx> { | ||||
|         } | ||||
|  | ||||
|         match cache.get(&cache_key).unwrap() { | ||||
|             Some(Cow::Borrowed(bytes)) => { | ||||
|                 DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) | ||||
|             } | ||||
|             Some(Cow::Owned(bytes)) => { | ||||
|                 DC::bytes_decode_owned(bytes).ok_or(heed::Error::Decoding.into()).map(Some) | ||||
|             } | ||||
|             Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes) | ||||
|                 .map(Some) | ||||
|                 .map_err(heed::Error::Decoding) | ||||
|                 .map_err(Into::into), | ||||
|             Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes) | ||||
|                 .map(Some) | ||||
|                 .map_err(heed::Error::Decoding) | ||||
|                 .map_err(Into::into), | ||||
|             None => Ok(None), | ||||
|         } | ||||
|     } | ||||
| @@ -165,16 +170,16 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|                     word, | ||||
|                     &keys[..], | ||||
|                     &mut self.db_cache.word_docids, | ||||
|                     self.index.word_fid_docids.remap_data_type::<ByteSlice>(), | ||||
|                     self.index.word_fid_docids.remap_data_type::<Bytes>(), | ||||
|                     merge_cbo_roaring_bitmaps, | ||||
|                 ) | ||||
|             } | ||||
|             None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( | ||||
|             None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|                 self.txn, | ||||
|                 word, | ||||
|                 self.word_interner.get(word).as_str(), | ||||
|                 &mut self.db_cache.word_docids, | ||||
|                 self.index.word_docids.remap_data_type::<ByteSlice>(), | ||||
|                 self.index.word_docids.remap_data_type::<Bytes>(), | ||||
|             ), | ||||
|         } | ||||
|     } | ||||
| @@ -194,16 +199,16 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|                     word, | ||||
|                     &keys[..], | ||||
|                     &mut self.db_cache.exact_word_docids, | ||||
|                     self.index.word_fid_docids.remap_data_type::<ByteSlice>(), | ||||
|                     self.index.word_fid_docids.remap_data_type::<Bytes>(), | ||||
|                     merge_cbo_roaring_bitmaps, | ||||
|                 ) | ||||
|             } | ||||
|             None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( | ||||
|             None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|                 self.txn, | ||||
|                 word, | ||||
|                 self.word_interner.get(word).as_str(), | ||||
|                 &mut self.db_cache.exact_word_docids, | ||||
|                 self.index.exact_word_docids.remap_data_type::<ByteSlice>(), | ||||
|                 self.index.exact_word_docids.remap_data_type::<Bytes>(), | ||||
|             ), | ||||
|         } | ||||
|     } | ||||
| @@ -244,16 +249,16 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|                     prefix, | ||||
|                     &keys[..], | ||||
|                     &mut self.db_cache.word_prefix_docids, | ||||
|                     self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(), | ||||
|                     self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(), | ||||
|                     merge_cbo_roaring_bitmaps, | ||||
|                 ) | ||||
|             } | ||||
|             None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( | ||||
|             None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|                 self.txn, | ||||
|                 prefix, | ||||
|                 self.word_interner.get(prefix).as_str(), | ||||
|                 &mut self.db_cache.word_prefix_docids, | ||||
|                 self.index.word_prefix_docids.remap_data_type::<ByteSlice>(), | ||||
|                 self.index.word_prefix_docids.remap_data_type::<Bytes>(), | ||||
|             ), | ||||
|         } | ||||
|     } | ||||
| @@ -273,16 +278,16 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|                     prefix, | ||||
|                     &keys[..], | ||||
|                     &mut self.db_cache.exact_word_prefix_docids, | ||||
|                     self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(), | ||||
|                     self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(), | ||||
|                     merge_cbo_roaring_bitmaps, | ||||
|                 ) | ||||
|             } | ||||
|             None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( | ||||
|             None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|                 self.txn, | ||||
|                 prefix, | ||||
|                 self.word_interner.get(prefix).as_str(), | ||||
|                 &mut self.db_cache.exact_word_prefix_docids, | ||||
|                 self.index.exact_word_prefix_docids.remap_data_type::<ByteSlice>(), | ||||
|                 self.index.exact_word_prefix_docids.remap_data_type::<Bytes>(), | ||||
|             ), | ||||
|         } | ||||
|     } | ||||
| @@ -293,17 +298,67 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|         word2: Interned<String>, | ||||
|         proximity: u8, | ||||
|     ) -> Result<Option<RoaringBitmap>> { | ||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|             self.txn, | ||||
|             (proximity, word1, word2), | ||||
|             &( | ||||
|                 proximity, | ||||
|                 self.word_interner.get(word1).as_str(), | ||||
|                 self.word_interner.get(word2).as_str(), | ||||
|             ), | ||||
|             &mut self.db_cache.word_pair_proximity_docids, | ||||
|             self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(), | ||||
|         ) | ||||
|         match self.index.proximity_precision(self.txn)?.unwrap_or_default() { | ||||
|             ProximityPrecision::AttributeScale => { | ||||
|                 // Force proximity to 0 because: | ||||
|                 // in AttributeScale, there are only 2 possible distances: | ||||
|                 // 1. words in same attribute: in that the DB contains (0, word1, word2) | ||||
|                 // 2. words in different attributes: no DB entry for these two words. | ||||
|                 let proximity = 0; | ||||
|                 let docids = if let Some(docids) = | ||||
|                     self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2)) | ||||
|                 { | ||||
|                     docids | ||||
|                         .as_ref() | ||||
|                         .map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d)) | ||||
|                         .transpose() | ||||
|                         .map_err(heed::Error::Decoding)? | ||||
|                 } else { | ||||
|                     // Compute the distance at the attribute level and store it in the cache. | ||||
|                     let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { | ||||
|                         fids | ||||
|                     } else { | ||||
|                         self.index.fields_ids_map(self.txn)?.ids().collect() | ||||
|                     }; | ||||
|                     let mut docids = RoaringBitmap::new(); | ||||
|                     for fid in fids { | ||||
|                         // for each field, intersect left word bitmap and right word bitmap, | ||||
|                         // then merge the result in a global bitmap before storing it in the cache. | ||||
|                         let word1_docids = self.get_db_word_fid_docids(word1, fid)?; | ||||
|                         let word2_docids = self.get_db_word_fid_docids(word2, fid)?; | ||||
|                         if let (Some(word1_docids), Some(word2_docids)) = | ||||
|                             (word1_docids, word2_docids) | ||||
|                         { | ||||
|                             docids |= word1_docids & word2_docids; | ||||
|                         } | ||||
|                     } | ||||
|                     let encoded = CboRoaringBitmapCodec::bytes_encode(&docids) | ||||
|                         .map(Cow::into_owned) | ||||
|                         .map(Cow::Owned) | ||||
|                         .map(Some) | ||||
|                         .map_err(heed::Error::Decoding)?; | ||||
|                     self.db_cache | ||||
|                         .word_pair_proximity_docids | ||||
|                         .insert((proximity, word1, word2), encoded); | ||||
|                     Some(docids) | ||||
|                 }; | ||||
|  | ||||
|                 Ok(docids) | ||||
|             } | ||||
|             ProximityPrecision::WordScale => { | ||||
|                 DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|                     self.txn, | ||||
|                     (proximity, word1, word2), | ||||
|                     &( | ||||
|                         proximity, | ||||
|                         self.word_interner.get(word1).as_str(), | ||||
|                         self.word_interner.get(word2).as_str(), | ||||
|                     ), | ||||
|                     &mut self.db_cache.word_pair_proximity_docids, | ||||
|                     self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), | ||||
|                 ) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_word_pair_proximity_docids_len( | ||||
| @@ -312,54 +367,107 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|         word2: Interned<String>, | ||||
|         proximity: u8, | ||||
|     ) -> Result<Option<u64>> { | ||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( | ||||
|             self.txn, | ||||
|             (proximity, word1, word2), | ||||
|             &( | ||||
|                 proximity, | ||||
|                 self.word_interner.get(word1).as_str(), | ||||
|                 self.word_interner.get(word2).as_str(), | ||||
|             ), | ||||
|             &mut self.db_cache.word_pair_proximity_docids, | ||||
|             self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(), | ||||
|         ) | ||||
|         match self.index.proximity_precision(self.txn)?.unwrap_or_default() { | ||||
|             ProximityPrecision::AttributeScale => Ok(self | ||||
|                 .get_db_word_pair_proximity_docids(word1, word2, proximity)? | ||||
|                 .map(|d| d.len())), | ||||
|             ProximityPrecision::WordScale => { | ||||
|                 DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( | ||||
|                     self.txn, | ||||
|                     (proximity, word1, word2), | ||||
|                     &( | ||||
|                         proximity, | ||||
|                         self.word_interner.get(word1).as_str(), | ||||
|                         self.word_interner.get(word2).as_str(), | ||||
|                     ), | ||||
|                     &mut self.db_cache.word_pair_proximity_docids, | ||||
|                     self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(), | ||||
|                 ) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_word_prefix_pair_proximity_docids( | ||||
|         &mut self, | ||||
|         word1: Interned<String>, | ||||
|         prefix2: Interned<String>, | ||||
|         proximity: u8, | ||||
|         mut proximity: u8, | ||||
|     ) -> Result<Option<RoaringBitmap>> { | ||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|             self.txn, | ||||
|             (proximity, word1, prefix2), | ||||
|             &( | ||||
|                 proximity, | ||||
|                 self.word_interner.get(word1).as_str(), | ||||
|                 self.word_interner.get(prefix2).as_str(), | ||||
|             ), | ||||
|             &mut self.db_cache.word_prefix_pair_proximity_docids, | ||||
|             self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(), | ||||
|         ) | ||||
|         let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default(); | ||||
|         if proximity_precision == ProximityPrecision::AttributeScale { | ||||
|             // Force proximity to 0 because: | ||||
|             // in AttributeScale, there are only 2 possible distances: | ||||
|             // 1. words in same attribute: in that the DB contains (0, word1, word2) | ||||
|             // 2. words in different attributes: no DB entry for these two words. | ||||
|             proximity = 0; | ||||
|         } | ||||
|  | ||||
|         let docids = if let Some(docids) = | ||||
|             self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2)) | ||||
|         { | ||||
|             docids.clone() | ||||
|         } else { | ||||
|             let prefix_docids = match proximity_precision { | ||||
|                 ProximityPrecision::AttributeScale => { | ||||
|                     // Compute the distance at the attribute level and store it in the cache. | ||||
|                     let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { | ||||
|                         fids | ||||
|                     } else { | ||||
|                         self.index.fields_ids_map(self.txn)?.ids().collect() | ||||
|                     }; | ||||
|                     let mut prefix_docids = RoaringBitmap::new(); | ||||
|                     // for each field, intersect left word bitmap and right word bitmap, | ||||
|                     // then merge the result in a global bitmap before storing it in the cache. | ||||
|                     for fid in fids { | ||||
|                         let word1_docids = self.get_db_word_fid_docids(word1, fid)?; | ||||
|                         let prefix2_docids = self.get_db_word_prefix_fid_docids(prefix2, fid)?; | ||||
|                         if let (Some(word1_docids), Some(prefix2_docids)) = | ||||
|                             (word1_docids, prefix2_docids) | ||||
|                         { | ||||
|                             prefix_docids |= word1_docids & prefix2_docids; | ||||
|                         } | ||||
|                     } | ||||
|                     prefix_docids | ||||
|                 } | ||||
|                 ProximityPrecision::WordScale => { | ||||
|                     // compute docids using prefix iter and store the result in the cache. | ||||
|                     let key = U8StrStrCodec::bytes_encode(&( | ||||
|                         proximity, | ||||
|                         self.word_interner.get(word1).as_str(), | ||||
|                         self.word_interner.get(prefix2).as_str(), | ||||
|                     )) | ||||
|                     .unwrap() | ||||
|                     .into_owned(); | ||||
|                     let mut prefix_docids = RoaringBitmap::new(); | ||||
|                     let remap_key_type = self | ||||
|                         .index | ||||
|                         .word_pair_proximity_docids | ||||
|                         .remap_key_type::<Bytes>() | ||||
|                         .prefix_iter(self.txn, &key)?; | ||||
|                     for result in remap_key_type { | ||||
|                         let (_, docids) = result?; | ||||
|  | ||||
|                         prefix_docids |= docids; | ||||
|                     } | ||||
|                     prefix_docids | ||||
|                 } | ||||
|             }; | ||||
|             self.db_cache | ||||
|                 .word_prefix_pair_proximity_docids | ||||
|                 .insert((proximity, word1, prefix2), Some(prefix_docids.clone())); | ||||
|             Some(prefix_docids) | ||||
|         }; | ||||
|         Ok(docids) | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_prefix_word_pair_proximity_docids( | ||||
|         &mut self, | ||||
|         left_prefix: Interned<String>, | ||||
|         right: Interned<String>, | ||||
|         proximity: u8, | ||||
|     ) -> Result<Option<RoaringBitmap>> { | ||||
|         DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( | ||||
|             self.txn, | ||||
|             (proximity, left_prefix, right), | ||||
|             &( | ||||
|                 proximity, | ||||
|                 self.word_interner.get(left_prefix).as_str(), | ||||
|                 self.word_interner.get(right).as_str(), | ||||
|             ), | ||||
|             &mut self.db_cache.prefix_word_pair_proximity_docids, | ||||
|             self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(), | ||||
|         ) | ||||
|         // only accept exact matches on reverted positions | ||||
|         self.get_db_word_pair_proximity_docids(left_prefix, right, proximity) | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_word_fid_docids( | ||||
| @@ -377,7 +485,7 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|             (word, fid), | ||||
|             &(self.word_interner.get(word).as_str(), fid), | ||||
|             &mut self.db_cache.word_fid_docids, | ||||
|             self.index.word_fid_docids.remap_data_type::<ByteSlice>(), | ||||
|             self.index.word_fid_docids.remap_data_type::<Bytes>(), | ||||
|         ) | ||||
|     } | ||||
|  | ||||
| @@ -396,7 +504,7 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|             (word_prefix, fid), | ||||
|             &(self.word_interner.get(word_prefix).as_str(), fid), | ||||
|             &mut self.db_cache.word_prefix_fid_docids, | ||||
|             self.index.word_prefix_fid_docids.remap_data_type::<ByteSlice>(), | ||||
|             self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(), | ||||
|         ) | ||||
|     } | ||||
|  | ||||
| @@ -410,7 +518,7 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|                 let remap_key_type = self | ||||
|                     .index | ||||
|                     .word_fid_docids | ||||
|                     .remap_types::<ByteSlice, ByteSlice>() | ||||
|                     .remap_types::<Bytes, Bytes>() | ||||
|                     .prefix_iter(self.txn, &key)? | ||||
|                     .remap_key_type::<StrBEU16Codec>(); | ||||
|                 for result in remap_key_type { | ||||
| @@ -436,7 +544,7 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|                 let remap_key_type = self | ||||
|                     .index | ||||
|                     .word_prefix_fid_docids | ||||
|                     .remap_types::<ByteSlice, ByteSlice>() | ||||
|                     .remap_types::<Bytes, Bytes>() | ||||
|                     .prefix_iter(self.txn, &key)? | ||||
|                     .remap_key_type::<StrBEU16Codec>(); | ||||
|                 for result in remap_key_type { | ||||
| @@ -464,7 +572,7 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|             (word, position), | ||||
|             &(self.word_interner.get(word).as_str(), position), | ||||
|             &mut self.db_cache.word_position_docids, | ||||
|             self.index.word_position_docids.remap_data_type::<ByteSlice>(), | ||||
|             self.index.word_position_docids.remap_data_type::<Bytes>(), | ||||
|         ) | ||||
|     } | ||||
|  | ||||
| @@ -478,7 +586,7 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|             (word_prefix, position), | ||||
|             &(self.word_interner.get(word_prefix).as_str(), position), | ||||
|             &mut self.db_cache.word_prefix_position_docids, | ||||
|             self.index.word_prefix_position_docids.remap_data_type::<ByteSlice>(), | ||||
|             self.index.word_prefix_position_docids.remap_data_type::<Bytes>(), | ||||
|         ) | ||||
|     } | ||||
|  | ||||
| @@ -492,7 +600,7 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|                 let remap_key_type = self | ||||
|                     .index | ||||
|                     .word_position_docids | ||||
|                     .remap_types::<ByteSlice, ByteSlice>() | ||||
|                     .remap_types::<Bytes, Bytes>() | ||||
|                     .prefix_iter(self.txn, &key)? | ||||
|                     .remap_key_type::<StrBEU16Codec>(); | ||||
|                 for result in remap_key_type { | ||||
| @@ -523,7 +631,7 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|                 let remap_key_type = self | ||||
|                     .index | ||||
|                     .word_prefix_position_docids | ||||
|                     .remap_types::<ByteSlice, ByteSlice>() | ||||
|                     .remap_types::<Bytes, Bytes>() | ||||
|                     .prefix_iter(self.txn, &key)? | ||||
|                     .remap_key_type::<StrBEU16Codec>(); | ||||
|                 for result in remap_key_type { | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| use heed::types::{ByteSlice, Str, Unit}; | ||||
| use heed::types::{Bytes, Str, Unit}; | ||||
| use heed::{Database, RoPrefix, RoTxn}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| @@ -8,7 +8,7 @@ const DOCID_SIZE: usize = 4; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec, | ||||
| }; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
| use crate::{Index, Result, SearchContext}; | ||||
|  | ||||
| pub struct DistinctOutput { | ||||
| @@ -71,7 +71,7 @@ pub fn distinct_single_docid( | ||||
|  | ||||
| /// Return all the docids containing the given value in the given field | ||||
| fn facet_value_docids( | ||||
|     database: Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     database: Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     txn: &RoTxn, | ||||
|     field_id: u16, | ||||
|     facet_value: &[u8], | ||||
| @@ -87,12 +87,12 @@ fn facet_number_values<'a>( | ||||
|     field_id: u16, | ||||
|     index: &Index, | ||||
|     txn: &'a RoTxn, | ||||
| ) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<ByteSliceRefCodec>, Unit>> { | ||||
| ) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Unit>> { | ||||
|     let key = facet_values_prefix_key(field_id, docid); | ||||
|  | ||||
|     let iter = index | ||||
|         .field_id_docid_facet_f64s | ||||
|         .remap_key_type::<ByteSlice>() | ||||
|         .remap_key_type::<Bytes>() | ||||
|         .prefix_iter(txn, &key)? | ||||
|         .remap_key_type(); | ||||
|  | ||||
| @@ -105,12 +105,12 @@ pub fn facet_string_values<'a>( | ||||
|     field_id: u16, | ||||
|     index: &Index, | ||||
|     txn: &'a RoTxn, | ||||
| ) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<ByteSliceRefCodec>, Str>> { | ||||
| ) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Str>> { | ||||
|     let key = facet_values_prefix_key(field_id, docid); | ||||
|  | ||||
|     let iter = index | ||||
|         .field_id_docid_facet_strings | ||||
|         .remap_key_type::<ByteSlice>() | ||||
|         .remap_key_type::<Bytes>() | ||||
|         .prefix_iter(txn, &key)? | ||||
|         .remap_types(); | ||||
|  | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| use std::collections::VecDeque; | ||||
| use std::iter::FromIterator; | ||||
|  | ||||
| use heed::types::{ByteSlice, Unit}; | ||||
| use heed::types::{Bytes, Unit}; | ||||
| use heed::{RoPrefix, RoTxn}; | ||||
| use roaring::RoaringBitmap; | ||||
| use rstar::RTree; | ||||
| @@ -34,7 +34,7 @@ fn facet_number_values<'a>( | ||||
|  | ||||
|     let iter = index | ||||
|         .field_id_docid_facet_f64s | ||||
|         .remap_key_type::<ByteSlice>() | ||||
|         .remap_key_type::<Bytes>() | ||||
|         .prefix_iter(txn, &key)? | ||||
|         .remap_key_type(); | ||||
|  | ||||
| @@ -163,7 +163,7 @@ impl<Q: RankingRuleQueryTrait> GeoSort<Q> { | ||||
|             // computing the distance between two points is expensive thus we cache the result | ||||
|             documents | ||||
|                 .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize); | ||||
|             self.cached_sorted_docids.extend(documents.into_iter()); | ||||
|             self.cached_sorted_docids.extend(documents); | ||||
|         }; | ||||
|  | ||||
|         Ok(()) | ||||
|   | ||||
| @@ -228,7 +228,7 @@ impl<T> Ord for Interned<T> { | ||||
|  | ||||
| impl<T> PartialOrd for Interned<T> { | ||||
|     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> { | ||||
|         self.idx.partial_cmp(&other.idx) | ||||
|         Some(self.cmp(other)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -241,7 +241,7 @@ impl<T> PartialEq for Interned<T> { | ||||
| } | ||||
| impl<T> Clone for Interned<T> { | ||||
|     fn clone(&self) -> Self { | ||||
|         Self { idx: self.idx, _phantom: PhantomData } | ||||
|         *self | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -52,7 +52,6 @@ use crate::score_details::{ScoreDetails, ScoringStrategy}; | ||||
| use crate::search::new::distinct::apply_distinct_rule; | ||||
| use crate::{ | ||||
|     AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, | ||||
|     BEU32, | ||||
| }; | ||||
|  | ||||
| /// A structure used throughout the execution of a search query. | ||||
| @@ -469,8 +468,8 @@ pub fn execute_search( | ||||
|                 let mut docids = Vec::new(); | ||||
|                 let mut uniq_docids = RoaringBitmap::new(); | ||||
|                 for instant_distance::Item { distance: _, pid, point: _ } in neighbors { | ||||
|                     let index = BEU32::new(pid.into_inner()); | ||||
|                     let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap().get(); | ||||
|                     let index = pid.into_inner(); | ||||
|                     let docid = ctx.index.vector_id_docid.get(ctx.txn, &index)?.unwrap(); | ||||
|                     if universe.contains(docid) && uniq_docids.insert(docid) { | ||||
|                         docids.push(docid); | ||||
|                         if docids.len() == (from + length) { | ||||
| @@ -627,7 +626,8 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>> | ||||
|                     field: field.to_string(), | ||||
|                     valid_fields, | ||||
|                     hidden_fields, | ||||
|                 })?; | ||||
|                 } | ||||
|                 .into()); | ||||
|             } | ||||
|             Member::Geo(_) if !sortable_fields.contains("_geo") => { | ||||
|                 let (valid_fields, hidden_fields) = | ||||
| @@ -637,7 +637,8 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>> | ||||
|                     field: "_geo".to_string(), | ||||
|                     valid_fields, | ||||
|                     hidden_fields, | ||||
|                 })?; | ||||
|                 } | ||||
|                 .into()); | ||||
|             } | ||||
|             _ => (), | ||||
|         } | ||||
|   | ||||
| @@ -175,7 +175,7 @@ impl QueryTermSubset { | ||||
|  | ||||
|     pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Word> { | ||||
|         let original = ctx.term_interner.get(self.original); | ||||
|         let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { return None }; | ||||
|         let use_prefix_db = original.zero_typo.use_prefix_db?; | ||||
|         let word = match &self.zero_typo_subset { | ||||
|             NTypoTermSubset::All => Some(use_prefix_db), | ||||
|             NTypoTermSubset::Subset { words, phrases: _ } => { | ||||
|   | ||||
| @@ -4,7 +4,7 @@ use roaring::RoaringBitmap; | ||||
| use super::logger::SearchLogger; | ||||
| use super::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait, SearchContext}; | ||||
| use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; | ||||
| use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; | ||||
| use crate::heed_codec::{BytesRefCodec, StrRefCodec}; | ||||
| use crate::score_details::{self, ScoreDetails}; | ||||
| use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; | ||||
| use crate::{FieldId, Index, Result}; | ||||
| @@ -100,11 +100,11 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, | ||||
|                 let number_db = ctx | ||||
|                     .index | ||||
|                     .facet_id_f64_docids | ||||
|                     .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(); | ||||
|                     .remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(); | ||||
|                 let string_db = ctx | ||||
|                     .index | ||||
|                     .facet_id_string_docids | ||||
|                     .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(); | ||||
|                     .remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(); | ||||
|  | ||||
|                 let (number_iter, string_iter) = if self.is_ascending { | ||||
|                     let number_iter = ascending_facet_sort( | ||||
|   | ||||
| @@ -124,8 +124,7 @@ fn test_attribute_fid_simple() { | ||||
|     s.query("the quick brown fox jumps over the lazy dog"); | ||||
|     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
| } | ||||
|  | ||||
| @@ -142,7 +141,6 @@ fn test_attribute_fid_ngrams() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
| } | ||||
|   | ||||
| @@ -141,8 +141,7 @@ fn test_attribute_position_simple() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
| } | ||||
| #[test] | ||||
| @@ -158,8 +157,7 @@ fn test_attribute_position_repeated() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
| } | ||||
|  | ||||
| @@ -176,8 +174,7 @@ fn test_attribute_position_different_fields() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
| } | ||||
|  | ||||
| @@ -194,7 +191,6 @@ fn test_attribute_position_ngrams() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
| } | ||||
|   | ||||
| @@ -478,8 +478,7 @@ fn test_exactness_simple_ordered() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -511,8 +510,7 @@ fn test_exactness_simple_reversed() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -535,8 +533,7 @@ fn test_exactness_simple_reversed() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -566,8 +563,7 @@ fn test_exactness_simple_random() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -596,8 +592,7 @@ fn test_exactness_attribute_starts_with_simple() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -623,8 +618,7 @@ fn test_exactness_attribute_starts_with_phrase() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -644,8 +638,7 @@ fn test_exactness_attribute_starts_with_phrase() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -674,8 +667,7 @@ fn test_exactness_all_candidates_with_typo() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -711,8 +703,7 @@ fn test_exactness_after_words() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -760,8 +751,7 @@ fn test_words_after_exactness() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 9, 18, 8, 17, 16, 6, 7, 15, 5, 14, 4, 13, 3, 12, 2, 1, 11]"); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -809,8 +799,7 @@ fn test_proximity_after_exactness() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|  | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 4, 5, 8, 7, 3, 6]"); | ||||
| @@ -847,8 +836,7 @@ fn test_proximity_after_exactness() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2]"); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -881,8 +869,7 @@ fn test_exactness_followed_by_typo_prefer_no_typo_prefix() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 4, 3]"); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
| @@ -917,8 +904,7 @@ fn test_typo_followed_by_exactness() { | ||||
|  | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     let document_ids_scores: Vec<_> = | ||||
|         documents_ids.iter().zip(document_scores.into_iter()).collect(); | ||||
|     let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect(); | ||||
|     insta::assert_snapshot!(format!("{document_ids_scores:#?}")); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 4, 3]"); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
|   | ||||
| @@ -371,7 +371,7 @@ fn test_proximity_prefix_db() { | ||||
|     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); | ||||
|     s.query("best s"); | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]"); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]"); | ||||
|     insta::assert_snapshot!(format!("{document_scores:#?}")); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
|  | ||||
| @@ -379,13 +379,13 @@ fn test_proximity_prefix_db() { | ||||
|     insta::assert_debug_snapshot!(texts, @r###" | ||||
|     [ | ||||
|         "\"this is the best summer meal\"", | ||||
|         "\"summer best\"", | ||||
|         "\"this is the best meal of summer\"", | ||||
|         "\"summer x best\"", | ||||
|         "\"this is the best meal I have ever had in such a beautiful summer day\"", | ||||
|         "\"this is the best cooked meal of the summer\"", | ||||
|         "\"this is the best meal of the summer\"", | ||||
|         "\"summer x y best\"", | ||||
|         "\"summer x best\"", | ||||
|         "\"summer best\"", | ||||
|         "\"this is the best meal I have ever had in such a beautiful winter day\"", | ||||
|     ] | ||||
|     "###); | ||||
| @@ -423,17 +423,17 @@ fn test_proximity_prefix_db() { | ||||
|     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); | ||||
|     s.query("best win"); | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]"); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); | ||||
|     insta::assert_snapshot!(format!("{document_scores:#?}")); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
|  | ||||
|     insta::assert_debug_snapshot!(texts, @r###" | ||||
|     [ | ||||
|         "\"this is the best winter meal\"", | ||||
|         "\"this is the best meal of winter\"", | ||||
|         "\"this is the best meal I have ever had in such a beautiful winter day\"", | ||||
|         "\"this is the best cooked meal of the winter\"", | ||||
|         "\"this is the best meal of the winter\"", | ||||
|         "\"this is the best meal of winter\"", | ||||
|         "\"this is the best winter meal\"", | ||||
|         "\"winter x y best\"", | ||||
|         "\"winter x best\"", | ||||
|         "\"winter best\"", | ||||
| @@ -471,20 +471,20 @@ fn test_proximity_prefix_db() { | ||||
|     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); | ||||
|     s.query("best wi"); | ||||
|     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); | ||||
|     insta::assert_snapshot!(format!("{document_scores:#?}")); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
|  | ||||
|     insta::assert_debug_snapshot!(texts, @r###" | ||||
|     [ | ||||
|         "\"this is the best winter meal\"", | ||||
|         "\"winter best\"", | ||||
|         "\"this is the best meal of winter\"", | ||||
|         "\"winter x best\"", | ||||
|         "\"this is the best meal I have ever had in such a beautiful winter day\"", | ||||
|         "\"this is the best cooked meal of the winter\"", | ||||
|         "\"this is the best meal of the winter\"", | ||||
|         "\"winter x y best\"", | ||||
|         "\"winter x best\"", | ||||
|         "\"winter best\"", | ||||
|     ] | ||||
|     "###); | ||||
| } | ||||
|   | ||||
| @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
|                 rank: 3, | ||||
|                 max_rank: 4, | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
| @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
|                 rank: 2, | ||||
|                 rank: 1, | ||||
|                 max_rank: 4, | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
|                 rank: 1, | ||||
|                 max_rank: 4, | ||||
|             }, | ||||
|         ), | ||||
|   | ||||
| @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
|                 rank: 3, | ||||
|                 max_rank: 4, | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
| @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
|                 rank: 2, | ||||
|                 rank: 1, | ||||
|                 max_rank: 4, | ||||
|             }, | ||||
|         ), | ||||
|     ], | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
|                 rank: 1, | ||||
|                 max_rank: 4, | ||||
|             }, | ||||
|         ), | ||||
|   | ||||
| @@ -6,7 +6,7 @@ expression: "format!(\"{document_scores:#?}\")" | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
|                 rank: 1, | ||||
|                 rank: 4, | ||||
|                 max_rank: 4, | ||||
|             }, | ||||
|         ), | ||||
| @@ -14,7 +14,7 @@ expression: "format!(\"{document_scores:#?}\")" | ||||
|     [ | ||||
|         Proximity( | ||||
|             Rank { | ||||
|                 rank: 1, | ||||
|                 rank: 2, | ||||
|                 max_rank: 4, | ||||
|             }, | ||||
|         ), | ||||
|   | ||||
| @@ -13,6 +13,7 @@ This module tests the `sort` ranking rule: | ||||
|  | ||||
| use big_s::S; | ||||
| use maplit::hashset; | ||||
| use meili_snap::insta; | ||||
|  | ||||
| use crate::index::tests::TempIndex; | ||||
| use crate::search::new::tests::collect_field_values; | ||||
|   | ||||
| @@ -4,9 +4,8 @@ use std::path::Path; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; | ||||
| use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index}; | ||||
| use crate::{make_db_snap_from_iter, obkv_to_json, Index}; | ||||
|  | ||||
| #[track_caller] | ||||
| pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { | ||||
| @@ -98,7 +97,6 @@ Create a snapshot test of the given database. | ||||
|     - `facet_id_string_docids` | ||||
|     - `documents_ids` | ||||
|     - `stop_words` | ||||
|     - `soft_deleted_documents_ids` | ||||
|     - `field_distribution` | ||||
|     - `fields_ids_map` | ||||
|     - `geo_faceted_documents_ids` | ||||
| @@ -221,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String { | ||||
|         &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) | ||||
|     }) | ||||
| } | ||||
| pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { | ||||
|     make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( | ||||
|         (proximity, word1, prefix), | ||||
|         b, | ||||
|     )| { | ||||
|         &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) | ||||
|     }) | ||||
| } | ||||
| pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { | ||||
|     make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( | ||||
|         (proximity, prefix, word2), | ||||
|         b, | ||||
|     )| { | ||||
|         &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) | ||||
|     }) | ||||
| } | ||||
| pub fn snap_word_position_docids(index: &Index) -> String { | ||||
|     make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { | ||||
|         &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) | ||||
| @@ -308,12 +290,6 @@ pub fn snap_stop_words(index: &Index) -> String { | ||||
|     let snap = format!("{stop_words:?}"); | ||||
|     snap | ||||
| } | ||||
| pub fn snap_soft_deleted_documents_ids(index: &Index) -> String { | ||||
|     let rtxn = index.read_txn().unwrap(); | ||||
|     let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap(); | ||||
|  | ||||
|     display_bitmap(&soft_deleted_documents_ids) | ||||
| } | ||||
| pub fn snap_field_distributions(index: &Index) -> String { | ||||
|     let rtxn = index.read_txn().unwrap(); | ||||
|     let mut snap = String::new(); | ||||
| @@ -340,50 +316,21 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String { | ||||
| } | ||||
| pub fn snap_external_documents_ids(index: &Index) -> String { | ||||
|     let rtxn = index.read_txn().unwrap(); | ||||
|     let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap(); | ||||
|     let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap(); | ||||
|     // ensure fixed order (not guaranteed by hashmap) | ||||
|     let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect(); | ||||
|     external_ids.sort_by(|(l, _), (r, _)| l.cmp(r)); | ||||
|  | ||||
|     let mut snap = String::new(); | ||||
|  | ||||
|     writeln!(&mut snap, "soft:").unwrap(); | ||||
|     let stream_soft = soft.stream(); | ||||
|     let soft_external_ids = stream_soft.into_str_vec().unwrap(); | ||||
|     for (key, id) in soft_external_ids { | ||||
|         writeln!(&mut snap, "{key:<24} {id}").unwrap(); | ||||
|     } | ||||
|     writeln!(&mut snap, "hard:").unwrap(); | ||||
|     let stream_hard = hard.stream(); | ||||
|     let hard_external_ids = stream_hard.into_str_vec().unwrap(); | ||||
|     for (key, id) in hard_external_ids { | ||||
|     writeln!(&mut snap, "docids:").unwrap(); | ||||
|     for (key, id) in external_ids { | ||||
|         writeln!(&mut snap, "{key:<24} {id}").unwrap(); | ||||
|     } | ||||
|  | ||||
|     snap | ||||
| } | ||||
| pub fn snap_number_faceted_documents_ids(index: &Index) -> String { | ||||
|     let rtxn = index.read_txn().unwrap(); | ||||
|     let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); | ||||
|     let mut snap = String::new(); | ||||
|     for field_id in fields_ids_map.ids() { | ||||
|         let number_faceted_documents_ids = | ||||
|             index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap(); | ||||
|         writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) | ||||
|             .unwrap(); | ||||
|     } | ||||
|     snap | ||||
| } | ||||
| pub fn snap_string_faceted_documents_ids(index: &Index) -> String { | ||||
|     let rtxn = index.read_txn().unwrap(); | ||||
|     let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); | ||||
|  | ||||
|     let mut snap = String::new(); | ||||
|     for field_id in fields_ids_map.ids() { | ||||
|         let string_faceted_documents_ids = | ||||
|             index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap(); | ||||
|         writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) | ||||
|             .unwrap(); | ||||
|     } | ||||
|     snap | ||||
| } | ||||
| pub fn snap_words_fst(index: &Index) -> String { | ||||
|     let rtxn = index.read_txn().unwrap(); | ||||
|     let words_fst = index.words_fst(&rtxn).unwrap(); | ||||
| @@ -516,9 +463,6 @@ macro_rules! full_snap_of_db { | ||||
|     ($index:ident, stop_words) => {{ | ||||
|         $crate::snapshot_tests::snap_stop_words(&$index) | ||||
|     }}; | ||||
|     ($index:ident, soft_deleted_documents_ids) => {{ | ||||
|         $crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index) | ||||
|     }}; | ||||
|     ($index:ident, field_distribution) => {{ | ||||
|         $crate::snapshot_tests::snap_field_distributions(&$index) | ||||
|     }}; | ||||
| @@ -531,12 +475,6 @@ macro_rules! full_snap_of_db { | ||||
|     ($index:ident, external_documents_ids) => {{ | ||||
|         $crate::snapshot_tests::snap_external_documents_ids(&$index) | ||||
|     }}; | ||||
|     ($index:ident, number_faceted_documents_ids) => {{ | ||||
|         $crate::snapshot_tests::snap_number_faceted_documents_ids(&$index) | ||||
|     }}; | ||||
|     ($index:ident, string_faceted_documents_ids) => {{ | ||||
|         $crate::snapshot_tests::snap_string_faceted_documents_ids(&$index) | ||||
|     }}; | ||||
|     ($index:ident, words_fst) => {{ | ||||
|         $crate::snapshot_tests::snap_words_fst(&$index) | ||||
|     }}; | ||||
|   | ||||
| @@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds { | ||||
| } | ||||
|  | ||||
| impl AvailableDocumentsIds { | ||||
|     pub fn from_documents_ids( | ||||
|         docids: &RoaringBitmap, | ||||
|         soft_deleted_docids: &RoaringBitmap, | ||||
|     ) -> AvailableDocumentsIds { | ||||
|         let used_docids = docids | soft_deleted_docids; | ||||
|  | ||||
|         match used_docids.max() { | ||||
|     pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { | ||||
|         match docids.max() { | ||||
|             Some(last_id) => { | ||||
|                 let mut available = RoaringBitmap::from_iter(0..last_id); | ||||
|                 available -= used_docids; | ||||
|                 available -= docids; | ||||
|  | ||||
|                 let iter = match last_id.checked_add(1) { | ||||
|                     Some(id) => id..=u32::max_value(), | ||||
| @@ -50,7 +45,7 @@ mod tests { | ||||
|     #[test] | ||||
|     fn empty() { | ||||
|         let base = RoaringBitmap::new(); | ||||
|         let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); | ||||
|         let left = AvailableDocumentsIds::from_documents_ids(&base); | ||||
|         let right = 0..=u32::max_value(); | ||||
|         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); | ||||
|     } | ||||
| @@ -63,28 +58,8 @@ mod tests { | ||||
|         base.insert(100); | ||||
|         base.insert(405); | ||||
|  | ||||
|         let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new()); | ||||
|         let left = AvailableDocumentsIds::from_documents_ids(&base); | ||||
|         let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); | ||||
|         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn soft_deleted() { | ||||
|         let mut base = RoaringBitmap::new(); | ||||
|         base.insert(0); | ||||
|         base.insert(10); | ||||
|         base.insert(100); | ||||
|         base.insert(405); | ||||
|  | ||||
|         let mut soft_deleted = RoaringBitmap::new(); | ||||
|         soft_deleted.insert(1); | ||||
|         soft_deleted.insert(11); | ||||
|         soft_deleted.insert(101); | ||||
|         soft_deleted.insert(406); | ||||
|  | ||||
|         let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted); | ||||
|         let right = | ||||
|             (0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n)); | ||||
|         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,16 +1,16 @@ | ||||
| use heed::RwTxn; | ||||
| use roaring::RoaringBitmap; | ||||
| use time::OffsetDateTime; | ||||
|  | ||||
| use crate::facet::FacetType; | ||||
| use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; | ||||
| use crate::{FieldDistribution, Index, Result}; | ||||
|  | ||||
| pub struct ClearDocuments<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
| pub struct ClearDocuments<'t, 'i> { | ||||
|     wtxn: &'t mut RwTxn<'i>, | ||||
|     index: &'i Index, | ||||
| } | ||||
|  | ||||
| impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|     pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> ClearDocuments<'t, 'u, 'i> { | ||||
| impl<'t, 'i> ClearDocuments<'t, 'i> { | ||||
|     pub fn new(wtxn: &'t mut RwTxn<'i>, index: &'i Index) -> ClearDocuments<'t, 'i> { | ||||
|         ClearDocuments { wtxn, index } | ||||
|     } | ||||
|  | ||||
| @@ -21,13 +21,12 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|         let Index { | ||||
|             env: _env, | ||||
|             main: _main, | ||||
|             external_documents_ids, | ||||
|             word_docids, | ||||
|             exact_word_docids, | ||||
|             word_prefix_docids, | ||||
|             exact_word_prefix_docids, | ||||
|             word_pair_proximity_docids, | ||||
|             word_prefix_pair_proximity_docids, | ||||
|             prefix_word_pair_proximity_docids, | ||||
|             word_position_docids, | ||||
|             word_fid_docids, | ||||
|             field_id_word_count_docids, | ||||
| @@ -51,43 +50,23 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|  | ||||
|         // We retrieve the number of documents ids that we are deleting. | ||||
|         let number_of_documents = self.index.number_of_documents(self.wtxn)?; | ||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; | ||||
|  | ||||
|         // We clean some of the main engine datastructures. | ||||
|         self.index.put_words_fst(self.wtxn, &fst::Set::default())?; | ||||
|         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; | ||||
|         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; | ||||
|         self.index.put_documents_ids(self.wtxn, &empty_roaring)?; | ||||
|         self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?; | ||||
|         self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; | ||||
|         self.index.delete_geo_rtree(self.wtxn)?; | ||||
|         self.index.delete_geo_faceted_documents_ids(self.wtxn)?; | ||||
|         self.index.delete_vector_hnsw(self.wtxn)?; | ||||
|  | ||||
|         // We clean all the faceted documents ids. | ||||
|         for field_id in faceted_fields { | ||||
|             self.index.put_faceted_documents_ids( | ||||
|                 self.wtxn, | ||||
|                 field_id, | ||||
|                 FacetType::Number, | ||||
|                 &empty_roaring, | ||||
|             )?; | ||||
|             self.index.put_faceted_documents_ids( | ||||
|                 self.wtxn, | ||||
|                 field_id, | ||||
|                 FacetType::String, | ||||
|                 &empty_roaring, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|         // Clear the other databases. | ||||
|         external_documents_ids.clear(self.wtxn)?; | ||||
|         word_docids.clear(self.wtxn)?; | ||||
|         exact_word_docids.clear(self.wtxn)?; | ||||
|         word_prefix_docids.clear(self.wtxn)?; | ||||
|         exact_word_prefix_docids.clear(self.wtxn)?; | ||||
|         word_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         word_prefix_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         prefix_word_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         word_position_docids.clear(self.wtxn)?; | ||||
|         word_fid_docids.clear(self.wtxn)?; | ||||
|         field_id_word_count_docids.clear(self.wtxn)?; | ||||
| @@ -140,7 +119,7 @@ mod tests { | ||||
|  | ||||
|         assert!(index.words_fst(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.external_documents_ids().is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.documents_ids(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.field_distribution(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.geo_rtree(&rtxn).unwrap().is_none()); | ||||
| @@ -150,7 +129,6 @@ mod tests { | ||||
|         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); | ||||
|   | ||||
							
								
								
									
										125
									
								
								milli/src/update/del_add.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										125
									
								
								milli/src/update/del_add.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,125 @@ | ||||
| use obkv::Key; | ||||
|  | ||||
| pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>; | ||||
| pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>; | ||||
|  | ||||
| /// DelAdd defines the new value to add in the database and old value to delete from the database. | ||||
| /// | ||||
| /// Its used in an OBKV to be serialized in grenad files. | ||||
| #[repr(u8)] | ||||
| #[derive(Clone, Copy, PartialOrd, PartialEq, Debug)] | ||||
| pub enum DelAdd { | ||||
|     Deletion = 0, | ||||
|     Addition = 1, | ||||
| } | ||||
|  | ||||
| impl Key for DelAdd { | ||||
|     const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>(); | ||||
|     type BYTES = [u8; Self::BYTES_SIZE]; | ||||
|  | ||||
|     fn to_be_bytes(&self) -> Self::BYTES { | ||||
|         u8::to_be_bytes(*self as u8) | ||||
|     } | ||||
|  | ||||
|     fn from_be_bytes(array: Self::BYTES) -> Self { | ||||
|         match u8::from_be_bytes(array) { | ||||
|             0 => Self::Deletion, | ||||
|             1 => Self::Addition, | ||||
|             otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value> | ||||
| /// | ||||
| /// Deletion: put all the values under DelAdd::Deletion | ||||
| /// Addition: put all the values under DelAdd::Addition, | ||||
| /// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition, | ||||
| pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>( | ||||
|     reader: obkv::KvReader<K>, | ||||
|     operation: DelAddOperation, | ||||
|     buffer: &mut Vec<u8>, | ||||
| ) -> Result<(), std::io::Error> { | ||||
|     let mut writer = obkv::KvWriter::new(buffer); | ||||
|     let mut value_buffer = Vec::new(); | ||||
|     for (key, value) in reader.iter() { | ||||
|         value_buffer.clear(); | ||||
|         let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||
|         if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) { | ||||
|             value_writer.insert(DelAdd::Deletion, value)?; | ||||
|         } | ||||
|         if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) { | ||||
|             value_writer.insert(DelAdd::Addition, value)?; | ||||
|         } | ||||
|         value_writer.finish()?; | ||||
|         writer.insert(key, &value_buffer)?; | ||||
|     } | ||||
|  | ||||
|     writer.finish() | ||||
| } | ||||
|  | ||||
| /// Enum controlling the side of the DelAdd obkv in which the provided value will be written. | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub enum DelAddOperation { | ||||
|     Deletion, | ||||
|     Addition, | ||||
|     DeletionAndAddition, | ||||
| } | ||||
|  | ||||
| /// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value> | ||||
| /// | ||||
| /// putting each deletion obkv's keys under an DelAdd::Deletion | ||||
| /// and putting each addition obkv's keys under an DelAdd::Addition | ||||
| pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>( | ||||
|     deletion: obkv::KvReader<K>, | ||||
|     addition: obkv::KvReader<K>, | ||||
|     buffer: &mut Vec<u8>, | ||||
| ) -> Result<(), std::io::Error> { | ||||
|     use itertools::merge_join_by; | ||||
|     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||
|  | ||||
|     let mut writer = obkv::KvWriter::new(buffer); | ||||
|     let mut value_buffer = Vec::new(); | ||||
|  | ||||
|     for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) { | ||||
|         value_buffer.clear(); | ||||
|         match eob { | ||||
|             Left((k, v)) => { | ||||
|                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||
|                 value_writer.insert(DelAdd::Deletion, v).unwrap(); | ||||
|                 writer.insert(k, value_writer.into_inner()?).unwrap(); | ||||
|             } | ||||
|             Right((k, v)) => { | ||||
|                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||
|                 value_writer.insert(DelAdd::Addition, v).unwrap(); | ||||
|                 writer.insert(k, value_writer.into_inner()?).unwrap(); | ||||
|             } | ||||
|             Both((k, deletion), (_, addition)) => { | ||||
|                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||
|                 value_writer.insert(DelAdd::Deletion, deletion).unwrap(); | ||||
|                 value_writer.insert(DelAdd::Addition, addition).unwrap(); | ||||
|                 writer.insert(k, value_writer.into_inner()?).unwrap(); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     writer.finish() | ||||
| } | ||||
|  | ||||
| pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool { | ||||
|     del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition) | ||||
| } | ||||
|  | ||||
| /// A function that extracts and returns the Add side of a DelAdd obkv. | ||||
| /// This is useful when there are no previous value in the database and | ||||
| /// therefore we don't need to do a diff with what's already there. | ||||
| /// | ||||
| /// If there is no Add side we currently write an empty buffer | ||||
| /// which is a valid CboRoaringBitmap. | ||||
| #[allow(clippy::ptr_arg)] // required to avoid signature mismatch | ||||
| pub fn deladd_serialize_add_side<'a>( | ||||
|     obkv: &'a [u8], | ||||
|     _buffer: &mut Vec<u8>, | ||||
| ) -> crate::Result<&'a [u8]> { | ||||
|     Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) | ||||
| } | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,10 +1,9 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::fs::File; | ||||
| use std::io::BufReader; | ||||
|  | ||||
| use grenad::CompressionType; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::{BytesEncode, Error, RoTxn, RwTxn}; | ||||
| use heed::types::Bytes; | ||||
| use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn}; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; | ||||
| @@ -12,18 +11,16 @@ use crate::facet::FacetType; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, | ||||
| }; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd}; | ||||
| use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader}; | ||||
| use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; | ||||
| use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result}; | ||||
|  | ||||
| /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases | ||||
| /// by rebuilding the database "from scratch". | ||||
| /// | ||||
| /// First, the new elements are inserted into the level 0 of the database. Then, the | ||||
| /// higher levels are cleared and recomputed from the content of level 0. | ||||
| /// | ||||
| /// Finally, the `faceted_documents_ids` value in the main database of `Index` | ||||
| /// is updated to contain the new set of faceted documents. | ||||
| pub struct FacetsUpdateBulk<'i> { | ||||
|     index: &'i Index, | ||||
|     group_size: u8, | ||||
| @@ -31,7 +28,7 @@ pub struct FacetsUpdateBulk<'i> { | ||||
|     facet_type: FacetType, | ||||
|     field_ids: Vec<FieldId>, | ||||
|     // None if level 0 does not need to be updated | ||||
|     new_data: Option<grenad::Reader<BufReader<File>>>, | ||||
|     delta_data: Option<grenad::Reader<BufReader<File>>>, | ||||
| } | ||||
|  | ||||
| impl<'i> FacetsUpdateBulk<'i> { | ||||
| @@ -39,7 +36,7 @@ impl<'i> FacetsUpdateBulk<'i> { | ||||
|         index: &'i Index, | ||||
|         field_ids: Vec<FieldId>, | ||||
|         facet_type: FacetType, | ||||
|         new_data: grenad::Reader<BufReader<File>>, | ||||
|         delta_data: grenad::Reader<BufReader<File>>, | ||||
|         group_size: u8, | ||||
|         min_level_size: u8, | ||||
|     ) -> FacetsUpdateBulk<'i> { | ||||
| @@ -49,7 +46,7 @@ impl<'i> FacetsUpdateBulk<'i> { | ||||
|             group_size, | ||||
|             min_level_size, | ||||
|             facet_type, | ||||
|             new_data: Some(new_data), | ||||
|             delta_data: Some(delta_data), | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -64,29 +61,26 @@ impl<'i> FacetsUpdateBulk<'i> { | ||||
|             group_size: FACET_GROUP_SIZE, | ||||
|             min_level_size: FACET_MIN_LEVEL_SIZE, | ||||
|             facet_type, | ||||
|             new_data: None, | ||||
|             delta_data: None, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("FacetsUpdateBulk::{}")] | ||||
|     pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { | ||||
|         let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; | ||||
|         let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self; | ||||
|  | ||||
|         let db = match facet_type { | ||||
|             FacetType::String => index | ||||
|                 .facet_id_string_docids | ||||
|                 .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), | ||||
|             FacetType::String => { | ||||
|                 index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>() | ||||
|             } | ||||
|             FacetType::Number => { | ||||
|                 index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>() | ||||
|                 index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>() | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; | ||||
|         let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size }; | ||||
|  | ||||
|         inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { | ||||
|             index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; | ||||
|             Ok(()) | ||||
|         })?; | ||||
|         inner.update(wtxn, &field_ids)?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| @@ -94,32 +88,25 @@ impl<'i> FacetsUpdateBulk<'i> { | ||||
|  | ||||
| /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type | ||||
| pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> { | ||||
|     pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     pub new_data: Option<grenad::Reader<R>>, | ||||
|     pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     pub delta_data: Option<grenad::Reader<R>>, | ||||
|     pub group_size: u8, | ||||
|     pub min_level_size: u8, | ||||
| } | ||||
| impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | ||||
|     pub fn update( | ||||
|         mut self, | ||||
|         wtxn: &mut RwTxn, | ||||
|         field_ids: &[u16], | ||||
|         mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, | ||||
|     ) -> Result<()> { | ||||
|     pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> { | ||||
|         self.update_level0(wtxn)?; | ||||
|         for &field_id in field_ids.iter() { | ||||
|             self.clear_levels(wtxn, field_id)?; | ||||
|         } | ||||
|  | ||||
|         for &field_id in field_ids.iter() { | ||||
|             let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?; | ||||
|  | ||||
|             handle_all_docids(wtxn, field_id, all_docids)?; | ||||
|             let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?; | ||||
|  | ||||
|             for level_reader in level_readers { | ||||
|                 let mut cursor = level_reader.into_cursor()?; | ||||
|                 while let Some((k, v)) = cursor.move_on_next()? { | ||||
|                     self.db.remap_types::<ByteSlice, ByteSlice>().put(wtxn, k, v)?; | ||||
|                     self.db.remap_types::<Bytes, Bytes>().put(wtxn, k, v)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| @@ -133,35 +120,48 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | ||||
|         self.db.delete_range(wtxn, &range).map(drop)?; | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { | ||||
|         let new_data = match self.new_data.take() { | ||||
|         let delta_data = match self.delta_data.take() { | ||||
|             Some(x) => x, | ||||
|             None => return Ok(()), | ||||
|         }; | ||||
|         if self.db.is_empty(wtxn)? { | ||||
|             let mut buffer = Vec::new(); | ||||
|             let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>(); | ||||
|             let mut cursor = new_data.into_cursor()?; | ||||
|             let mut database = self.db.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>(); | ||||
|             let mut cursor = delta_data.into_cursor()?; | ||||
|             while let Some((key, value)) = cursor.move_on_next()? { | ||||
|                 if !valid_lmdb_key(key) { | ||||
|                     continue; | ||||
|                 } | ||||
|                 let value = KvReaderDelAdd::new(value); | ||||
|  | ||||
|                 // DB is empty, it is safe to ignore Del operations | ||||
|                 let Some(value) = value.get(DelAdd::Addition) else { | ||||
|                     continue; | ||||
|                 }; | ||||
|  | ||||
|                 buffer.clear(); | ||||
|                 // the group size for level 0 | ||||
|                 buffer.push(1); | ||||
|                 // then we extend the buffer with the docids bitmap | ||||
|                 buffer.extend_from_slice(value); | ||||
|                 unsafe { database.append(key, &buffer)? }; | ||||
|                 unsafe { | ||||
|                     database.put_current_with_options::<Bytes>(PutFlags::APPEND, key, &buffer)? | ||||
|                 }; | ||||
|             } | ||||
|         } else { | ||||
|             let mut buffer = Vec::new(); | ||||
|             let database = self.db.remap_types::<ByteSlice, ByteSlice>(); | ||||
|             let database = self.db.remap_types::<Bytes, Bytes>(); | ||||
|  | ||||
|             let mut cursor = new_data.into_cursor()?; | ||||
|             let mut cursor = delta_data.into_cursor()?; | ||||
|             while let Some((key, value)) = cursor.move_on_next()? { | ||||
|                 if !valid_lmdb_key(key) { | ||||
|                     continue; | ||||
|                 } | ||||
|  | ||||
|                 let value = KvReaderDelAdd::new(value); | ||||
|  | ||||
|                 // the value is a CboRoaringBitmap, but I still need to prepend the | ||||
|                 // group size for level 0 (= 1) to it | ||||
|                 buffer.clear(); | ||||
| @@ -169,17 +169,27 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | ||||
|                 // then we extend the buffer with the docids bitmap | ||||
|                 match database.get(wtxn, key)? { | ||||
|                     Some(prev_value) => { | ||||
|                         // prev_value is the group size for level 0, followed by the previous bitmap. | ||||
|                         let old_bitmap = &prev_value[1..]; | ||||
|                         CboRoaringBitmapCodec::merge_into( | ||||
|                             &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], | ||||
|                             &mut buffer, | ||||
|                         )?; | ||||
|                         CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?; | ||||
|                     } | ||||
|                     None => { | ||||
|                         // it is safe to ignore the del in that case. | ||||
|                         let Some(value) = value.get(DelAdd::Addition) else { | ||||
|                             // won't put the key in DB as the value would be empty | ||||
|                             continue; | ||||
|                         }; | ||||
|  | ||||
|                         buffer.extend_from_slice(value); | ||||
|                     } | ||||
|                 }; | ||||
|                 database.put(wtxn, key, &buffer)?; | ||||
|                 let new_bitmap = &buffer[1..]; | ||||
|                 // if the new bitmap is empty, let's remove it | ||||
|                 if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 { | ||||
|                     database.delete(wtxn, key)?; | ||||
|                 } else { | ||||
|                     database.put(wtxn, key, &buffer)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         Ok(()) | ||||
| @@ -188,16 +198,10 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | ||||
|         &self, | ||||
|         field_id: FieldId, | ||||
|         txn: &RoTxn, | ||||
|     ) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> { | ||||
|         let mut all_docids = RoaringBitmap::new(); | ||||
|         let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { | ||||
|             for bitmap in bitmaps { | ||||
|                 all_docids |= bitmap; | ||||
|             } | ||||
|             Ok(()) | ||||
|         })?; | ||||
|     ) -> Result<Vec<grenad::Reader<BufReader<File>>>> { | ||||
|         let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?; | ||||
|  | ||||
|         Ok((subwriters, all_docids)) | ||||
|         Ok(subwriters) | ||||
|     } | ||||
|     #[allow(clippy::type_complexity)] | ||||
|     fn read_level_0<'t>( | ||||
| @@ -217,9 +221,9 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | ||||
|  | ||||
|         let level_0_iter = self | ||||
|             .db | ||||
|             .as_polymorph() | ||||
|             .prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, level_0_prefix.as_slice())? | ||||
|             .remap_types::<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>(); | ||||
|             .remap_types::<Bytes, Bytes>() | ||||
|             .prefix_iter(rtxn, level_0_prefix.as_slice())? | ||||
|             .remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>(); | ||||
|  | ||||
|         let mut left_bound: &[u8] = &[]; | ||||
|         let mut first_iteration_for_new_group = true; | ||||
| @@ -305,11 +309,11 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | ||||
|                     bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) | ||||
|                 { | ||||
|                     let key = FacetGroupKey { field_id, level, left_bound }; | ||||
|                     let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key) | ||||
|                         .ok_or(Error::Encoding)?; | ||||
|                     let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key) | ||||
|                         .map_err(Error::Encoding)?; | ||||
|                     let value = FacetGroupValue { size: group_size, bitmap }; | ||||
|                     let value = | ||||
|                         FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; | ||||
|                         FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?; | ||||
|                     cur_writer.insert(key, value)?; | ||||
|                     cur_writer_len += 1; | ||||
|                 } | ||||
| @@ -334,10 +338,10 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | ||||
|                 bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) | ||||
|             { | ||||
|                 let key = FacetGroupKey { field_id, level, left_bound }; | ||||
|                 let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key) | ||||
|                     .ok_or(Error::Encoding)?; | ||||
|                 let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key) | ||||
|                     .map_err(Error::Encoding)?; | ||||
|                 let value = FacetGroupValue { size: group_size, bitmap }; | ||||
|                 let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; | ||||
|                 let value = FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?; | ||||
|                 cur_writer.insert(key, value)?; | ||||
|                 cur_writer_len += 1; | ||||
|             } | ||||
| @@ -491,7 +495,6 @@ mod tests { | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); | ||||
|         db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|   | ||||
| @@ -1,360 +0,0 @@ | ||||
| use std::collections::{HashMap, HashSet}; | ||||
|  | ||||
| use heed::RwTxn; | ||||
| use log::debug; | ||||
| use roaring::RoaringBitmap; | ||||
| use time::OffsetDateTime; | ||||
|  | ||||
| use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}; | ||||
| use crate::{FieldId, Index, Result}; | ||||
|  | ||||
| /// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases. | ||||
| /// | ||||
| /// Depending on the number of removed elements and the existing size of the database, we use either | ||||
| /// a bulk delete method or an incremental delete method. | ||||
| pub struct FacetsDelete<'i, 'b> { | ||||
|     index: &'i Index, | ||||
|     database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     facet_type: FacetType, | ||||
|     affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>, | ||||
|     docids_to_delete: &'b RoaringBitmap, | ||||
|     group_size: u8, | ||||
|     max_group_size: u8, | ||||
|     min_level_size: u8, | ||||
| } | ||||
| impl<'i, 'b> FacetsDelete<'i, 'b> { | ||||
|     pub fn new( | ||||
|         index: &'i Index, | ||||
|         facet_type: FacetType, | ||||
|         affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>, | ||||
|         docids_to_delete: &'b RoaringBitmap, | ||||
|     ) -> Self { | ||||
|         let database = match facet_type { | ||||
|             FacetType::String => index | ||||
|                 .facet_id_string_docids | ||||
|                 .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), | ||||
|             FacetType::Number => { | ||||
|                 index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>() | ||||
|             } | ||||
|         }; | ||||
|         Self { | ||||
|             index, | ||||
|             database, | ||||
|             facet_type, | ||||
|             affected_facet_values, | ||||
|             docids_to_delete, | ||||
|             group_size: FACET_GROUP_SIZE, | ||||
|             max_group_size: FACET_MAX_GROUP_SIZE, | ||||
|             min_level_size: FACET_MIN_LEVEL_SIZE, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { | ||||
|         debug!("Computing and writing the facet values levels docids into LMDB on disk..."); | ||||
|         self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; | ||||
|  | ||||
|         for (field_id, affected_facet_values) in self.affected_facet_values { | ||||
|             // This is an incorrect condition, since we assume that the length of the database is equal | ||||
|             // to the number of facet values for the given field_id. It means that in some cases, we might | ||||
|             // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could | ||||
|             // really be a performance problem is when we fully delete a large ratio of all facet values for | ||||
|             // each field id. This would almost never happen. Still, to be overly cautious, I have added a | ||||
|             // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance | ||||
|             // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead. | ||||
|             if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) { | ||||
|                 // Bulk delete | ||||
|                 let mut modified = false; | ||||
|  | ||||
|                 for facet_value in affected_facet_values { | ||||
|                     let key = | ||||
|                         FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() }; | ||||
|                     let mut old = self.database.get(wtxn, &key)?.unwrap(); | ||||
|                     let previous_len = old.bitmap.len(); | ||||
|                     old.bitmap -= self.docids_to_delete; | ||||
|                     if old.bitmap.is_empty() { | ||||
|                         modified = true; | ||||
|                         self.database.delete(wtxn, &key)?; | ||||
|                     } else if old.bitmap.len() != previous_len { | ||||
|                         modified = true; | ||||
|                         self.database.put(wtxn, &key, &old)?; | ||||
|                     } | ||||
|                 } | ||||
|                 if modified { | ||||
|                     let builder = FacetsUpdateBulk::new_not_updating_level_0( | ||||
|                         self.index, | ||||
|                         vec![field_id], | ||||
|                         self.facet_type, | ||||
|                     ); | ||||
|                     builder.execute(wtxn)?; | ||||
|                 } | ||||
|             } else { | ||||
|                 // Incremental | ||||
|                 let inc = FacetsUpdateIncrementalInner { | ||||
|                     db: self.database, | ||||
|                     group_size: self.group_size, | ||||
|                     min_level_size: self.min_level_size, | ||||
|                     max_group_size: self.max_group_size, | ||||
|                 }; | ||||
|                 for facet_value in affected_facet_values { | ||||
|                     inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use std::iter::FromIterator; | ||||
|  | ||||
|     use big_s::S; | ||||
|     use maplit::hashset; | ||||
|     use rand::seq::SliceRandom; | ||||
|     use rand::SeedableRng; | ||||
|     use roaring::RoaringBitmap; | ||||
|  | ||||
|     use crate::db_snap; | ||||
|     use crate::documents::documents_batch_reader_from_objects; | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use crate::update::facet::test_helpers::ordered_string; | ||||
|     use crate::update::{DeleteDocuments, DeletionStrategy}; | ||||
|  | ||||
|     #[test] | ||||
|     fn delete_mixed_incremental_and_bulk() { | ||||
|         // The point of this test is to create an index populated with documents | ||||
|         // containing different filterable attributes. Then, we delete a bunch of documents | ||||
|         // such that a mix of the incremental and bulk indexer is used (depending on the field id) | ||||
|         let index = TempIndex::new_with_map_size(4096 * 1000 * 100); | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_filterable_fields( | ||||
|                     hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, | ||||
|                 ); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let mut documents = vec![]; | ||||
|         for i in 0..1000 { | ||||
|             documents.push( | ||||
|                 serde_json::json! { | ||||
|                     { | ||||
|                         "id": i, | ||||
|                         "label": i / 10, | ||||
|                         "colour": i / 100, | ||||
|                         "timestamp": i / 2, | ||||
|                     } | ||||
|                 } | ||||
|                 .as_object() | ||||
|                 .unwrap() | ||||
|                 .clone(), | ||||
|             ); | ||||
|         } | ||||
|  | ||||
|         let documents = documents_batch_reader_from_objects(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576"); | ||||
|         db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf"); | ||||
|  | ||||
|         let mut wtxn = index.env.write_txn().unwrap(); | ||||
|  | ||||
|         let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); | ||||
|         builder.strategy(DeletionStrategy::AlwaysHard); | ||||
|         builder.delete_documents(&RoaringBitmap::from_iter(0..100)); | ||||
|         // by deleting the first 100 documents, we expect that: | ||||
|         // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) | ||||
|         // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 | ||||
|         // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 | ||||
|         // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 | ||||
|         // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         db_snap!(index, soft_deleted_documents_ids, @"[]"); | ||||
|         db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6"); | ||||
|         db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56"); | ||||
|     } | ||||
|  | ||||
|     // Same test as above but working with string values for the facets | ||||
|     #[test] | ||||
|     fn delete_mixed_incremental_and_bulk_string() { | ||||
|         // The point of this test is to create an index populated with documents | ||||
|         // containing different filterable attributes. Then, we delete a bunch of documents | ||||
|         // such that a mix of the incremental and bulk indexer is used (depending on the field id) | ||||
|         let index = TempIndex::new_with_map_size(4096 * 1000 * 100); | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_filterable_fields( | ||||
|                     hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, | ||||
|                 ); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let mut documents = vec![]; | ||||
|         for i in 0..1000 { | ||||
|             documents.push( | ||||
|                 serde_json::json! { | ||||
|                     { | ||||
|                         "id": i, | ||||
|                         "label": ordered_string(i / 10), | ||||
|                         "colour": ordered_string(i / 100), | ||||
|                         "timestamp": ordered_string(i / 2), | ||||
|                     } | ||||
|                 } | ||||
|                 .as_object() | ||||
|                 .unwrap() | ||||
|                 .clone(), | ||||
|             ); | ||||
|         } | ||||
|  | ||||
|         let documents = documents_batch_reader_from_objects(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) | ||||
|         db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); | ||||
|         db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); | ||||
|  | ||||
|         let mut wtxn = index.env.write_txn().unwrap(); | ||||
|  | ||||
|         let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); | ||||
|         builder.strategy(DeletionStrategy::AlwaysHard); | ||||
|         builder.delete_documents(&RoaringBitmap::from_iter(0..100)); | ||||
|         // by deleting the first 100 documents, we expect that: | ||||
|         // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) | ||||
|         // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 | ||||
|         // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 | ||||
|         // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 | ||||
|         // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test | ||||
|         builder.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         db_snap!(index, soft_deleted_documents_ids, @"[]"); | ||||
|         db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc"); | ||||
|         db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f"); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn delete_almost_all_incrementally_string() { | ||||
|         let index = TempIndex::new_with_map_size(4096 * 1000 * 100); | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_filterable_fields( | ||||
|                     hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, | ||||
|                 ); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let mut documents = vec![]; | ||||
|         for i in 0..1000 { | ||||
|             documents.push( | ||||
|                 serde_json::json! { | ||||
|                     { | ||||
|                         "id": i, | ||||
|                         "label": ordered_string(i / 10), | ||||
|                         "colour": ordered_string(i / 100), | ||||
|                         "timestamp": ordered_string(i / 2), | ||||
|                     } | ||||
|                 } | ||||
|                 .as_object() | ||||
|                 .unwrap() | ||||
|                 .clone(), | ||||
|             ); | ||||
|         } | ||||
|  | ||||
|         let documents = documents_batch_reader_from_objects(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022) | ||||
|         db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503"); | ||||
|         db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5"); | ||||
|  | ||||
|         let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); | ||||
|  | ||||
|         let mut docids_to_delete = (0..1000).collect::<Vec<_>>(); | ||||
|         docids_to_delete.shuffle(&mut rng); | ||||
|         for docid in docids_to_delete.into_iter().take(990) { | ||||
|             let mut wtxn = index.env.write_txn().unwrap(); | ||||
|             let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); | ||||
|             builder.strategy(DeletionStrategy::AlwaysHard); | ||||
|             builder.delete_documents(&RoaringBitmap::from_iter([docid])); | ||||
|             builder.execute().unwrap(); | ||||
|             wtxn.commit().unwrap(); | ||||
|         } | ||||
|  | ||||
|         db_snap!(index, soft_deleted_documents_ids, @"[]"); | ||||
|         db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d"); | ||||
|         db_snap!(index, string_faceted_documents_ids, 2, @r###" | ||||
|         0   [] | ||||
|         1   [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] | ||||
|         2   [292, 324, 358, 381, 493, 839, 852, ] | ||||
|         3   [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ] | ||||
|         "###); | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[allow(unused)] | ||||
| #[cfg(test)] | ||||
| mod comparison_bench { | ||||
|     use std::iter::once; | ||||
|  | ||||
|     use rand::Rng; | ||||
|     use roaring::RoaringBitmap; | ||||
|  | ||||
|     use crate::heed_codec::facet::OrderedF64Codec; | ||||
|     use crate::update::facet::test_helpers::FacetIndex; | ||||
|  | ||||
|     // This is a simple test to get an intuition on the relative speed | ||||
|     // of the incremental vs. bulk indexer. | ||||
|     // | ||||
|     // The benchmark shows the worst-case scenario for the incremental indexer, since | ||||
|     // each facet value contains only one document ID. | ||||
|     // | ||||
|     // In that scenario, it appears that the incremental indexer is about 70 times slower than the | ||||
|     // bulk indexer. | ||||
|     // #[test] | ||||
|     fn benchmark_facet_indexing_delete() { | ||||
|         let mut r = rand::thread_rng(); | ||||
|  | ||||
|         for i in 1..=20 { | ||||
|             let size = 50_000 * i; | ||||
|             let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5); | ||||
|  | ||||
|             let mut txn = index.env.write_txn().unwrap(); | ||||
|             let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); | ||||
|             for i in 0..size { | ||||
|                 // field id = 0, left_bound = i, docids = [i] | ||||
|                 elements.push(((0, i as f64), once(i).collect())); | ||||
|             } | ||||
|             let timer = std::time::Instant::now(); | ||||
|             index.bulk_insert(&mut txn, &[0], elements.iter()); | ||||
|             let time_spent = timer.elapsed().as_millis(); | ||||
|             println!("bulk {size} : {time_spent}ms"); | ||||
|  | ||||
|             txn.commit().unwrap(); | ||||
|  | ||||
|             for nbr_doc in [1, 100, 1000, 10_000] { | ||||
|                 let mut txn = index.env.write_txn().unwrap(); | ||||
|                 let timer = std::time::Instant::now(); | ||||
|                 // | ||||
|                 // delete one document | ||||
|                 // | ||||
|                 for _ in 0..nbr_doc { | ||||
|                     let deleted_u32 = r.gen::<u32>() % size; | ||||
|                     let deleted_f64 = deleted_u32 as f64; | ||||
|                     index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32) | ||||
|                 } | ||||
|                 let time_spent = timer.elapsed().as_millis(); | ||||
|                 println!("    delete {nbr_doc} : {time_spent}ms"); | ||||
|                 txn.abort().unwrap(); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
| @@ -1,19 +1,20 @@ | ||||
| use std::collections::HashMap; | ||||
| use std::fs::File; | ||||
| use std::io::BufReader; | ||||
|  | ||||
| use heed::types::{ByteSlice, DecodeIgnore}; | ||||
| use heed::types::{Bytes, DecodeIgnore}; | ||||
| use heed::{BytesDecode, Error, RoTxn, RwTxn}; | ||||
| use obkv::KvReader; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, | ||||
| }; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
| use crate::search::facet::get_highest_level; | ||||
| use crate::update::del_add::DelAdd; | ||||
| use crate::update::index_documents::valid_lmdb_key; | ||||
| use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; | ||||
| use crate::{CboRoaringBitmapCodec, Index, Result}; | ||||
|  | ||||
| enum InsertionResult { | ||||
|     InPlace, | ||||
| @@ -28,72 +29,76 @@ enum DeletionResult { | ||||
|  | ||||
| /// Algorithm to incrementally insert and delete elememts into the | ||||
| /// `facet_id_(string/f64)_docids` databases. | ||||
| /// | ||||
| /// Rhe `faceted_documents_ids` value in the main database of `Index` | ||||
| /// is also updated to contain the new set of faceted documents. | ||||
| pub struct FacetsUpdateIncremental<'i> { | ||||
|     index: &'i Index, | ||||
| pub struct FacetsUpdateIncremental { | ||||
|     inner: FacetsUpdateIncrementalInner, | ||||
|     facet_type: FacetType, | ||||
|     new_data: grenad::Reader<BufReader<File>>, | ||||
|     delta_data: grenad::Reader<BufReader<File>>, | ||||
| } | ||||
|  | ||||
| impl<'i> FacetsUpdateIncremental<'i> { | ||||
| impl FacetsUpdateIncremental { | ||||
|     pub fn new( | ||||
|         index: &'i Index, | ||||
|         index: &Index, | ||||
|         facet_type: FacetType, | ||||
|         new_data: grenad::Reader<BufReader<File>>, | ||||
|         delta_data: grenad::Reader<BufReader<File>>, | ||||
|         group_size: u8, | ||||
|         min_level_size: u8, | ||||
|         max_group_size: u8, | ||||
|     ) -> Self { | ||||
|         FacetsUpdateIncremental { | ||||
|             index, | ||||
|             inner: FacetsUpdateIncrementalInner { | ||||
|                 db: match facet_type { | ||||
|                     FacetType::String => index | ||||
|                         .facet_id_string_docids | ||||
|                         .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), | ||||
|                         .remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(), | ||||
|                     FacetType::Number => index | ||||
|                         .facet_id_f64_docids | ||||
|                         .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), | ||||
|                         .remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(), | ||||
|                 }, | ||||
|                 group_size, | ||||
|                 max_group_size, | ||||
|                 min_level_size, | ||||
|             }, | ||||
|             facet_type, | ||||
|             new_data, | ||||
|             delta_data, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { | ||||
|         let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default(); | ||||
|  | ||||
|         let mut cursor = self.new_data.into_cursor()?; | ||||
|     pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> { | ||||
|         let mut cursor = self.delta_data.into_cursor()?; | ||||
|         while let Some((key, value)) = cursor.move_on_next()? { | ||||
|             if !valid_lmdb_key(key) { | ||||
|                 continue; | ||||
|             } | ||||
|             let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key) | ||||
|                 .ok_or(heed::Error::Encoding)?; | ||||
|             let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; | ||||
|             self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; | ||||
|             *new_faceted_docids.entry(key.field_id).or_default() |= docids; | ||||
|             let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key) | ||||
|                 .map_err(heed::Error::Encoding)?; | ||||
|             let value = KvReader::new(value); | ||||
|  | ||||
|             let docids_to_delete = value | ||||
|                 .get(DelAdd::Deletion) | ||||
|                 .map(CboRoaringBitmapCodec::bytes_decode) | ||||
|                 .map(|o| o.map_err(heed::Error::Encoding)); | ||||
|  | ||||
|             let docids_to_add = value | ||||
|                 .get(DelAdd::Addition) | ||||
|                 .map(CboRoaringBitmapCodec::bytes_decode) | ||||
|                 .map(|o| o.map_err(heed::Error::Encoding)); | ||||
|  | ||||
|             if let Some(docids_to_delete) = docids_to_delete { | ||||
|                 let docids_to_delete = docids_to_delete?; | ||||
|                 self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?; | ||||
|             } | ||||
|  | ||||
|             if let Some(docids_to_add) = docids_to_add { | ||||
|                 let docids_to_add = docids_to_add?; | ||||
|                 self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         for (field_id, new_docids) in new_faceted_docids { | ||||
|             let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; | ||||
|             docids |= new_docids; | ||||
|             self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; | ||||
|         } | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type | ||||
| pub struct FacetsUpdateIncrementalInner { | ||||
|     pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     pub group_size: u8, | ||||
|     pub min_level_size: u8, | ||||
|     pub max_group_size: u8, | ||||
| @@ -129,15 +134,14 @@ impl FacetsUpdateIncrementalInner { | ||||
|                     prefix.extend_from_slice(&field_id.to_be_bytes()); | ||||
|                     prefix.push(level); | ||||
|  | ||||
|                     let mut iter = | ||||
|                         self.db.as_polymorph().prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( | ||||
|                             txn, | ||||
|                             prefix.as_slice(), | ||||
|                         )?; | ||||
|                     let mut iter = self | ||||
|                         .db | ||||
|                         .remap_types::<Bytes, FacetGroupValueCodec>() | ||||
|                         .prefix_iter(txn, prefix.as_slice())?; | ||||
|                     let (key_bytes, value) = iter.next().unwrap()?; | ||||
|                     Ok(( | ||||
|                         FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key_bytes) | ||||
|                             .ok_or(Error::Encoding)? | ||||
|                         FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes) | ||||
|                             .map_err(Error::Encoding)? | ||||
|                             .into_owned(), | ||||
|                         value, | ||||
|                     )) | ||||
| @@ -172,10 +176,8 @@ impl FacetsUpdateIncrementalInner { | ||||
|         level0_prefix.extend_from_slice(&field_id.to_be_bytes()); | ||||
|         level0_prefix.push(0); | ||||
|  | ||||
|         let mut iter = self | ||||
|             .db | ||||
|             .as_polymorph() | ||||
|             .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, &level0_prefix)?; | ||||
|         let mut iter = | ||||
|             self.db.remap_types::<Bytes, DecodeIgnore>().prefix_iter(txn, &level0_prefix)?; | ||||
|  | ||||
|         if iter.next().is_none() { | ||||
|             drop(iter); | ||||
| @@ -377,11 +379,8 @@ impl FacetsUpdateIncrementalInner { | ||||
|         highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); | ||||
|         highest_level_prefix.push(highest_level); | ||||
|  | ||||
|         let size_highest_level = self | ||||
|             .db | ||||
|             .as_polymorph() | ||||
|             .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)? | ||||
|             .count(); | ||||
|         let size_highest_level = | ||||
|             self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, &highest_level_prefix)?.count(); | ||||
|  | ||||
|         if size_highest_level < self.group_size as usize * self.min_level_size as usize { | ||||
|             return Ok(()); | ||||
| @@ -389,8 +388,8 @@ impl FacetsUpdateIncrementalInner { | ||||
|  | ||||
|         let mut groups_iter = self | ||||
|             .db | ||||
|             .as_polymorph() | ||||
|             .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &highest_level_prefix)?; | ||||
|             .remap_types::<Bytes, FacetGroupValueCodec>() | ||||
|             .prefix_iter(txn, &highest_level_prefix)?; | ||||
|  | ||||
|         let nbr_new_groups = size_highest_level / self.group_size as usize; | ||||
|         let nbr_leftover_elements = size_highest_level % self.group_size as usize; | ||||
| @@ -401,8 +400,8 @@ impl FacetsUpdateIncrementalInner { | ||||
|             let mut values = RoaringBitmap::new(); | ||||
|             for _ in 0..group_size { | ||||
|                 let (key_bytes, value_i) = groups_iter.next().unwrap()?; | ||||
|                 let key_i = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key_bytes) | ||||
|                     .ok_or(Error::Encoding)?; | ||||
|                 let key_i = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes) | ||||
|                     .map_err(Error::Encoding)?; | ||||
|  | ||||
|                 if first_key.is_none() { | ||||
|                     first_key = Some(key_i); | ||||
| @@ -424,8 +423,8 @@ impl FacetsUpdateIncrementalInner { | ||||
|             let mut values = RoaringBitmap::new(); | ||||
|             for _ in 0..nbr_leftover_elements { | ||||
|                 let (key_bytes, value_i) = groups_iter.next().unwrap()?; | ||||
|                 let key_i = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key_bytes) | ||||
|                     .ok_or(Error::Encoding)?; | ||||
|                 let key_i = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes) | ||||
|                     .map_err(Error::Encoding)?; | ||||
|  | ||||
|                 if first_key.is_none() { | ||||
|                     first_key = Some(key_i); | ||||
| @@ -592,23 +591,21 @@ impl FacetsUpdateIncrementalInner { | ||||
|         if highest_level == 0 | ||||
|             || self | ||||
|                 .db | ||||
|                 .as_polymorph() | ||||
|                 .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)? | ||||
|                 .remap_types::<Bytes, Bytes>() | ||||
|                 .prefix_iter(txn, &highest_level_prefix)? | ||||
|                 .count() | ||||
|                 >= self.min_level_size as usize | ||||
|         { | ||||
|             return Ok(()); | ||||
|         } | ||||
|         let mut to_delete = vec![]; | ||||
|         let mut iter = self | ||||
|             .db | ||||
|             .as_polymorph() | ||||
|             .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)?; | ||||
|         let mut iter = | ||||
|             self.db.remap_types::<Bytes, Bytes>().prefix_iter(txn, &highest_level_prefix)?; | ||||
|         for el in iter.by_ref() { | ||||
|             let (k, _) = el?; | ||||
|             to_delete.push( | ||||
|                 FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(k) | ||||
|                     .ok_or(Error::Encoding)? | ||||
|                 FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(k) | ||||
|                     .map_err(Error::Encoding)? | ||||
|                     .into_owned(), | ||||
|             ); | ||||
|         } | ||||
| @@ -1116,7 +1113,7 @@ mod fuzz { | ||||
|  | ||||
|     #[no_coverage] | ||||
|     fn compare_with_trivial_database(tempdir: Rc<TempDir>, operations: &[Operation]) { | ||||
|         let index = FacetIndex::<ByteSliceRefCodec>::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten | ||||
|         let index = FacetIndex::<BytesRefCodec>::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten | ||||
|         let mut txn = index.env.write_txn().unwrap(); | ||||
|  | ||||
|         let mut trivial_db = TrivialDatabase::<Vec<u8>>::default(); | ||||
| @@ -1162,16 +1159,13 @@ mod fuzz { | ||||
|             let level0iter = index | ||||
|                 .content | ||||
|                 .as_polymorph() | ||||
|                 .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( | ||||
|                     &mut txn, | ||||
|                     &field_id.to_be_bytes(), | ||||
|                 ) | ||||
|                 .prefix_iter::<_, Bytes, FacetGroupValueCodec>(&mut txn, &field_id.to_be_bytes()) | ||||
|                 .unwrap(); | ||||
|  | ||||
|             for ((key, values), group) in values_field_id.iter().zip(level0iter) { | ||||
|                 let (group_key, group_values) = group.unwrap(); | ||||
|                 let group_key = | ||||
|                     FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(group_key).unwrap(); | ||||
|                     FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(group_key).unwrap(); | ||||
|                 assert_eq!(key, &group_key.left_bound); | ||||
|                 assert_eq!(values, &group_values.bitmap); | ||||
|             } | ||||
| @@ -1181,13 +1175,13 @@ mod fuzz { | ||||
|             let level0iter = index | ||||
|                 .content | ||||
|                 .as_polymorph() | ||||
|                 .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) | ||||
|                 .prefix_iter::<_, Bytes, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) | ||||
|                 .unwrap(); | ||||
|  | ||||
|             for ((key, values), group) in values_field_id.iter().zip(level0iter) { | ||||
|                 let (group_key, group_values) = group.unwrap(); | ||||
|                 let group_key = | ||||
|                     FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(group_key).unwrap(); | ||||
|                     FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(group_key).unwrap(); | ||||
|                 assert_eq!(key, &group_key.left_bound); | ||||
|                 assert_eq!(values, &group_values.bitmap); | ||||
|             } | ||||
|   | ||||
| @@ -14,7 +14,7 @@ The databases must be able to return results for queries such as: | ||||
| The algorithms that implement these queries are found in the `src/search/facet` folder. | ||||
|  | ||||
| To make these queries fast to compute, the database adopts a tree structure: | ||||
| ```ignore | ||||
| ```text | ||||
|             ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ | ||||
| ┌───────┐   │           "ab" (2)            │           "gaf" (2)           │   "woz" (1)   │ | ||||
| │Level 2│   │                               │                               │               │ | ||||
| @@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`. | ||||
| In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a | ||||
| [`FacetGroupValue`], which have the following format: | ||||
|  | ||||
| ```ignore | ||||
| ```text | ||||
| FacetGroupKey: | ||||
| - field id  : u16 | ||||
| - level     : u8 | ||||
| @@ -83,7 +83,7 @@ use std::iter::FromIterator; | ||||
|  | ||||
| use charabia::normalizer::{Normalize, NormalizerOption}; | ||||
| use grenad::{CompressionType, SortAlgorithm}; | ||||
| use heed::types::{ByteSlice, DecodeIgnore, SerdeJson}; | ||||
| use heed::types::{Bytes, DecodeIgnore, SerdeJson}; | ||||
| use heed::BytesEncode; | ||||
| use log::debug; | ||||
| use time::OffsetDateTime; | ||||
| @@ -92,13 +92,12 @@ use self::incremental::FacetsUpdateIncremental; | ||||
| use super::FacetsUpdateBulk; | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; | ||||
| use crate::heed_codec::ByteSliceRefCodec; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
| use crate::update::index_documents::create_sorter; | ||||
| use crate::update::merge_btreeset_string; | ||||
| use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH}; | ||||
| use crate::{BEU16StrCodec, Index, Result, MAX_FACET_VALUE_LENGTH}; | ||||
|  | ||||
| pub mod bulk; | ||||
| pub mod delete; | ||||
| pub mod incremental; | ||||
|  | ||||
| /// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. | ||||
| @@ -107,9 +106,9 @@ pub mod incremental; | ||||
| /// a bulk update method or an incremental update method. | ||||
| pub struct FacetsUpdate<'i> { | ||||
|     index: &'i Index, | ||||
|     database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|     database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|     facet_type: FacetType, | ||||
|     new_data: grenad::Reader<BufReader<File>>, | ||||
|     delta_data: grenad::Reader<BufReader<File>>, | ||||
|     group_size: u8, | ||||
|     max_group_size: u8, | ||||
|     min_level_size: u8, | ||||
| @@ -118,14 +117,14 @@ impl<'i> FacetsUpdate<'i> { | ||||
|     pub fn new( | ||||
|         index: &'i Index, | ||||
|         facet_type: FacetType, | ||||
|         new_data: grenad::Reader<BufReader<File>>, | ||||
|         delta_data: grenad::Reader<BufReader<File>>, | ||||
|     ) -> Self { | ||||
|         let database = match facet_type { | ||||
|             FacetType::String => index | ||||
|                 .facet_id_string_docids | ||||
|                 .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(), | ||||
|             FacetType::String => { | ||||
|                 index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>() | ||||
|             } | ||||
|             FacetType::Number => { | ||||
|                 index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>() | ||||
|                 index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>() | ||||
|             } | ||||
|         }; | ||||
|         Self { | ||||
| @@ -135,26 +134,26 @@ impl<'i> FacetsUpdate<'i> { | ||||
|             max_group_size: FACET_MAX_GROUP_SIZE, | ||||
|             min_level_size: FACET_MIN_LEVEL_SIZE, | ||||
|             facet_type, | ||||
|             new_data, | ||||
|             delta_data, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { | ||||
|         if self.new_data.is_empty() { | ||||
|         if self.delta_data.is_empty() { | ||||
|             return Ok(()); | ||||
|         } | ||||
|         debug!("Computing and writing the facet values levels docids into LMDB on disk..."); | ||||
|         self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; | ||||
|  | ||||
|         // See self::comparison_bench::benchmark_facet_indexing | ||||
|         if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { | ||||
|         if self.delta_data.len() >= (self.database.len(wtxn)? / 50) { | ||||
|             let field_ids = | ||||
|                 self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>(); | ||||
|             let bulk_update = FacetsUpdateBulk::new( | ||||
|                 self.index, | ||||
|                 field_ids, | ||||
|                 self.facet_type, | ||||
|                 self.new_data, | ||||
|                 self.delta_data, | ||||
|                 self.group_size, | ||||
|                 self.min_level_size, | ||||
|             ); | ||||
| @@ -163,7 +162,7 @@ impl<'i> FacetsUpdate<'i> { | ||||
|             let incremental_update = FacetsUpdateIncremental::new( | ||||
|                 self.index, | ||||
|                 self.facet_type, | ||||
|                 self.new_data, | ||||
|                 self.delta_data, | ||||
|                 self.group_size, | ||||
|                 self.min_level_size, | ||||
|                 self.max_group_size, | ||||
| @@ -208,8 +207,8 @@ impl<'i> FacetsUpdate<'i> { | ||||
|                 } | ||||
|                 let set = BTreeSet::from_iter(std::iter::once(left_bound)); | ||||
|                 let key = (field_id, normalized_facet.as_ref()); | ||||
|                 let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?; | ||||
|                 let val = SerdeJson::bytes_encode(&set).ok_or(heed::Error::Encoding)?; | ||||
|                 let key = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; | ||||
|                 let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; | ||||
|                 sorter.insert(key, val)?; | ||||
|             } | ||||
|         } | ||||
| @@ -218,10 +217,11 @@ impl<'i> FacetsUpdate<'i> { | ||||
|         // as the grenad sorter already merged them for us. | ||||
|         let mut merger_iter = sorter.into_stream_merger_iter()?; | ||||
|         while let Some((key_bytes, btreeset_bytes)) = merger_iter.next()? { | ||||
|             self.index | ||||
|                 .facet_id_normalized_string_strings | ||||
|                 .remap_types::<ByteSlice, ByteSlice>() | ||||
|                 .put(wtxn, key_bytes, btreeset_bytes)?; | ||||
|             self.index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>().put( | ||||
|                 wtxn, | ||||
|                 key_bytes, | ||||
|                 btreeset_bytes, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|         // We compute one FST by string facet | ||||
| @@ -253,7 +253,7 @@ impl<'i> FacetsUpdate<'i> { | ||||
|  | ||||
|         // We write those FSTs in LMDB now | ||||
|         for (field_id, fst) in text_fsts { | ||||
|             self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?; | ||||
|             self.index.facet_id_string_fst.put(wtxn, &field_id, &fst)?; | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
| @@ -268,7 +268,7 @@ pub(crate) mod test_helpers { | ||||
|     use std::marker::PhantomData; | ||||
|     use std::rc::Rc; | ||||
|  | ||||
|     use heed::types::ByteSlice; | ||||
|     use heed::types::Bytes; | ||||
|     use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; | ||||
|     use roaring::RoaringBitmap; | ||||
|  | ||||
| @@ -276,9 +276,10 @@ pub(crate) mod test_helpers { | ||||
|     use crate::heed_codec::facet::{ | ||||
|         FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, | ||||
|     }; | ||||
|     use crate::heed_codec::ByteSliceRefCodec; | ||||
|     use crate::heed_codec::BytesRefCodec; | ||||
|     use crate::search::facet::get_highest_level; | ||||
|     use crate::snapshot_tests::display_bitmap; | ||||
|     use crate::update::del_add::{DelAdd, KvWriterDelAdd}; | ||||
|     use crate::update::FacetsUpdateIncrementalInner; | ||||
|     use crate::CboRoaringBitmapCodec; | ||||
|  | ||||
| @@ -306,7 +307,7 @@ pub(crate) mod test_helpers { | ||||
|             BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>, | ||||
|     { | ||||
|         pub env: Env, | ||||
|         pub content: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, | ||||
|         pub content: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, | ||||
|         pub group_size: Cell<u8>, | ||||
|         pub min_level_size: Cell<u8>, | ||||
|         pub max_group_size: Cell<u8>, | ||||
| @@ -454,21 +455,23 @@ pub(crate) mod test_helpers { | ||||
|                 let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned(); | ||||
|                 let key: FacetGroupKey<&[u8]> = | ||||
|                     FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; | ||||
|                 let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap(); | ||||
|                 let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key).unwrap(); | ||||
|                 let mut inner_writer = KvWriterDelAdd::memory(); | ||||
|                 let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap(); | ||||
|                 writer.insert(&key, &value).unwrap(); | ||||
|                 inner_writer.insert(DelAdd::Addition, value).unwrap(); | ||||
|                 writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap(); | ||||
|             } | ||||
|             writer.finish().unwrap(); | ||||
|             let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); | ||||
|  | ||||
|             let update = FacetsUpdateBulkInner { | ||||
|                 db: self.content, | ||||
|                 new_data: Some(reader), | ||||
|                 delta_data: Some(reader), | ||||
|                 group_size: self.group_size.get(), | ||||
|                 min_level_size: self.min_level_size.get(), | ||||
|             }; | ||||
|  | ||||
|             update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); | ||||
|             update.update(wtxn, field_ids).unwrap(); | ||||
|         } | ||||
|  | ||||
|         pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { | ||||
| @@ -484,12 +487,12 @@ pub(crate) mod test_helpers { | ||||
|  | ||||
|                 let iter = self | ||||
|                     .content | ||||
|                     .as_polymorph() | ||||
|                     .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &level_no_prefix) | ||||
|                     .remap_types::<Bytes, FacetGroupValueCodec>() | ||||
|                     .prefix_iter(txn, &level_no_prefix) | ||||
|                     .unwrap(); | ||||
|                 for el in iter { | ||||
|                     let (key, value) = el.unwrap(); | ||||
|                     let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key).unwrap(); | ||||
|                     let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key).unwrap(); | ||||
|  | ||||
|                     let mut prefix_start_below = vec![]; | ||||
|                     prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); | ||||
| @@ -499,14 +502,11 @@ pub(crate) mod test_helpers { | ||||
|                     let start_below = { | ||||
|                         let mut start_below_iter = self | ||||
|                             .content | ||||
|                             .as_polymorph() | ||||
|                             .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( | ||||
|                                 txn, | ||||
|                                 &prefix_start_below, | ||||
|                             ) | ||||
|                             .remap_types::<Bytes, FacetGroupValueCodec>() | ||||
|                             .prefix_iter(txn, &prefix_start_below) | ||||
|                             .unwrap(); | ||||
|                         let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); | ||||
|                         FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key_bytes).unwrap() | ||||
|                         FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes).unwrap() | ||||
|                     }; | ||||
|  | ||||
|                     assert!(value.size > 0); | ||||
| @@ -556,101 +556,6 @@ pub(crate) mod test_helpers { | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use big_s::S; | ||||
|     use maplit::hashset; | ||||
|  | ||||
|     use crate::db_snap; | ||||
|     use crate::documents::documents_batch_reader_from_objects; | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use crate::update::DeletionStrategy; | ||||
|  | ||||
|     #[test] | ||||
|     fn replace_all_identical_soft_deletion_then_hard_deletion() { | ||||
|         let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100); | ||||
|  | ||||
|         index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_primary_key("id".to_owned()); | ||||
|                 settings.set_filterable_fields(hashset! { S("size") }); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let mut documents = vec![]; | ||||
|         for i in 0..1000 { | ||||
|             documents.push( | ||||
|                 serde_json::json! { | ||||
|                     { | ||||
|                         "id": i, | ||||
|                         "size": i % 250, | ||||
|                     } | ||||
|                 } | ||||
|                 .as_object() | ||||
|                 .unwrap() | ||||
|                 .clone(), | ||||
|             ); | ||||
|         } | ||||
|  | ||||
|         let documents = documents_batch_reader_from_objects(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b"); | ||||
|         db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9"); | ||||
|         db_snap!(index, soft_deleted_documents_ids, "initial", @"[]"); | ||||
|  | ||||
|         let mut documents = vec![]; | ||||
|         for i in 0..999 { | ||||
|             documents.push( | ||||
|                 serde_json::json! { | ||||
|                     { | ||||
|                         "id": i, | ||||
|                         "size": i % 250, | ||||
|                         "other": 0, | ||||
|                     } | ||||
|                 } | ||||
|                 .as_object() | ||||
|                 .unwrap() | ||||
|                 .clone(), | ||||
|             ); | ||||
|         } | ||||
|  | ||||
|         let documents = documents_batch_reader_from_objects(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f"); | ||||
|         db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06"); | ||||
|         db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123"); | ||||
|  | ||||
|         // Then replace the last document while disabling soft_deletion | ||||
|         index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; | ||||
|         let mut documents = vec![]; | ||||
|         for i in 999..1000 { | ||||
|             documents.push( | ||||
|                 serde_json::json! { | ||||
|                     { | ||||
|                         "id": i, | ||||
|                         "size": i % 250, | ||||
|                         "other": 0, | ||||
|                     } | ||||
|                 } | ||||
|                 .as_object() | ||||
|                 .unwrap() | ||||
|                 .clone(), | ||||
|             ); | ||||
|         } | ||||
|  | ||||
|         let documents = documents_batch_reader_from_objects(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6"); | ||||
|         db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028"); | ||||
|         db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]"); | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[allow(unused)] | ||||
| #[cfg(test)] | ||||
| mod comparison_bench { | ||||
| @@ -705,7 +610,7 @@ mod comparison_bench { | ||||
|                 } | ||||
|                 let time_spent = timer.elapsed().as_millis(); | ||||
|                 println!("    add {nbr_doc} : {time_spent}ms"); | ||||
|                 txn.abort().unwrap(); | ||||
|                 txn.abort(); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -1,20 +1,17 @@ | ||||
| use std::fmt; | ||||
| use std::io::{BufWriter, Read, Seek}; | ||||
| use std::result::Result as StdResult; | ||||
| use std::{fmt, iter}; | ||||
|  | ||||
| use serde::{Deserialize, Serialize}; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; | ||||
| use crate::documents::{ | ||||
|     DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader, | ||||
|     EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY, | ||||
| }; | ||||
| use crate::error::{GeoError, InternalError, UserError}; | ||||
| use crate::update::index_documents::{obkv_to_object, writer_into_reader}; | ||||
| use crate::{FieldId, Index, Object, Result}; | ||||
|  | ||||
| /// The symbol used to define levels in a nested primary key. | ||||
| const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; | ||||
|  | ||||
| /// The default primary that is used when not specified. | ||||
| const DEFAULT_PRIMARY_KEY: &str = "id"; | ||||
| use crate::{FieldId, Index, Result}; | ||||
|  | ||||
| /// This function validates and enrich the documents by checking that: | ||||
| ///  - we can infer a primary key, | ||||
| @@ -41,14 +38,12 @@ pub fn enrich_documents_batch<R: Read + Seek>( | ||||
|     // The primary key *field id* that has already been set for this index or the one | ||||
|     // we will guess by searching for the first key that contains "id" as a substring. | ||||
|     let primary_key = match index.primary_key(rtxn)? { | ||||
|         Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { | ||||
|             PrimaryKey::nested(primary_key) | ||||
|         } | ||||
|         Some(primary_key) => match documents_batch_index.id(primary_key) { | ||||
|             Some(id) => PrimaryKey::flat(primary_key, id), | ||||
|             None if autogenerate_docids => { | ||||
|                 PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) | ||||
|             } | ||||
|         Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) { | ||||
|             Some(primary_key) => primary_key, | ||||
|             None if autogenerate_docids => PrimaryKey::Flat { | ||||
|                 name: primary_key, | ||||
|                 field_id: documents_batch_index.insert(primary_key), | ||||
|             }, | ||||
|             None => { | ||||
|                 return match cursor.next_document()? { | ||||
|                     Some(first_document) => Ok(Err(UserError::MissingDocumentId { | ||||
| @@ -76,14 +71,14 @@ pub fn enrich_documents_batch<R: Read + Seek>( | ||||
|             }); | ||||
|  | ||||
|             match guesses.as_slice() { | ||||
|                 [] if autogenerate_docids => PrimaryKey::flat( | ||||
|                     DEFAULT_PRIMARY_KEY, | ||||
|                     documents_batch_index.insert(DEFAULT_PRIMARY_KEY), | ||||
|                 ), | ||||
|                 [] if autogenerate_docids => PrimaryKey::Flat { | ||||
|                     name: DEFAULT_PRIMARY_KEY, | ||||
|                     field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY), | ||||
|                 }, | ||||
|                 [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)), | ||||
|                 [(field_id, name)] => { | ||||
|                     log::info!("Primary key was not specified in index. Inferred to '{name}'"); | ||||
|                     PrimaryKey::flat(name, *field_id) | ||||
|                     PrimaryKey::Flat { name, field_id: *field_id } | ||||
|                 } | ||||
|                 multiple => { | ||||
|                     return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound { | ||||
| @@ -156,92 +151,24 @@ fn fetch_or_generate_document_id( | ||||
|     uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], | ||||
|     count: u32, | ||||
| ) -> Result<StdResult<DocumentId, UserError>> { | ||||
|     match primary_key { | ||||
|         PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { | ||||
|             match document.get(primary_key_id) { | ||||
|                 Some(document_id_bytes) => { | ||||
|                     let document_id = serde_json::from_slice(document_id_bytes) | ||||
|                         .map_err(InternalError::SerdeJson)?; | ||||
|                     match validate_document_id_value(document_id)? { | ||||
|                         Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), | ||||
|                         Err(user_error) => Ok(Err(user_error)), | ||||
|                     } | ||||
|                 } | ||||
|                 None if autogenerate_docids => { | ||||
|                     let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); | ||||
|                     Ok(Ok(DocumentId::generated(uuid.to_string(), count))) | ||||
|                 } | ||||
|                 None => Ok(Err(UserError::MissingDocumentId { | ||||
|                     primary_key: primary_key.to_string(), | ||||
|                     document: obkv_to_object(document, documents_batch_index)?, | ||||
|                 })), | ||||
|             } | ||||
|     Ok(match primary_key.document_id(document, documents_batch_index)? { | ||||
|         Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }), | ||||
|         Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error), | ||||
|         Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => { | ||||
|             let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); | ||||
|             Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count }) | ||||
|         } | ||||
|         nested @ PrimaryKey::Nested { .. } => { | ||||
|             let mut matching_documents_ids = Vec::new(); | ||||
|             for (first_level_name, right) in nested.possible_level_names() { | ||||
|                 if let Some(field_id) = documents_batch_index.id(first_level_name) { | ||||
|                     if let Some(value_bytes) = document.get(field_id) { | ||||
|                         let object = serde_json::from_slice(value_bytes) | ||||
|                             .map_err(InternalError::SerdeJson)?; | ||||
|                         fetch_matching_values(object, right, &mut matching_documents_ids); | ||||
|  | ||||
|                         if matching_documents_ids.len() >= 2 { | ||||
|                             return Ok(Err(UserError::TooManyDocumentIds { | ||||
|                                 primary_key: nested.name().to_string(), | ||||
|                                 document: obkv_to_object(document, documents_batch_index)?, | ||||
|                             })); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             match matching_documents_ids.pop() { | ||||
|                 Some(document_id) => match validate_document_id_value(document_id)? { | ||||
|                     Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), | ||||
|                     Err(user_error) => Ok(Err(user_error)), | ||||
|                 }, | ||||
|                 None => Ok(Err(UserError::MissingDocumentId { | ||||
|                     primary_key: nested.name().to_string(), | ||||
|                     document: obkv_to_object(document, documents_batch_index)?, | ||||
|                 })), | ||||
|             } | ||||
|         Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId { | ||||
|             primary_key: primary_key.name().to_string(), | ||||
|             document: obkv_to_object(document, documents_batch_index)?, | ||||
|         }), | ||||
|         Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => { | ||||
|             Err(UserError::TooManyDocumentIds { | ||||
|                 primary_key: primary_key.name().to_string(), | ||||
|                 document: obkv_to_object(document, documents_batch_index)?, | ||||
|             }) | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// A type that represent the type of primary key that has been set | ||||
| /// for this index, a classic flat one or a nested one. | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| enum PrimaryKey<'a> { | ||||
|     Flat { name: &'a str, field_id: FieldId }, | ||||
|     Nested { name: &'a str }, | ||||
| } | ||||
|  | ||||
| impl PrimaryKey<'_> { | ||||
|     fn flat(name: &str, field_id: FieldId) -> PrimaryKey { | ||||
|         PrimaryKey::Flat { name, field_id } | ||||
|     } | ||||
|  | ||||
|     fn nested(name: &str) -> PrimaryKey { | ||||
|         PrimaryKey::Nested { name } | ||||
|     } | ||||
|  | ||||
|     fn name(&self) -> &str { | ||||
|         match self { | ||||
|             PrimaryKey::Flat { name, .. } => name, | ||||
|             PrimaryKey::Nested { name } => name, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Returns an `Iterator` that gives all the possible fields names the primary key | ||||
|     /// can have depending of the first level name and deepnes of the objects. | ||||
|     fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ { | ||||
|         let name = self.name(); | ||||
|         name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) | ||||
|             .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) | ||||
|             .chain(iter::once((name, ""))) | ||||
|     } | ||||
|     }) | ||||
| } | ||||
|  | ||||
| /// A type that represents a document id that has been retrieved from a document or auto-generated. | ||||
| @@ -255,14 +182,6 @@ pub enum DocumentId { | ||||
| } | ||||
|  | ||||
| impl DocumentId { | ||||
|     fn retrieved(value: String) -> DocumentId { | ||||
|         DocumentId::Retrieved { value } | ||||
|     } | ||||
|  | ||||
|     fn generated(value: String, document_nth: u32) -> DocumentId { | ||||
|         DocumentId::Generated { value, document_nth } | ||||
|     } | ||||
|  | ||||
|     fn debug(&self) -> String { | ||||
|         format!("{:?}", self) | ||||
|     } | ||||
| @@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId { | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn starts_with(selector: &str, key: &str) -> bool { | ||||
|     selector.strip_prefix(key).map_or(false, |tail| { | ||||
|         tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) | ||||
|     }) | ||||
| } | ||||
|  | ||||
| pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) { | ||||
|     match value { | ||||
|         Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), | ||||
|         otherwise => output.push(otherwise), | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn fetch_matching_values_in_object( | ||||
|     object: Object, | ||||
|     selector: &str, | ||||
|     base_key: &str, | ||||
|     output: &mut Vec<Value>, | ||||
| ) { | ||||
|     for (key, value) in object { | ||||
|         let base_key = if base_key.is_empty() { | ||||
|             key.to_string() | ||||
|         } else { | ||||
|             format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) | ||||
|         }; | ||||
|  | ||||
|         if starts_with(selector, &base_key) { | ||||
|             match value { | ||||
|                 Value::Object(object) => { | ||||
|                     fetch_matching_values_in_object(object, selector, &base_key, output) | ||||
|                 } | ||||
|                 value => output.push(value), | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn validate_document_id(document_id: &str) -> Option<&str> { | ||||
|     if !document_id.is_empty() | ||||
|         && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) | ||||
|     { | ||||
|         Some(document_id) | ||||
|     } else { | ||||
|         None | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Parses a Json encoded document id and validate it, returning a user error when it is one. | ||||
| pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> { | ||||
|     match document_id { | ||||
|         Value::String(string) => match validate_document_id(&string) { | ||||
|             Some(s) if s.len() == string.len() => Ok(Ok(string)), | ||||
|             Some(s) => Ok(Ok(s.to_string())), | ||||
|             None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), | ||||
|         }, | ||||
|         Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), | ||||
|         content => Ok(Err(UserError::InvalidDocumentId { document_id: content })), | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Try to extract an `f64` from a JSON `Value` and return the `Value` | ||||
| /// in the `Err` variant if it failed. | ||||
| pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> { | ||||
|   | ||||
| @@ -5,18 +5,16 @@ use std::io::BufReader; | ||||
| use std::{io, mem, str}; | ||||
|  | ||||
| use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; | ||||
| use obkv::KvReader; | ||||
| use obkv::{KvReader, KvWriterU16}; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::Value; | ||||
|  | ||||
| use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters}; | ||||
| use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; | ||||
| use crate::error::{InternalError, SerializationError}; | ||||
| use crate::update::index_documents::MergeFn; | ||||
| use crate::{ | ||||
|     absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH, | ||||
| }; | ||||
| use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; | ||||
| use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; | ||||
|  | ||||
| pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>; | ||||
| pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; | ||||
|  | ||||
| /// Extracts the word and positions where this word appear and | ||||
| /// prefixes it by the document id. | ||||
| @@ -32,25 +30,162 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||
|     allowed_separators: Option<&[&str]>, | ||||
|     dictionary: Option<&[&str]>, | ||||
|     max_positions_per_attributes: Option<u32>, | ||||
| ) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> { | ||||
| ) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> { | ||||
|     puffin::profile_function!(); | ||||
|  | ||||
|     let max_positions_per_attributes = max_positions_per_attributes | ||||
|         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     // initialize destination values. | ||||
|     let mut documents_ids = RoaringBitmap::new(); | ||||
|     let mut script_language_docids = HashMap::new(); | ||||
|     let mut docid_word_positions_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Stable, | ||||
|         concat_u32s_array, | ||||
|         keep_latest_obkv, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     let mut buffers = Buffers::default(); | ||||
|     // initialize buffers. | ||||
|     let mut del_buffers = Buffers::default(); | ||||
|     let mut add_buffers = Buffers::default(); | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut value_buffer = Vec::new(); | ||||
|  | ||||
|     // initialize tokenizer. | ||||
|     let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None); | ||||
|     let tokenizer = builder.build(); | ||||
|  | ||||
|     // iterate over documents. | ||||
|     let mut cursor = obkv_documents.into_cursor()?; | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         let document_id = key | ||||
|             .try_into() | ||||
|             .map(u32::from_be_bytes) | ||||
|             .map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||
|         let obkv = KvReader::<FieldId>::new(value); | ||||
|  | ||||
|         // if the searchable fields didn't change, skip the searchable indexing for this document. | ||||
|         if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) { | ||||
|             continue; | ||||
|         } | ||||
|  | ||||
|         documents_ids.push(document_id); | ||||
|  | ||||
|         // Update key buffer prefix. | ||||
|         key_buffer.clear(); | ||||
|         key_buffer.extend_from_slice(&document_id.to_be_bytes()); | ||||
|  | ||||
|         // Tokenize deletions and additions in 2 diffferent threads. | ||||
|         let (del, add): (Result<_>, Result<_>) = rayon::join( | ||||
|             || { | ||||
|                 // deletions | ||||
|                 lang_safe_tokens_from_document( | ||||
|                     &obkv, | ||||
|                     searchable_fields, | ||||
|                     &tokenizer, | ||||
|                     stop_words, | ||||
|                     allowed_separators, | ||||
|                     dictionary, | ||||
|                     max_positions_per_attributes, | ||||
|                     DelAdd::Deletion, | ||||
|                     &mut del_buffers, | ||||
|                 ) | ||||
|             }, | ||||
|             || { | ||||
|                 // additions | ||||
|                 lang_safe_tokens_from_document( | ||||
|                     &obkv, | ||||
|                     searchable_fields, | ||||
|                     &tokenizer, | ||||
|                     stop_words, | ||||
|                     allowed_separators, | ||||
|                     dictionary, | ||||
|                     max_positions_per_attributes, | ||||
|                     DelAdd::Addition, | ||||
|                     &mut add_buffers, | ||||
|                 ) | ||||
|             }, | ||||
|         ); | ||||
|  | ||||
|         let (del_obkv, del_script_language_word_count) = del?; | ||||
|         let (add_obkv, add_script_language_word_count) = add?; | ||||
|  | ||||
|         // merge deletions and additions. | ||||
|         // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>> | ||||
|         value_buffer.clear(); | ||||
|         del_add_from_two_obkvs( | ||||
|             KvReader::<FieldId>::new(del_obkv), | ||||
|             KvReader::<FieldId>::new(add_obkv), | ||||
|             &mut value_buffer, | ||||
|         )?; | ||||
|  | ||||
|         // write each KV<DelAdd, KV<u16, String>> into the sorter, field by field. | ||||
|         let obkv = KvReader::<FieldId>::new(&value_buffer); | ||||
|         for (field_id, value) in obkv.iter() { | ||||
|             key_buffer.truncate(mem::size_of::<u32>()); | ||||
|             key_buffer.extend_from_slice(&field_id.to_be_bytes()); | ||||
|             docid_word_positions_sorter.insert(&key_buffer, value)?; | ||||
|         } | ||||
|  | ||||
|         // update script_language_docids deletions. | ||||
|         for (script, languages_frequency) in del_script_language_word_count { | ||||
|             for (language, _) in languages_frequency { | ||||
|                 let entry = script_language_docids | ||||
|                     .entry((script, language)) | ||||
|                     .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); | ||||
|                 entry.0.push(document_id); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // update script_language_docids additions. | ||||
|         for (script, languages_frequency) in add_script_language_word_count { | ||||
|             for (language, _) in languages_frequency { | ||||
|                 let entry = script_language_docids | ||||
|                     .entry((script, language)) | ||||
|                     .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); | ||||
|                 entry.1.push(document_id); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>. | ||||
|     sorter_into_reader(docid_word_positions_sorter, indexer) | ||||
|         .map(|reader| (reader, script_language_docids)) | ||||
| } | ||||
|  | ||||
| /// Check if any searchable fields of a document changed. | ||||
| fn searchable_fields_changed( | ||||
|     obkv: &KvReader<FieldId>, | ||||
|     searchable_fields: &Option<HashSet<FieldId>>, | ||||
| ) -> bool { | ||||
|     for (field_id, field_bytes) in obkv.iter() { | ||||
|         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { | ||||
|             let del_add = KvReaderDelAdd::new(field_bytes); | ||||
|             match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { | ||||
|                 // if both fields are None, check the next field. | ||||
|                 (None, None) => (), | ||||
|                 // if both contains a value and values are the same, check the next field. | ||||
|                 (Some(del), Some(add)) if del == add => (), | ||||
|                 // otherwise the fields are different, return true. | ||||
|                 _otherwise => return true, | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     false | ||||
| } | ||||
|  | ||||
| /// Factorize tokenizer building. | ||||
| fn tokenizer_builder<'a>( | ||||
|     stop_words: Option<&'a fst::Set<&[u8]>>, | ||||
|     allowed_separators: Option<&'a [&str]>, | ||||
|     dictionary: Option<&'a [&str]>, | ||||
|     script_language: Option<&'a HashMap<Script, Vec<Language>>>, | ||||
| ) -> TokenizerBuilder<'a, &'a [u8]> { | ||||
|     let mut tokenizer_builder = TokenizerBuilder::new(); | ||||
|     if let Some(stop_words) = stop_words { | ||||
|         tokenizer_builder.stop_words(stop_words); | ||||
| @@ -61,130 +196,146 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||
|     if let Some(separators) = allowed_separators { | ||||
|         tokenizer_builder.separators(separators); | ||||
|     } | ||||
|     let tokenizer = tokenizer_builder.build(); | ||||
|  | ||||
|     let mut cursor = obkv_documents.into_cursor()?; | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         let document_id = key | ||||
|             .try_into() | ||||
|             .map(u32::from_be_bytes) | ||||
|             .map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||
|         let obkv = KvReader::<FieldId>::new(value); | ||||
|     if let Some(script_language) = script_language { | ||||
|         tokenizer_builder.allow_list(script_language); | ||||
|     } | ||||
|  | ||||
|         documents_ids.push(document_id); | ||||
|         buffers.key_buffer.clear(); | ||||
|         buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes()); | ||||
|     tokenizer_builder | ||||
| } | ||||
|  | ||||
|         let mut script_language_word_count = HashMap::new(); | ||||
| /// Extract words mapped with their positions of a document, | ||||
| /// ensuring no Language detection mistakes was made. | ||||
| #[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct | ||||
| fn lang_safe_tokens_from_document<'a>( | ||||
|     obkv: &KvReader<FieldId>, | ||||
|     searchable_fields: &Option<HashSet<FieldId>>, | ||||
|     tokenizer: &Tokenizer, | ||||
|     stop_words: Option<&fst::Set<&[u8]>>, | ||||
|     allowed_separators: Option<&[&str]>, | ||||
|     dictionary: Option<&[&str]>, | ||||
|     max_positions_per_attributes: u32, | ||||
|     del_add: DelAdd, | ||||
|     buffers: &'a mut Buffers, | ||||
| ) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> { | ||||
|     let mut script_language_word_count = HashMap::new(); | ||||
|  | ||||
|         extract_tokens_from_document( | ||||
|             &obkv, | ||||
|             searchable_fields, | ||||
|             &tokenizer, | ||||
|             max_positions_per_attributes, | ||||
|             &mut buffers, | ||||
|             &mut script_language_word_count, | ||||
|             &mut docid_word_positions_sorter, | ||||
|         )?; | ||||
|     tokens_from_document( | ||||
|         obkv, | ||||
|         searchable_fields, | ||||
|         tokenizer, | ||||
|         max_positions_per_attributes, | ||||
|         del_add, | ||||
|         buffers, | ||||
|         &mut script_language_word_count, | ||||
|     )?; | ||||
|  | ||||
|         // if we detect a potetial mistake in the language detection, | ||||
|         // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. | ||||
|         // context: https://github.com/meilisearch/meilisearch/issues/3565 | ||||
|         if script_language_word_count | ||||
|             .values() | ||||
|             .map(Vec::as_slice) | ||||
|             .any(potential_language_detection_error) | ||||
|         { | ||||
|             // build an allow list with the most frequent detected languages in the document. | ||||
|             let script_language: HashMap<_, _> = | ||||
|                 script_language_word_count.iter().filter_map(most_frequent_languages).collect(); | ||||
|     // if we detect a potetial mistake in the language detection, | ||||
|     // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. | ||||
|     // context: https://github.com/meilisearch/meilisearch/issues/3565 | ||||
|     if script_language_word_count | ||||
|         .values() | ||||
|         .map(Vec::as_slice) | ||||
|         .any(potential_language_detection_error) | ||||
|     { | ||||
|         // build an allow list with the most frequent detected languages in the document. | ||||
|         let script_language: HashMap<_, _> = | ||||
|             script_language_word_count.iter().filter_map(most_frequent_languages).collect(); | ||||
|  | ||||
|             // if the allow list is empty, meaning that no Language is considered frequent, | ||||
|             // then we don't rerun the extraction. | ||||
|             if !script_language.is_empty() { | ||||
|                 // build a new temporary tokenizer including the allow list. | ||||
|                 let mut tokenizer_builder = TokenizerBuilder::new(); | ||||
|                 if let Some(stop_words) = stop_words { | ||||
|                     tokenizer_builder.stop_words(stop_words); | ||||
|                 } | ||||
|                 tokenizer_builder.allow_list(&script_language); | ||||
|                 let tokenizer = tokenizer_builder.build(); | ||||
|         // if the allow list is empty, meaning that no Language is considered frequent, | ||||
|         // then we don't rerun the extraction. | ||||
|         if !script_language.is_empty() { | ||||
|             // build a new temporary tokenizer including the allow list. | ||||
|             let mut builder = tokenizer_builder( | ||||
|                 stop_words, | ||||
|                 allowed_separators, | ||||
|                 dictionary, | ||||
|                 Some(&script_language), | ||||
|             ); | ||||
|             let tokenizer = builder.build(); | ||||
|  | ||||
|                 script_language_word_count.clear(); | ||||
|             script_language_word_count.clear(); | ||||
|  | ||||
|                 // rerun the extraction. | ||||
|                 extract_tokens_from_document( | ||||
|                     &obkv, | ||||
|                     searchable_fields, | ||||
|                     &tokenizer, | ||||
|                     max_positions_per_attributes, | ||||
|                     &mut buffers, | ||||
|                     &mut script_language_word_count, | ||||
|                     &mut docid_word_positions_sorter, | ||||
|                 )?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         for (script, languages_frequency) in script_language_word_count { | ||||
|             for (language, _) in languages_frequency { | ||||
|                 let entry = script_language_docids | ||||
|                     .entry((script, language)) | ||||
|                     .or_insert_with(RoaringBitmap::new); | ||||
|                 entry.push(document_id); | ||||
|             } | ||||
|             // rerun the extraction. | ||||
|             tokens_from_document( | ||||
|                 obkv, | ||||
|                 searchable_fields, | ||||
|                 &tokenizer, | ||||
|                 max_positions_per_attributes, | ||||
|                 del_add, | ||||
|                 buffers, | ||||
|                 &mut script_language_word_count, | ||||
|             )?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(docid_word_positions_sorter, indexer) | ||||
|         .map(|reader| (documents_ids, reader, script_language_docids)) | ||||
|     // returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>) | ||||
|     Ok((&buffers.obkv_buffer, script_language_word_count)) | ||||
| } | ||||
|  | ||||
| fn extract_tokens_from_document( | ||||
| /// Extract words mapped with their positions of a document. | ||||
| fn tokens_from_document<'a>( | ||||
|     obkv: &KvReader<FieldId>, | ||||
|     searchable_fields: &Option<HashSet<FieldId>>, | ||||
|     tokenizer: &Tokenizer, | ||||
|     max_positions_per_attributes: u32, | ||||
|     buffers: &mut Buffers, | ||||
|     del_add: DelAdd, | ||||
|     buffers: &'a mut Buffers, | ||||
|     script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>, | ||||
|     docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>, | ||||
| ) -> Result<()> { | ||||
| ) -> Result<&'a [u8]> { | ||||
|     buffers.obkv_buffer.clear(); | ||||
|     let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); | ||||
|     for (field_id, field_bytes) in obkv.iter() { | ||||
|         // if field is searchable. | ||||
|         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { | ||||
|             let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; | ||||
|             buffers.field_buffer.clear(); | ||||
|             if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { | ||||
|                 let tokens = process_tokens(tokenizer.tokenize(field)) | ||||
|                     .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); | ||||
|             // extract deletion or addition only. | ||||
|             if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { | ||||
|                 // parse json. | ||||
|                 let value = | ||||
|                     serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; | ||||
|  | ||||
|                 for (index, token) in tokens { | ||||
|                     // if a language has been detected for the token, we update the counter. | ||||
|                     if let Some(language) = token.language { | ||||
|                         let script = token.script; | ||||
|                         let entry = | ||||
|                             script_language_word_count.entry(script).or_insert_with(Vec::new); | ||||
|                         match entry.iter_mut().find(|(l, _)| *l == language) { | ||||
|                             Some((_, n)) => *n += 1, | ||||
|                             None => entry.push((language, 1)), | ||||
|                 // prepare writing destination. | ||||
|                 buffers.obkv_positions_buffer.clear(); | ||||
|                 let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer); | ||||
|  | ||||
|                 // convert json into a unique string. | ||||
|                 buffers.field_buffer.clear(); | ||||
|                 if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { | ||||
|                     // create an iterator of token with their positions. | ||||
|                     let tokens = process_tokens(tokenizer.tokenize(field)) | ||||
|                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); | ||||
|  | ||||
|                     for (index, token) in tokens { | ||||
|                         // if a language has been detected for the token, we update the counter. | ||||
|                         if let Some(language) = token.language { | ||||
|                             let script = token.script; | ||||
|                             let entry = script_language_word_count.entry(script).or_default(); | ||||
|                             match entry.iter_mut().find(|(l, _)| *l == language) { | ||||
|                                 Some((_, n)) => *n += 1, | ||||
|                                 None => entry.push((language, 1)), | ||||
|                             } | ||||
|                         } | ||||
|  | ||||
|                         // keep a word only if it is not empty and fit in a LMDB key. | ||||
|                         let token = token.lemma().trim(); | ||||
|                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { | ||||
|                             let position: u16 = index | ||||
|                                 .try_into() | ||||
|                                 .map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||
|                             writer.insert(position, token.as_bytes())?; | ||||
|                         } | ||||
|                     } | ||||
|                     let token = token.lemma().trim(); | ||||
|                     if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { | ||||
|                         buffers.key_buffer.truncate(mem::size_of::<u32>()); | ||||
|                         buffers.key_buffer.extend_from_slice(token.as_bytes()); | ||||
|  | ||||
|                         let position: u16 = index | ||||
|                             .try_into() | ||||
|                             .map_err(|_| SerializationError::InvalidNumberSerialization)?; | ||||
|                         let position = absolute_from_relative_position(field_id, position); | ||||
|                         docid_word_positions_sorter | ||||
|                             .insert(&buffers.key_buffer, position.to_ne_bytes())?; | ||||
|                     } | ||||
|                     // write positions into document. | ||||
|                     let positions = writer.into_inner()?; | ||||
|                     document_writer.insert(field_id, positions)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
|     // returns a KV<FieldId, KV<u16, String>> | ||||
|     Ok(document_writer.into_inner().map(|v| v.as_slice())?) | ||||
| } | ||||
|  | ||||
| /// Transform a JSON value into a string that can be indexed. | ||||
| @@ -287,10 +438,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize) | ||||
|  | ||||
| #[derive(Default)] | ||||
| struct Buffers { | ||||
|     // the key buffer is the concatenation of the internal document id with the field id. | ||||
|     // The buffer has to be completelly cleared between documents, | ||||
|     // and the field id part must be cleared between each field. | ||||
|     key_buffer: Vec<u8>, | ||||
|     // the field buffer for each fields desserialization, and must be cleared between each field. | ||||
|     field_buffer: String, | ||||
|     // buffer used to store the value data containing an obkv. | ||||
|     obkv_buffer: Vec<u8>, | ||||
|     // buffer used to store the value data containing an obkv of tokens with their positions. | ||||
|     obkv_positions_buffer: Vec<u8>, | ||||
| } | ||||
|   | ||||
| @@ -4,11 +4,12 @@ use std::io::{self, BufReader}; | ||||
| use heed::{BytesDecode, BytesEncode}; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, | ||||
|     create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, | ||||
| }; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, | ||||
| }; | ||||
| use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; | ||||
| use crate::Result; | ||||
|  | ||||
| /// Extracts the facet number and the documents ids where this facet number appear. | ||||
| @@ -17,7 +18,7 @@ use crate::Result; | ||||
| /// documents ids from the given chunk of docid facet number positions. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_facet_number_docids<R: io::Read + io::Seek>( | ||||
|     docid_fid_facet_number: grenad::Reader<R>, | ||||
|     fid_docid_facet_number: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ||||
|     puffin::profile_function!(); | ||||
| @@ -26,21 +27,30 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>( | ||||
|  | ||||
|     let mut facet_number_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Unstable, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     let mut cursor = docid_fid_facet_number.into_cursor()?; | ||||
|     while let Some((key_bytes, _)) = cursor.move_on_next()? { | ||||
|     let mut buffer = Vec::new(); | ||||
|     let mut cursor = fid_docid_facet_number.into_cursor()?; | ||||
|     while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? { | ||||
|         let (field_id, document_id, number) = | ||||
|             FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); | ||||
|  | ||||
|         let key = FacetGroupKey { field_id, level: 0, left_bound: number }; | ||||
|         let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap(); | ||||
|         facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; | ||||
|  | ||||
|         buffer.clear(); | ||||
|         let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||
|         for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() { | ||||
|             obkv.insert(deladd_key, document_id.to_ne_bytes())?; | ||||
|         } | ||||
|         obkv.finish()?; | ||||
|  | ||||
|         facet_number_docids_sorter.insert(key_bytes, &buffer)?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(facet_number_docids_sorter, indexer) | ||||
|   | ||||
| @@ -1,13 +1,15 @@ | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader}; | ||||
| use std::io::BufReader; | ||||
| use std::{io, str}; | ||||
|  | ||||
| use heed::BytesEncode; | ||||
|  | ||||
| use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; | ||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; | ||||
| use crate::heed_codec::StrRefCodec; | ||||
| use crate::update::index_documents::merge_cbo_roaring_bitmaps; | ||||
| use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; | ||||
| use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; | ||||
| use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps; | ||||
| use crate::{FieldId, Result}; | ||||
|  | ||||
| /// Extracts the facet string and the documents ids where this facet string appear. | ||||
| /// | ||||
| @@ -24,15 +26,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | ||||
|  | ||||
|     let mut facet_string_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Stable, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     let mut buffer = Vec::new(); | ||||
|     let mut cursor = docid_fid_facet_string.into_cursor()?; | ||||
|     while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { | ||||
|     while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { | ||||
|         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); | ||||
|         let field_id = FieldId::from_be_bytes(field_id_bytes); | ||||
|  | ||||
| @@ -40,21 +43,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | ||||
|             try_split_array_at::<_, 4>(bytes).unwrap(); | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|  | ||||
|         let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?; | ||||
|  | ||||
|         let normalised_truncated_value: String; | ||||
|         if normalised_value.len() > MAX_FACET_VALUE_LENGTH { | ||||
|             normalised_truncated_value = normalised_value | ||||
|                 .char_indices() | ||||
|                 .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) | ||||
|                 .map(|(_, c)| c) | ||||
|                 .collect(); | ||||
|             normalised_value = normalised_truncated_value.as_str(); | ||||
|         } | ||||
|         let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; | ||||
|         let normalized_value = str::from_utf8(normalized_value_bytes)?; | ||||
|         let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; | ||||
|         let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap(); | ||||
|         // document id is encoded in native-endian because of the CBO roaring bitmap codec | ||||
|         facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?; | ||||
|  | ||||
|         buffer.clear(); | ||||
|         let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||
|         for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() { | ||||
|             obkv.insert(deladd_key, document_id.to_ne_bytes())?; | ||||
|         } | ||||
|         obkv.finish()?; | ||||
|         facet_string_docids_sorter.insert(&key_bytes, &buffer)?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(facet_string_docids_sorter, indexer) | ||||
|   | ||||
| @@ -1,24 +1,34 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::{BTreeMap, HashSet}; | ||||
| use std::convert::TryInto; | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader}; | ||||
| use std::mem::size_of; | ||||
| use std::result::Result as StdResult; | ||||
|  | ||||
| use heed::zerocopy::AsBytes; | ||||
| use bytemuck::bytes_of; | ||||
| use grenad::Sorter; | ||||
| use heed::BytesEncode; | ||||
| use itertools::EitherOrBoth; | ||||
| use ordered_float::OrderedFloat; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::{from_slice, Value}; | ||||
| use FilterableValues::{Empty, Null, Values}; | ||||
|  | ||||
| use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; | ||||
| use crate::error::InternalError; | ||||
| use crate::facet::value_encoding::f64_into_bytes; | ||||
| use crate::update::del_add::{DelAdd, KvWriterDelAdd}; | ||||
| use crate::update::index_documents::{create_writer, writer_into_reader}; | ||||
| use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; | ||||
| use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; | ||||
|  | ||||
| /// The length of the elements that are always in the buffer when inserting new values. | ||||
| const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>(); | ||||
|  | ||||
| /// The extracted facet values stored in grenad files by type. | ||||
| pub struct ExtractedFacetValues { | ||||
|     pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>, | ||||
|     pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>, | ||||
|     pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>, | ||||
|     pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>, | ||||
|     pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>, | ||||
|     pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>, | ||||
|     pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>, | ||||
| @@ -58,71 +68,150 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|         max_memory.map(|m| m / 2), | ||||
|     ); | ||||
|  | ||||
|     let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); | ||||
|     let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); | ||||
|     let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); | ||||
|     // The tuples represents the Del and Add side for a bitmap | ||||
|     let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new(); | ||||
|     let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new(); | ||||
|     let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new(); | ||||
|  | ||||
|     // We create two buffers for mutable ref issues with closures. | ||||
|     let mut numbers_key_buffer = Vec::new(); | ||||
|     let mut strings_key_buffer = Vec::new(); | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut cursor = obkv_documents.into_cursor()?; | ||||
|     while let Some((docid_bytes, value)) = cursor.move_on_next()? { | ||||
|         let obkv = obkv::KvReader::new(value); | ||||
|  | ||||
|         for (field_id, field_bytes) in obkv.iter() { | ||||
|             if faceted_fields.contains(&field_id) { | ||||
|                 key_buffer.clear(); | ||||
|                 numbers_key_buffer.clear(); | ||||
|                 strings_key_buffer.clear(); | ||||
|  | ||||
|                 // Set key to the field_id | ||||
|                 // Note: this encoding is consistent with FieldIdCodec | ||||
|                 key_buffer.extend_from_slice(&field_id.to_be_bytes()); | ||||
|                 numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes()); | ||||
|                 strings_key_buffer.extend_from_slice(&field_id.to_be_bytes()); | ||||
|  | ||||
|                 // Here, we know already that the document must be added to the “field id exists” database | ||||
|                 let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap(); | ||||
|                 let document = BEU32::from(document).get(); | ||||
|  | ||||
|                 facet_exists_docids.entry(field_id).or_default().insert(document); | ||||
|                 let document = DocumentId::from_be_bytes(document); | ||||
|  | ||||
|                 // For the other extraction tasks, prefix the key with the field_id and the document_id | ||||
|                 key_buffer.extend_from_slice(docid_bytes); | ||||
|                 numbers_key_buffer.extend_from_slice(docid_bytes); | ||||
|                 strings_key_buffer.extend_from_slice(docid_bytes); | ||||
|  | ||||
|                 let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; | ||||
|                 let del_add_obkv = obkv::KvReader::new(field_bytes); | ||||
|                 let del_value = match del_add_obkv.get(DelAdd::Deletion) { | ||||
|                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), | ||||
|                     None => None, | ||||
|                 }; | ||||
|                 let add_value = match del_add_obkv.get(DelAdd::Addition) { | ||||
|                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), | ||||
|                     None => None, | ||||
|                 }; | ||||
|  | ||||
|                 match extract_facet_values( | ||||
|                     &value, | ||||
|                     geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng), | ||||
|                 ) { | ||||
|                     FilterableValues::Null => { | ||||
|                         facet_is_null_docids.entry(field_id).or_default().insert(document); | ||||
|                     } | ||||
|                     FilterableValues::Empty => { | ||||
|                         facet_is_empty_docids.entry(field_id).or_default().insert(document); | ||||
|                     } | ||||
|                     FilterableValues::Values { numbers, strings } => { | ||||
|                         // insert facet numbers in sorter | ||||
|                         for number in numbers { | ||||
|                             key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); | ||||
|                             if let Some(value_bytes) = f64_into_bytes(number) { | ||||
|                                 key_buffer.extend_from_slice(&value_bytes); | ||||
|                                 key_buffer.extend_from_slice(&number.to_be_bytes()); | ||||
|                 // We insert the document id on the Del and the Add side if the field exists. | ||||
|                 let (ref mut del_exists, ref mut add_exists) = | ||||
|                     facet_exists_docids.entry(field_id).or_default(); | ||||
|                 let (ref mut del_is_null, ref mut add_is_null) = | ||||
|                     facet_is_null_docids.entry(field_id).or_default(); | ||||
|                 let (ref mut del_is_empty, ref mut add_is_empty) = | ||||
|                     facet_is_empty_docids.entry(field_id).or_default(); | ||||
|  | ||||
|                                 fid_docid_facet_numbers_sorter | ||||
|                                     .insert(&key_buffer, ().as_bytes())?; | ||||
|                             } | ||||
|                 if del_value.is_some() { | ||||
|                     del_exists.insert(document); | ||||
|                 } | ||||
|                 if add_value.is_some() { | ||||
|                     add_exists.insert(document); | ||||
|                 } | ||||
|  | ||||
|                 let geo_support = | ||||
|                     geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng); | ||||
|                 let del_filterable_values = | ||||
|                     del_value.map(|value| extract_facet_values(&value, geo_support)); | ||||
|                 let add_filterable_values = | ||||
|                     add_value.map(|value| extract_facet_values(&value, geo_support)); | ||||
|  | ||||
|                 // Those closures are just here to simplify things a bit. | ||||
|                 let mut insert_numbers_diff = |del_numbers, add_numbers| { | ||||
|                     insert_numbers_diff( | ||||
|                         &mut fid_docid_facet_numbers_sorter, | ||||
|                         &mut numbers_key_buffer, | ||||
|                         del_numbers, | ||||
|                         add_numbers, | ||||
|                     ) | ||||
|                 }; | ||||
|                 let mut insert_strings_diff = |del_strings, add_strings| { | ||||
|                     insert_strings_diff( | ||||
|                         &mut fid_docid_facet_strings_sorter, | ||||
|                         &mut strings_key_buffer, | ||||
|                         del_strings, | ||||
|                         add_strings, | ||||
|                     ) | ||||
|                 }; | ||||
|  | ||||
|                 match (del_filterable_values, add_filterable_values) { | ||||
|                     (None, None) => (), | ||||
|                     (Some(del_filterable_values), None) => match del_filterable_values { | ||||
|                         Null => { | ||||
|                             del_is_null.insert(document); | ||||
|                         } | ||||
|  | ||||
|                         // insert normalized and original facet string in sorter | ||||
|                         for (normalized, original) in | ||||
|                             strings.into_iter().filter(|(n, _)| !n.is_empty()) | ||||
|                         { | ||||
|                             let normalized_truncated_value: String = normalized | ||||
|                                 .char_indices() | ||||
|                                 .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) | ||||
|                                 .map(|(_, c)| c) | ||||
|                                 .collect(); | ||||
|  | ||||
|                             key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); | ||||
|                             key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); | ||||
|                             fid_docid_facet_strings_sorter | ||||
|                                 .insert(&key_buffer, original.as_bytes())?; | ||||
|                         Empty => { | ||||
|                             del_is_empty.insert(document); | ||||
|                         } | ||||
|                         Values { numbers, strings } => { | ||||
|                             insert_numbers_diff(numbers, vec![])?; | ||||
|                             insert_strings_diff(strings, vec![])?; | ||||
|                         } | ||||
|                     }, | ||||
|                     (None, Some(add_filterable_values)) => match add_filterable_values { | ||||
|                         Null => { | ||||
|                             add_is_null.insert(document); | ||||
|                         } | ||||
|                         Empty => { | ||||
|                             add_is_empty.insert(document); | ||||
|                         } | ||||
|                         Values { numbers, strings } => { | ||||
|                             insert_numbers_diff(vec![], numbers)?; | ||||
|                             insert_strings_diff(vec![], strings)?; | ||||
|                         } | ||||
|                     }, | ||||
|                     (Some(del_filterable_values), Some(add_filterable_values)) => { | ||||
|                         match (del_filterable_values, add_filterable_values) { | ||||
|                             (Null, Null) | (Empty, Empty) => (), | ||||
|                             (Null, Empty) => { | ||||
|                                 del_is_null.insert(document); | ||||
|                                 add_is_empty.insert(document); | ||||
|                             } | ||||
|                             (Empty, Null) => { | ||||
|                                 del_is_empty.insert(document); | ||||
|                                 add_is_null.insert(document); | ||||
|                             } | ||||
|                             (Null, Values { numbers, strings }) => { | ||||
|                                 insert_numbers_diff(vec![], numbers)?; | ||||
|                                 insert_strings_diff(vec![], strings)?; | ||||
|                                 del_is_null.insert(document); | ||||
|                             } | ||||
|                             (Empty, Values { numbers, strings }) => { | ||||
|                                 insert_numbers_diff(vec![], numbers)?; | ||||
|                                 insert_strings_diff(vec![], strings)?; | ||||
|                                 del_is_empty.insert(document); | ||||
|                             } | ||||
|                             (Values { numbers, strings }, Null) => { | ||||
|                                 add_is_null.insert(document); | ||||
|                                 insert_numbers_diff(numbers, vec![])?; | ||||
|                                 insert_strings_diff(strings, vec![])?; | ||||
|                             } | ||||
|                             (Values { numbers, strings }, Empty) => { | ||||
|                                 add_is_empty.insert(document); | ||||
|                                 insert_numbers_diff(numbers, vec![])?; | ||||
|                                 insert_strings_diff(strings, vec![])?; | ||||
|                             } | ||||
|                             ( | ||||
|                                 Values { numbers: del_numbers, strings: del_strings }, | ||||
|                                 Values { numbers: add_numbers, strings: add_strings }, | ||||
|                             ) => { | ||||
|                                 insert_numbers_diff(del_numbers, add_numbers)?; | ||||
|                                 insert_strings_diff(del_strings, add_strings)?; | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
| @@ -130,14 +219,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let mut buffer = Vec::new(); | ||||
|     let mut facet_exists_docids_writer = create_writer( | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         tempfile::tempfile()?, | ||||
|     ); | ||||
|     for (fid, bitmap) in facet_exists_docids.into_iter() { | ||||
|         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); | ||||
|         facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; | ||||
|     for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() { | ||||
|         deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; | ||||
|         facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?; | ||||
|     } | ||||
|     let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; | ||||
|  | ||||
| @@ -146,9 +236,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|         indexer.chunk_compression_level, | ||||
|         tempfile::tempfile()?, | ||||
|     ); | ||||
|     for (fid, bitmap) in facet_is_null_docids.into_iter() { | ||||
|         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); | ||||
|         facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; | ||||
|     for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() { | ||||
|         deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; | ||||
|         facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?; | ||||
|     } | ||||
|     let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; | ||||
|  | ||||
| @@ -157,21 +247,156 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|         indexer.chunk_compression_level, | ||||
|         tempfile::tempfile()?, | ||||
|     ); | ||||
|     for (fid, bitmap) in facet_is_empty_docids.into_iter() { | ||||
|         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); | ||||
|         facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; | ||||
|     for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() { | ||||
|         deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?; | ||||
|         facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?; | ||||
|     } | ||||
|     let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; | ||||
|  | ||||
|     Ok(ExtractedFacetValues { | ||||
|         docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, | ||||
|         docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, | ||||
|         fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, | ||||
|         fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, | ||||
|         fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, | ||||
|         fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader, | ||||
|         fid_facet_exists_docids_chunk: facet_exists_docids_reader, | ||||
|     }) | ||||
| } | ||||
|  | ||||
| /// Generates a vector of bytes containing a DelAdd obkv with two bitmaps. | ||||
| fn deladd_obkv_cbo_roaring_bitmaps( | ||||
|     buffer: &mut Vec<u8>, | ||||
|     del_bitmap: &RoaringBitmap, | ||||
|     add_bitmap: &RoaringBitmap, | ||||
| ) -> io::Result<()> { | ||||
|     buffer.clear(); | ||||
|     let mut obkv = KvWriterDelAdd::new(buffer); | ||||
|     let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap(); | ||||
|     let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap(); | ||||
|     obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?; | ||||
|     obkv.insert(DelAdd::Addition, add_bitmap_bytes)?; | ||||
|     obkv.finish() | ||||
| } | ||||
|  | ||||
| /// Truncates a string to the biggest valid LMDB key size. | ||||
| fn truncate_string(s: String) -> String { | ||||
|     s.char_indices() | ||||
|         .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) | ||||
|         .map(|(_, c)| c) | ||||
|         .collect() | ||||
| } | ||||
|  | ||||
| /// Computes the diff between both Del and Add numbers and | ||||
| /// only inserts the parts that differ in the sorter. | ||||
| fn insert_numbers_diff<MF>( | ||||
|     fid_docid_facet_numbers_sorter: &mut Sorter<MF>, | ||||
|     key_buffer: &mut Vec<u8>, | ||||
|     mut del_numbers: Vec<f64>, | ||||
|     mut add_numbers: Vec<f64>, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>, | ||||
| { | ||||
|     // We sort and dedup the float numbers | ||||
|     del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); | ||||
|     add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f)); | ||||
|     del_numbers.dedup_by_key(|f| OrderedFloat(*f)); | ||||
|     add_numbers.dedup_by_key(|f| OrderedFloat(*f)); | ||||
|  | ||||
|     let merged_numbers_iter = itertools::merge_join_by( | ||||
|         del_numbers.into_iter().map(OrderedFloat), | ||||
|         add_numbers.into_iter().map(OrderedFloat), | ||||
|         |del, add| del.cmp(add), | ||||
|     ); | ||||
|  | ||||
|     // insert facet numbers in sorter | ||||
|     for eob in merged_numbers_iter { | ||||
|         key_buffer.truncate(TRUNCATE_SIZE); | ||||
|         match eob { | ||||
|             EitherOrBoth::Both(_, _) => (), // no need to touch anything | ||||
|             EitherOrBoth::Left(OrderedFloat(number)) => { | ||||
|                 if let Some(value_bytes) = f64_into_bytes(number) { | ||||
|                     key_buffer.extend_from_slice(&value_bytes); | ||||
|                     key_buffer.extend_from_slice(&number.to_be_bytes()); | ||||
|  | ||||
|                     // We insert only the Del part of the Obkv to inform | ||||
|                     // that we only want to remove all those numbers. | ||||
|                     let mut obkv = KvWriterDelAdd::memory(); | ||||
|                     obkv.insert(DelAdd::Deletion, bytes_of(&()))?; | ||||
|                     let bytes = obkv.into_inner()?; | ||||
|                     fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; | ||||
|                 } | ||||
|             } | ||||
|             EitherOrBoth::Right(OrderedFloat(number)) => { | ||||
|                 if let Some(value_bytes) = f64_into_bytes(number) { | ||||
|                     key_buffer.extend_from_slice(&value_bytes); | ||||
|                     key_buffer.extend_from_slice(&number.to_be_bytes()); | ||||
|  | ||||
|                     // We insert only the Add part of the Obkv to inform | ||||
|                     // that we only want to remove all those numbers. | ||||
|                     let mut obkv = KvWriterDelAdd::memory(); | ||||
|                     obkv.insert(DelAdd::Addition, bytes_of(&()))?; | ||||
|                     let bytes = obkv.into_inner()?; | ||||
|                     fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// Computes the diff between both Del and Add strings and | ||||
| /// only inserts the parts that differ in the sorter. | ||||
| fn insert_strings_diff<MF>( | ||||
|     fid_docid_facet_strings_sorter: &mut Sorter<MF>, | ||||
|     key_buffer: &mut Vec<u8>, | ||||
|     mut del_strings: Vec<(String, String)>, | ||||
|     mut add_strings: Vec<(String, String)>, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>, | ||||
| { | ||||
|     // We sort and dedup the normalized and original strings | ||||
|     del_strings.sort_unstable(); | ||||
|     add_strings.sort_unstable(); | ||||
|     del_strings.dedup(); | ||||
|     add_strings.dedup(); | ||||
|  | ||||
|     let merged_strings_iter = itertools::merge_join_by( | ||||
|         del_strings.into_iter().filter(|(n, _)| !n.is_empty()), | ||||
|         add_strings.into_iter().filter(|(n, _)| !n.is_empty()), | ||||
|         |del, add| del.cmp(add), | ||||
|     ); | ||||
|  | ||||
|     // insert normalized and original facet string in sorter | ||||
|     for eob in merged_strings_iter { | ||||
|         key_buffer.truncate(TRUNCATE_SIZE); | ||||
|         match eob { | ||||
|             EitherOrBoth::Both(_, _) => (), // no need to touch anything | ||||
|             EitherOrBoth::Left((normalized, original)) => { | ||||
|                 let truncated = truncate_string(normalized); | ||||
|                 key_buffer.extend_from_slice(truncated.as_bytes()); | ||||
|  | ||||
|                 let mut obkv = KvWriterDelAdd::memory(); | ||||
|                 obkv.insert(DelAdd::Deletion, original)?; | ||||
|                 let bytes = obkv.into_inner()?; | ||||
|                 fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; | ||||
|             } | ||||
|             EitherOrBoth::Right((normalized, original)) => { | ||||
|                 let truncated = truncate_string(normalized); | ||||
|                 key_buffer.extend_from_slice(truncated.as_bytes()); | ||||
|  | ||||
|                 let mut obkv = KvWriterDelAdd::memory(); | ||||
|                 obkv.insert(DelAdd::Addition, original)?; | ||||
|                 let bytes = obkv.into_inner()?; | ||||
|                 fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// Represent what a document field contains. | ||||
| enum FilterableValues { | ||||
|     /// Corresponds to the JSON `null` value. | ||||
| @@ -182,6 +407,7 @@ enum FilterableValues { | ||||
|     Values { numbers: Vec<f64>, strings: Vec<(String, String)> }, | ||||
| } | ||||
|  | ||||
| /// Extracts the facet values of a JSON field. | ||||
| fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues { | ||||
|     fn inner_extract_facet_values( | ||||
|         value: &Value, | ||||
|   | ||||
| @@ -1,16 +1,18 @@ | ||||
| use std::collections::HashMap; | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader}; | ||||
|  | ||||
| use grenad::Sorter; | ||||
| use obkv::KvReaderU16; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, MergeFn, | ||||
|     create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, | ||||
|     GrenadParameters, | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::{relative_from_absolute_position, DocumentId, FieldId, Result}; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||
| use crate::Result; | ||||
|  | ||||
| const MAX_COUNTED_WORDS: usize = 30; | ||||
|  | ||||
| /// Extracts the field id word count and the documents ids where | ||||
| /// this field id with this amount of words appear. | ||||
| @@ -28,70 +30,62 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>( | ||||
|  | ||||
|     let mut fid_word_count_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Unstable, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     // This map is assumed to not consume a lot of memory. | ||||
|     let mut document_fid_wordcount = HashMap::new(); | ||||
|     let mut current_document_id = None; | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut value_buffer = Vec::new(); | ||||
|     let mut cursor = docid_word_positions.into_cursor()?; | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         let (document_id_bytes, _word_bytes) = try_split_array_at(key) | ||||
|         let (document_id_bytes, fid_bytes) = try_split_array_at(key) | ||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|  | ||||
|         let curr_document_id = *current_document_id.get_or_insert(document_id); | ||||
|         if curr_document_id != document_id { | ||||
|             drain_document_fid_wordcount_into_sorter( | ||||
|                 &mut fid_word_count_docids_sorter, | ||||
|                 &mut document_fid_wordcount, | ||||
|                 curr_document_id, | ||||
|             )?; | ||||
|             current_document_id = Some(document_id); | ||||
|         let del_add_reader = KvReaderDelAdd::new(value); | ||||
|         let deletion = del_add_reader | ||||
|             // get deleted words | ||||
|             .get(DelAdd::Deletion) | ||||
|             // count deleted words | ||||
|             .map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()) | ||||
|             // keep the count if under or equal to MAX_COUNTED_WORDS | ||||
|             .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); | ||||
|         let addition = del_add_reader | ||||
|             // get added words | ||||
|             .get(DelAdd::Addition) | ||||
|             // count added words | ||||
|             .map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count()) | ||||
|             // keep the count if under or equal to MAX_COUNTED_WORDS | ||||
|             .filter(|&word_count| word_count <= MAX_COUNTED_WORDS); | ||||
|  | ||||
|         if deletion != addition { | ||||
|             // Insert deleted word count in sorter if exist. | ||||
|             if let Some(word_count) = deletion { | ||||
|                 value_buffer.clear(); | ||||
|                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||
|                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||
|                 key_buffer.clear(); | ||||
|                 key_buffer.extend_from_slice(fid_bytes); | ||||
|                 key_buffer.push(word_count as u8); | ||||
|                 fid_word_count_docids_sorter | ||||
|                     .insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||
|             } | ||||
|             // Insert added word count in sorter if exist. | ||||
|             if let Some(word_count) = addition { | ||||
|                 value_buffer.clear(); | ||||
|                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||
|                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||
|                 key_buffer.clear(); | ||||
|                 key_buffer.extend_from_slice(fid_bytes); | ||||
|                 key_buffer.push(word_count as u8); | ||||
|                 fid_word_count_docids_sorter | ||||
|                     .insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         for position in read_u32_ne_bytes(value) { | ||||
|             let (field_id, _) = relative_from_absolute_position(position); | ||||
|  | ||||
|             let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0); | ||||
|             *value += 1; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if let Some(document_id) = current_document_id { | ||||
|         // We must make sure that don't lose the current document field id | ||||
|         // word count map if we break because we reached the end of the chunk. | ||||
|         drain_document_fid_wordcount_into_sorter( | ||||
|             &mut fid_word_count_docids_sorter, | ||||
|             &mut document_fid_wordcount, | ||||
|             document_id, | ||||
|         )?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(fid_word_count_docids_sorter, indexer) | ||||
| } | ||||
|  | ||||
| fn drain_document_fid_wordcount_into_sorter( | ||||
|     fid_word_count_docids_sorter: &mut Sorter<MergeFn>, | ||||
|     document_fid_wordcount: &mut HashMap<FieldId, u32>, | ||||
|     document_id: DocumentId, | ||||
| ) -> Result<()> { | ||||
|     let mut key_buffer = Vec::new(); | ||||
|  | ||||
|     for (fid, count) in document_fid_wordcount.drain() { | ||||
|         if count <= 30 { | ||||
|             key_buffer.clear(); | ||||
|             key_buffer.extend_from_slice(&fid.to_be_bytes()); | ||||
|             key_buffer.push(count as u8); | ||||
|  | ||||
|             fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
| @@ -6,6 +6,7 @@ use serde_json::Value; | ||||
|  | ||||
| use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | ||||
| use crate::error::GeoError; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||
| use crate::update::index_documents::extract_finite_float_from_value; | ||||
| use crate::{FieldId, InternalError, Result}; | ||||
|  | ||||
| @@ -30,39 +31,71 @@ pub fn extract_geo_points<R: io::Read + io::Seek>( | ||||
|     let mut cursor = obkv_documents.into_cursor()?; | ||||
|     while let Some((docid_bytes, value)) = cursor.move_on_next()? { | ||||
|         let obkv = obkv::KvReader::new(value); | ||||
|         // since we only needs the primary key when we throw an error we create this getter to | ||||
|         // lazily get it when needed | ||||
|         // since we only need the primary key when we throw an error | ||||
|         // we create this getter to lazily get it when needed | ||||
|         let document_id = || -> Value { | ||||
|             let document_id = obkv.get(primary_key_id).unwrap(); | ||||
|             serde_json::from_slice(document_id).unwrap() | ||||
|         }; | ||||
|  | ||||
|         // first we get the two fields | ||||
|         let lat = obkv.get(lat_fid); | ||||
|         let lng = obkv.get(lng_fid); | ||||
|         match (obkv.get(lat_fid), obkv.get(lng_fid)) { | ||||
|             (Some(lat), Some(lng)) => { | ||||
|                 let deladd_lat_obkv = KvReaderDelAdd::new(lat); | ||||
|                 let deladd_lng_obkv = KvReaderDelAdd::new(lng); | ||||
|  | ||||
|         if let Some((lat, lng)) = lat.zip(lng) { | ||||
|             // then we extract the values | ||||
|             let lat = extract_finite_float_from_value( | ||||
|                 serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, | ||||
|             ) | ||||
|             .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; | ||||
|                 // then we extract the values | ||||
|                 let del_lat_lng = deladd_lat_obkv | ||||
|                     .get(DelAdd::Deletion) | ||||
|                     .zip(deladd_lng_obkv.get(DelAdd::Deletion)) | ||||
|                     .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) | ||||
|                     .transpose()?; | ||||
|                 let add_lat_lng = deladd_lat_obkv | ||||
|                     .get(DelAdd::Addition) | ||||
|                     .zip(deladd_lng_obkv.get(DelAdd::Addition)) | ||||
|                     .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id)) | ||||
|                     .transpose()?; | ||||
|  | ||||
|             let lng = extract_finite_float_from_value( | ||||
|                 serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, | ||||
|             ) | ||||
|             .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; | ||||
|  | ||||
|             #[allow(clippy::drop_non_drop)] | ||||
|             let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; | ||||
|             writer.insert(docid_bytes, bytes)?; | ||||
|         } else if lat.is_none() && lng.is_some() { | ||||
|             return Err(GeoError::MissingLatitude { document_id: document_id() })?; | ||||
|         } else if lat.is_some() && lng.is_none() { | ||||
|             return Err(GeoError::MissingLongitude { document_id: document_id() })?; | ||||
|                 if del_lat_lng != add_lat_lng { | ||||
|                     let mut obkv = KvWriterDelAdd::memory(); | ||||
|                     if let Some([lat, lng]) = del_lat_lng { | ||||
|                         #[allow(clippy::drop_non_drop)] | ||||
|                         let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; | ||||
|                         obkv.insert(DelAdd::Deletion, bytes)?; | ||||
|                     } | ||||
|                     if let Some([lat, lng]) = add_lat_lng { | ||||
|                         #[allow(clippy::drop_non_drop)] | ||||
|                         let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; | ||||
|                         obkv.insert(DelAdd::Addition, bytes)?; | ||||
|                     } | ||||
|                     let bytes = obkv.into_inner()?; | ||||
|                     writer.insert(docid_bytes, bytes)?; | ||||
|                 } | ||||
|             } | ||||
|             (None, Some(_)) => { | ||||
|                 return Err(GeoError::MissingLatitude { document_id: document_id() }.into()) | ||||
|             } | ||||
|             (Some(_), None) => { | ||||
|                 return Err(GeoError::MissingLongitude { document_id: document_id() }.into()) | ||||
|             } | ||||
|             (None, None) => (), | ||||
|         } | ||||
|         // else => the _geo object was `null`, there is nothing to do | ||||
|     } | ||||
|  | ||||
|     writer_into_reader(writer) | ||||
| } | ||||
|  | ||||
| /// Extract the finite floats lat and lng from two bytes slices. | ||||
| fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> { | ||||
|     let lat = extract_finite_float_from_value( | ||||
|         serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, | ||||
|     ) | ||||
|     .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; | ||||
|  | ||||
|     let lng = extract_finite_float_from_value( | ||||
|         serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, | ||||
|     ) | ||||
|     .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; | ||||
|  | ||||
|     Ok([lat, lng]) | ||||
| } | ||||
|   | ||||
| @@ -1,13 +1,24 @@ | ||||
| use std::cmp::Ordering; | ||||
| use std::convert::TryFrom; | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader}; | ||||
| use std::io::{self, BufReader, BufWriter}; | ||||
| use std::mem::size_of; | ||||
| use std::str::from_utf8; | ||||
|  | ||||
| use bytemuck::cast_slice; | ||||
| use grenad::Writer; | ||||
| use itertools::EitherOrBoth; | ||||
| use ordered_float::OrderedFloat; | ||||
| use serde_json::{from_slice, Value}; | ||||
|  | ||||
| use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; | ||||
| use crate::error::UserError; | ||||
| use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||
| use crate::update::index_documents::helpers::try_split_at; | ||||
| use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors}; | ||||
|  | ||||
| /// The length of the elements that are always in the buffer when inserting new values. | ||||
| const TRUNCATE_SIZE: usize = size_of::<DocumentId>(); | ||||
|  | ||||
| /// Extracts the embedding vector contained in each document under the `_vectors` field. | ||||
| /// | ||||
| @@ -16,7 +27,6 @@ use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors}; | ||||
| pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||
|     obkv_documents: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
|     primary_key_id: FieldId, | ||||
|     vectors_fid: FieldId, | ||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ||||
|     puffin::profile_function!(); | ||||
| @@ -27,43 +37,112 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||
|         tempfile::tempfile()?, | ||||
|     ); | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut cursor = obkv_documents.into_cursor()?; | ||||
|     while let Some((docid_bytes, value)) = cursor.move_on_next()? { | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         // this must always be serialized as (docid, external_docid); | ||||
|         let (docid_bytes, external_id_bytes) = | ||||
|             try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap(); | ||||
|         debug_assert!(from_utf8(external_id_bytes).is_ok()); | ||||
|  | ||||
|         let obkv = obkv::KvReader::new(value); | ||||
|         key_buffer.clear(); | ||||
|         key_buffer.extend_from_slice(docid_bytes); | ||||
|  | ||||
|         // since we only needs the primary key when we throw an error we create this getter to | ||||
|         // lazily get it when needed | ||||
|         let document_id = || -> Value { | ||||
|             let document_id = obkv.get(primary_key_id).unwrap(); | ||||
|             from_slice(document_id).unwrap() | ||||
|         }; | ||||
|         let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; | ||||
|  | ||||
|         // first we retrieve the _vectors field | ||||
|         if let Some(vectors) = obkv.get(vectors_fid) { | ||||
|             // extract the vectors | ||||
|             let vectors = match from_slice(vectors) { | ||||
|                 Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors), | ||||
|                 Err(_) => { | ||||
|                     return Err(UserError::InvalidVectorsType { | ||||
|                         document_id: document_id(), | ||||
|                         value: from_slice(vectors).map_err(InternalError::SerdeJson)?, | ||||
|                     } | ||||
|                     .into()) | ||||
|                 } | ||||
|             }; | ||||
|         if let Some(value) = obkv.get(vectors_fid) { | ||||
|             let vectors_obkv = KvReaderDelAdd::new(value); | ||||
|  | ||||
|             if let Some(vectors) = vectors { | ||||
|                 for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) { | ||||
|                     let index = u16::try_from(i).unwrap(); | ||||
|                     let mut key = docid_bytes.to_vec(); | ||||
|                     key.extend_from_slice(&index.to_be_bytes()); | ||||
|                     let bytes = cast_slice(&vector); | ||||
|                     writer.insert(key, bytes)?; | ||||
|                 } | ||||
|             } | ||||
|             // then we extract the values | ||||
|             let del_vectors = vectors_obkv | ||||
|                 .get(DelAdd::Deletion) | ||||
|                 .map(|vectors| extract_vectors(vectors, document_id)) | ||||
|                 .transpose()? | ||||
|                 .flatten(); | ||||
|             let add_vectors = vectors_obkv | ||||
|                 .get(DelAdd::Addition) | ||||
|                 .map(|vectors| extract_vectors(vectors, document_id)) | ||||
|                 .transpose()? | ||||
|                 .flatten(); | ||||
|  | ||||
|             // and we finally push the unique vectors into the writer | ||||
|             push_vectors_diff( | ||||
|                 &mut writer, | ||||
|                 &mut key_buffer, | ||||
|                 del_vectors.unwrap_or_default(), | ||||
|                 add_vectors.unwrap_or_default(), | ||||
|             )?; | ||||
|         } | ||||
|         // else => the `_vectors` object was `null`, there is nothing to do | ||||
|     } | ||||
|  | ||||
|     writer_into_reader(writer) | ||||
| } | ||||
|  | ||||
| /// Computes the diff between both Del and Add numbers and | ||||
| /// only inserts the parts that differ in the sorter. | ||||
| fn push_vectors_diff( | ||||
|     writer: &mut Writer<BufWriter<File>>, | ||||
|     key_buffer: &mut Vec<u8>, | ||||
|     mut del_vectors: Vec<Vec<f32>>, | ||||
|     mut add_vectors: Vec<Vec<f32>>, | ||||
| ) -> Result<()> { | ||||
|     // We sort and dedup the vectors | ||||
|     del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); | ||||
|     add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); | ||||
|     del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); | ||||
|     add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); | ||||
|  | ||||
|     let merged_vectors_iter = | ||||
|         itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); | ||||
|  | ||||
|     // insert vectors into the writer | ||||
|     for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { | ||||
|         // Generate the key by extending the unique index to it. | ||||
|         key_buffer.truncate(TRUNCATE_SIZE); | ||||
|         let index = u16::try_from(i).unwrap(); | ||||
|         key_buffer.extend_from_slice(&index.to_be_bytes()); | ||||
|  | ||||
|         match eob { | ||||
|             EitherOrBoth::Both(_, _) => (), // no need to touch anything | ||||
|             EitherOrBoth::Left(vector) => { | ||||
|                 // We insert only the Del part of the Obkv to inform | ||||
|                 // that we only want to remove all those vectors. | ||||
|                 let mut obkv = KvWriterDelAdd::memory(); | ||||
|                 obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; | ||||
|                 let bytes = obkv.into_inner()?; | ||||
|                 writer.insert(&key_buffer, bytes)?; | ||||
|             } | ||||
|             EitherOrBoth::Right(vector) => { | ||||
|                 // We insert only the Add part of the Obkv to inform | ||||
|                 // that we only want to remove all those vectors. | ||||
|                 let mut obkv = KvWriterDelAdd::memory(); | ||||
|                 obkv.insert(DelAdd::Addition, cast_slice(&vector))?; | ||||
|                 let bytes = obkv.into_inner()?; | ||||
|                 writer.insert(&key_buffer, bytes)?; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// Compares two vectors by using the OrderingFloat helper. | ||||
| fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { | ||||
|     a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat)) | ||||
| } | ||||
|  | ||||
| /// Extracts the vectors from a JSON value. | ||||
| fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Option<Vec<Vec<f32>>>> { | ||||
|     match from_slice(value) { | ||||
|         Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)), | ||||
|         Err(_) => Err(UserError::InvalidVectorsType { | ||||
|             document_id: document_id(), | ||||
|             value: from_slice(value).map_err(InternalError::SerdeJson)?, | ||||
|         } | ||||
|         .into()), | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,18 +1,20 @@ | ||||
| use std::collections::HashSet; | ||||
| use std::collections::{BTreeSet, HashSet}; | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader}; | ||||
| use std::iter::FromIterator; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
| use heed::BytesDecode; | ||||
| use obkv::KvReaderU16; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, | ||||
|     create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, | ||||
|     try_split_array_at, writer_into_reader, GrenadParameters, | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::heed_codec::StrBEU16Codec; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::update::index_documents::helpers::read_u32_ne_bytes; | ||||
| use crate::{relative_from_absolute_position, FieldId, Result}; | ||||
| use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||
| use crate::update::MergeFn; | ||||
| use crate::{DocumentId, FieldId, Result}; | ||||
|  | ||||
| /// Extracts the word and the documents ids where this word appear. | ||||
| /// | ||||
| @@ -26,65 +28,152 @@ pub fn extract_word_docids<R: io::Read + io::Seek>( | ||||
|     docid_word_positions: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
|     exact_attributes: &HashSet<FieldId>, | ||||
| ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { | ||||
| ) -> Result<( | ||||
|     grenad::Reader<BufReader<File>>, | ||||
|     grenad::Reader<BufReader<File>>, | ||||
|     grenad::Reader<BufReader<File>>, | ||||
| )> { | ||||
|     puffin::profile_function!(); | ||||
|  | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut word_docids_sorter = create_sorter( | ||||
|     let mut word_fid_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Unstable, | ||||
|         merge_roaring_bitmaps, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory.map(|x| x / 2), | ||||
|         max_memory.map(|x| x / 3), | ||||
|     ); | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut del_words = BTreeSet::new(); | ||||
|     let mut add_words = BTreeSet::new(); | ||||
|     let mut cursor = docid_word_positions.into_cursor()?; | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         let (document_id_bytes, fid_bytes) = try_split_array_at(key) | ||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let (fid_bytes, _) = try_split_array_at(fid_bytes) | ||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|         let fid = u16::from_be_bytes(fid_bytes); | ||||
|  | ||||
|         let del_add_reader = KvReaderDelAdd::new(value); | ||||
|         // extract all unique words to remove. | ||||
|         if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { | ||||
|             for (_pos, word) in KvReaderU16::new(deletion).iter() { | ||||
|                 del_words.insert(word.to_vec()); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // extract all unique additional words. | ||||
|         if let Some(addition) = del_add_reader.get(DelAdd::Addition) { | ||||
|             for (_pos, word) in KvReaderU16::new(addition).iter() { | ||||
|                 add_words.insert(word.to_vec()); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         words_into_sorter( | ||||
|             document_id, | ||||
|             fid, | ||||
|             &mut key_buffer, | ||||
|             &del_words, | ||||
|             &add_words, | ||||
|             &mut word_fid_docids_sorter, | ||||
|         )?; | ||||
|  | ||||
|         del_words.clear(); | ||||
|         add_words.clear(); | ||||
|     } | ||||
|  | ||||
|     let mut word_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Unstable, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory.map(|x| x / 3), | ||||
|     ); | ||||
|  | ||||
|     let mut exact_word_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Unstable, | ||||
|         merge_roaring_bitmaps, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory.map(|x| x / 2), | ||||
|         max_memory.map(|x| x / 3), | ||||
|     ); | ||||
|  | ||||
|     let mut value_buffer = Vec::new(); | ||||
|     let mut cursor = docid_word_positions.into_cursor()?; | ||||
|     while let Some((key, positions)) = cursor.move_on_next()? { | ||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) | ||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|     let mut word_fid_docids_writer = create_writer( | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         tempfile::tempfile()?, | ||||
|     ); | ||||
|  | ||||
|         let bitmap = RoaringBitmap::from_iter(Some(document_id)); | ||||
|         serialize_roaring_bitmap(&bitmap, &mut value_buffer)?; | ||||
|     let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; | ||||
|     // TODO: replace sorters by writers by accumulating values into a buffer before inserting them. | ||||
|     while let Some((key, value)) = iter.next()? { | ||||
|         // only keep the value if their is a change to apply in the DB. | ||||
|         if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { | ||||
|             word_fid_docids_writer.insert(key, value)?; | ||||
|         } | ||||
|  | ||||
|         // If there are no exact attributes, we do not need to iterate over positions. | ||||
|         if exact_attributes.is_empty() { | ||||
|             word_docids_sorter.insert(word_bytes, &value_buffer)?; | ||||
|         let (word, fid) = StrBEU16Codec::bytes_decode(key) | ||||
|             .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|  | ||||
|         // every words contained in an attribute set to exact must be pushed in the exact_words list. | ||||
|         if exact_attributes.contains(&fid) { | ||||
|             exact_word_docids_sorter.insert(word.as_bytes(), value)?; | ||||
|         } else { | ||||
|             let mut added_to_exact = false; | ||||
|             let mut added_to_word_docids = false; | ||||
|             for position in read_u32_ne_bytes(positions) { | ||||
|                 // as soon as we know that this word had been to both readers, we don't need to | ||||
|                 // iterate over the positions. | ||||
|                 if added_to_exact && added_to_word_docids { | ||||
|                     break; | ||||
|                 } | ||||
|                 let (fid, _) = relative_from_absolute_position(position); | ||||
|                 if exact_attributes.contains(&fid) && !added_to_exact { | ||||
|                     exact_word_docids_sorter.insert(word_bytes, &value_buffer)?; | ||||
|                     added_to_exact = true; | ||||
|                 } else if !added_to_word_docids { | ||||
|                     word_docids_sorter.insert(word_bytes, &value_buffer)?; | ||||
|                     added_to_word_docids = true; | ||||
|                 } | ||||
|             } | ||||
|             word_docids_sorter.insert(word.as_bytes(), value)?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(( | ||||
|         sorter_into_reader(word_docids_sorter, indexer)?, | ||||
|         sorter_into_reader(exact_word_docids_sorter, indexer)?, | ||||
|         writer_into_reader(word_fid_docids_writer)?, | ||||
|     )) | ||||
| } | ||||
|  | ||||
| fn words_into_sorter( | ||||
|     document_id: DocumentId, | ||||
|     fid: FieldId, | ||||
|     key_buffer: &mut Vec<u8>, | ||||
|     del_words: &BTreeSet<Vec<u8>>, | ||||
|     add_words: &BTreeSet<Vec<u8>>, | ||||
|     word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>, | ||||
| ) -> Result<()> { | ||||
|     puffin::profile_function!(); | ||||
|  | ||||
|     use itertools::merge_join_by; | ||||
|     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||
|  | ||||
|     let mut buffer = Vec::new(); | ||||
|     for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) { | ||||
|         buffer.clear(); | ||||
|         let mut value_writer = KvWriterDelAdd::new(&mut buffer); | ||||
|         let word_bytes = match eob { | ||||
|             Left(word_bytes) => { | ||||
|                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||
|                 word_bytes | ||||
|             } | ||||
|             Right(word_bytes) => { | ||||
|                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||
|                 word_bytes | ||||
|             } | ||||
|             Both(word_bytes, _) => { | ||||
|                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||
|                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||
|                 word_bytes | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         key_buffer.clear(); | ||||
|         key_buffer.extend_from_slice(word_bytes); | ||||
|         key_buffer.push(0); | ||||
|         key_buffer.extend_from_slice(&fid.to_be_bytes()); | ||||
|         word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
| @@ -1,51 +0,0 @@ | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader}; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::{relative_from_absolute_position, DocumentId, Result}; | ||||
|  | ||||
| /// Extracts the word, field id, and the documents ids where this word appear at this field id. | ||||
| #[logging_timer::time] | ||||
| pub fn extract_word_fid_docids<R: io::Read + io::Seek>( | ||||
|     docid_word_positions: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ||||
|     puffin::profile_function!(); | ||||
|  | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut word_fid_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Unstable, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut cursor = docid_word_positions.into_cursor()?; | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) | ||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = DocumentId::from_be_bytes(document_id_bytes); | ||||
|  | ||||
|         for position in read_u32_ne_bytes(value) { | ||||
|             key_buffer.clear(); | ||||
|             key_buffer.extend_from_slice(word_bytes); | ||||
|             key_buffer.push(0); | ||||
|             let (fid, _) = relative_from_absolute_position(position); | ||||
|             key_buffer.extend_from_slice(&fid.to_be_bytes()); | ||||
|             word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; | ||||
|  | ||||
|     Ok(word_fid_docids_reader) | ||||
| } | ||||
| @@ -1,16 +1,18 @@ | ||||
| use std::cmp::Ordering; | ||||
| use std::collections::{BinaryHeap, HashMap}; | ||||
| use std::collections::{BTreeMap, VecDeque}; | ||||
| use std::fs::File; | ||||
| use std::io::BufReader; | ||||
| use std::{cmp, io, mem, str, vec}; | ||||
| use std::{cmp, io}; | ||||
|  | ||||
| use obkv::KvReaderU16; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, MergeFn, | ||||
|     create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at, | ||||
|     writer_into_reader, GrenadParameters, MergeFn, | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::proximity::{positions_proximity, MAX_DISTANCE}; | ||||
| use crate::proximity::{index_proximity, MAX_DISTANCE}; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||
| use crate::{DocumentId, Result}; | ||||
|  | ||||
| /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. | ||||
| @@ -26,58 +28,137 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( | ||||
|  | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut word_pair_proximity_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Unstable, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory.map(|m| m / 2), | ||||
|     ); | ||||
|     let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE) | ||||
|         .map(|_| { | ||||
|             create_sorter( | ||||
|                 grenad::SortAlgorithm::Unstable, | ||||
|                 merge_deladd_cbo_roaring_bitmaps, | ||||
|                 indexer.chunk_compression_type, | ||||
|                 indexer.chunk_compression_level, | ||||
|                 indexer.max_nb_chunks, | ||||
|                 max_memory.map(|m| m / MAX_DISTANCE as usize), | ||||
|             ) | ||||
|         }) | ||||
|         .collect(); | ||||
|  | ||||
|     // This map is assumed to not consume a lot of memory. | ||||
|     let mut document_word_positions_heap = BinaryHeap::new(); | ||||
|     let mut del_word_positions: VecDeque<(String, u16)> = | ||||
|         VecDeque::with_capacity(MAX_DISTANCE as usize); | ||||
|     let mut add_word_positions: VecDeque<(String, u16)> = | ||||
|         VecDeque::with_capacity(MAX_DISTANCE as usize); | ||||
|     let mut del_word_pair_proximity = BTreeMap::new(); | ||||
|     let mut add_word_pair_proximity = BTreeMap::new(); | ||||
|     let mut current_document_id = None; | ||||
|  | ||||
|     let mut cursor = docid_word_positions.into_cursor()?; | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) | ||||
|         let (document_id_bytes, _fid_bytes) = try_split_array_at(key) | ||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|         let word = str::from_utf8(word_bytes)?; | ||||
|  | ||||
|         let curr_document_id = *current_document_id.get_or_insert(document_id); | ||||
|         if curr_document_id != document_id { | ||||
|             let document_word_positions_heap = mem::take(&mut document_word_positions_heap); | ||||
|         // if we change document, we fill the sorter | ||||
|         if current_document_id.map_or(false, |id| id != document_id) { | ||||
|             puffin::profile_scope!("Document into sorter"); | ||||
|  | ||||
|             document_word_positions_into_sorter( | ||||
|                 curr_document_id, | ||||
|                 document_word_positions_heap, | ||||
|                 &mut word_pair_proximity_docids_sorter, | ||||
|                 current_document_id.unwrap(), | ||||
|                 &del_word_pair_proximity, | ||||
|                 &add_word_pair_proximity, | ||||
|                 &mut word_pair_proximity_docids_sorters, | ||||
|             )?; | ||||
|             current_document_id = Some(document_id); | ||||
|             del_word_pair_proximity.clear(); | ||||
|             add_word_pair_proximity.clear(); | ||||
|         } | ||||
|  | ||||
|         let word = word.to_string(); | ||||
|         let mut positions: Vec<_> = read_u32_ne_bytes(value).collect(); | ||||
|         positions.sort_unstable(); | ||||
|         let mut iter = positions.into_iter(); | ||||
|         if let Some(position) = iter.next() { | ||||
|             document_word_positions_heap.push(PeekedWordPosition { word, position, iter }); | ||||
|         } | ||||
|         current_document_id = Some(document_id); | ||||
|  | ||||
|         let (del, add): (Result<_>, Result<_>) = rayon::join( | ||||
|             || { | ||||
|                 // deletions | ||||
|                 if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { | ||||
|                     for (position, word) in KvReaderU16::new(deletion).iter() { | ||||
|                         // drain the proximity window until the head word is considered close to the word we are inserting. | ||||
|                         while del_word_positions.get(0).map_or(false, |(_w, p)| { | ||||
|                             index_proximity(*p as u32, position as u32) >= MAX_DISTANCE | ||||
|                         }) { | ||||
|                             word_positions_into_word_pair_proximity( | ||||
|                                 &mut del_word_positions, | ||||
|                                 &mut del_word_pair_proximity, | ||||
|                             )?; | ||||
|                         } | ||||
|  | ||||
|                         // insert the new word. | ||||
|                         let word = std::str::from_utf8(word)?; | ||||
|                         del_word_positions.push_back((word.to_string(), position)); | ||||
|                     } | ||||
|  | ||||
|                     while !del_word_positions.is_empty() { | ||||
|                         word_positions_into_word_pair_proximity( | ||||
|                             &mut del_word_positions, | ||||
|                             &mut del_word_pair_proximity, | ||||
|                         )?; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 Ok(()) | ||||
|             }, | ||||
|             || { | ||||
|                 // additions | ||||
|                 if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { | ||||
|                     for (position, word) in KvReaderU16::new(addition).iter() { | ||||
|                         // drain the proximity window until the head word is considered close to the word we are inserting. | ||||
|                         while add_word_positions.get(0).map_or(false, |(_w, p)| { | ||||
|                             index_proximity(*p as u32, position as u32) >= MAX_DISTANCE | ||||
|                         }) { | ||||
|                             word_positions_into_word_pair_proximity( | ||||
|                                 &mut add_word_positions, | ||||
|                                 &mut add_word_pair_proximity, | ||||
|                             )?; | ||||
|                         } | ||||
|  | ||||
|                         // insert the new word. | ||||
|                         let word = std::str::from_utf8(word)?; | ||||
|                         add_word_positions.push_back((word.to_string(), position)); | ||||
|                     } | ||||
|  | ||||
|                     while !add_word_positions.is_empty() { | ||||
|                         word_positions_into_word_pair_proximity( | ||||
|                             &mut add_word_positions, | ||||
|                             &mut add_word_pair_proximity, | ||||
|                         )?; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 Ok(()) | ||||
|             }, | ||||
|         ); | ||||
|  | ||||
|         del?; | ||||
|         add?; | ||||
|     } | ||||
|  | ||||
|     if let Some(document_id) = current_document_id { | ||||
|         // We must make sure that don't lose the current document field id | ||||
|         // word count map if we break because we reached the end of the chunk. | ||||
|         let document_word_positions_heap = mem::take(&mut document_word_positions_heap); | ||||
|         puffin::profile_scope!("Final document into sorter"); | ||||
|         document_word_positions_into_sorter( | ||||
|             document_id, | ||||
|             document_word_positions_heap, | ||||
|             &mut word_pair_proximity_docids_sorter, | ||||
|             &del_word_pair_proximity, | ||||
|             &add_word_pair_proximity, | ||||
|             &mut word_pair_proximity_docids_sorters, | ||||
|         )?; | ||||
|     } | ||||
|     { | ||||
|         puffin::profile_scope!("sorter_into_reader"); | ||||
|         let mut writer = create_writer( | ||||
|             indexer.chunk_compression_type, | ||||
|             indexer.chunk_compression_level, | ||||
|             tempfile::tempfile()?, | ||||
|         ); | ||||
|  | ||||
|     sorter_into_reader(word_pair_proximity_docids_sorter, indexer) | ||||
|         for sorter in word_pair_proximity_docids_sorters { | ||||
|             sorter.write_into_stream_writer(&mut writer)?; | ||||
|         } | ||||
|  | ||||
|         writer_into_reader(writer) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive. | ||||
| @@ -86,96 +167,66 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( | ||||
| /// close to each other. | ||||
| fn document_word_positions_into_sorter( | ||||
|     document_id: DocumentId, | ||||
|     mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>, | ||||
|     word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>, | ||||
|     del_word_pair_proximity: &BTreeMap<(String, String), u8>, | ||||
|     add_word_pair_proximity: &BTreeMap<(String, String), u8>, | ||||
|     word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>], | ||||
| ) -> Result<()> { | ||||
|     let mut word_pair_proximity = HashMap::new(); | ||||
|     let mut ordered_peeked_word_positions = Vec::new(); | ||||
|     while !word_positions_heap.is_empty() { | ||||
|         while let Some(peeked_word_position) = word_positions_heap.pop() { | ||||
|             ordered_peeked_word_positions.push(peeked_word_position); | ||||
|             if ordered_peeked_word_positions.len() == 7 { | ||||
|                 break; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         if let Some((head, tail)) = ordered_peeked_word_positions.split_first() { | ||||
|             for PeekedWordPosition { word, position, .. } in tail { | ||||
|                 let prox = positions_proximity(head.position, *position); | ||||
|                 if prox > 0 && prox < MAX_DISTANCE { | ||||
|                     word_pair_proximity | ||||
|                         .entry((head.word.clone(), word.clone())) | ||||
|                         .and_modify(|p| { | ||||
|                             *p = cmp::min(*p, prox); | ||||
|                         }) | ||||
|                         .or_insert(prox); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             // Push the tail in the heap. | ||||
|             let tail_iter = ordered_peeked_word_positions.drain(1..); | ||||
|             word_positions_heap.extend(tail_iter); | ||||
|  | ||||
|             // Advance the head and push it in the heap. | ||||
|             if let Some(mut head) = ordered_peeked_word_positions.pop() { | ||||
|                 if let Some(next_position) = head.iter.next() { | ||||
|                     let prox = positions_proximity(head.position, next_position); | ||||
|  | ||||
|                     if prox > 0 && prox < MAX_DISTANCE { | ||||
|                         word_pair_proximity | ||||
|                             .entry((head.word.clone(), head.word.clone())) | ||||
|                             .and_modify(|p| { | ||||
|                                 *p = cmp::min(*p, prox); | ||||
|                             }) | ||||
|                             .or_insert(prox); | ||||
|                     } | ||||
|  | ||||
|                     word_positions_heap.push(PeekedWordPosition { | ||||
|                         word: head.word, | ||||
|                         position: next_position, | ||||
|                         iter: head.iter, | ||||
|                     }); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|     use itertools::merge_join_by; | ||||
|     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||
|  | ||||
|     let mut buffer = Vec::new(); | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     for ((w1, w2), prox) in word_pair_proximity { | ||||
|     for eob in | ||||
|         merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { | ||||
|             d.cmp(a) | ||||
|         }) | ||||
|     { | ||||
|         buffer.clear(); | ||||
|         let mut value_writer = KvWriterDelAdd::new(&mut buffer); | ||||
|         let ((w1, w2), prox) = match eob { | ||||
|             Left(key_value) => { | ||||
|                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||
|                 key_value | ||||
|             } | ||||
|             Right(key_value) => { | ||||
|                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||
|                 key_value | ||||
|             } | ||||
|             Both(key_value, _) => { | ||||
|                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||
|                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||
|                 key_value | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         key_buffer.clear(); | ||||
|         key_buffer.push(prox as u8); | ||||
|         key_buffer.push(*prox); | ||||
|         key_buffer.extend_from_slice(w1.as_bytes()); | ||||
|         key_buffer.push(0); | ||||
|         key_buffer.extend_from_slice(w2.as_bytes()); | ||||
|  | ||||
|         word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; | ||||
|         word_pair_proximity_docids_sorters[*prox as usize - 1] | ||||
|             .insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| struct PeekedWordPosition<I> { | ||||
|     word: String, | ||||
|     position: u32, | ||||
|     iter: I, | ||||
| } | ||||
|  | ||||
| impl<I> Ord for PeekedWordPosition<I> { | ||||
|     fn cmp(&self, other: &Self) -> Ordering { | ||||
|         self.position.cmp(&other.position).reverse() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<I> PartialOrd for PeekedWordPosition<I> { | ||||
|     fn partial_cmp(&self, other: &Self) -> Option<Ordering> { | ||||
|         Some(self.cmp(other)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<I> Eq for PeekedWordPosition<I> {} | ||||
|  | ||||
| impl<I> PartialEq for PeekedWordPosition<I> { | ||||
|     fn eq(&self, other: &Self) -> bool { | ||||
|         self.position == other.position | ||||
| fn word_positions_into_word_pair_proximity( | ||||
|     word_positions: &mut VecDeque<(String, u16)>, | ||||
|     word_pair_proximity: &mut BTreeMap<(String, String), u8>, | ||||
| ) -> Result<()> { | ||||
|     let (head_word, head_position) = word_positions.pop_front().unwrap(); | ||||
|     for (word, position) in word_positions.iter() { | ||||
|         let prox = index_proximity(head_position as u32, *position as u32) as u8; | ||||
|         if prox > 0 && prox < MAX_DISTANCE as u8 { | ||||
|             word_pair_proximity | ||||
|                 .entry((head_word.clone(), word.clone())) | ||||
|                 .and_modify(|p| { | ||||
|                     *p = cmp::min(*p, prox); | ||||
|                 }) | ||||
|                 .or_insert(prox); | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
| @@ -1,13 +1,18 @@ | ||||
| use std::collections::BTreeSet; | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader}; | ||||
|  | ||||
| use obkv::KvReaderU16; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, | ||||
|     try_split_array_at, GrenadParameters, | ||||
|     create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at, | ||||
|     GrenadParameters, | ||||
| }; | ||||
| use crate::error::SerializationError; | ||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||
| use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||
| use crate::update::MergeFn; | ||||
| use crate::{bucketed_position, DocumentId, Result}; | ||||
|  | ||||
| /// Extracts the word positions and the documents ids where this word appear. | ||||
| /// | ||||
| @@ -24,32 +29,111 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>( | ||||
|  | ||||
|     let mut word_position_docids_sorter = create_sorter( | ||||
|         grenad::SortAlgorithm::Unstable, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         indexer.max_nb_chunks, | ||||
|         max_memory, | ||||
|     ); | ||||
|  | ||||
|     let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new(); | ||||
|     let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new(); | ||||
|     let mut current_document_id: Option<u32> = None; | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut cursor = docid_word_positions.into_cursor()?; | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         let (document_id_bytes, word_bytes) = try_split_array_at(key) | ||||
|         let (document_id_bytes, _fid_bytes) = try_split_array_at(key) | ||||
|             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||
|         let document_id = DocumentId::from_be_bytes(document_id_bytes); | ||||
|  | ||||
|         for position in read_u32_ne_bytes(value) { | ||||
|             key_buffer.clear(); | ||||
|             key_buffer.extend_from_slice(word_bytes); | ||||
|             key_buffer.push(0); | ||||
|             let (_, position) = relative_from_absolute_position(position); | ||||
|             let position = bucketed_position(position); | ||||
|             key_buffer.extend_from_slice(&position.to_be_bytes()); | ||||
|             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; | ||||
|         if current_document_id.map_or(false, |id| document_id != id) { | ||||
|             words_position_into_sorter( | ||||
|                 current_document_id.unwrap(), | ||||
|                 &mut key_buffer, | ||||
|                 &del_word_positions, | ||||
|                 &add_word_positions, | ||||
|                 &mut word_position_docids_sorter, | ||||
|             )?; | ||||
|             del_word_positions.clear(); | ||||
|             add_word_positions.clear(); | ||||
|         } | ||||
|  | ||||
|         current_document_id = Some(document_id); | ||||
|  | ||||
|         let del_add_reader = KvReaderDelAdd::new(value); | ||||
|         // extract all unique words to remove. | ||||
|         if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) { | ||||
|             for (position, word_bytes) in KvReaderU16::new(deletion).iter() { | ||||
|                 let position = bucketed_position(position); | ||||
|                 del_word_positions.insert((position, word_bytes.to_vec())); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // extract all unique additional words. | ||||
|         if let Some(addition) = del_add_reader.get(DelAdd::Addition) { | ||||
|             for (position, word_bytes) in KvReaderU16::new(addition).iter() { | ||||
|                 let position = bucketed_position(position); | ||||
|                 add_word_positions.insert((position, word_bytes.to_vec())); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if let Some(document_id) = current_document_id { | ||||
|         words_position_into_sorter( | ||||
|             document_id, | ||||
|             &mut key_buffer, | ||||
|             &del_word_positions, | ||||
|             &add_word_positions, | ||||
|             &mut word_position_docids_sorter, | ||||
|         )?; | ||||
|     } | ||||
|  | ||||
|     // TODO remove noop DelAdd OBKV | ||||
|     let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?; | ||||
|  | ||||
|     Ok(word_position_docids_reader) | ||||
| } | ||||
|  | ||||
| fn words_position_into_sorter( | ||||
|     document_id: DocumentId, | ||||
|     key_buffer: &mut Vec<u8>, | ||||
|     del_word_positions: &BTreeSet<(u16, Vec<u8>)>, | ||||
|     add_word_positions: &BTreeSet<(u16, Vec<u8>)>, | ||||
|     word_position_docids_sorter: &mut grenad::Sorter<MergeFn>, | ||||
| ) -> Result<()> { | ||||
|     puffin::profile_function!(); | ||||
|  | ||||
|     use itertools::merge_join_by; | ||||
|     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||
|  | ||||
|     let mut buffer = Vec::new(); | ||||
|     for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a)) | ||||
|     { | ||||
|         buffer.clear(); | ||||
|         let mut value_writer = KvWriterDelAdd::new(&mut buffer); | ||||
|         let (position, word_bytes) = match eob { | ||||
|             Left(key) => { | ||||
|                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||
|                 key | ||||
|             } | ||||
|             Right(key) => { | ||||
|                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||
|                 key | ||||
|             } | ||||
|             Both(key, _) => { | ||||
|                 // both values needs to be kept because it will be used in other extractors. | ||||
|                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap(); | ||||
|                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap(); | ||||
|                 key | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         key_buffer.clear(); | ||||
|         key_buffer.extend_from_slice(word_bytes); | ||||
|         key_buffer.push(0); | ||||
|         key_buffer.extend_from_slice(&position.to_be_bytes()); | ||||
|         word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?; | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
| @@ -6,7 +6,6 @@ mod extract_fid_word_count_docids; | ||||
| mod extract_geo_points; | ||||
| mod extract_vector_points; | ||||
| mod extract_word_docids; | ||||
| mod extract_word_fid_docids; | ||||
| mod extract_word_pair_proximity_docids; | ||||
| mod extract_word_position_docids; | ||||
|  | ||||
| @@ -26,14 +25,14 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids; | ||||
| use self::extract_geo_points::extract_geo_points; | ||||
| use self::extract_vector_points::extract_vector_points; | ||||
| use self::extract_word_docids::extract_word_docids; | ||||
| use self::extract_word_fid_docids::extract_word_fid_docids; | ||||
| use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; | ||||
| use self::extract_word_position_docids::extract_word_position_docids; | ||||
| use super::helpers::{ | ||||
|     as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, | ||||
|     GrenadParameters, MergeFn, MergeableReader, | ||||
|     as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, | ||||
|     MergeFn, MergeableReader, | ||||
| }; | ||||
| use super::{helpers, TypedChunk}; | ||||
| use crate::proximity::ProximityPrecision; | ||||
| use crate::{FieldId, Result}; | ||||
|  | ||||
| /// Extract data for each databases from obkv documents in parallel. | ||||
| @@ -54,6 +53,7 @@ pub(crate) fn data_from_obkv_documents( | ||||
|     dictionary: Option<&[&str]>, | ||||
|     max_positions_per_attributes: Option<u32>, | ||||
|     exact_attributes: HashSet<FieldId>, | ||||
|     proximity_precision: ProximityPrecision, | ||||
| ) -> Result<()> { | ||||
|     puffin::profile_function!(); | ||||
|  | ||||
| @@ -65,7 +65,6 @@ pub(crate) fn data_from_obkv_documents( | ||||
|                 indexer, | ||||
|                 lmdb_writer_sx.clone(), | ||||
|                 vectors_field_id, | ||||
|                 primary_key_id, | ||||
|             ) | ||||
|         }) | ||||
|         .collect::<Result<()>>()?; | ||||
| @@ -94,9 +93,9 @@ pub(crate) fn data_from_obkv_documents( | ||||
|     let ( | ||||
|         docid_word_positions_chunks, | ||||
|         ( | ||||
|             docid_fid_facet_numbers_chunks, | ||||
|             fid_docid_facet_numbers_chunks, | ||||
|             ( | ||||
|                 docid_fid_facet_strings_chunks, | ||||
|                 fid_docid_facet_strings_chunks, | ||||
|                 ( | ||||
|                     facet_is_null_docids_chunks, | ||||
|                     (facet_is_empty_docids_chunks, facet_exists_docids_chunks), | ||||
| @@ -110,7 +109,7 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         let lmdb_writer_sx = lmdb_writer_sx.clone(); | ||||
|         rayon::spawn(move || { | ||||
|             debug!("merge {} database", "facet-id-exists-docids"); | ||||
|             match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { | ||||
|             match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { | ||||
|                 Ok(reader) => { | ||||
|                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader))); | ||||
|                 } | ||||
| @@ -126,7 +125,7 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         let lmdb_writer_sx = lmdb_writer_sx.clone(); | ||||
|         rayon::spawn(move || { | ||||
|             debug!("merge {} database", "facet-id-is-null-docids"); | ||||
|             match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { | ||||
|             match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { | ||||
|                 Ok(reader) => { | ||||
|                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); | ||||
|                 } | ||||
| @@ -142,7 +141,7 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         let lmdb_writer_sx = lmdb_writer_sx.clone(); | ||||
|         rayon::spawn(move || { | ||||
|             debug!("merge {} database", "facet-id-is-empty-docids"); | ||||
|             match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { | ||||
|             match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) { | ||||
|                 Ok(reader) => { | ||||
|                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); | ||||
|                 } | ||||
| @@ -153,39 +152,48 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer, | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_word_pair_proximity_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         TypedChunk::WordPairProximityDocids, | ||||
|         "word-pair-proximity-docids", | ||||
|     ); | ||||
|     if proximity_precision == ProximityPrecision::WordScale { | ||||
|         spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||
|             docid_word_positions_chunks.clone(), | ||||
|             indexer, | ||||
|             lmdb_writer_sx.clone(), | ||||
|             extract_word_pair_proximity_docids, | ||||
|             merge_deladd_cbo_roaring_bitmaps, | ||||
|             TypedChunk::WordPairProximityDocids, | ||||
|             "word-pair-proximity-docids", | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer, | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_fid_word_count_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         TypedChunk::FieldIdWordcountDocids, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         TypedChunk::FieldIdWordCountDocids, | ||||
|         "field-id-wordcount-docids", | ||||
|     ); | ||||
|  | ||||
|     spawn_extraction_task::< | ||||
|         _, | ||||
|         _, | ||||
|         Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>, | ||||
|         Vec<( | ||||
|             grenad::Reader<BufReader<File>>, | ||||
|             grenad::Reader<BufReader<File>>, | ||||
|             grenad::Reader<BufReader<File>>, | ||||
|         )>, | ||||
|     >( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer, | ||||
|         lmdb_writer_sx.clone(), | ||||
|         move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), | ||||
|         merge_roaring_bitmaps, | ||||
|         |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { | ||||
|             word_docids_reader, | ||||
|             exact_word_docids_reader, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { | ||||
|             TypedChunk::WordDocids { | ||||
|                 word_docids_reader, | ||||
|                 exact_word_docids_reader, | ||||
|                 word_fid_docids_reader, | ||||
|             } | ||||
|         }, | ||||
|         "word-docids", | ||||
|     ); | ||||
| @@ -195,36 +203,27 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         indexer, | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_word_position_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         TypedChunk::WordPositionDocids, | ||||
|         "word-position-docids", | ||||
|     ); | ||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||
|         docid_word_positions_chunks, | ||||
|         indexer, | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_word_fid_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         TypedChunk::WordFidDocids, | ||||
|         "word-fid-docids", | ||||
|     ); | ||||
|  | ||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||
|         docid_fid_facet_strings_chunks, | ||||
|         fid_docid_facet_strings_chunks, | ||||
|         indexer, | ||||
|         lmdb_writer_sx.clone(), | ||||
|         extract_facet_string_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         TypedChunk::FieldIdFacetStringDocids, | ||||
|         "field-id-facet-string-docids", | ||||
|     ); | ||||
|  | ||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>( | ||||
|         docid_fid_facet_numbers_chunks, | ||||
|         fid_docid_facet_numbers_chunks, | ||||
|         indexer, | ||||
|         lmdb_writer_sx, | ||||
|         extract_facet_number_docids, | ||||
|         merge_cbo_roaring_bitmaps, | ||||
|         merge_deladd_cbo_roaring_bitmaps, | ||||
|         TypedChunk::FieldIdFacetNumberDocids, | ||||
|         "field-id-facet-number-docids", | ||||
|     ); | ||||
| @@ -278,7 +277,6 @@ fn send_original_documents_data( | ||||
|     indexer: GrenadParameters, | ||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||
|     vectors_field_id: Option<FieldId>, | ||||
|     primary_key_id: FieldId, | ||||
| ) -> Result<()> { | ||||
|     let original_documents_chunk = | ||||
|         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; | ||||
| @@ -287,12 +285,7 @@ fn send_original_documents_data( | ||||
|         let documents_chunk_cloned = original_documents_chunk.clone(); | ||||
|         let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); | ||||
|         rayon::spawn(move || { | ||||
|             let result = extract_vector_points( | ||||
|                 documents_chunk_cloned, | ||||
|                 indexer, | ||||
|                 primary_key_id, | ||||
|                 vectors_field_id, | ||||
|             ); | ||||
|             let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id); | ||||
|             let _ = match result { | ||||
|                 Ok(vector_points) => { | ||||
|                     lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points))) | ||||
| @@ -356,10 +349,10 @@ fn send_and_extract_flattened_documents_data( | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) = | ||||
|     let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = | ||||
|         rayon::join( | ||||
|             || { | ||||
|                 let (documents_ids, docid_word_positions_chunk, script_language_pair) = | ||||
|                 let (docid_word_positions_chunk, script_language_pair) = | ||||
|                     extract_docid_word_positions( | ||||
|                         flattened_documents_chunk.clone(), | ||||
|                         indexer, | ||||
| @@ -370,9 +363,6 @@ fn send_and_extract_flattened_documents_data( | ||||
|                         max_positions_per_attributes, | ||||
|                     )?; | ||||
|  | ||||
|                 // send documents_ids to DB writer | ||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids))); | ||||
|  | ||||
|                 // send docid_word_positions_chunk to DB writer | ||||
|                 let docid_word_positions_chunk = | ||||
|                     unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; | ||||
| @@ -384,8 +374,8 @@ fn send_and_extract_flattened_documents_data( | ||||
|             }, | ||||
|             || { | ||||
|                 let ExtractedFacetValues { | ||||
|                     docid_fid_facet_numbers_chunk, | ||||
|                     docid_fid_facet_strings_chunk, | ||||
|                     fid_docid_facet_numbers_chunk, | ||||
|                     fid_docid_facet_strings_chunk, | ||||
|                     fid_facet_is_null_docids_chunk, | ||||
|                     fid_facet_is_empty_docids_chunk, | ||||
|                     fid_facet_exists_docids_chunk, | ||||
| @@ -396,26 +386,26 @@ fn send_and_extract_flattened_documents_data( | ||||
|                     geo_fields_ids, | ||||
|                 )?; | ||||
|  | ||||
|                 // send docid_fid_facet_numbers_chunk to DB writer | ||||
|                 let docid_fid_facet_numbers_chunk = | ||||
|                     unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? }; | ||||
|                 // send fid_docid_facet_numbers_chunk to DB writer | ||||
|                 let fid_docid_facet_numbers_chunk = | ||||
|                     unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? }; | ||||
|  | ||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers( | ||||
|                     docid_fid_facet_numbers_chunk.clone(), | ||||
|                     fid_docid_facet_numbers_chunk.clone(), | ||||
|                 ))); | ||||
|  | ||||
|                 // send docid_fid_facet_strings_chunk to DB writer | ||||
|                 let docid_fid_facet_strings_chunk = | ||||
|                     unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? }; | ||||
|                 // send fid_docid_facet_strings_chunk to DB writer | ||||
|                 let fid_docid_facet_strings_chunk = | ||||
|                     unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? }; | ||||
|  | ||||
|                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings( | ||||
|                     docid_fid_facet_strings_chunk.clone(), | ||||
|                     fid_docid_facet_strings_chunk.clone(), | ||||
|                 ))); | ||||
|  | ||||
|                 Ok(( | ||||
|                     docid_fid_facet_numbers_chunk, | ||||
|                     fid_docid_facet_numbers_chunk, | ||||
|                     ( | ||||
|                         docid_fid_facet_strings_chunk, | ||||
|                         fid_docid_facet_strings_chunk, | ||||
|                         ( | ||||
|                             fid_facet_is_null_docids_chunk, | ||||
|                             (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), | ||||
| @@ -425,5 +415,5 @@ fn send_and_extract_flattened_documents_data( | ||||
|             }, | ||||
|         ); | ||||
|  | ||||
|     Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?)) | ||||
|     Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?)) | ||||
| } | ||||
|   | ||||
| @@ -1,14 +1,12 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader, BufWriter, Seek}; | ||||
| use std::time::Instant; | ||||
|  | ||||
| use grenad::{CompressionType, Sorter}; | ||||
| use heed::types::ByteSlice; | ||||
| use log::debug; | ||||
| use heed::types::Bytes; | ||||
|  | ||||
| use super::{ClonableMmap, MergeFn}; | ||||
| use crate::error::InternalError; | ||||
| use crate::update::index_documents::valid_lmdb_key; | ||||
| use crate::Result; | ||||
|  | ||||
| pub type CursorClonableMmap = io::Cursor<ClonableMmap>; | ||||
| @@ -47,6 +45,7 @@ pub fn create_sorter( | ||||
|         builder.allow_realloc(false); | ||||
|     } | ||||
|     builder.sort_algorithm(sort_algorithm); | ||||
|     builder.sort_in_parallel(true); | ||||
|     builder.build() | ||||
| } | ||||
|  | ||||
| @@ -54,6 +53,7 @@ pub fn sorter_into_reader( | ||||
|     sorter: grenad::Sorter<MergeFn>, | ||||
|     indexer: GrenadParameters, | ||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ||||
|     puffin::profile_function!(); | ||||
|     let mut writer = create_writer( | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
| @@ -115,6 +115,32 @@ impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<Bu | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl MergeableReader | ||||
|     for Vec<( | ||||
|         grenad::Reader<BufReader<File>>, | ||||
|         grenad::Reader<BufReader<File>>, | ||||
|         grenad::Reader<BufReader<File>>, | ||||
|     )> | ||||
| { | ||||
|     type Output = ( | ||||
|         grenad::Reader<BufReader<File>>, | ||||
|         grenad::Reader<BufReader<File>>, | ||||
|         grenad::Reader<BufReader<File>>, | ||||
|     ); | ||||
|  | ||||
|     fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> { | ||||
|         let mut m1 = MergerBuilder::new(merge_fn); | ||||
|         let mut m2 = MergerBuilder::new(merge_fn); | ||||
|         let mut m3 = MergerBuilder::new(merge_fn); | ||||
|         for (r1, r2, r3) in self.into_iter() { | ||||
|             m1.push(r1)?; | ||||
|             m2.push(r2)?; | ||||
|             m3.push(r3)?; | ||||
|         } | ||||
|         Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>); | ||||
|  | ||||
| impl<R: io::Read + io::Seek> MergerBuilder<R> { | ||||
| @@ -195,11 +221,13 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>( | ||||
|         ); | ||||
|  | ||||
|         while let Some((document_id, obkv)) = cursor.move_on_next()? { | ||||
|             obkv_documents.insert(document_id, obkv)?; | ||||
|             current_chunk_size += document_id.len() as u64 + obkv.len() as u64; | ||||
|             if !obkv.is_empty() { | ||||
|                 obkv_documents.insert(document_id, obkv)?; | ||||
|                 current_chunk_size += document_id.len() as u64 + obkv.len() as u64; | ||||
|  | ||||
|             if current_chunk_size >= documents_chunk_size as u64 { | ||||
|                 return writer_into_reader(obkv_documents).map(Some); | ||||
|                 if current_chunk_size >= documents_chunk_size as u64 { | ||||
|                     return writer_into_reader(obkv_documents).map(Some); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
| @@ -210,45 +238,46 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>( | ||||
|     Ok(std::iter::from_fn(move || transposer().transpose())) | ||||
| } | ||||
|  | ||||
| pub fn sorter_into_lmdb_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
| /// Write provided sorter in database using serialize_value function. | ||||
| /// merge_values function is used if an entry already exist in the database. | ||||
| pub fn write_sorter_into_database<K, V, FS, FM>( | ||||
|     sorter: Sorter<MergeFn>, | ||||
|     merge: MergeFn, | ||||
| ) -> Result<()> { | ||||
|     database: &heed::Database<K, V>, | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     index_is_empty: bool, | ||||
|     serialize_value: FS, | ||||
|     merge_values: FM, | ||||
| ) -> Result<()> | ||||
| where | ||||
|     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, | ||||
|     FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>, | ||||
| { | ||||
|     puffin::profile_function!(); | ||||
|     debug!("Writing MTBL sorter..."); | ||||
|     let before = Instant::now(); | ||||
|  | ||||
|     let mut buffer = Vec::new(); | ||||
|     let database = database.remap_types::<Bytes, Bytes>(); | ||||
|  | ||||
|     let mut merger_iter = sorter.into_stream_merger_iter()?; | ||||
|     if database.is_empty(wtxn)? { | ||||
|         let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; | ||||
|         while let Some((k, v)) = merger_iter.next()? { | ||||
|             // safety: we don't keep references from inside the LMDB database. | ||||
|             unsafe { out_iter.append(k, v)? }; | ||||
|         } | ||||
|     } else { | ||||
|         while let Some((k, v)) = merger_iter.next()? { | ||||
|             let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; | ||||
|             match iter.next().transpose()? { | ||||
|                 Some((key, old_val)) if key == k => { | ||||
|                     let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; | ||||
|                     let val = merge(k, &vals).map_err(|_| { | ||||
|                         // TODO just wrap this error? | ||||
|                         InternalError::IndexingMergingKeys { process: "get-put-merge" } | ||||
|                     })?; | ||||
|                     // safety: we don't keep references from inside the LMDB database. | ||||
|                     unsafe { iter.put_current(k, &val)? }; | ||||
|     while let Some((key, value)) = merger_iter.next()? { | ||||
|         if valid_lmdb_key(key) { | ||||
|             buffer.clear(); | ||||
|             let value = if index_is_empty { | ||||
|                 Some(serialize_value(value, &mut buffer)?) | ||||
|             } else { | ||||
|                 match database.get(wtxn, key)? { | ||||
|                     Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, | ||||
|                     None => Some(serialize_value(value, &mut buffer)?), | ||||
|                 } | ||||
|                 _ => { | ||||
|                     drop(iter); | ||||
|                     database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||
|             }; | ||||
|             match value { | ||||
|                 Some(value) => database.put(wtxn, key, value)?, | ||||
|                 None => { | ||||
|                     database.delete(wtxn, key)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -6,22 +6,12 @@ use std::result::Result as StdResult; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||
| use crate::update::index_documents::transform::Operation; | ||||
| use crate::Result; | ||||
|  | ||||
| pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>; | ||||
|  | ||||
| pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> { | ||||
|     if values.len() == 1 { | ||||
|         Ok(values[0].clone()) | ||||
|     } else { | ||||
|         let capacity = values.iter().map(|v| v.len()).sum::<usize>(); | ||||
|         let mut output = Vec::with_capacity(capacity); | ||||
|         values.iter().for_each(|integers| output.extend_from_slice(integers)); | ||||
|         Ok(Cow::Owned(output)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> { | ||||
|     buffer.clear(); | ||||
|     buffer.reserve(bitmap.serialized_size()); | ||||
| @@ -75,57 +65,123 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow< | ||||
|     Ok(obkvs.last().unwrap().clone()) | ||||
| } | ||||
|  | ||||
| pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) { | ||||
| pub fn merge_two_del_add_obkvs( | ||||
|     base: obkv::KvReaderU16, | ||||
|     update: obkv::KvReaderU16, | ||||
|     merge_additions: bool, | ||||
|     buffer: &mut Vec<u8>, | ||||
| ) { | ||||
|     use itertools::merge_join_by; | ||||
|     use itertools::EitherOrBoth::{Both, Left, Right}; | ||||
|  | ||||
|     buffer.clear(); | ||||
|  | ||||
|     let mut writer = obkv::KvWriter::new(buffer); | ||||
|     let mut value_buffer = Vec::new(); | ||||
|     for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) { | ||||
|         match eob { | ||||
|             Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(), | ||||
|             Left((k, v)) => { | ||||
|                 if merge_additions { | ||||
|                     writer.insert(k, v).unwrap() | ||||
|                 } else { | ||||
|                     // If merge_additions is false, recreate an obkv keeping the deletions only. | ||||
|                     value_buffer.clear(); | ||||
|                     let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||
|                     let base_reader = KvReaderDelAdd::new(v); | ||||
|  | ||||
|                     if let Some(deletion) = base_reader.get(DelAdd::Deletion) { | ||||
|                         value_writer.insert(DelAdd::Deletion, deletion).unwrap(); | ||||
|                         value_writer.finish().unwrap(); | ||||
|                         writer.insert(k, &value_buffer).unwrap() | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             Right((k, v)) => writer.insert(k, v).unwrap(), | ||||
|             Both((k, base), (_, update)) => { | ||||
|                 // merge deletions and additions. | ||||
|                 value_buffer.clear(); | ||||
|                 let mut value_writer = KvWriterDelAdd::new(&mut value_buffer); | ||||
|                 let base_reader = KvReaderDelAdd::new(base); | ||||
|                 let update_reader = KvReaderDelAdd::new(update); | ||||
|  | ||||
|                 // keep newest deletion. | ||||
|                 if let Some(deletion) = update_reader | ||||
|                     .get(DelAdd::Deletion) | ||||
|                     .or_else(|| base_reader.get(DelAdd::Deletion)) | ||||
|                 { | ||||
|                     value_writer.insert(DelAdd::Deletion, deletion).unwrap(); | ||||
|                 } | ||||
|  | ||||
|                 // keep base addition only if merge_additions is true. | ||||
|                 let base_addition = | ||||
|                     merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten(); | ||||
|                 // keep newest addition. | ||||
|                 // TODO use or_else | ||||
|                 if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) { | ||||
|                     value_writer.insert(DelAdd::Addition, addition).unwrap(); | ||||
|                 } | ||||
|  | ||||
|                 value_writer.finish().unwrap(); | ||||
|                 writer.insert(k, &value_buffer).unwrap() | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     writer.finish().unwrap(); | ||||
| } | ||||
|  | ||||
| /// Merge all the obks in the order we see them. | ||||
| pub fn merge_obkvs_and_operations<'a>( | ||||
| /// Merge all the obkvs from the newest to the oldest. | ||||
| fn inner_merge_del_add_obkvs<'a>( | ||||
|     obkvs: &[Cow<'a, [u8]>], | ||||
|     merge_additions: bool, | ||||
| ) -> Result<Cow<'a, [u8]>> { | ||||
|     // pop the newest operation from the list. | ||||
|     let (newest, obkvs) = obkvs.split_last().unwrap(); | ||||
|     // keep the operation type for the returned value. | ||||
|     let newest_operation_type = newest[0]; | ||||
|  | ||||
|     // treat the newest obkv as the starting point of the merge. | ||||
|     let mut acc_operation_type = newest_operation_type; | ||||
|     let mut acc = newest[1..].to_vec(); | ||||
|     let mut buffer = Vec::new(); | ||||
|     // reverse iter from the most recent to the oldest. | ||||
|     for current in obkvs.iter().rev() { | ||||
|         // if in the previous iteration there was a complete deletion, | ||||
|         // stop the merge process. | ||||
|         if acc_operation_type == Operation::Deletion as u8 { | ||||
|             break; | ||||
|         } | ||||
|  | ||||
|         let newest = obkv::KvReader::new(&acc); | ||||
|         let oldest = obkv::KvReader::new(¤t[1..]); | ||||
|         merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer); | ||||
|  | ||||
|         // we want the result of the merge into our accumulator. | ||||
|         std::mem::swap(&mut acc, &mut buffer); | ||||
|         acc_operation_type = current[0]; | ||||
|     } | ||||
|  | ||||
|     acc.insert(0, newest_operation_type); | ||||
|     Ok(Cow::from(acc)) | ||||
| } | ||||
|  | ||||
| /// Merge all the obkvs from the newest to the oldest. | ||||
| pub fn obkvs_merge_additions_and_deletions<'a>( | ||||
|     _key: &[u8], | ||||
|     obkvs: &[Cow<'a, [u8]>], | ||||
| ) -> Result<Cow<'a, [u8]>> { | ||||
|     // [add, add, delete, add, add] | ||||
|     // we can ignore everything that happened before the last delete. | ||||
|     let starting_position = | ||||
|         obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0); | ||||
|  | ||||
|     // [add, add, delete] | ||||
|     // if the last operation was a deletion then we simply return the deletion | ||||
|     if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8 | ||||
|     { | ||||
|         return Ok(obkvs[obkvs.len() - 1].clone()); | ||||
|     } | ||||
|     let mut buffer = Vec::new(); | ||||
|  | ||||
|     // (add, add, delete) [add, add] | ||||
|     // in the other case, no deletion will be encountered during the merge | ||||
|     let mut ret = | ||||
|         obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| { | ||||
|             let first = obkv::KvReader::new(&acc); | ||||
|             let second = obkv::KvReader::new(¤t[1..]); | ||||
|             merge_two_obkvs(first, second, &mut buffer); | ||||
|  | ||||
|             // we want the result of the merge into our accumulator | ||||
|             std::mem::swap(&mut acc, &mut buffer); | ||||
|             acc | ||||
|         }); | ||||
|  | ||||
|     ret.insert(0, Operation::Addition as u8); | ||||
|     Ok(Cow::from(ret)) | ||||
|     inner_merge_del_add_obkvs(obkvs, true) | ||||
| } | ||||
|  | ||||
| /// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions. | ||||
| pub fn obkvs_keep_last_addition_merge_deletions<'a>( | ||||
|     _key: &[u8], | ||||
|     obkvs: &[Cow<'a, [u8]>], | ||||
| ) -> Result<Cow<'a, [u8]>> { | ||||
|     inner_merge_del_add_obkvs(obkvs, false) | ||||
| } | ||||
|  | ||||
| /// Do a union of all the CboRoaringBitmaps in the values. | ||||
| pub fn merge_cbo_roaring_bitmaps<'a>( | ||||
|     _key: &[u8], | ||||
|     values: &[Cow<'a, [u8]>], | ||||
| @@ -138,3 +194,52 @@ pub fn merge_cbo_roaring_bitmaps<'a>( | ||||
|         Ok(Cow::from(vec)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv | ||||
| /// separately and outputs a new DelAdd with both unions. | ||||
| pub fn merge_deladd_cbo_roaring_bitmaps<'a>( | ||||
|     _key: &[u8], | ||||
|     values: &[Cow<'a, [u8]>], | ||||
| ) -> Result<Cow<'a, [u8]>> { | ||||
|     if values.len() == 1 { | ||||
|         Ok(values[0].clone()) | ||||
|     } else { | ||||
|         // Retrieve the bitmaps from both sides | ||||
|         let mut del_bitmaps_bytes = Vec::new(); | ||||
|         let mut add_bitmaps_bytes = Vec::new(); | ||||
|         for value in values { | ||||
|             let obkv = KvReaderDelAdd::new(value); | ||||
|             if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) { | ||||
|                 del_bitmaps_bytes.push(bitmap_bytes); | ||||
|             } | ||||
|             if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) { | ||||
|                 add_bitmaps_bytes.push(bitmap_bytes); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let mut output_deladd_obkv = KvWriterDelAdd::memory(); | ||||
|         let mut buffer = Vec::new(); | ||||
|         CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?; | ||||
|         output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?; | ||||
|         buffer.clear(); | ||||
|         CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?; | ||||
|         output_deladd_obkv.insert(DelAdd::Addition, &buffer)?; | ||||
|         output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// A function that merges a DelAdd of bitmao into an already existing bitmap. | ||||
| /// | ||||
| /// The first argument is the DelAdd obkv of CboRoaringBitmaps and | ||||
| /// the second one is the CboRoaringBitmap to merge into. | ||||
| pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( | ||||
|     deladd_obkv: &[u8], | ||||
|     previous: &[u8], | ||||
|     buffer: &'a mut Vec<u8>, | ||||
| ) -> Result<Option<&'a [u8]>> { | ||||
|     Ok(CboRoaringBitmapCodec::merge_deladd_into( | ||||
|         KvReaderDelAdd::new(deladd_obkv), | ||||
|         previous, | ||||
|         buffer, | ||||
|     )?) | ||||
| } | ||||
|   | ||||
| @@ -9,13 +9,14 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| pub use grenad_helpers::{ | ||||
|     as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, | ||||
|     merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, | ||||
|     merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader, | ||||
|     GrenadParameters, MergeableReader, | ||||
| }; | ||||
| pub use merge_functions::{ | ||||
|     concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, | ||||
|     merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, | ||||
|     serialize_roaring_bitmap, MergeFn, | ||||
|     keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps, | ||||
|     merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|     merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, | ||||
|     obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, | ||||
| }; | ||||
|  | ||||
| use crate::MAX_WORD_LENGTH; | ||||
| @@ -44,10 +45,6 @@ where | ||||
|     Some((head, tail)) | ||||
| } | ||||
|  | ||||
| pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ { | ||||
|     bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes) | ||||
| } | ||||
|  | ||||
| /// Converts an fst Stream into an HashSet of Strings. | ||||
| pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>> | ||||
| where | ||||
|   | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| [] | ||||
| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
|  | ||||
| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
|  | ||||
| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
|  | ||||
| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| [2, ] | ||||
| @@ -0,0 +1,5 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| benoit           [2, ] | ||||
|  | ||||
| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
|  | ||||
| @@ -1,5 +1,5 @@ | ||||
| --- | ||||
| source: milli/src/update/delete_documents.rs | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| 1   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] | ||||
| 2   [21, ] | ||||
| @@ -0,0 +1,5 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| 2   0  2.2    1  [21, ] | ||||
|  | ||||
| @@ -1,5 +1,5 @@ | ||||
| --- | ||||
| source: milli/src/update/delete_documents.rs | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| 1   0  abstract     1  [2, 6, 10, 13, 14, 15, 16, 17, ] | ||||
| 1   0  aquarium     1  [5, ] | ||||
| @@ -1,5 +1,5 @@ | ||||
| --- | ||||
| source: milli/src/update/delete_documents.rs | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| 1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] | ||||
| 2                [21, ] | ||||
| @@ -1,5 +1,5 @@ | ||||
| --- | ||||
| source: milli/src/update/delete_documents.rs | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| 1  1                36               [3, ] | ||||
| 1  1                37               [4, ] | ||||
| @@ -1,5 +1,5 @@ | ||||
| --- | ||||
| source: milli/src/update/delete_documents.rs | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| 3   0  48.9021 1  [19, ] | ||||
| 3   0  49.9314 1  [17, ] | ||||
| @@ -0,0 +1,4 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
|  | ||||
| @@ -1,60 +1,56 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| 0                [1, 7, ] | ||||
| 0                [1, ] | ||||
| 1                [2, ] | ||||
| 10               [1, 7, ] | ||||
| 12               [0, 8, ] | ||||
| 10               [1, ] | ||||
| 12               [0, ] | ||||
| 1344             [3, ] | ||||
| 1813             [8, ] | ||||
| 2                [0, 8, ] | ||||
| 1813             [0, ] | ||||
| 2                [0, ] | ||||
| 23               [5, ] | ||||
| 25               [2, ] | ||||
| 3                [0, 8, ] | ||||
| 3                [0, ] | ||||
| 35               [5, ] | ||||
| 4                [4, 6, ] | ||||
| 42               [0, 5, 8, ] | ||||
| 456              [1, 7, ] | ||||
| 5                [0, 8, ] | ||||
| 4                [4, ] | ||||
| 42               [0, 5, ] | ||||
| 456              [1, ] | ||||
| 5                [0, ] | ||||
| 99               [2, ] | ||||
| adams            [5, ] | ||||
| adventure        [1, 7, ] | ||||
| adventure        [1, ] | ||||
| alice            [2, ] | ||||
| and              [0, 4, 6, 8, ] | ||||
| antoine          [1, 7, ] | ||||
| austen           [8, ] | ||||
| austin           [0, ] | ||||
| blood            [4, 6, ] | ||||
| and              [0, 4, ] | ||||
| antoine          [1, ] | ||||
| austen           [0, ] | ||||
| blood            [4, ] | ||||
| carroll          [2, ] | ||||
| de               [1, 7, ] | ||||
| de               [1, ] | ||||
| douglas          [5, ] | ||||
| exupery          [1, 7, ] | ||||
| fantasy          [2, 3, 4, 6, ] | ||||
| exupery          [1, ] | ||||
| fantasy          [2, 3, 4, ] | ||||
| galaxy           [5, ] | ||||
| guide            [5, ] | ||||
| half             [4, 6, ] | ||||
| harry            [4, 6, ] | ||||
| half             [4, ] | ||||
| harry            [4, ] | ||||
| hitchhiker       [5, ] | ||||
| hobbit           [3, ] | ||||
| in               [2, ] | ||||
| j                [3, 4, 6, 8, ] | ||||
| jane             [0, ] | ||||
| k                [4, 6, ] | ||||
| le               [1, ] | ||||
| j                [0, 3, 4, ] | ||||
| k                [4, ] | ||||
| lewis            [2, ] | ||||
| little           [7, ] | ||||
| petit            [1, ] | ||||
| potter           [4, 6, ] | ||||
| prejudice        [0, 8, ] | ||||
| pride            [0, 8, ] | ||||
| prince           [1, 4, 7, ] | ||||
| princess         [6, ] | ||||
| little           [1, ] | ||||
| potter           [4, ] | ||||
| prejudice        [0, ] | ||||
| pride            [0, ] | ||||
| prince           [1, ] | ||||
| princess         [4, ] | ||||
| r                [3, ] | ||||
| romance          [0, 8, ] | ||||
| rowling          [4, 6, ] | ||||
| romance          [0, ] | ||||
| rowling          [4, ] | ||||
| s                [5, ] | ||||
| saint            [1, 7, ] | ||||
| the              [3, 4, 5, 6, 7, ] | ||||
| saint            [1, ] | ||||
| the              [1, 3, 4, 5, ] | ||||
| to               [5, ] | ||||
| tolkien          [3, ] | ||||
| wonderland       [2, ] | ||||
|   | ||||
| @@ -1,5 +1,6 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::hash_map::Entry; | ||||
| use std::collections::btree_map::Entry as BEntry; | ||||
| use std::collections::hash_map::Entry as HEntry; | ||||
| use std::collections::{HashMap, HashSet}; | ||||
| use std::fs::File; | ||||
| use std::io::{Read, Seek}; | ||||
| @@ -7,30 +8,28 @@ use std::io::{Read, Seek}; | ||||
| use fxhash::FxHashMap; | ||||
| use heed::RoTxn; | ||||
| use itertools::Itertools; | ||||
| use obkv::{KvReader, KvWriter}; | ||||
| use obkv::{KvReader, KvReaderU16, KvWriter}; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::Value; | ||||
| use smartstring::SmartString; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn, | ||||
|     create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions, | ||||
|     obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn, | ||||
| }; | ||||
| use super::{IndexDocumentsMethod, IndexerConfig}; | ||||
| use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; | ||||
| use crate::error::{Error, InternalError, UserError}; | ||||
| use crate::index::{db_name, main_key}; | ||||
| use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; | ||||
| use crate::update::index_documents::GrenadParameters; | ||||
| use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; | ||||
| use crate::{ | ||||
|     FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, | ||||
| }; | ||||
| use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; | ||||
|  | ||||
| pub struct TransformOutput { | ||||
|     pub primary_key: String, | ||||
|     pub fields_ids_map: FieldsIdsMap, | ||||
|     pub field_distribution: FieldDistribution, | ||||
|     pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>, | ||||
|     pub new_documents_ids: RoaringBitmap, | ||||
|     pub replaced_documents_ids: RoaringBitmap, | ||||
|     pub documents_count: usize, | ||||
|     pub original_documents: File, | ||||
|     pub flattened_documents: File, | ||||
| @@ -106,8 +105,8 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         // We must choose the appropriate merge function for when two or more documents | ||||
|         // with the same user id must be merged or fully replaced in the same batch. | ||||
|         let merge_function = match index_documents_method { | ||||
|             IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv, | ||||
|             IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations, | ||||
|             IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions, | ||||
|             IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions, | ||||
|         }; | ||||
|  | ||||
|         // We initialize the sorter with the user indexing settings. | ||||
| @@ -130,17 +129,13 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             indexer_settings.max_memory.map(|mem| mem / 2), | ||||
|         ); | ||||
|         let documents_ids = index.documents_ids(wtxn)?; | ||||
|         let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?; | ||||
|  | ||||
|         Ok(Transform { | ||||
|             index, | ||||
|             fields_ids_map: index.fields_ids_map(wtxn)?, | ||||
|             indexer_settings, | ||||
|             autogenerate_docids, | ||||
|             available_documents_ids: AvailableDocumentsIds::from_documents_ids( | ||||
|                 &documents_ids, | ||||
|                 &soft_deleted_documents_ids, | ||||
|             ), | ||||
|             available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), | ||||
|             original_sorter, | ||||
|             flattened_sorter, | ||||
|             index_documents_method, | ||||
| @@ -151,6 +146,7 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time] | ||||
|     pub fn read_documents<R, FP, FA>( | ||||
|         &mut self, | ||||
|         reader: EnrichedDocumentsBatchReader<R>, | ||||
| @@ -163,8 +159,10 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         FP: Fn(UpdateIndexingStep) + Sync, | ||||
|         FA: Fn() -> bool + Sync, | ||||
|     { | ||||
|         puffin::profile_function!(); | ||||
|  | ||||
|         let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); | ||||
|         let external_documents_ids = self.index.external_documents_ids(wtxn)?; | ||||
|         let external_documents_ids = self.index.external_documents_ids(); | ||||
|         let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; | ||||
|  | ||||
|         let primary_key = cursor.primary_key().to_string(); | ||||
| @@ -172,7 +170,8 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; | ||||
|  | ||||
|         let mut obkv_buffer = Vec::new(); | ||||
|         let mut document_sorter_buffer = Vec::new(); | ||||
|         let mut document_sorter_value_buffer = Vec::new(); | ||||
|         let mut document_sorter_key_buffer = Vec::new(); | ||||
|         let mut documents_count = 0; | ||||
|         let mut docid_buffer: Vec<u8> = Vec::new(); | ||||
|         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); | ||||
| @@ -213,29 +212,30 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2)); | ||||
|  | ||||
|             // Build the new obkv document. | ||||
|             let mut writer = obkv::KvWriter::new(&mut obkv_buffer); | ||||
|             let mut writer = KvWriter::new(&mut obkv_buffer); | ||||
|             for (k, v) in field_buffer_cache.iter() { | ||||
|                 writer.insert(*k, v)?; | ||||
|             } | ||||
|  | ||||
|             let mut original_docid = None; | ||||
|  | ||||
|             let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { | ||||
|                 Entry::Occupied(entry) => *entry.get() as u32, | ||||
|                 Entry::Vacant(entry) => { | ||||
|                     // If the document was already in the db we mark it as a replaced document. | ||||
|                     // It'll be deleted later. | ||||
|                     if let Some(docid) = external_documents_ids.get(entry.key()) { | ||||
|                         // If it was already in the list of replaced documents it means it was deleted | ||||
|                         // by the remove_document method. We should starts as if it never existed. | ||||
|                         if self.replaced_documents_ids.insert(docid) { | ||||
|                             original_docid = Some(docid); | ||||
|                 HEntry::Occupied(entry) => *entry.get() as u32, | ||||
|                 HEntry::Vacant(entry) => { | ||||
|                     let docid = match external_documents_ids.get(wtxn, entry.key())? { | ||||
|                         Some(docid) => { | ||||
|                             // If it was already in the list of replaced documents it means it was deleted | ||||
|                             // by the remove_document method. We should starts as if it never existed. | ||||
|                             if self.replaced_documents_ids.insert(docid) { | ||||
|                                 original_docid = Some(docid); | ||||
|                             } | ||||
|  | ||||
|                             docid | ||||
|                         } | ||||
|                     } | ||||
|                     let docid = self | ||||
|                         .available_documents_ids | ||||
|                         .next() | ||||
|                         .ok_or(UserError::DocumentLimitReached)?; | ||||
|                         None => self | ||||
|                             .available_documents_ids | ||||
|                             .next() | ||||
|                             .ok_or(UserError::DocumentLimitReached)?, | ||||
|                     }; | ||||
|                     entry.insert(docid as u64); | ||||
|                     docid | ||||
|                 } | ||||
| @@ -243,11 +243,11 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|  | ||||
|             let mut skip_insertion = false; | ||||
|             if let Some(original_docid) = original_docid { | ||||
|                 let original_key = BEU32::new(original_docid); | ||||
|                 let original_key = original_docid; | ||||
|                 let base_obkv = self | ||||
|                     .index | ||||
|                     .documents | ||||
|                     .remap_data_type::<heed::types::ByteSlice>() | ||||
|                     .remap_data_type::<heed::types::Bytes>() | ||||
|                     .get(wtxn, &original_key)? | ||||
|                     .ok_or(InternalError::DatabaseMissingEntry { | ||||
|                         db_name: db_name::DOCUMENTS, | ||||
| @@ -263,47 +263,68 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|                     skip_insertion = true; | ||||
|                 } else { | ||||
|                     // we associate the base document with the new key, everything will get merged later. | ||||
|                     document_sorter_buffer.clear(); | ||||
|                     document_sorter_buffer.push(Operation::Addition as u8); | ||||
|                     document_sorter_buffer.extend_from_slice(base_obkv); | ||||
|                     self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; | ||||
|                     match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? { | ||||
|                         Some(flattened_obkv) => { | ||||
|                             // we recreate our buffer with the flattened documents | ||||
|                             document_sorter_buffer.clear(); | ||||
|                             document_sorter_buffer.push(Operation::Addition as u8); | ||||
|                             document_sorter_buffer.extend_from_slice(&flattened_obkv); | ||||
|                             self.flattened_sorter | ||||
|                                 .insert(docid.to_be_bytes(), &document_sorter_buffer)? | ||||
|                     let deladd_operation = match self.index_documents_method { | ||||
|                         IndexDocumentsMethod::UpdateDocuments => { | ||||
|                             DelAddOperation::DeletionAndAddition | ||||
|                         } | ||||
|                         None => self | ||||
|                             .flattened_sorter | ||||
|                             .insert(docid.to_be_bytes(), &document_sorter_buffer)?, | ||||
|                         IndexDocumentsMethod::ReplaceDocuments => DelAddOperation::Deletion, | ||||
|                     }; | ||||
|                     document_sorter_key_buffer.clear(); | ||||
|                     document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||
|                     document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); | ||||
|                     document_sorter_value_buffer.clear(); | ||||
|                     document_sorter_value_buffer.push(Operation::Addition as u8); | ||||
|                     into_del_add_obkv( | ||||
|                         KvReaderU16::new(base_obkv), | ||||
|                         deladd_operation, | ||||
|                         &mut document_sorter_value_buffer, | ||||
|                     )?; | ||||
|                     self.original_sorter | ||||
|                         .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||
|                     let base_obkv = KvReader::new(base_obkv); | ||||
|                     if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { | ||||
|                         // we recreate our buffer with the flattened documents | ||||
|                         document_sorter_value_buffer.clear(); | ||||
|                         document_sorter_value_buffer.push(Operation::Addition as u8); | ||||
|                         into_del_add_obkv( | ||||
|                             KvReaderU16::new(&flattened_obkv), | ||||
|                             deladd_operation, | ||||
|                             &mut document_sorter_value_buffer, | ||||
|                         )?; | ||||
|                     } | ||||
|                     self.flattened_sorter | ||||
|                         .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             if !skip_insertion { | ||||
|                 self.new_documents_ids.insert(docid); | ||||
|  | ||||
|                 document_sorter_buffer.clear(); | ||||
|                 document_sorter_buffer.push(Operation::Addition as u8); | ||||
|                 document_sorter_buffer.extend_from_slice(&obkv_buffer); | ||||
|                 document_sorter_key_buffer.clear(); | ||||
|                 document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||
|                 document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); | ||||
|                 document_sorter_value_buffer.clear(); | ||||
|                 document_sorter_value_buffer.push(Operation::Addition as u8); | ||||
|                 into_del_add_obkv( | ||||
|                     KvReaderU16::new(&obkv_buffer), | ||||
|                     DelAddOperation::Addition, | ||||
|                     &mut document_sorter_value_buffer, | ||||
|                 )?; | ||||
|                 // We use the extracted/generated user id as the key for this document. | ||||
|                 self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?; | ||||
|                 self.original_sorter | ||||
|                     .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||
|  | ||||
|                 match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { | ||||
|                     Some(flattened_obkv) => { | ||||
|                         document_sorter_buffer.clear(); | ||||
|                         document_sorter_buffer.push(Operation::Addition as u8); | ||||
|                         document_sorter_buffer.extend_from_slice(&flattened_obkv); | ||||
|                         self.flattened_sorter | ||||
|                             .insert(docid.to_be_bytes(), &document_sorter_buffer)? | ||||
|                     } | ||||
|                     None => self | ||||
|                         .flattened_sorter | ||||
|                         .insert(docid.to_be_bytes(), &document_sorter_buffer)?, | ||||
|                 let flattened_obkv = KvReader::new(&obkv_buffer); | ||||
|                 if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { | ||||
|                     document_sorter_value_buffer.clear(); | ||||
|                     document_sorter_value_buffer.push(Operation::Addition as u8); | ||||
|                     into_del_add_obkv( | ||||
|                         KvReaderU16::new(&obkv), | ||||
|                         DelAddOperation::Addition, | ||||
|                         &mut document_sorter_value_buffer, | ||||
|                     )? | ||||
|                 } | ||||
|                 self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||
|             } | ||||
|             documents_count += 1; | ||||
|  | ||||
| @@ -338,6 +359,7 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|     /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db, | ||||
|     ///   it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids. | ||||
|     /// - If the document to remove was not present in either the db or the transform we do nothing. | ||||
|     #[logging_timer::time] | ||||
|     pub fn remove_documents<FA>( | ||||
|         &mut self, | ||||
|         mut to_remove: Vec<String>, | ||||
| @@ -347,54 +369,176 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|     where | ||||
|         FA: Fn() -> bool + Sync, | ||||
|     { | ||||
|         puffin::profile_function!(); | ||||
|  | ||||
|         // there may be duplicates in the documents to remove. | ||||
|         to_remove.sort_unstable(); | ||||
|         to_remove.dedup(); | ||||
|  | ||||
|         let external_documents_ids = self.index.external_documents_ids(wtxn)?; | ||||
|         let external_documents_ids = self.index.external_documents_ids(); | ||||
|  | ||||
|         let mut documents_deleted = 0; | ||||
|         let mut document_sorter_value_buffer = Vec::new(); | ||||
|         let mut document_sorter_key_buffer = Vec::new(); | ||||
|         for to_remove in to_remove { | ||||
|             if should_abort() { | ||||
|                 return Err(Error::InternalError(InternalError::AbortedIndexation)); | ||||
|             } | ||||
|  | ||||
|             match self.new_external_documents_ids_builder.entry((*to_remove).into()) { | ||||
|                 // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. | ||||
|                 Entry::Occupied(entry) => { | ||||
|                     let doc_id = *entry.get() as u32; | ||||
|                     self.original_sorter | ||||
|                         .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; | ||||
|                     self.flattened_sorter | ||||
|                         .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?; | ||||
|             // Check if the document has been added in the current indexing process. | ||||
|             let deleted_from_current = | ||||
|                 match self.new_external_documents_ids_builder.entry((*to_remove).into()) { | ||||
|                     // if the document was added in a previous iteration of the transform we make it as deleted in the sorters. | ||||
|                     HEntry::Occupied(entry) => { | ||||
|                         let docid = *entry.get() as u32; | ||||
|                         // Key is the concatenation of the internal docid and the external one. | ||||
|                         document_sorter_key_buffer.clear(); | ||||
|                         document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||
|                         document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes()); | ||||
|                         document_sorter_value_buffer.clear(); | ||||
|                         document_sorter_value_buffer.push(Operation::Deletion as u8); | ||||
|                         obkv::KvWriterU16::new(&mut document_sorter_value_buffer).finish().unwrap(); | ||||
|                         self.original_sorter | ||||
|                             .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||
|                         self.flattened_sorter | ||||
|                             .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||
|  | ||||
|                     // we must NOT update the list of replaced_documents_ids | ||||
|                     // Either: | ||||
|                     // 1. It's already in it and there is nothing to do | ||||
|                     // 2. It wasn't in it because the document was created by a previous batch and since | ||||
|                     //    we're removing it there is nothing to do. | ||||
|                     self.new_documents_ids.remove(doc_id); | ||||
|                     entry.remove_entry(); | ||||
|                 } | ||||
|                 Entry::Vacant(entry) => { | ||||
|                     // If the document was already in the db we mark it as a `to_delete` document. | ||||
|                     // It'll be deleted later. We don't need to push anything to the sorters. | ||||
|                     if let Some(docid) = external_documents_ids.get(entry.key()) { | ||||
|                         self.replaced_documents_ids.insert(docid); | ||||
|                     } else { | ||||
|                         // if the document is nowehere to be found, there is nothing to do and we must NOT | ||||
|                         // increment the count of documents_deleted | ||||
|                         continue; | ||||
|                         // we must NOT update the list of replaced_documents_ids | ||||
|                         // Either: | ||||
|                         // 1. It's already in it and there is nothing to do | ||||
|                         // 2. It wasn't in it because the document was created by a previous batch and since | ||||
|                         //    we're removing it there is nothing to do. | ||||
|                         self.new_documents_ids.remove(docid); | ||||
|                         entry.remove_entry(); | ||||
|                         true | ||||
|                     } | ||||
|                     HEntry::Vacant(_) => false, | ||||
|                 }; | ||||
|  | ||||
|             // If the document was already in the db we mark it as a `to_delete` document. | ||||
|             // Then we push the document in sorters in deletion mode. | ||||
|             let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? { | ||||
|                 Some(docid) => { | ||||
|                     self.remove_document_from_db( | ||||
|                         docid, | ||||
|                         to_remove, | ||||
|                         wtxn, | ||||
|                         &mut document_sorter_key_buffer, | ||||
|                         &mut document_sorter_value_buffer, | ||||
|                     )?; | ||||
|                     true | ||||
|                 } | ||||
|                 None => false, | ||||
|             }; | ||||
|  | ||||
|             // increase counter only if the document existed somewhere before. | ||||
|             if deleted_from_current || deleted_from_db { | ||||
|                 documents_deleted += 1; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(documents_deleted) | ||||
|     } | ||||
|  | ||||
|     /// Removes documents from db using their internal document ids. | ||||
|     /// | ||||
|     /// # Warning | ||||
|     /// | ||||
|     /// This function is dangerous and will only work correctly if: | ||||
|     /// | ||||
|     /// - All the passed ids currently exist in the database | ||||
|     /// - No batching using the standards `remove_documents` and `add_documents` took place | ||||
|     /// | ||||
|     /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function. | ||||
|     #[logging_timer::time] | ||||
|     pub fn remove_documents_from_db_no_batch<FA>( | ||||
|         &mut self, | ||||
|         to_remove: &RoaringBitmap, | ||||
|         wtxn: &mut heed::RwTxn, | ||||
|         should_abort: FA, | ||||
|     ) -> Result<usize> | ||||
|     where | ||||
|         FA: Fn() -> bool + Sync, | ||||
|     { | ||||
|         puffin::profile_function!(); | ||||
|  | ||||
|         let mut documents_deleted = 0; | ||||
|         let mut document_sorter_value_buffer = Vec::new(); | ||||
|         let mut document_sorter_key_buffer = Vec::new(); | ||||
|         let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?; | ||||
|  | ||||
|         for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) { | ||||
|             let external_docid = external_docid?; | ||||
|             if should_abort() { | ||||
|                 return Err(Error::InternalError(InternalError::AbortedIndexation)); | ||||
|             } | ||||
|             self.remove_document_from_db( | ||||
|                 internal_docid, | ||||
|                 external_docid, | ||||
|                 wtxn, | ||||
|                 &mut document_sorter_key_buffer, | ||||
|                 &mut document_sorter_value_buffer, | ||||
|             )?; | ||||
|  | ||||
|             documents_deleted += 1; | ||||
|         } | ||||
|  | ||||
|         Ok(documents_deleted) | ||||
|     } | ||||
|  | ||||
|     fn remove_document_from_db( | ||||
|         &mut self, | ||||
|         internal_docid: u32, | ||||
|         external_docid: String, | ||||
|         txn: &heed::RoTxn, | ||||
|         document_sorter_key_buffer: &mut Vec<u8>, | ||||
|         document_sorter_value_buffer: &mut Vec<u8>, | ||||
|     ) -> Result<()> { | ||||
|         self.replaced_documents_ids.insert(internal_docid); | ||||
|  | ||||
|         // fetch the obkv document | ||||
|         let original_key = internal_docid; | ||||
|         let base_obkv = self | ||||
|             .index | ||||
|             .documents | ||||
|             .remap_data_type::<heed::types::Bytes>() | ||||
|             .get(txn, &original_key)? | ||||
|             .ok_or(InternalError::DatabaseMissingEntry { | ||||
|                 db_name: db_name::DOCUMENTS, | ||||
|                 key: None, | ||||
|             })?; | ||||
|  | ||||
|         // Key is the concatenation of the internal docid and the external one. | ||||
|         document_sorter_key_buffer.clear(); | ||||
|         document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes()); | ||||
|         document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes()); | ||||
|         // push it as to delete in the original_sorter | ||||
|         document_sorter_value_buffer.clear(); | ||||
|         document_sorter_value_buffer.push(Operation::Deletion as u8); | ||||
|         into_del_add_obkv( | ||||
|             KvReaderU16::new(base_obkv), | ||||
|             DelAddOperation::Deletion, | ||||
|             document_sorter_value_buffer, | ||||
|         )?; | ||||
|         self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||
|  | ||||
|         // flatten it and push it as to delete in the flattened_sorter | ||||
|         let flattened_obkv = KvReader::new(base_obkv); | ||||
|         if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { | ||||
|             // we recreate our buffer with the flattened documents | ||||
|             document_sorter_value_buffer.clear(); | ||||
|             document_sorter_value_buffer.push(Operation::Deletion as u8); | ||||
|             into_del_add_obkv( | ||||
|                 KvReaderU16::new(&obkv), | ||||
|                 DelAddOperation::Deletion, | ||||
|                 document_sorter_value_buffer, | ||||
|             )?; | ||||
|         } | ||||
|         self.flattened_sorter | ||||
|             .insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     // Flatten a document from the fields ids map contained in self and insert the new | ||||
|     // created fields. Returns `None` if the document doesn't need to be flattened. | ||||
|     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> { | ||||
| @@ -514,42 +658,10 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn remove_deleted_documents_from_field_distribution( | ||||
|         &self, | ||||
|         rtxn: &RoTxn, | ||||
|         field_distribution: &mut FieldDistribution, | ||||
|     ) -> Result<()> { | ||||
|         for deleted_docid in self.replaced_documents_ids.iter() { | ||||
|             let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or( | ||||
|                 InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, | ||||
|             )?; | ||||
|  | ||||
|             for (key, _) in obkv.iter() { | ||||
|                 let name = | ||||
|                     self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { | ||||
|                         field_id: key, | ||||
|                         process: "Computing field distribution in transform.", | ||||
|                     })?; | ||||
|                 // We checked that the document was in the db earlier. If we can't find it it means | ||||
|                 // there is an inconsistency between the field distribution and the field id map. | ||||
|                 let field = | ||||
|                     field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId { | ||||
|                         field_id: key, | ||||
|                         process: "Accessing field distribution in transform.", | ||||
|                     })?; | ||||
|                 *field -= 1; | ||||
|                 if *field == 0 { | ||||
|                     // since we were able to get the field right before it's safe to unwrap here | ||||
|                     field_distribution.remove(name).unwrap(); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     /// Generate the `TransformOutput` based on the given sorter that can be generated from any | ||||
|     /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document | ||||
|     /// id for the user side and the value must be an obkv where keys are valid fields ids. | ||||
|     #[logging_timer::time] | ||||
|     pub(crate) fn output_from_sorter<F>( | ||||
|         self, | ||||
|         wtxn: &mut heed::RwTxn, | ||||
| @@ -581,17 +693,13 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         // 2. Add all the new documents to the field distribution | ||||
|         let mut field_distribution = self.index.field_distribution(wtxn)?; | ||||
|  | ||||
|         self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?; | ||||
|  | ||||
|         // Here we are going to do the document count + field distribution + `write_into_stream_writer` | ||||
|         let mut iter = self.original_sorter.into_stream_merger_iter()?; | ||||
|         // used only for the callback | ||||
|         let mut documents_count = 0; | ||||
|  | ||||
|         while let Some((key, val)) = iter.next()? { | ||||
|             if val[0] == Operation::Deletion as u8 { | ||||
|                 continue; | ||||
|             } | ||||
|             // skip first byte corresponding to the operation type (Deletion or Addition). | ||||
|             let val = &val[1..]; | ||||
|  | ||||
|             // send a callback to show at which step we are | ||||
| @@ -601,16 +709,51 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|                 total_documents: self.documents_count, | ||||
|             }); | ||||
|  | ||||
|             // We increment all the field of the current document in the field distribution. | ||||
|             let obkv = KvReader::new(val); | ||||
|  | ||||
|             for (key, _) in obkv.iter() { | ||||
|                 let name = | ||||
|                     self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { | ||||
|                         field_id: key, | ||||
|                         process: "Computing field distribution in transform.", | ||||
|                     })?; | ||||
|                 *field_distribution.entry(name.to_string()).or_insert(0) += 1; | ||||
|             for (key, value) in KvReader::new(val) { | ||||
|                 let reader = KvReaderDelAdd::new(value); | ||||
|                 match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { | ||||
|                     (None, None) => {} | ||||
|                     (None, Some(_)) => { | ||||
|                         // New field | ||||
|                         let name = self.fields_ids_map.name(key).ok_or( | ||||
|                             FieldIdMapMissingEntry::FieldId { | ||||
|                                 field_id: key, | ||||
|                                 process: "Computing field distribution in transform.", | ||||
|                             }, | ||||
|                         )?; | ||||
|                         *field_distribution.entry(name.to_string()).or_insert(0) += 1; | ||||
|                     } | ||||
|                     (Some(_), None) => { | ||||
|                         // Field removed | ||||
|                         let name = self.fields_ids_map.name(key).ok_or( | ||||
|                             FieldIdMapMissingEntry::FieldId { | ||||
|                                 field_id: key, | ||||
|                                 process: "Computing field distribution in transform.", | ||||
|                             }, | ||||
|                         )?; | ||||
|                         match field_distribution.entry(name.to_string()) { | ||||
|                             BEntry::Vacant(_) => { /* Bug? trying to remove a non-existing field */ | ||||
|                             } | ||||
|                             BEntry::Occupied(mut entry) => { | ||||
|                                 // attempt to remove one | ||||
|                                 match entry.get_mut().checked_sub(1) { | ||||
|                                     Some(0) => { | ||||
|                                         entry.remove(); | ||||
|                                     } | ||||
|                                     Some(new_val) => { | ||||
|                                         *entry.get_mut() = new_val; | ||||
|                                     } | ||||
|                                     None => { | ||||
|                                         unreachable!("Attempting to remove a field that wasn't in the field distribution") | ||||
|                                     } | ||||
|                                 } | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                     (Some(_), Some(_)) => { | ||||
|                         // Value change, no field distribution change | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             writer.insert(key, val)?; | ||||
|         } | ||||
| @@ -631,9 +774,7 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         // We get rids of the `Operation` byte and skip the deleted documents as well. | ||||
|         let mut iter = self.flattened_sorter.into_stream_merger_iter()?; | ||||
|         while let Some((key, val)) = iter.next()? { | ||||
|             if val[0] == Operation::Deletion as u8 { | ||||
|                 continue; | ||||
|             } | ||||
|             // skip first byte corresponding to the operation type (Deletion or Addition). | ||||
|             let val = &val[1..]; | ||||
|             writer.insert(key, val)?; | ||||
|         } | ||||
| @@ -649,15 +790,11 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|         new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| { | ||||
|             fst_new_external_documents_ids_builder.insert(key, value) | ||||
|         })?; | ||||
|         let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map(); | ||||
|  | ||||
|         Ok(TransformOutput { | ||||
|             primary_key, | ||||
|             fields_ids_map: self.fields_ids_map, | ||||
|             field_distribution, | ||||
|             new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(), | ||||
|             new_documents_ids: self.new_documents_ids, | ||||
|             replaced_documents_ids: self.replaced_documents_ids, | ||||
|             documents_count: self.documents_count, | ||||
|             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, | ||||
|             flattened_documents: flattened_documents | ||||
| @@ -672,7 +809,7 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|     // TODO this can be done in parallel by using the rayon `ThreadPool`. | ||||
|     pub fn prepare_for_documents_reindexing( | ||||
|         self, | ||||
|         wtxn: &mut heed::RwTxn<'i, '_>, | ||||
|         wtxn: &mut heed::RwTxn<'i>, | ||||
|         old_fields_ids_map: FieldsIdsMap, | ||||
|         mut new_fields_ids_map: FieldsIdsMap, | ||||
|     ) -> Result<TransformOutput> { | ||||
| @@ -687,37 +824,40 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             .to_string(); | ||||
|         let field_distribution = self.index.field_distribution(wtxn)?; | ||||
|  | ||||
|         // Delete the soft deleted document ids from the maps inside the external_document_ids structure | ||||
|         let new_external_documents_ids = { | ||||
|             let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; | ||||
|             external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; | ||||
|             // This call should be free and can't fail since the previous method merged both fsts. | ||||
|             external_documents_ids.into_static().to_fst()?.into_owned() | ||||
|         }; | ||||
|  | ||||
|         let documents_ids = self.index.documents_ids(wtxn)?; | ||||
|         let documents_count = documents_ids.len() as usize; | ||||
|  | ||||
|         // We create a final writer to write the new documents in order from the sorter. | ||||
|         let mut original_writer = create_writer( | ||||
|         // We initialize the sorter with the user indexing settings. | ||||
|         let mut original_sorter = create_sorter( | ||||
|             grenad::SortAlgorithm::Stable, | ||||
|             keep_first, | ||||
|             self.indexer_settings.chunk_compression_type, | ||||
|             self.indexer_settings.chunk_compression_level, | ||||
|             tempfile::tempfile()?, | ||||
|             self.indexer_settings.max_nb_chunks, | ||||
|             self.indexer_settings.max_memory.map(|mem| mem / 2), | ||||
|         ); | ||||
|  | ||||
|         // We create a final writer to write the new documents in order from the sorter. | ||||
|         let mut flattened_writer = create_writer( | ||||
|         // We initialize the sorter with the user indexing settings. | ||||
|         let mut flattened_sorter = create_sorter( | ||||
|             grenad::SortAlgorithm::Stable, | ||||
|             keep_first, | ||||
|             self.indexer_settings.chunk_compression_type, | ||||
|             self.indexer_settings.chunk_compression_level, | ||||
|             tempfile::tempfile()?, | ||||
|             self.indexer_settings.max_nb_chunks, | ||||
|             self.indexer_settings.max_memory.map(|mem| mem / 2), | ||||
|         ); | ||||
|  | ||||
|         let mut obkv_buffer = Vec::new(); | ||||
|         for result in self.index.all_documents(wtxn)? { | ||||
|             let (docid, obkv) = result?; | ||||
|         let mut document_sorter_key_buffer = Vec::new(); | ||||
|         let mut document_sorter_value_buffer = Vec::new(); | ||||
|         for result in self.index.external_documents_ids().iter(wtxn)? { | ||||
|             let (external_id, docid) = result?; | ||||
|             let obkv = self.index.documents.get(wtxn, &docid)?.ok_or( | ||||
|                 InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, | ||||
|             )?; | ||||
|  | ||||
|             obkv_buffer.clear(); | ||||
|             let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer); | ||||
|             let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer); | ||||
|  | ||||
|             // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. | ||||
|             for (id, name) in new_fields_ids_map.iter() { | ||||
| @@ -727,7 +867,17 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|             } | ||||
|  | ||||
|             let buffer = obkv_writer.into_inner()?; | ||||
|             original_writer.insert(docid.to_be_bytes(), &buffer)?; | ||||
|  | ||||
|             document_sorter_key_buffer.clear(); | ||||
|             document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||
|             document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); | ||||
|             document_sorter_value_buffer.clear(); | ||||
|             into_del_add_obkv( | ||||
|                 KvReaderU16::new(buffer), | ||||
|                 DelAddOperation::Addition, | ||||
|                 &mut document_sorter_value_buffer, | ||||
|             )?; | ||||
|             original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||
|  | ||||
|             // Once we have the document. We're going to flatten it | ||||
|             // and insert it in the flattened sorter. | ||||
| @@ -762,29 +912,34 @@ impl<'a, 'i> Transform<'a, 'i> { | ||||
|                 let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; | ||||
|                 writer.insert(fid, &value)?; | ||||
|             } | ||||
|             flattened_writer.insert(docid.to_be_bytes(), &buffer)?; | ||||
|             document_sorter_value_buffer.clear(); | ||||
|             into_del_add_obkv( | ||||
|                 KvReaderU16::new(&buffer), | ||||
|                 DelAddOperation::Addition, | ||||
|                 &mut document_sorter_value_buffer, | ||||
|             )?; | ||||
|             flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; | ||||
|         } | ||||
|  | ||||
|         // Once we have written all the documents, we extract | ||||
|         // the file and reset the seek to be able to read it again. | ||||
|         let mut original_documents = original_writer.into_inner()?; | ||||
|         original_documents.rewind()?; | ||||
|         let grenad_params = GrenadParameters { | ||||
|             chunk_compression_type: self.indexer_settings.chunk_compression_type, | ||||
|             chunk_compression_level: self.indexer_settings.chunk_compression_level, | ||||
|             max_memory: self.indexer_settings.max_memory, | ||||
|             max_nb_chunks: self.indexer_settings.max_nb_chunks, // default value, may be chosen. | ||||
|         }; | ||||
|  | ||||
|         let mut flattened_documents = flattened_writer.into_inner()?; | ||||
|         flattened_documents.rewind()?; | ||||
|         // Once we have written all the documents, we merge everything into a Reader. | ||||
|         let original_documents = sorter_into_reader(original_sorter, grenad_params)?; | ||||
|  | ||||
|         let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; | ||||
|  | ||||
|         let output = TransformOutput { | ||||
|             primary_key, | ||||
|             fields_ids_map: new_fields_ids_map, | ||||
|             field_distribution, | ||||
|             new_external_documents_ids, | ||||
|             new_documents_ids: documents_ids, | ||||
|             replaced_documents_ids: RoaringBitmap::default(), | ||||
|             documents_count, | ||||
|             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, | ||||
|             flattened_documents: flattened_documents | ||||
|                 .into_inner() | ||||
|                 .map_err(|err| err.into_error())?, | ||||
|             original_documents: original_documents.into_inner().into_inner(), | ||||
|             flattened_documents: flattened_documents.into_inner().into_inner(), | ||||
|         }; | ||||
|  | ||||
|         let new_facets = output.compute_real_facets(wtxn, self.index)?; | ||||
| @@ -828,38 +983,111 @@ mod test { | ||||
|  | ||||
|     #[test] | ||||
|     fn merge_obkvs() { | ||||
|         let mut doc_0 = Vec::new(); | ||||
|         let mut kv_writer = KvWriter::new(&mut doc_0); | ||||
|         let mut additive_doc_0 = Vec::new(); | ||||
|         let mut deletive_doc_0 = Vec::new(); | ||||
|         let mut del_add_doc_0 = Vec::new(); | ||||
|         let mut kv_writer = KvWriter::memory(); | ||||
|         kv_writer.insert(0_u8, [0]).unwrap(); | ||||
|         kv_writer.finish().unwrap(); | ||||
|         doc_0.insert(0, Operation::Addition as u8); | ||||
|  | ||||
|         let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap(); | ||||
|         assert_eq!(*ret, doc_0); | ||||
|  | ||||
|         let ret = merge_obkvs_and_operations( | ||||
|             &[], | ||||
|             &[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())], | ||||
|         let buffer = kv_writer.into_inner().unwrap(); | ||||
|         into_del_add_obkv( | ||||
|             KvReaderU16::new(&buffer), | ||||
|             DelAddOperation::Addition, | ||||
|             &mut additive_doc_0, | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         assert_eq!(*ret, doc_0); | ||||
|  | ||||
|         let ret = merge_obkvs_and_operations( | ||||
|             &[], | ||||
|             &[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())], | ||||
|         additive_doc_0.insert(0, Operation::Addition as u8); | ||||
|         into_del_add_obkv( | ||||
|             KvReaderU16::new(&buffer), | ||||
|             DelAddOperation::Deletion, | ||||
|             &mut deletive_doc_0, | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         assert_eq!(*ret, [Operation::Deletion as u8]); | ||||
|         deletive_doc_0.insert(0, Operation::Deletion as u8); | ||||
|         into_del_add_obkv( | ||||
|             KvReaderU16::new(&buffer), | ||||
|             DelAddOperation::DeletionAndAddition, | ||||
|             &mut del_add_doc_0, | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         del_add_doc_0.insert(0, Operation::Addition as u8); | ||||
|  | ||||
|         let ret = merge_obkvs_and_operations( | ||||
|         let mut additive_doc_1 = Vec::new(); | ||||
|         let mut kv_writer = KvWriter::memory(); | ||||
|         kv_writer.insert(1_u8, [1]).unwrap(); | ||||
|         let buffer = kv_writer.into_inner().unwrap(); | ||||
|         into_del_add_obkv( | ||||
|             KvReaderU16::new(&buffer), | ||||
|             DelAddOperation::Addition, | ||||
|             &mut additive_doc_1, | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         additive_doc_1.insert(0, Operation::Addition as u8); | ||||
|  | ||||
|         let mut additive_doc_0_1 = Vec::new(); | ||||
|         let mut kv_writer = KvWriter::memory(); | ||||
|         kv_writer.insert(0_u8, [0]).unwrap(); | ||||
|         kv_writer.insert(1_u8, [1]).unwrap(); | ||||
|         let buffer = kv_writer.into_inner().unwrap(); | ||||
|         into_del_add_obkv( | ||||
|             KvReaderU16::new(&buffer), | ||||
|             DelAddOperation::Addition, | ||||
|             &mut additive_doc_0_1, | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         additive_doc_0_1.insert(0, Operation::Addition as u8); | ||||
|  | ||||
|         let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())]) | ||||
|             .unwrap(); | ||||
|         assert_eq!(*ret, additive_doc_0); | ||||
|  | ||||
|         let ret = obkvs_merge_additions_and_deletions( | ||||
|             &[], | ||||
|             &[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())], | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         assert_eq!(*ret, del_add_doc_0); | ||||
|  | ||||
|         let ret = obkvs_merge_additions_and_deletions( | ||||
|             &[], | ||||
|             &[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())], | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         assert_eq!(*ret, deletive_doc_0); | ||||
|  | ||||
|         let ret = obkvs_merge_additions_and_deletions( | ||||
|             &[], | ||||
|             &[ | ||||
|                 Cow::from([Operation::Addition as u8, 1].as_slice()), | ||||
|                 Cow::from([Operation::Deletion as u8].as_slice()), | ||||
|                 Cow::from(doc_0.as_slice()), | ||||
|                 Cow::from(additive_doc_1.as_slice()), | ||||
|                 Cow::from(deletive_doc_0.as_slice()), | ||||
|                 Cow::from(additive_doc_0.as_slice()), | ||||
|             ], | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         assert_eq!(*ret, doc_0); | ||||
|         assert_eq!(*ret, del_add_doc_0); | ||||
|  | ||||
|         let ret = obkvs_merge_additions_and_deletions( | ||||
|             &[], | ||||
|             &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         assert_eq!(*ret, additive_doc_0_1); | ||||
|  | ||||
|         let ret = obkvs_keep_last_addition_merge_deletions( | ||||
|             &[], | ||||
|             &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())], | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         assert_eq!(*ret, additive_doc_0); | ||||
|  | ||||
|         let ret = obkvs_keep_last_addition_merge_deletions( | ||||
|             &[], | ||||
|             &[ | ||||
|                 Cow::from(deletive_doc_0.as_slice()), | ||||
|                 Cow::from(additive_doc_1.as_slice()), | ||||
|                 Cow::from(additive_doc_0.as_slice()), | ||||
|             ], | ||||
|         ) | ||||
|         .unwrap(); | ||||
|         assert_eq!(*ret, del_add_doc_0); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -1,5 +1,4 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashMap; | ||||
| use std::collections::{HashMap, HashSet}; | ||||
| use std::convert::TryInto; | ||||
| use std::fs::File; | ||||
| use std::io::{self, BufReader}; | ||||
| @@ -7,34 +6,40 @@ use std::io::{self, BufReader}; | ||||
| use bytemuck::allocation::pod_collect_to_vec; | ||||
| use charabia::{Language, Script}; | ||||
| use grenad::MergerBuilder; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::RwTxn; | ||||
| use heed::types::Bytes; | ||||
| use heed::{PutFlags, RwTxn}; | ||||
| use log::error; | ||||
| use obkv::{KvReader, KvWriter}; | ||||
| use ordered_float::OrderedFloat; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::helpers::{ | ||||
|     self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, | ||||
|     self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, | ||||
|     valid_lmdb_key, CursorClonableMmap, | ||||
| }; | ||||
| use super::{ClonableMmap, MergeFn}; | ||||
| use crate::distance::NDotProductPoint; | ||||
| use crate::error::UserError; | ||||
| use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; | ||||
| use crate::facet::FacetType; | ||||
| use crate::index::db_name::DOCUMENTS; | ||||
| use crate::index::Hnsw; | ||||
| use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; | ||||
| use crate::update::facet::FacetsUpdate; | ||||
| use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; | ||||
| use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32}; | ||||
| use crate::{lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError}; | ||||
|  | ||||
| pub(crate) enum TypedChunk { | ||||
|     FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>), | ||||
|     FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>), | ||||
|     Documents(grenad::Reader<CursorClonableMmap>), | ||||
|     FieldIdWordcountDocids(grenad::Reader<BufReader<File>>), | ||||
|     NewDocumentsIds(RoaringBitmap), | ||||
|     FieldIdWordCountDocids(grenad::Reader<BufReader<File>>), | ||||
|     WordDocids { | ||||
|         word_docids_reader: grenad::Reader<BufReader<File>>, | ||||
|         exact_word_docids_reader: grenad::Reader<BufReader<File>>, | ||||
|         word_fid_docids_reader: grenad::Reader<BufReader<File>>, | ||||
|     }, | ||||
|     WordPositionDocids(grenad::Reader<BufReader<File>>), | ||||
|     WordFidDocids(grenad::Reader<BufReader<File>>), | ||||
|     WordPairProximityDocids(grenad::Reader<BufReader<File>>), | ||||
|     FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>), | ||||
|     FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>), | ||||
| @@ -43,7 +48,7 @@ pub(crate) enum TypedChunk { | ||||
|     FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>), | ||||
|     GeoPoints(grenad::Reader<BufReader<File>>), | ||||
|     VectorPoints(grenad::Reader<BufReader<File>>), | ||||
|     ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), | ||||
|     ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), | ||||
| } | ||||
|  | ||||
| impl TypedChunk { | ||||
| @@ -58,23 +63,22 @@ impl TypedChunk { | ||||
|             TypedChunk::Documents(grenad) => { | ||||
|                 format!("Documents {{ number_of_entries: {} }}", grenad.len()) | ||||
|             } | ||||
|             TypedChunk::FieldIdWordcountDocids(grenad) => { | ||||
|             TypedChunk::FieldIdWordCountDocids(grenad) => { | ||||
|                 format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) | ||||
|             } | ||||
|             TypedChunk::NewDocumentsIds(grenad) => { | ||||
|                 format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len()) | ||||
|             } | ||||
|             TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!( | ||||
|                 "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}", | ||||
|             TypedChunk::WordDocids { | ||||
|                 word_docids_reader, | ||||
|                 exact_word_docids_reader, | ||||
|                 word_fid_docids_reader, | ||||
|             } => format!( | ||||
|                 "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}", | ||||
|                 word_docids_reader.len(), | ||||
|                 exact_word_docids_reader.len() | ||||
|                 exact_word_docids_reader.len(), | ||||
|                 word_fid_docids_reader.len() | ||||
|             ), | ||||
|             TypedChunk::WordPositionDocids(grenad) => { | ||||
|                 format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len()) | ||||
|             } | ||||
|             TypedChunk::WordFidDocids(grenad) => { | ||||
|                 format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len()) | ||||
|             } | ||||
|             TypedChunk::WordPairProximityDocids(grenad) => { | ||||
|                 format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len()) | ||||
|             } | ||||
| @@ -99,8 +103,8 @@ impl TypedChunk { | ||||
|             TypedChunk::VectorPoints(grenad) => { | ||||
|                 format!("VectorPoints {{ number_of_entries: {} }}", grenad.len()) | ||||
|             } | ||||
|             TypedChunk::ScriptLanguageDocids(grenad) => { | ||||
|                 format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len()) | ||||
|             TypedChunk::ScriptLanguageDocids(sl_map) => { | ||||
|                 format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len()) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| @@ -119,34 +123,75 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|     let mut is_merged_database = false; | ||||
|     match typed_chunk { | ||||
|         TypedChunk::Documents(obkv_documents_iter) => { | ||||
|             let mut operations: Vec<DocumentOperation> = Default::default(); | ||||
|  | ||||
|             let mut docids = index.documents_ids(wtxn)?; | ||||
|             let mut cursor = obkv_documents_iter.into_cursor()?; | ||||
|             while let Some((key, value)) = cursor.move_on_next()? { | ||||
|                 index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?; | ||||
|             while let Some((key, reader)) = cursor.move_on_next()? { | ||||
|                 let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); | ||||
|                 let reader: KvReader<FieldId> = KvReader::new(reader); | ||||
|  | ||||
|                 let (document_id_bytes, external_id_bytes) = try_split_array_at(key) | ||||
|                     .ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?; | ||||
|                 let docid = DocumentId::from_be_bytes(document_id_bytes); | ||||
|                 let external_id = std::str::from_utf8(external_id_bytes)?; | ||||
|  | ||||
|                 for (field_id, value) in reader.iter() { | ||||
|                     let del_add_reader = KvReaderDelAdd::new(value); | ||||
|  | ||||
|                     if let Some(addition) = del_add_reader.get(DelAdd::Addition) { | ||||
|                         writer.insert(field_id, addition)?; | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 let db = index.documents.remap_data_type::<Bytes>(); | ||||
|  | ||||
|                 if !writer.is_empty() { | ||||
|                     db.put(wtxn, &docid, &writer.into_inner().unwrap())?; | ||||
|                     operations.push(DocumentOperation { | ||||
|                         external_id: external_id.to_string(), | ||||
|                         internal_id: docid, | ||||
|                         kind: DocumentOperationKind::Create, | ||||
|                     }); | ||||
|                     docids.insert(docid); | ||||
|                 } else { | ||||
|                     db.delete(wtxn, &docid)?; | ||||
|                     operations.push(DocumentOperation { | ||||
|                         external_id: external_id.to_string(), | ||||
|                         internal_id: docid, | ||||
|                         kind: DocumentOperationKind::Delete, | ||||
|                     }); | ||||
|                     docids.remove(docid); | ||||
|                 } | ||||
|             } | ||||
|             let external_documents_docids = index.external_documents_ids(); | ||||
|             external_documents_docids.apply(wtxn, operations)?; | ||||
|             index.put_documents_ids(wtxn, &docids)?; | ||||
|         } | ||||
|         TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { | ||||
|         TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { | ||||
|             append_entries_into_database( | ||||
|                 fid_word_count_docids_iter, | ||||
|                 &index.field_id_word_count_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|                 deladd_serialize_add_side, | ||||
|                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::NewDocumentsIds(documents_ids) => { | ||||
|             return Ok((documents_ids, is_merged_database)) | ||||
|         } | ||||
|         TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { | ||||
|         TypedChunk::WordDocids { | ||||
|             word_docids_reader, | ||||
|             exact_word_docids_reader, | ||||
|             word_fid_docids_reader, | ||||
|         } => { | ||||
|             let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; | ||||
|             append_entries_into_database( | ||||
|                 word_docids_iter.clone(), | ||||
|                 &index.word_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_roaring_bitmaps, | ||||
|                 deladd_serialize_add_side, | ||||
|                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|             )?; | ||||
|  | ||||
|             let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; | ||||
| @@ -155,8 +200,18 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                 &index.exact_word_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_roaring_bitmaps, | ||||
|                 deladd_serialize_add_side, | ||||
|                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|             )?; | ||||
|  | ||||
|             let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; | ||||
|             append_entries_into_database( | ||||
|                 word_fid_docids_iter, | ||||
|                 &index.word_fid_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 deladd_serialize_add_side, | ||||
|                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|             )?; | ||||
|  | ||||
|             // create fst from word docids | ||||
| @@ -177,19 +232,8 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                 &index.word_position_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::WordFidDocids(word_fid_docids_iter) => { | ||||
|             append_entries_into_database( | ||||
|                 word_fid_docids_iter, | ||||
|                 &index.word_fid_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|                 deladd_serialize_add_side, | ||||
|                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
| @@ -209,8 +253,8 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                 &index.facet_id_exists_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|                 deladd_serialize_add_side, | ||||
|                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
| @@ -220,8 +264,8 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                 &index.facet_id_is_null_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|                 deladd_serialize_add_side, | ||||
|                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
| @@ -231,8 +275,8 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                 &index.facet_id_is_empty_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|                 deladd_serialize_add_side, | ||||
|                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
| @@ -242,28 +286,48 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                 &index.word_pair_proximity_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|                 deladd_serialize_add_side, | ||||
|                 merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::FieldIdDocidFacetNumbers(fid_docid_facet_number) => { | ||||
|             let index_fid_docid_facet_numbers = | ||||
|                 index.field_id_docid_facet_f64s.remap_types::<ByteSlice, ByteSlice>(); | ||||
|                 index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>(); | ||||
|             let mut cursor = fid_docid_facet_number.into_cursor()?; | ||||
|             while let Some((key, value)) = cursor.move_on_next()? { | ||||
|                 let reader = KvReaderDelAdd::new(value); | ||||
|                 if valid_lmdb_key(key) { | ||||
|                     index_fid_docid_facet_numbers.put(wtxn, key, value)?; | ||||
|                     match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { | ||||
|                         (None, None) => {} | ||||
|                         (None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?, | ||||
|                         (Some(_), None) => { | ||||
|                             index_fid_docid_facet_numbers.delete(wtxn, key)?; | ||||
|                         } | ||||
|                         (Some(_), Some(new)) => { | ||||
|                             index_fid_docid_facet_numbers.put(wtxn, key, new)? | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|         TypedChunk::FieldIdDocidFacetStrings(fid_docid_facet_string) => { | ||||
|             let index_fid_docid_facet_strings = | ||||
|                 index.field_id_docid_facet_strings.remap_types::<ByteSlice, ByteSlice>(); | ||||
|                 index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>(); | ||||
|             let mut cursor = fid_docid_facet_string.into_cursor()?; | ||||
|             while let Some((key, value)) = cursor.move_on_next()? { | ||||
|                 let reader = KvReaderDelAdd::new(value); | ||||
|                 if valid_lmdb_key(key) { | ||||
|                     index_fid_docid_facet_strings.put(wtxn, key, value)?; | ||||
|                     match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) { | ||||
|                         (None, None) => {} | ||||
|                         (None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?, | ||||
|                         (Some(_), None) => { | ||||
|                             index_fid_docid_facet_strings.delete(wtxn, key)?; | ||||
|                         } | ||||
|                         (Some(_), Some(new)) => { | ||||
|                             index_fid_docid_facet_strings.put(wtxn, key, new)? | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| @@ -276,85 +340,113 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                 // convert the key back to a u32 (4 bytes) | ||||
|                 let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); | ||||
|  | ||||
|                 // convert the latitude and longitude back to a f64 (8 bytes) | ||||
|                 let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap(); | ||||
|                 let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap(); | ||||
|                 let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; | ||||
|                 let xyz_point = lat_lng_to_xyz(&point); | ||||
|  | ||||
|                 rtree.insert(GeoPoint::new(xyz_point, (docid, point))); | ||||
|                 geo_faceted_docids.insert(docid); | ||||
|                 let deladd_obkv = KvReaderDelAdd::new(value); | ||||
|                 if let Some(value) = deladd_obkv.get(DelAdd::Deletion) { | ||||
|                     let geopoint = extract_geo_point(value, docid); | ||||
|                     rtree.remove(&geopoint); | ||||
|                     geo_faceted_docids.remove(docid); | ||||
|                 } | ||||
|                 if let Some(value) = deladd_obkv.get(DelAdd::Addition) { | ||||
|                     let geopoint = extract_geo_point(value, docid); | ||||
|                     rtree.insert(geopoint); | ||||
|                     geo_faceted_docids.insert(docid); | ||||
|                 } | ||||
|             } | ||||
|             index.put_geo_rtree(wtxn, &rtree)?; | ||||
|             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?; | ||||
|         } | ||||
|         TypedChunk::VectorPoints(vector_points) => { | ||||
|             let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? { | ||||
|                 Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(), | ||||
|                 None => Default::default(), | ||||
|             }; | ||||
|  | ||||
|             // Convert the PointIds into DocumentIds | ||||
|             let mut docids = Vec::new(); | ||||
|             for pid in pids { | ||||
|                 let docid = | ||||
|                     index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap(); | ||||
|                 docids.push(docid.get()); | ||||
|             let mut vectors_set = HashSet::new(); | ||||
|             // We extract and store the previous vectors | ||||
|             if let Some(hnsw) = index.vector_hnsw(wtxn)? { | ||||
|                 for (pid, point) in hnsw.iter() { | ||||
|                     let pid_key = pid.into_inner(); | ||||
|                     let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap(); | ||||
|                     let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect(); | ||||
|                     vectors_set.insert((docid, vector)); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             let mut expected_dimensions = points.get(0).map(|p| p.len()); | ||||
|             let mut cursor = vector_points.into_cursor()?; | ||||
|             while let Some((key, value)) = cursor.move_on_next()? { | ||||
|                 // convert the key back to a u32 (4 bytes) | ||||
|                 let (left, _index) = try_split_array_at(key).unwrap(); | ||||
|                 let docid = DocumentId::from_be_bytes(left); | ||||
|                 // convert the vector back to a Vec<f32> | ||||
|                 let vector: Vec<f32> = pod_collect_to_vec(value); | ||||
|  | ||||
|                 // TODO Inform the user about the document that has a wrong `_vectors` | ||||
|                 let found = vector.len(); | ||||
|                 let expected = *expected_dimensions.get_or_insert(found); | ||||
|                 if expected != found { | ||||
|                     return Err(UserError::InvalidVectorDimensions { expected, found })?; | ||||
|                 let vector_deladd_obkv = KvReaderDelAdd::new(value); | ||||
|                 if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) { | ||||
|                     // convert the vector back to a Vec<f32> | ||||
|                     let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); | ||||
|                     let key = (docid, vector); | ||||
|                     if !vectors_set.remove(&key) { | ||||
|                         error!("Unable to delete the vector: {:?}", key.1); | ||||
|                     } | ||||
|                 } | ||||
|                 if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) { | ||||
|                     // convert the vector back to a Vec<f32> | ||||
|                     let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect(); | ||||
|                     vectors_set.insert((docid, vector)); | ||||
|                 } | ||||
|  | ||||
|                 points.push(NDotProductPoint::new(vector)); | ||||
|                 docids.push(docid); | ||||
|             } | ||||
|  | ||||
|             assert_eq!(docids.len(), points.len()); | ||||
|             // Extract the most common vector dimension | ||||
|             let expected_dimension_size = { | ||||
|                 let mut dims = HashMap::new(); | ||||
|                 vectors_set.iter().for_each(|(_, v)| *dims.entry(v.len()).or_insert(0) += 1); | ||||
|                 dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len) | ||||
|             }; | ||||
|  | ||||
|             // Ensure that the vector lengths are correct and | ||||
|             // prepare the vectors before inserting them in the HNSW. | ||||
|             let mut points = Vec::new(); | ||||
|             let mut docids = Vec::new(); | ||||
|             for (docid, vector) in vectors_set { | ||||
|                 if expected_dimension_size.map_or(false, |expected| expected != vector.len()) { | ||||
|                     return Err(UserError::InvalidVectorDimensions { | ||||
|                         expected: expected_dimension_size.unwrap_or(vector.len()), | ||||
|                         found: vector.len(), | ||||
|                     } | ||||
|                     .into()); | ||||
|                 } else { | ||||
|                     let vector = vector.into_iter().map(OrderedFloat::into_inner).collect(); | ||||
|                     points.push(NDotProductPoint::new(vector)); | ||||
|                     docids.push(docid); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             let hnsw_length = points.len(); | ||||
|             let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points); | ||||
|  | ||||
|             assert_eq!(docids.len(), pids.len()); | ||||
|  | ||||
|             // Store the vectors in the point-docid relation database | ||||
|             index.vector_id_docid.clear(wtxn)?; | ||||
|             for (docid, pid) in docids.into_iter().zip(pids) { | ||||
|                 index.vector_id_docid.put( | ||||
|                     wtxn, | ||||
|                     &BEU32::new(pid.into_inner()), | ||||
|                     &BEU32::new(docid), | ||||
|                 )?; | ||||
|                 index.vector_id_docid.put(wtxn, &pid.into_inner(), &docid)?; | ||||
|             } | ||||
|  | ||||
|             log::debug!("There are {} entries in the HNSW so far", hnsw_length); | ||||
|             index.put_vector_hnsw(wtxn, &new_hnsw)?; | ||||
|         } | ||||
|         TypedChunk::ScriptLanguageDocids(hash_pair) => { | ||||
|             let mut buffer = Vec::new(); | ||||
|             for (key, value) in hash_pair { | ||||
|                 buffer.clear(); | ||||
|         TypedChunk::ScriptLanguageDocids(sl_map) => { | ||||
|             for (key, (deletion, addition)) in sl_map { | ||||
|                 let mut db_key_exists = false; | ||||
|                 let final_value = match index.script_language_docids.get(wtxn, &key)? { | ||||
|                     Some(db_values) => { | ||||
|                         let mut db_value_buffer = Vec::new(); | ||||
|                         serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?; | ||||
|                         let mut new_value_buffer = Vec::new(); | ||||
|                         serialize_roaring_bitmap(&value, &mut new_value_buffer)?; | ||||
|                         merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?; | ||||
|                         RoaringBitmap::deserialize_from(&buffer[..])? | ||||
|                         db_key_exists = true; | ||||
|                         (db_values - deletion) | addition | ||||
|                     } | ||||
|                     None => value, | ||||
|                     None => addition, | ||||
|                 }; | ||||
|                 index.script_language_docids.put(wtxn, &key, &final_value)?; | ||||
|  | ||||
|                 if final_value.is_empty() { | ||||
|                     // If the database entry exists, delete it. | ||||
|                     if db_key_exists { | ||||
|                         index.script_language_docids.delete(wtxn, &key)?; | ||||
|                     } | ||||
|                 } else { | ||||
|                     index.script_language_docids.put(wtxn, &key, &final_value)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| @@ -362,6 +454,15 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|     Ok((RoaringBitmap::new(), is_merged_database)) | ||||
| } | ||||
|  | ||||
| /// Converts the latitude and longitude back to an xyz GeoPoint. | ||||
| fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint { | ||||
|     let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap(); | ||||
|     let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap(); | ||||
|     let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)]; | ||||
|     let xyz_point = lat_lng_to_xyz(&point); | ||||
|     GeoPoint::new(xyz_point, (docid, point)) | ||||
| } | ||||
|  | ||||
| fn merge_word_docids_reader_into_fst( | ||||
|     word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>, | ||||
|     exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>, | ||||
| @@ -379,24 +480,6 @@ fn merge_word_docids_reader_into_fst( | ||||
|     Ok(builder.into_set()) | ||||
| } | ||||
|  | ||||
| fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> { | ||||
|     let new_value = RoaringBitmap::deserialize_from(new_value)?; | ||||
|     let db_value = RoaringBitmap::deserialize_from(db_value)?; | ||||
|     let value = new_value | db_value; | ||||
|     Ok(serialize_roaring_bitmap(&value, buffer)?) | ||||
| } | ||||
|  | ||||
| fn merge_cbo_roaring_bitmaps( | ||||
|     new_value: &[u8], | ||||
|     db_value: &[u8], | ||||
|     buffer: &mut Vec<u8>, | ||||
| ) -> Result<()> { | ||||
|     Ok(CboRoaringBitmapCodec::merge_into( | ||||
|         &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], | ||||
|         buffer, | ||||
|     )?) | ||||
| } | ||||
|  | ||||
| /// Write provided entries in database using serialize_value function. | ||||
| /// merge_values function is used if an entry already exist in the database. | ||||
| fn write_entries_into_database<R, K, V, FS, FM>( | ||||
| @@ -410,29 +493,31 @@ fn write_entries_into_database<R, K, V, FS, FM>( | ||||
| where | ||||
|     R: io::Read + io::Seek, | ||||
|     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, | ||||
|     FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>, | ||||
|     FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>, | ||||
| { | ||||
|     puffin::profile_function!(format!("number of entries: {}", data.len())); | ||||
|  | ||||
|     let mut buffer = Vec::new(); | ||||
|     let database = database.remap_types::<ByteSlice, ByteSlice>(); | ||||
|     let database = database.remap_types::<Bytes, Bytes>(); | ||||
|  | ||||
|     let mut cursor = data.into_cursor()?; | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         if valid_lmdb_key(key) { | ||||
|             buffer.clear(); | ||||
|             let value = if index_is_empty { | ||||
|                 serialize_value(value, &mut buffer)? | ||||
|                 Some(serialize_value(value, &mut buffer)?) | ||||
|             } else { | ||||
|                 match database.get(wtxn, key)? { | ||||
|                     Some(prev_value) => { | ||||
|                         merge_values(value, prev_value, &mut buffer)?; | ||||
|                         &buffer[..] | ||||
|                     } | ||||
|                     None => serialize_value(value, &mut buffer)?, | ||||
|                     Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, | ||||
|                     None => Some(serialize_value(value, &mut buffer)?), | ||||
|                 } | ||||
|             }; | ||||
|             database.put(wtxn, key, value)?; | ||||
|             match value { | ||||
|                 Some(value) => database.put(wtxn, key, value)?, | ||||
|                 None => { | ||||
|                     database.delete(wtxn, key)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -454,7 +539,8 @@ fn append_entries_into_database<R, K, V, FS, FM>( | ||||
| where | ||||
|     R: io::Read + io::Seek, | ||||
|     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>, | ||||
|     FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>, | ||||
|     FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>, | ||||
|     K: for<'a> heed::BytesDecode<'a>, | ||||
| { | ||||
|     puffin::profile_function!(format!("number of entries: {}", data.len())); | ||||
|  | ||||
| @@ -470,14 +556,23 @@ where | ||||
|     } | ||||
|  | ||||
|     let mut buffer = Vec::new(); | ||||
|     let mut database = database.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>(); | ||||
|     let mut database = database.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>(); | ||||
|  | ||||
|     let mut cursor = data.into_cursor()?; | ||||
|     while let Some((key, value)) = cursor.move_on_next()? { | ||||
|         if valid_lmdb_key(key) { | ||||
|             debug_assert!( | ||||
|                 K::bytes_decode(key).is_ok(), | ||||
|                 "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}", | ||||
|                 key.len(), | ||||
|                 &key | ||||
|             ); | ||||
|             buffer.clear(); | ||||
|             let value = serialize_value(value, &mut buffer)?; | ||||
|             unsafe { database.append(key, value)? }; | ||||
|             unsafe { | ||||
|                 // safety: We do not keep a reference to anything that lives inside the database | ||||
|                 database.put_current_with_options::<Bytes>(PutFlags::APPEND, key, value)? | ||||
|             }; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -1,6 +1,5 @@ | ||||
| pub use self::available_documents_ids::AvailableDocumentsIds; | ||||
| pub use self::clear_documents::ClearDocuments; | ||||
| pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult}; | ||||
| pub use self::facet::bulk::FacetsUpdateBulk; | ||||
| pub use self::facet::incremental::FacetsUpdateIncrementalInner; | ||||
| pub use self::index_documents::{ | ||||
| @@ -9,10 +8,6 @@ pub use self::index_documents::{ | ||||
|     MergeFn, | ||||
| }; | ||||
| pub use self::indexer_config::IndexerConfig; | ||||
| pub use self::prefix_word_pairs::{ | ||||
|     PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, | ||||
|     MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, | ||||
| }; | ||||
| pub use self::settings::{Setting, Settings}; | ||||
| pub use self::update_step::UpdateIndexingStep; | ||||
| pub use self::word_prefix_docids::WordPrefixDocids; | ||||
| @@ -21,11 +16,10 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; | ||||
|  | ||||
| mod available_documents_ids; | ||||
| mod clear_documents; | ||||
| mod delete_documents; | ||||
| pub(crate) mod del_add; | ||||
| pub(crate) mod facet; | ||||
| mod index_documents; | ||||
| mod indexer_config; | ||||
| mod prefix_word_pairs; | ||||
| mod settings; | ||||
| mod update_step; | ||||
| mod word_prefix_docids; | ||||
|   | ||||
| @@ -1,579 +0,0 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashSet; | ||||
| use std::io::{BufReader, BufWriter}; | ||||
|  | ||||
| use grenad::CompressionType; | ||||
| use heed::types::ByteSlice; | ||||
|  | ||||
| use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; | ||||
| use crate::{Index, Result}; | ||||
|  | ||||
| mod prefix_word; | ||||
| mod word_prefix; | ||||
|  | ||||
| pub use prefix_word::index_prefix_word_database; | ||||
| pub use word_prefix::index_word_prefix_database; | ||||
|  | ||||
| pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; | ||||
| pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; | ||||
|  | ||||
| pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     max_proximity: u8, | ||||
|     max_prefix_length: usize, | ||||
|     chunk_compression_type: CompressionType, | ||||
|     chunk_compression_level: Option<u32>, | ||||
| } | ||||
| impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { | ||||
|     pub fn new( | ||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|         index: &'i Index, | ||||
|         chunk_compression_type: CompressionType, | ||||
|         chunk_compression_level: Option<u32>, | ||||
|     ) -> Self { | ||||
|         Self { | ||||
|             wtxn, | ||||
|             index, | ||||
|             max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, | ||||
|             max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("WordPrefixPairProximityDocids::{}")] | ||||
|     pub fn execute<'a>( | ||||
|         self, | ||||
|         new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, | ||||
|         new_prefix_fst_words: &'a [String], | ||||
|         common_prefix_fst_words: &[&'a [String]], | ||||
|         del_prefix_fst_words: &HashSet<Vec<u8>>, | ||||
|     ) -> Result<()> { | ||||
|         puffin::profile_function!(); | ||||
|  | ||||
|         index_word_prefix_database( | ||||
|             self.wtxn, | ||||
|             self.index.word_pair_proximity_docids, | ||||
|             self.index.word_prefix_pair_proximity_docids, | ||||
|             self.max_proximity, | ||||
|             self.max_prefix_length, | ||||
|             new_word_pair_proximity_docids.clone(), | ||||
|             new_prefix_fst_words, | ||||
|             common_prefix_fst_words, | ||||
|             del_prefix_fst_words, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|         )?; | ||||
|  | ||||
|         index_prefix_word_database( | ||||
|             self.wtxn, | ||||
|             self.index.word_pair_proximity_docids, | ||||
|             self.index.prefix_word_pair_proximity_docids, | ||||
|             self.max_proximity, | ||||
|             self.max_prefix_length, | ||||
|             new_word_pair_proximity_docids, | ||||
|             new_prefix_fst_words, | ||||
|             common_prefix_fst_words, | ||||
|             del_prefix_fst_words, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|         )?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| // This is adapted from `sorter_into_lmdb_database` | ||||
| pub fn insert_into_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     new_key: &[u8], | ||||
|     new_value: &[u8], | ||||
| ) -> Result<()> { | ||||
|     let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; | ||||
|     match iter.next().transpose()? { | ||||
|         Some((key, old_val)) if new_key == key => { | ||||
|             let val = | ||||
|                 merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) | ||||
|                     .map_err(|_| { | ||||
|                         // TODO just wrap this error? | ||||
|                         crate::error::InternalError::IndexingMergingKeys { | ||||
|                             process: "get-put-merge", | ||||
|                         } | ||||
|                     })?; | ||||
|             // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour | ||||
|             unsafe { iter.put_current(new_key, &val)? }; | ||||
|         } | ||||
|         _ => { | ||||
|             drop(iter); | ||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| // This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, | ||||
| // but it uses `append` if the database is empty, and it assumes that the values in the | ||||
| // writer don't conflict with values in the database. | ||||
| pub fn write_into_lmdb_database_without_merging( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     database: heed::PolyDatabase, | ||||
|     writer: grenad::Writer<BufWriter<std::fs::File>>, | ||||
| ) -> Result<()> { | ||||
|     let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?; | ||||
|     let reader = grenad::Reader::new(BufReader::new(file))?; | ||||
|     if database.is_empty(wtxn)? { | ||||
|         let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; | ||||
|         let mut cursor = reader.into_cursor()?; | ||||
|         while let Some((k, v)) = cursor.move_on_next()? { | ||||
|             // safety: the key comes from the grenad reader, not the database | ||||
|             unsafe { out_iter.append(k, v)? }; | ||||
|         } | ||||
|     } else { | ||||
|         let mut cursor = reader.into_cursor()?; | ||||
|         while let Some((k, v)) = cursor.move_on_next()? { | ||||
|             database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use std::io::Cursor; | ||||
|     use std::iter::FromIterator; | ||||
|  | ||||
|     use roaring::RoaringBitmap; | ||||
|  | ||||
|     use crate::db_snap; | ||||
|     use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod}; | ||||
|  | ||||
|     fn documents_with_enough_different_words_for_prefixes( | ||||
|         prefixes: &[&str], | ||||
|         start_id: usize, | ||||
|     ) -> Vec<crate::Object> { | ||||
|         let mut documents = Vec::new(); | ||||
|         let mut id = start_id; | ||||
|         for prefix in prefixes { | ||||
|             for i in 0..50 { | ||||
|                 documents.push( | ||||
|                     serde_json::json!({ | ||||
|                         "id": id, | ||||
|                         "text": format!("{prefix}{i:x}"), | ||||
|                     }) | ||||
|                     .as_object() | ||||
|                     .unwrap() | ||||
|                     .clone(), | ||||
|                 ); | ||||
|                 id += 1; | ||||
|             } | ||||
|         } | ||||
|         documents | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn add_new_documents() { | ||||
|         let mut index = TempIndex::new(); | ||||
|         index.index_documents_config.words_prefix_threshold = Some(50); | ||||
|         index.index_documents_config.autogenerate_docids = true; | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let batch_reader_from_documents = |documents| { | ||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||
|             for object in documents { | ||||
|                 builder.append_json_object(&object).unwrap(); | ||||
|             } | ||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() | ||||
|         }; | ||||
|  | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": "9000", | ||||
|                 "text": "At an amazing and beautiful house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": "9001", | ||||
|                 "text": "The bell rings at 5 am" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|  | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); | ||||
|  | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": "9002", | ||||
|                 "text": "At an extraordinary house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, word_pair_proximity_docids, "update"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "update"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "update"); | ||||
|     } | ||||
|     #[test] | ||||
|     fn batch_bug_3043() { | ||||
|         // https://github.com/meilisearch/meilisearch/issues/3043 | ||||
|         let mut index = TempIndex::new(); | ||||
|         index.index_documents_config.words_prefix_threshold = Some(50); | ||||
|         index.index_documents_config.autogenerate_docids = true; | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let batch_reader_from_documents = |documents| { | ||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||
|             for object in documents { | ||||
|                 builder.append_json_object(&object).unwrap(); | ||||
|             } | ||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() | ||||
|         }; | ||||
|  | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "text": "x y" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "text": "x a y" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|  | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, word_pair_proximity_docids); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn hard_delete_and_reupdate() { | ||||
|         let mut index = TempIndex::new(); | ||||
|         index.index_documents_config.words_prefix_threshold = Some(50); | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_primary_key("id".to_owned()); | ||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let batch_reader_from_documents = |documents| { | ||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||
|             for object in documents { | ||||
|                 builder.append_json_object(&object).unwrap(); | ||||
|             } | ||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() | ||||
|         }; | ||||
|  | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": 9000, | ||||
|                 "text": "At an amazing and beautiful house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": 9001, | ||||
|                 "text": "The bell rings at 5 am" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|  | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "initial"); | ||||
|         db_snap!(index, word_docids, "initial"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); | ||||
|         delete.strategy(DeletionStrategy::AlwaysHard); | ||||
|         delete.delete_documents(&RoaringBitmap::from_iter([50])); | ||||
|         delete.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "first_delete"); | ||||
|         db_snap!(index, word_docids, "first_delete"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); | ||||
|         delete.strategy(DeletionStrategy::AlwaysHard); | ||||
|         delete.delete_documents(&RoaringBitmap::from_iter(0..50)); | ||||
|         delete.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "second_delete"); | ||||
|         db_snap!(index, word_docids, "second_delete"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); | ||||
|  | ||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database | ||||
|  | ||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "reupdate"); | ||||
|         db_snap!(index, word_docids, "reupdate"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn soft_delete_and_reupdate() { | ||||
|         let mut index = TempIndex::new(); | ||||
|         index.index_documents_config.words_prefix_threshold = Some(50); | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_primary_key("id".to_owned()); | ||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let batch_reader_from_documents = |documents| { | ||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||
|             for object in documents { | ||||
|                 builder.append_json_object(&object).unwrap(); | ||||
|             } | ||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() | ||||
|         }; | ||||
|  | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": 9000, | ||||
|                 "text": "At an amazing and beautiful house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": 9001, | ||||
|                 "text": "The bell rings at 5 am" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|  | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "initial"); | ||||
|         db_snap!(index, word_docids, "initial"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); | ||||
|         delete.strategy(DeletionStrategy::AlwaysSoft); | ||||
|         delete.delete_documents(&RoaringBitmap::from_iter([50])); | ||||
|         delete.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "first_delete"); | ||||
|         db_snap!(index, word_docids, "first_delete"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); | ||||
|  | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); | ||||
|         delete.strategy(DeletionStrategy::AlwaysSoft); | ||||
|  | ||||
|         delete.delete_documents(&RoaringBitmap::from_iter(0..50)); | ||||
|         delete.execute().unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "second_delete"); | ||||
|         db_snap!(index, word_docids, "second_delete"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); | ||||
|  | ||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database | ||||
|  | ||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "reupdate"); | ||||
|         db_snap!(index, word_docids, "reupdate"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn replace_soft_deletion() { | ||||
|         let mut index = TempIndex::new(); | ||||
|         index.index_documents_config.words_prefix_threshold = Some(50); | ||||
|         index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; | ||||
|         index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft; | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_primary_key("id".to_owned()); | ||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let batch_reader_from_documents = |documents| { | ||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||
|             for object in documents { | ||||
|                 builder.append_json_object(&object).unwrap(); | ||||
|             } | ||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() | ||||
|         }; | ||||
|  | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": 9000, | ||||
|                 "text": "At an amazing house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": 9001, | ||||
|                 "text": "The bell rings" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|  | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "initial"); | ||||
|         db_snap!(index, word_docids, "initial"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); | ||||
|  | ||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); | ||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "replaced"); | ||||
|         db_snap!(index, word_docids, "replaced"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); | ||||
|         db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]"); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn replace_hard_deletion() { | ||||
|         let mut index = TempIndex::new(); | ||||
|         index.index_documents_config.words_prefix_threshold = Some(50); | ||||
|         index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard; | ||||
|         index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; | ||||
|  | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_primary_key("id".to_owned()); | ||||
|                 settings.set_searchable_fields(vec!["text".to_owned()]); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         let batch_reader_from_documents = |documents| { | ||||
|             let mut builder = DocumentsBatchBuilder::new(Vec::new()); | ||||
|             for object in documents { | ||||
|                 builder.append_json_object(&object).unwrap(); | ||||
|             } | ||||
|             DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() | ||||
|         }; | ||||
|  | ||||
|         let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); | ||||
|         // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": 9000, | ||||
|                 "text": "At an amazing house" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|         documents.push( | ||||
|             serde_json::json!({ | ||||
|                 "id": 9001, | ||||
|                 "text": "The bell rings" | ||||
|             }) | ||||
|             .as_object() | ||||
|             .unwrap() | ||||
|             .clone(), | ||||
|         ); | ||||
|  | ||||
|         let documents = batch_reader_from_documents(documents); | ||||
|         index.add_documents(documents).unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "initial"); | ||||
|         db_snap!(index, word_docids, "initial"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "initial"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "initial"); | ||||
|  | ||||
|         let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); | ||||
|         index.add_documents(batch_reader_from_documents(documents)).unwrap(); | ||||
|  | ||||
|         db_snap!(index, documents_ids, "replaced"); | ||||
|         db_snap!(index, word_docids, "replaced"); | ||||
|         db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); | ||||
|         db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); | ||||
|         db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]"); | ||||
|     } | ||||
| } | ||||
| @@ -1,182 +0,0 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::{BTreeMap, HashSet}; | ||||
|  | ||||
| use grenad::CompressionType; | ||||
| use heed::types::ByteSlice; | ||||
| use heed::BytesDecode; | ||||
| use log::debug; | ||||
|  | ||||
| use crate::update::index_documents::{create_writer, CursorClonableMmap}; | ||||
| use crate::update::prefix_word_pairs::{ | ||||
|     insert_into_database, write_into_lmdb_database_without_merging, | ||||
| }; | ||||
| use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; | ||||
|  | ||||
| #[allow(clippy::too_many_arguments)] | ||||
| #[logging_timer::time] | ||||
| pub fn index_prefix_word_database( | ||||
|     wtxn: &mut heed::RwTxn, | ||||
|     word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, | ||||
|     prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>, | ||||
|     max_proximity: u8, | ||||
|     max_prefix_length: usize, | ||||
|     new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>, | ||||
|     new_prefix_fst_words: &[String], | ||||
|     common_prefix_fst_words: &[&[String]], | ||||
|     del_prefix_fst_words: &HashSet<Vec<u8>>, | ||||
|     chunk_compression_type: CompressionType, | ||||
|     chunk_compression_level: Option<u32>, | ||||
| ) -> Result<()> { | ||||
|     puffin::profile_function!(); | ||||
|  | ||||
|     let max_proximity = max_proximity - 1; | ||||
|     debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); | ||||
|  | ||||
|     let common_prefixes: Vec<_> = common_prefix_fst_words | ||||
|         .iter() | ||||
|         .flat_map(|s| s.iter()) | ||||
|         .map(|s| s.as_str()) | ||||
|         .filter(|s| s.len() <= max_prefix_length) | ||||
|         .collect(); | ||||
|  | ||||
|     for proximity in 1..max_proximity { | ||||
|         for prefix in common_prefixes.iter() { | ||||
|             let mut prefix_key = vec![proximity]; | ||||
|             prefix_key.extend_from_slice(prefix.as_bytes()); | ||||
|             let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; | ||||
|             // This is the core of the algorithm | ||||
|             execute_on_word_pairs_and_prefixes( | ||||
|                 proximity, | ||||
|                 prefix.as_bytes(), | ||||
|                 // the next two arguments tell how to iterate over the new word pairs | ||||
|                 &mut cursor, | ||||
|                 |cursor| { | ||||
|                     if let Some((key, value)) = cursor.next()? { | ||||
|                         let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) | ||||
|                             .ok_or(heed::Error::Decoding)?; | ||||
|                         Ok(Some((word2, value))) | ||||
|                     } else { | ||||
|                         Ok(None) | ||||
|                     } | ||||
|                 }, | ||||
|                 // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) | ||||
|                 |key, value| { | ||||
|                     insert_into_database( | ||||
|                         wtxn, | ||||
|                         *prefix_word_pair_proximity_docids.as_polymorph(), | ||||
|                         key, | ||||
|                         value, | ||||
|                     ) | ||||
|                 }, | ||||
|             )?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // Now we do the same thing with the new prefixes and all word pairs in the DB | ||||
|     let new_prefixes: Vec<_> = new_prefix_fst_words | ||||
|         .iter() | ||||
|         .map(|s| s.as_str()) | ||||
|         .filter(|s| s.len() <= max_prefix_length) | ||||
|         .collect(); | ||||
|  | ||||
|     // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) | ||||
|     // element in an intermediary grenad | ||||
|     let mut writer = | ||||
|         create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); | ||||
|  | ||||
|     for proximity in 1..max_proximity { | ||||
|         for prefix in new_prefixes.iter() { | ||||
|             let mut prefix_key = vec![proximity]; | ||||
|             prefix_key.extend_from_slice(prefix.as_bytes()); | ||||
|             let mut db_iter = word_pair_proximity_docids | ||||
|                 .as_polymorph() | ||||
|                 .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? | ||||
|                 .remap_key_type::<UncheckedU8StrStrCodec>(); | ||||
|             execute_on_word_pairs_and_prefixes( | ||||
|                 proximity, | ||||
|                 prefix.as_bytes(), | ||||
|                 &mut db_iter, | ||||
|                 |db_iter| { | ||||
|                     db_iter | ||||
|                         .next() | ||||
|                         .transpose() | ||||
|                         .map(|x| x.map(|((_, _, word2), value)| (word2, value))) | ||||
|                         .map_err(|e| e.into()) | ||||
|                 }, | ||||
|                 |key, value| writer.insert(key, value).map_err(|e| e.into()), | ||||
|             )?; | ||||
|             drop(db_iter); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // and then we write the grenad into the DB | ||||
|     // Since the grenad contains only new prefixes, we know in advance that none | ||||
|     // of its elements already exist in the DB, thus there is no need to specify | ||||
|     // how to merge conflicting elements | ||||
|     write_into_lmdb_database_without_merging( | ||||
|         wtxn, | ||||
|         *prefix_word_pair_proximity_docids.as_polymorph(), | ||||
|         writer, | ||||
|     )?; | ||||
|  | ||||
|     // All of the word prefix pairs in the database that have a w2 | ||||
|     // that is contained in the `suppr_pw` set must be removed as well. | ||||
|     if !del_prefix_fst_words.is_empty() { | ||||
|         let mut iter = | ||||
|             prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?; | ||||
|         while let Some(((_, prefix, _), _)) = iter.next().transpose()? { | ||||
|             if del_prefix_fst_words.contains(prefix.as_bytes()) { | ||||
|                 // Delete this entry as the w2 prefix is no more in the words prefix fst. | ||||
|                 unsafe { iter.del_current()? }; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| /// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. | ||||
| /// | ||||
| /// Its arguments are: | ||||
| /// - an iterator over the words following the given `prefix` with the given `proximity` | ||||
| /// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements | ||||
| fn execute_on_word_pairs_and_prefixes<I>( | ||||
|     proximity: u8, | ||||
|     prefix: &[u8], | ||||
|     iter: &mut I, | ||||
|     mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>, | ||||
|     mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, | ||||
| ) -> Result<()> { | ||||
|     let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default(); | ||||
|  | ||||
|     // Memory usage check: | ||||
|     // The content of the loop will be called for each `word2` that follows a word beginning | ||||
|     // with `prefix` with the given proximity. | ||||
|     // In practice, I don't think the batch can ever get too big. | ||||
|     while let Some((word2, docids)) = next_word2_and_docids(iter)? { | ||||
|         let entry = batch.entry(word2.to_owned()).or_default(); | ||||
|         entry.push(Cow::Owned(docids.to_owned())); | ||||
|     } | ||||
|  | ||||
|     let mut key_buffer = Vec::with_capacity(512); | ||||
|     key_buffer.push(proximity); | ||||
|     key_buffer.extend_from_slice(prefix); | ||||
|     key_buffer.push(0); | ||||
|  | ||||
|     let mut value_buffer = Vec::with_capacity(65_536); | ||||
|  | ||||
|     for (word2, docids) in batch { | ||||
|         key_buffer.truncate(prefix.len() + 2); | ||||
|         value_buffer.clear(); | ||||
|  | ||||
|         key_buffer.extend_from_slice(&word2); | ||||
|         let data = if docids.len() > 1 { | ||||
|             CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; | ||||
|             value_buffer.as_slice() | ||||
|         } else { | ||||
|             &docids[0] | ||||
|         }; | ||||
|         insert(key_buffer.as_slice(), data)?; | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
| @@ -1,20 +0,0 @@ | ||||
| --- | ||||
| source: milli/src/update/prefix_word_pairs/mod.rs | ||||
| --- | ||||
| 1  a    5                [101, ] | ||||
| 1  a    amazing          [100, ] | ||||
| 1  a    an               [100, ] | ||||
| 1  a    and              [100, ] | ||||
| 1  a    beautiful        [100, ] | ||||
| 1  b    house            [100, ] | ||||
| 1  b    rings            [101, ] | ||||
| 1  be   house            [100, ] | ||||
| 1  be   rings            [101, ] | ||||
| 2  a    am               [101, ] | ||||
| 2  a    amazing          [100, ] | ||||
| 2  a    and              [100, ] | ||||
| 2  a    beautiful        [100, ] | ||||
| 2  a    house            [100, ] | ||||
| 2  b    at               [101, ] | ||||
| 2  be   at               [101, ] | ||||
|  | ||||
| @@ -1,23 +0,0 @@ | ||||
| --- | ||||
| source: milli/src/update/prefix_word_pairs/mod.rs | ||||
| --- | ||||
| 1  5                a    [101, ] | ||||
| 1  amazing          a    [100, ] | ||||
| 1  an               a    [100, ] | ||||
| 1  and              b    [100, ] | ||||
| 1  and              be   [100, ] | ||||
| 1  at               a    [100, ] | ||||
| 1  rings            a    [101, ] | ||||
| 1  the              b    [101, ] | ||||
| 1  the              be   [101, ] | ||||
| 2  amazing          b    [100, ] | ||||
| 2  amazing          be   [100, ] | ||||
| 2  an               a    [100, ] | ||||
| 2  at               a    [100, 101, ] | ||||
| 2  bell             a    [101, ] | ||||
| 3  an               b    [100, ] | ||||
| 3  an               be   [100, ] | ||||
| 3  at               a    [100, ] | ||||
| 3  rings            a    [101, ] | ||||
| 3  the              a    [101, ] | ||||
|  | ||||
| @@ -1,29 +0,0 @@ | ||||
| --- | ||||
| source: milli/src/update/prefix_word_pairs/mod.rs | ||||
| --- | ||||
| 1  a    5                [101, ] | ||||
| 1  a    amazing          [100, ] | ||||
| 1  a    an               [100, 202, ] | ||||
| 1  a    and              [100, ] | ||||
| 1  a    beautiful        [100, ] | ||||
| 1  a    extraordinary    [202, ] | ||||
| 1  am   and              [100, ] | ||||
| 1  an   amazing          [100, ] | ||||
| 1  an   beautiful        [100, ] | ||||
| 1  an   extraordinary    [202, ] | ||||
| 1  b    house            [100, ] | ||||
| 1  b    rings            [101, ] | ||||
| 1  be   house            [100, ] | ||||
| 1  be   rings            [101, ] | ||||
| 2  a    am               [101, ] | ||||
| 2  a    amazing          [100, ] | ||||
| 2  a    and              [100, ] | ||||
| 2  a    beautiful        [100, ] | ||||
| 2  a    extraordinary    [202, ] | ||||
| 2  a    house            [100, 202, ] | ||||
| 2  am   beautiful        [100, ] | ||||
| 2  an   and              [100, ] | ||||
| 2  an   house            [100, 202, ] | ||||
| 2  b    at               [101, ] | ||||
| 2  be   at               [101, ] | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user