mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	Use an LMDB database to store the external documents ids
This commit is contained in:
		
				
					committed by
					
						 Louis Dureuil
						Louis Dureuil
					
				
			
			
				
	
			
			
			
						parent
						
							fdf3f7f627
						
					
				
				
					commit
					dfab6293c9
				
			| @@ -1575,11 +1575,14 @@ fn delete_document_by_filter<'a>( | |||||||
|             } |             } | ||||||
|             e => e.into(), |             e => e.into(), | ||||||
|         })?; |         })?; | ||||||
|         let external_documents_ids = index.external_documents_ids(wtxn)?; |         let external_documents_ids = index.external_documents_ids(); | ||||||
|         // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings). |         // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings). | ||||||
|         // Since what we have is an iterator, it would be better to delete in chunks |         // Since what we have is an iterator, it would be better to delete in chunks | ||||||
|         let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> = |         let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> = | ||||||
|             external_documents_ids.find_external_id_of(candidates).only_external_ids().collect(); |             external_documents_ids | ||||||
|  |                 .find_external_id_of(wtxn, candidates)? | ||||||
|  |                 .only_external_ids() | ||||||
|  |                 .collect(); | ||||||
|         let document_ids = match external_to_internal { |         let document_ids = match external_to_internal { | ||||||
|             Ok(external_ids) => external_ids, |             Ok(external_ids) => external_ids, | ||||||
|             Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids), |             Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids), | ||||||
|   | |||||||
| @@ -612,8 +612,8 @@ fn retrieve_document<S: AsRef<str>>( | |||||||
|     let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); |     let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); | ||||||
|  |  | ||||||
|     let internal_id = index |     let internal_id = index | ||||||
|         .external_documents_ids(&txn)? |         .external_documents_ids() | ||||||
|         .get(doc_id.as_bytes()) |         .get(&txn, doc_id)? | ||||||
|         .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; |         .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; | ||||||
|  |  | ||||||
|     let document = index |     let document = index | ||||||
|   | |||||||
| @@ -1,12 +1,11 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::collections::HashMap; | use std::collections::HashMap; | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::fmt; |  | ||||||
|  |  | ||||||
| use fst::Streamer; | use heed::types::{OwnedType, Str}; | ||||||
|  | use heed::{Database, RoIter, RoTxn, RwTxn}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::DocumentId; | use crate::{DocumentId, BEU32}; | ||||||
|  |  | ||||||
| pub enum DocumentOperationKind { | pub enum DocumentOperationKind { | ||||||
|     Create, |     Create, | ||||||
| @@ -19,41 +18,31 @@ pub struct DocumentOperation { | |||||||
|     pub kind: DocumentOperationKind, |     pub kind: DocumentOperationKind, | ||||||
| } | } | ||||||
|  |  | ||||||
| pub struct ExternalDocumentsIds<'a>(fst::Map<Cow<'a, [u8]>>); | pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>); | ||||||
|  |  | ||||||
| impl<'a> ExternalDocumentsIds<'a> { | impl ExternalDocumentsIds { | ||||||
|     pub fn new(fst: fst::Map<Cow<'a, [u8]>>) -> ExternalDocumentsIds<'a> { |     pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds { | ||||||
|         ExternalDocumentsIds(fst) |         ExternalDocumentsIds(db) | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn into_static(self) -> ExternalDocumentsIds<'static> { |  | ||||||
|         ExternalDocumentsIds(self.0.map_data(|c| Cow::Owned(c.into_owned())).unwrap()) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns `true` if hard and soft external documents lists are empty. |     /// Returns `true` if hard and soft external documents lists are empty. | ||||||
|     pub fn is_empty(&self) -> bool { |     pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result<bool> { | ||||||
|         self.0.is_empty() |         self.0.is_empty(rtxn).map_err(Into::into) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> { |     pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> { | ||||||
|         let external_id = external_id.as_ref(); |         Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get().try_into().unwrap())) | ||||||
|         self.0.get(external_id).map(|x| x.try_into().unwrap()) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// An helper function to debug this type, returns an `HashMap` of both, |     /// An helper function to debug this type, returns an `HashMap` of both, | ||||||
|     /// soft and hard fst maps, combined. |     /// soft and hard fst maps, combined. | ||||||
|     pub fn to_hash_map(&self) -> HashMap<String, u32> { |     pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, u32>> { | ||||||
|         let mut map = HashMap::default(); |         let mut map = HashMap::default(); | ||||||
|         let mut stream = self.0.stream(); |         for result in self.0.iter(rtxn)? { | ||||||
|         while let Some((k, v)) = stream.next() { |             let (external, internal) = result?; | ||||||
|             let k = String::from_utf8(k.to_vec()).unwrap(); |             map.insert(external.to_owned(), internal.get().try_into().unwrap()); | ||||||
|             map.insert(k, v.try_into().unwrap()); |  | ||||||
|         } |         } | ||||||
|         map |         Ok(map) | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn as_bytes(&self) -> &[u8] { |  | ||||||
|         self.0.as_fst().as_bytes() |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between |     /// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between | ||||||
| @@ -65,12 +54,12 @@ impl<'a> ExternalDocumentsIds<'a> { | |||||||
|     /// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found. |     /// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found. | ||||||
|     ///   In that case the returned bitmap contains the internal ids whose external ids were not found after traversing |     ///   In that case the returned bitmap contains the internal ids whose external ids were not found after traversing | ||||||
|     ///   the entire fst. |     ///   the entire fst. | ||||||
|     pub fn find_external_id_of( |     pub fn find_external_id_of<'t>( | ||||||
|         &self, |         &self, | ||||||
|  |         rtxn: &'t RoTxn, | ||||||
|         internal_ids: RoaringBitmap, |         internal_ids: RoaringBitmap, | ||||||
|     ) -> ExternalToInternalOwnedIterator<'_> { |     ) -> heed::Result<ExternalToInternalOwnedIterator<'t>> { | ||||||
|         let it = ExternalToInternalOwnedIterator { stream: self.0.stream(), internal_ids }; |         self.0.iter(rtxn).map(|iter| ExternalToInternalOwnedIterator { iter, internal_ids }) | ||||||
|         it |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Applies the list of operations passed as argument, modifying the current external to internal id mapping. |     /// Applies the list of operations passed as argument, modifying the current external to internal id mapping. | ||||||
| @@ -81,84 +70,39 @@ impl<'a> ExternalDocumentsIds<'a> { | |||||||
|     /// |     /// | ||||||
|     /// - If attempting to delete a document that doesn't exist |     /// - If attempting to delete a document that doesn't exist | ||||||
|     /// - If attempting to create a document that already exists |     /// - If attempting to create a document that already exists | ||||||
|     pub fn apply(&mut self, mut operations: Vec<DocumentOperation>) { |     pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec<DocumentOperation>) -> heed::Result<()> { | ||||||
|         operations.sort_unstable_by(|left, right| left.external_id.cmp(&right.external_id)); |         for DocumentOperation { external_id, internal_id, kind } in operations { | ||||||
|         operations.dedup_by(|left, right| left.external_id == right.external_id); |             match kind { | ||||||
|  |                 DocumentOperationKind::Create => { | ||||||
|         let mut builder = fst::MapBuilder::memory(); |                     // TODO should we get before insert to be able to detect bugs? | ||||||
|  |                     // if matches!(kind, DocumentOperationKind::Create) { | ||||||
|         let mut stream = self.0.stream(); |                     //     panic!("Attempting to create an already-existing document"); | ||||||
|         let mut next_stream = stream.next(); |                     // } | ||||||
|         let mut operations = operations.iter(); |                     self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?; | ||||||
|         let mut next_operation = operations.next(); |                 } | ||||||
|  |                 DocumentOperationKind::Delete => { | ||||||
|         loop { |                     if !self.0.delete(wtxn, &external_id)? { | ||||||
|             (next_stream, next_operation) = match (next_stream.take(), next_operation.take()) { |  | ||||||
|                 (None, None) => break, |  | ||||||
|                 (None, Some(DocumentOperation { external_id, internal_id, kind })) => { |  | ||||||
|                     if matches!(kind, DocumentOperationKind::Delete) { |  | ||||||
|                         panic!("Attempting to delete a non-existing document") |                         panic!("Attempting to delete a non-existing document") | ||||||
|                     } |                     } | ||||||
|                     builder.insert(external_id, (*internal_id).into()).unwrap(); |  | ||||||
|                     (None, operations.next()) |  | ||||||
|                 } |                 } | ||||||
|                 (Some((k, v)), None) => { |  | ||||||
|                     builder.insert(k, v).unwrap(); |  | ||||||
|                     (stream.next(), None) |  | ||||||
|                 } |  | ||||||
|                 ( |  | ||||||
|                     current_stream @ Some((left_external_id, left_internal_id)), |  | ||||||
|                     current_operation @ Some(DocumentOperation { |  | ||||||
|                         external_id: right_external_id, |  | ||||||
|                         internal_id: right_internal_id, |  | ||||||
|                         kind, |  | ||||||
|                     }), |  | ||||||
|                 ) => match left_external_id.cmp(right_external_id.as_bytes()) { |  | ||||||
|                     std::cmp::Ordering::Less => { |  | ||||||
|                         builder.insert(left_external_id, left_internal_id).unwrap(); |  | ||||||
|                         (stream.next(), current_operation) |  | ||||||
|                     } |  | ||||||
|                     std::cmp::Ordering::Greater => { |  | ||||||
|                         builder.insert(right_external_id, (*right_internal_id).into()).unwrap(); |  | ||||||
|                         (current_stream, operations.next()) |  | ||||||
|                     } |  | ||||||
|                     std::cmp::Ordering::Equal => { |  | ||||||
|                         if matches!(kind, DocumentOperationKind::Create) { |  | ||||||
|                             panic!("Attempting to create an already-existing document"); |  | ||||||
|                         } |  | ||||||
|                         // we delete the document, so we just advance both iterators to skip in stream |  | ||||||
|                         (stream.next(), operations.next()) |  | ||||||
|                     } |  | ||||||
|                 }, |  | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         self.0 = builder.into_map().map_data(Cow::Owned).unwrap(); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl fmt::Debug for ExternalDocumentsIds<'_> { |         Ok(()) | ||||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |  | ||||||
|         f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish() |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl Default for ExternalDocumentsIds<'static> { |  | ||||||
|     fn default() -> Self { |  | ||||||
|         ExternalDocumentsIds(fst::Map::default().map_data(Cow::Owned).unwrap()) |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| /// An iterator over mappings between requested internal ids and external ids. | /// An iterator over mappings between requested internal ids and external ids. | ||||||
| /// | /// | ||||||
| /// See [`ExternalDocumentsIds::find_external_id_of`] for details. | /// See [`ExternalDocumentsIds::find_external_id_of`] for details. | ||||||
| pub struct ExternalToInternalOwnedIterator<'it> { | pub struct ExternalToInternalOwnedIterator<'t> { | ||||||
|     stream: fst::map::Stream<'it>, |     iter: RoIter<'t, Str, OwnedType<BEU32>>, | ||||||
|     internal_ids: RoaringBitmap, |     internal_ids: RoaringBitmap, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'it> Iterator for ExternalToInternalOwnedIterator<'it> { | impl<'t> Iterator for ExternalToInternalOwnedIterator<'t> { | ||||||
|     /// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids. |     /// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids. | ||||||
|     type Item = Result<(String, DocumentId), RoaringBitmap>; |     type Item = Result<(&'t str, DocumentId), RoaringBitmap>; | ||||||
|  |  | ||||||
|     fn next(&mut self) -> Option<Self::Item> { |     fn next(&mut self) -> Option<Self::Item> { | ||||||
|         // if all requested ids were found, we won't find any other, so short-circuit |         // if all requested ids were found, we won't find any other, so short-circuit | ||||||
| @@ -166,23 +110,28 @@ impl<'it> Iterator for ExternalToInternalOwnedIterator<'it> { | |||||||
|             return None; |             return None; | ||||||
|         } |         } | ||||||
|         loop { |         loop { | ||||||
|             let Some((external, internal)) = self.stream.next() else { |             let (external, internal) = match self.iter.next() { | ||||||
|                 // we exhausted the stream but we still have some internal ids to find |                 Some(Ok((external, internal))) => (external, internal), | ||||||
|                 let remaining_ids = std::mem::take(&mut self.internal_ids); |                 // TODO manage this better, remove panic | ||||||
|                 return Some(Err(remaining_ids)); |                 Some(Err(e)) => panic!("{}", e), | ||||||
|                 // note: next calls to `next` will return `None` since we replaced the internal_ids |                 _ => { | ||||||
|                 // with the default empty bitmap |                     // we exhausted the stream but we still have some internal ids to find | ||||||
|  |                     let remaining_ids = std::mem::take(&mut self.internal_ids); | ||||||
|  |                     return Some(Err(remaining_ids)); | ||||||
|  |                     // note: next calls to `next` will return `None` since we replaced the internal_ids | ||||||
|  |                     // with the default empty bitmap | ||||||
|  |                 } | ||||||
|             }; |             }; | ||||||
|             let internal = internal.try_into().unwrap(); |             let internal = internal.get(); | ||||||
|             let was_contained = self.internal_ids.remove(internal); |             let was_contained = self.internal_ids.remove(internal); | ||||||
|             if was_contained { |             if was_contained { | ||||||
|                 return Some(Ok((std::str::from_utf8(external).unwrap().to_owned(), internal))); |                 return Some(Ok((external, internal))); | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'it> ExternalToInternalOwnedIterator<'it> { | impl<'t> ExternalToInternalOwnedIterator<'t> { | ||||||
|     /// Returns the bitmap of internal ids whose external id are yet to be found |     /// Returns the bitmap of internal ids whose external id are yet to be found | ||||||
|     pub fn remaining_internal_ids(&self) -> &RoaringBitmap { |     pub fn remaining_internal_ids(&self) -> &RoaringBitmap { | ||||||
|         &self.internal_ids |         &self.internal_ids | ||||||
| @@ -191,7 +140,7 @@ impl<'it> ExternalToInternalOwnedIterator<'it> { | |||||||
|     /// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids. |     /// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids. | ||||||
|     /// |     /// | ||||||
|     /// Use this when you don't need the mapping between the external and the internal ids. |     /// Use this when you don't need the mapping between the external and the internal ids. | ||||||
|     pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 'it { |     pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 't { | ||||||
|         self.map(|res| res.map(|(external, _internal)| external)) |         self.map(|res| res.map(|(external, _internal)| external.to_owned())) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -51,7 +51,6 @@ pub mod main_key { | |||||||
|     /// It is concatenated with a big-endian encoded number (non-human readable). |     /// It is concatenated with a big-endian encoded number (non-human readable). | ||||||
|     /// e.g. vector-hnsw0x0032. |     /// e.g. vector-hnsw0x0032. | ||||||
|     pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw"; |     pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw"; | ||||||
|     pub const EXTERNAL_DOCUMENTS_IDS_KEY: &str = "external-documents-ids"; |  | ||||||
|     pub const PRIMARY_KEY_KEY: &str = "primary-key"; |     pub const PRIMARY_KEY_KEY: &str = "primary-key"; | ||||||
|     pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; |     pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; | ||||||
|     pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; |     pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields"; | ||||||
| @@ -81,6 +80,7 @@ pub mod db_name { | |||||||
|     pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; |     pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; | ||||||
|     pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; |     pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; | ||||||
|     pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; |     pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids"; | ||||||
|  |     pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids"; | ||||||
|     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; |     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; | ||||||
|     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; |     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; | ||||||
|     pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; |     pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; | ||||||
| @@ -112,6 +112,9 @@ pub struct Index { | |||||||
|     /// Contains many different types (e.g. the fields ids map). |     /// Contains many different types (e.g. the fields ids map). | ||||||
|     pub(crate) main: PolyDatabase, |     pub(crate) main: PolyDatabase, | ||||||
|  |  | ||||||
|  |     /// Maps the external documents ids with the internal document id. | ||||||
|  |     pub external_documents_ids: Database<Str, OwnedType<BEU32>>, | ||||||
|  |  | ||||||
|     /// A word and all the documents ids containing the word. |     /// A word and all the documents ids containing the word. | ||||||
|     pub word_docids: Database<Str, CboRoaringBitmapCodec>, |     pub word_docids: Database<Str, CboRoaringBitmapCodec>, | ||||||
|  |  | ||||||
| @@ -183,13 +186,15 @@ impl Index { | |||||||
|     ) -> Result<Index> { |     ) -> Result<Index> { | ||||||
|         use db_name::*; |         use db_name::*; | ||||||
|  |  | ||||||
|         options.max_dbs(25); |         options.max_dbs(26); | ||||||
|         unsafe { options.flag(Flags::MdbAlwaysFreePages) }; |         unsafe { options.flag(Flags::MdbAlwaysFreePages) }; | ||||||
|  |  | ||||||
|         let env = options.open(path)?; |         let env = options.open(path)?; | ||||||
|         let mut wtxn = env.write_txn()?; |         let mut wtxn = env.write_txn()?; | ||||||
|         let main = env.create_poly_database(&mut wtxn, Some(MAIN))?; |         let main = env.create_poly_database(&mut wtxn, Some(MAIN))?; | ||||||
|         let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; |         let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?; | ||||||
|  |         let external_documents_ids = | ||||||
|  |             env.create_database(&mut wtxn, Some(EXTERNAL_DOCUMENTS_IDS))?; | ||||||
|         let exact_word_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?; |         let exact_word_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?; | ||||||
|         let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?; |         let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?; | ||||||
|         let exact_word_prefix_docids = |         let exact_word_prefix_docids = | ||||||
| @@ -235,6 +240,7 @@ impl Index { | |||||||
|         Ok(Index { |         Ok(Index { | ||||||
|             env, |             env, | ||||||
|             main, |             main, | ||||||
|  |             external_documents_ids, | ||||||
|             word_docids, |             word_docids, | ||||||
|             exact_word_docids, |             exact_word_docids, | ||||||
|             word_prefix_docids, |             word_prefix_docids, | ||||||
| @@ -386,29 +392,10 @@ impl Index { | |||||||
|  |  | ||||||
|     /* external documents ids */ |     /* external documents ids */ | ||||||
|  |  | ||||||
|     /// Writes the external documents ids and internal ids (i.e. `u32`). |  | ||||||
|     pub(crate) fn put_external_documents_ids( |  | ||||||
|         &self, |  | ||||||
|         wtxn: &mut RwTxn, |  | ||||||
|         external_documents_ids: &ExternalDocumentsIds<'_>, |  | ||||||
|     ) -> heed::Result<()> { |  | ||||||
|         self.main.put::<_, Str, ByteSlice>( |  | ||||||
|             wtxn, |  | ||||||
|             main_key::EXTERNAL_DOCUMENTS_IDS_KEY, |  | ||||||
|             external_documents_ids.as_bytes(), |  | ||||||
|         )?; |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Returns the external documents ids map which associate the external ids |     /// Returns the external documents ids map which associate the external ids | ||||||
|     /// with the internal ids (i.e. `u32`). |     /// with the internal ids (i.e. `u32`). | ||||||
|     pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> { |     pub fn external_documents_ids(&self) -> ExternalDocumentsIds { | ||||||
|         let fst = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::EXTERNAL_DOCUMENTS_IDS_KEY)?; |         ExternalDocumentsIds::new(self.external_documents_ids) | ||||||
|         let fst = match fst { |  | ||||||
|             Some(fst) => fst::Map::new(fst)?.map_data(Cow::Borrowed)?, |  | ||||||
|             None => fst::Map::default().map_data(Cow::Owned)?, |  | ||||||
|         }; |  | ||||||
|         Ok(ExternalDocumentsIds::new(fst)) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /* fields ids map */ |     /* fields ids map */ | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use time::OffsetDateTime; | use time::OffsetDateTime; | ||||||
|  |  | ||||||
| use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; | use crate::{FieldDistribution, Index, Result}; | ||||||
|  |  | ||||||
| pub struct ClearDocuments<'t, 'u, 'i> { | pub struct ClearDocuments<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -20,6 +20,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|         let Index { |         let Index { | ||||||
|             env: _env, |             env: _env, | ||||||
|             main: _main, |             main: _main, | ||||||
|  |             external_documents_ids, | ||||||
|             word_docids, |             word_docids, | ||||||
|             exact_word_docids, |             exact_word_docids, | ||||||
|             word_prefix_docids, |             word_prefix_docids, | ||||||
| @@ -54,7 +55,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|         // We clean some of the main engine datastructures. |         // We clean some of the main engine datastructures. | ||||||
|         self.index.put_words_fst(self.wtxn, &fst::Set::default())?; |         self.index.put_words_fst(self.wtxn, &fst::Set::default())?; | ||||||
|         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; |         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; | ||||||
|         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; |  | ||||||
|         self.index.put_documents_ids(self.wtxn, &empty_roaring)?; |         self.index.put_documents_ids(self.wtxn, &empty_roaring)?; | ||||||
|         self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; |         self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; | ||||||
|         self.index.delete_geo_rtree(self.wtxn)?; |         self.index.delete_geo_rtree(self.wtxn)?; | ||||||
| @@ -62,6 +62,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|         self.index.delete_vector_hnsw(self.wtxn)?; |         self.index.delete_vector_hnsw(self.wtxn)?; | ||||||
|  |  | ||||||
|         // Clear the other databases. |         // Clear the other databases. | ||||||
|  |         external_documents_ids.clear(self.wtxn)?; | ||||||
|         word_docids.clear(self.wtxn)?; |         word_docids.clear(self.wtxn)?; | ||||||
|         exact_word_docids.clear(self.wtxn)?; |         exact_word_docids.clear(self.wtxn)?; | ||||||
|         word_prefix_docids.clear(self.wtxn)?; |         word_prefix_docids.clear(self.wtxn)?; | ||||||
|   | |||||||
| @@ -162,7 +162,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         FA: Fn() -> bool + Sync, |         FA: Fn() -> bool + Sync, | ||||||
|     { |     { | ||||||
|         let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); |         let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); | ||||||
|         let external_documents_ids = self.index.external_documents_ids(wtxn)?; |         let external_documents_ids = self.index.external_documents_ids(); | ||||||
|         let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; |         let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; | ||||||
|  |  | ||||||
|         let primary_key = cursor.primary_key().to_string(); |         let primary_key = cursor.primary_key().to_string(); | ||||||
| @@ -221,7 +221,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { |             let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) { | ||||||
|                 Entry::Occupied(entry) => *entry.get() as u32, |                 Entry::Occupied(entry) => *entry.get() as u32, | ||||||
|                 Entry::Vacant(entry) => { |                 Entry::Vacant(entry) => { | ||||||
|                     let docid = match external_documents_ids.get(entry.key()) { |                     let docid = match external_documents_ids.get(wtxn, entry.key())? { | ||||||
|                         Some(docid) => { |                         Some(docid) => { | ||||||
|                             // If it was already in the list of replaced documents it means it was deleted |                             // If it was already in the list of replaced documents it means it was deleted | ||||||
|                             // by the remove_document method. We should starts as if it never existed. |                             // by the remove_document method. We should starts as if it never existed. | ||||||
| @@ -373,7 +373,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         to_remove.sort_unstable(); |         to_remove.sort_unstable(); | ||||||
|         to_remove.dedup(); |         to_remove.dedup(); | ||||||
|  |  | ||||||
|         let external_documents_ids = self.index.external_documents_ids(wtxn)?; |         let external_documents_ids = self.index.external_documents_ids(); | ||||||
|  |  | ||||||
|         let mut documents_deleted = 0; |         let mut documents_deleted = 0; | ||||||
|         let mut document_sorter_buffer = Vec::new(); |         let mut document_sorter_buffer = Vec::new(); | ||||||
| @@ -410,7 +410,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|  |  | ||||||
|             // If the document was already in the db we mark it as a `to_delete` document. |             // If the document was already in the db we mark it as a `to_delete` document. | ||||||
|             // Then we push the document in sorters in deletion mode. |             // Then we push the document in sorters in deletion mode. | ||||||
|             let deleted_from_db = match external_documents_ids.get(&to_remove) { |             let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? { | ||||||
|                 Some(docid) => { |                 Some(docid) => { | ||||||
|                     self.replaced_documents_ids.insert(docid); |                     self.replaced_documents_ids.insert(docid); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -194,10 +194,8 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|                     db.delete(wtxn, &BEU32::new(docid))?; |                     db.delete(wtxn, &BEU32::new(docid))?; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             let mut external_documents_docids = index.external_documents_ids(wtxn)?.into_static(); |             let external_documents_docids = index.external_documents_ids(); | ||||||
|             external_documents_docids.apply(operations); |             external_documents_docids.apply(wtxn, operations)?; | ||||||
|             index.put_external_documents_ids(wtxn, &external_documents_docids)?; |  | ||||||
|  |  | ||||||
|             index.put_documents_ids(wtxn, &docids)?; |             index.put_documents_ids(wtxn, &docids)?; | ||||||
|         } |         } | ||||||
|         TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { |         TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user