mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Compute and merge discovered ids
This commit is contained in:
		| @@ -85,10 +85,36 @@ impl Main { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn merge_internal_ids(self, writer: &mut heed::RwTxn<MainT>, new_ids: &sdset::Set<DocumentId>) -> ZResult<()> { | ||||
|         use sdset::SetOperation; | ||||
|  | ||||
|         // We do an union of the old and new internal ids. | ||||
|         let internal_ids = self.internal_ids(writer)?; | ||||
|         let internal_ids = sdset::duo::Union::new(&new_ids, &internal_ids).into_set_buf(); | ||||
|         self.put_internal_ids(writer, &internal_ids) | ||||
|     } | ||||
|  | ||||
|     pub fn put_user_ids(self, writer: &mut heed::RwTxn<MainT>, ids: &fst::Map) -> ZResult<()> { | ||||
|         self.main.put::<_, Str, ByteSlice>(writer, USER_IDS_KEY, ids.as_fst().as_bytes()) | ||||
|     } | ||||
|  | ||||
|     pub fn merge_user_ids(self, writer: &mut heed::RwTxn<MainT>, new_ids: &fst::Map) -> ZResult<()> { | ||||
|         use fst::{Streamer, IntoStreamer}; | ||||
|  | ||||
|         let user_ids = self.user_ids(writer)?; | ||||
|  | ||||
|         // Do an union of the old and the new set of user ids. | ||||
|         let mut op = user_ids.op().add(new_ids.into_stream()).r#union(); | ||||
|         let mut build = fst::MapBuilder::memory(); | ||||
|         while let Some((userid, values)) = op.next() { | ||||
|             build.insert(userid, values[0].value).unwrap(); | ||||
|         } | ||||
|         let user_ids = build.into_inner().unwrap(); | ||||
|  | ||||
|         // TODO prefer using self.put_user_ids | ||||
|         self.main.put::<_, Str, ByteSlice>(writer, USER_IDS_KEY, user_ids.as_slice()) | ||||
|     } | ||||
|  | ||||
|     pub fn user_ids(self, reader: &heed::RoTxn<MainT>) -> ZResult<fst::Map> { | ||||
|         match self.main.get::<_, Str, ByteSlice>(reader, USER_IDS_KEY)? { | ||||
|             Some(bytes) => { | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| use std::collections::HashMap; | ||||
| use std::collections::{HashMap, BTreeMap}; | ||||
|  | ||||
| use fst::{set::OpBuilder, SetBuilder}; | ||||
| use indexmap::IndexMap; | ||||
| @@ -13,7 +13,7 @@ use crate::database::{UpdateEvent, UpdateEventsEmitter}; | ||||
| use crate::facets; | ||||
| use crate::raw_indexer::RawIndexer; | ||||
| use crate::serde::Deserializer; | ||||
| use crate::store::{self, DocumentsFields, DocumentsFieldsCounts}; | ||||
| use crate::store::{self, DocumentsFields, DocumentsFieldsCounts, DiscoverIds}; | ||||
| use crate::update::helpers::{index_value, value_to_number, extract_document_id}; | ||||
| use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; | ||||
| use crate::{Error, MResult, RankedMap}; | ||||
| @@ -150,17 +150,26 @@ pub fn apply_addition<'a, 'b>( | ||||
|     partial: bool | ||||
| ) -> MResult<()> { | ||||
|     let mut documents_additions = HashMap::new(); | ||||
|     let mut new_user_ids = BTreeMap::new(); | ||||
|     let mut new_internal_ids = Vec::with_capacity(new_documents.len()); | ||||
|  | ||||
|     let mut schema = match index.main.schema(writer)? { | ||||
|         Some(schema) => schema, | ||||
|         None => return Err(Error::SchemaMissing), | ||||
|     }; | ||||
|  | ||||
|     // Retrieve the documents ids related structures | ||||
|     let user_ids = index.main.user_ids(writer)?; | ||||
|     let internal_ids = index.main.internal_ids(writer)?; | ||||
|     let mut available_ids = DiscoverIds::new(&internal_ids); | ||||
|  | ||||
|     let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?; | ||||
|  | ||||
|     // 1. store documents ids for future deletion | ||||
|     for mut document in new_documents { | ||||
|         let document_id = extract_document_id(&primary_key, &document)?; | ||||
|         let (document_id, userid) = extract_document_id(&primary_key, &document, &user_ids, &mut available_ids)?; | ||||
|         new_user_ids.insert(userid, document_id.0); | ||||
|         new_internal_ids.push(document_id); | ||||
|  | ||||
|         if partial { | ||||
|             let mut deserializer = Deserializer { | ||||
| @@ -233,6 +242,11 @@ pub fn apply_addition<'a, 'b>( | ||||
|  | ||||
|     index.main.put_schema(writer, &schema)?; | ||||
|  | ||||
|     let new_user_ids = fst::Map::from_iter(new_user_ids)?; | ||||
|     let new_internal_ids = sdset::SetBuf::from_dirty(new_internal_ids); | ||||
|     index.main.merge_user_ids(writer, &new_user_ids)?; | ||||
|     index.main.merge_internal_ids(writer, &new_internal_ids)?; | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -71,7 +71,10 @@ pub fn apply_documents_deletion( | ||||
|     writer: &mut heed::RwTxn<MainT>, | ||||
|     index: &store::Index, | ||||
|     deletion: Vec<DocumentId>, | ||||
| ) -> MResult<()> { | ||||
| ) -> MResult<()> | ||||
| { | ||||
|     unimplemented!("When we delete documents we must ask for user ids instead of internal ones"); | ||||
|  | ||||
|     let schema = match index.main.schema(writer)? { | ||||
|         Some(schema) => schema, | ||||
|         None => return Err(Error::SchemaMissing), | ||||
|   | ||||
| @@ -1,16 +1,15 @@ | ||||
| use std::fmt::Write as _; | ||||
| use std::hash::{Hash, Hasher}; | ||||
|  | ||||
| use indexmap::IndexMap; | ||||
| use meilisearch_schema::IndexedPos; | ||||
| use meilisearch_types::DocumentId; | ||||
| use ordered_float::OrderedFloat; | ||||
| use serde_json::Value; | ||||
| use siphasher::sip::SipHasher; | ||||
|  | ||||
| use crate::Number; | ||||
| use crate::raw_indexer::RawIndexer; | ||||
| use crate::serde::SerializerError; | ||||
| use crate::Number; | ||||
| use crate::store::DiscoverIds; | ||||
|  | ||||
| /// Returns the number of words indexed or `None` if the type is unindexable. | ||||
| pub fn index_value( | ||||
| @@ -96,28 +95,43 @@ pub fn value_to_number(value: &Value) -> Option<Number> { | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Validates a string representation to be a correct document id and | ||||
| /// returns the hash of the given type, this is the way we produce documents ids. | ||||
| pub fn compute_document_id(string: &str) -> Result<DocumentId, SerializerError> { | ||||
|     if string.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') { | ||||
|         let mut s = SipHasher::new(); | ||||
|         string.hash(&mut s); | ||||
|         Ok(DocumentId(s.finish())) | ||||
| /// Validates a string representation to be a correct document id and returns | ||||
| /// the corresponding id or generate a new one, this is the way we produce documents ids. | ||||
| pub fn discover_document_id( | ||||
|     userid: &str, | ||||
|     user_ids: &fst::Map, | ||||
|     available_ids: &mut DiscoverIds<'_>, | ||||
| ) -> Result<DocumentId, SerializerError> | ||||
| { | ||||
|     if userid.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') { | ||||
|         match user_ids.get(userid) { | ||||
|             Some(internal_id) => Ok(DocumentId(internal_id)), | ||||
|             None => { | ||||
|                 let internal_id = available_ids.next().expect("no more ids available"); | ||||
|                 Ok(internal_id) | ||||
|             }, | ||||
|         } | ||||
|     } else { | ||||
|         Err(SerializerError::InvalidDocumentIdFormat) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Extracts and validates the document id of a document. | ||||
| pub fn extract_document_id(primary_key: &str, document: &IndexMap<String, Value>) -> Result<DocumentId, SerializerError> { | ||||
| pub fn extract_document_id( | ||||
|     primary_key: &str, | ||||
|     document: &IndexMap<String, Value>, | ||||
|     user_ids: &fst::Map, | ||||
|     available_ids: &mut DiscoverIds<'_>, | ||||
| ) -> Result<(DocumentId, String), SerializerError> | ||||
| { | ||||
|     match document.get(primary_key) { | ||||
|         Some(value) => { | ||||
|             let string = match value { | ||||
|             let userid = match value { | ||||
|                 Value::Number(number) => number.to_string(), | ||||
|                 Value::String(string) => string.clone(), | ||||
|                 _ => return Err(SerializerError::InvalidDocumentIdFormat), | ||||
|             }; | ||||
|             compute_document_id(&string) | ||||
|             discover_document_id(&userid, user_ids, available_ids).map(|id| (id, userid)) | ||||
|         } | ||||
|         None => Err(SerializerError::DocumentIdNotFound), | ||||
|     } | ||||
|   | ||||
| @@ -9,7 +9,7 @@ pub use self::clear_all::{apply_clear_all, push_clear_all}; | ||||
| pub use self::customs_update::{apply_customs_update, push_customs_update}; | ||||
| pub use self::documents_addition::{apply_documents_addition, apply_documents_partial_addition, DocumentsAddition}; | ||||
| pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; | ||||
| pub use self::helpers::{index_value, value_to_string, value_to_number, compute_document_id, extract_document_id}; | ||||
| pub use self::helpers::{index_value, value_to_string, value_to_number, discover_document_id, extract_document_id}; | ||||
| pub use self::settings_update::{apply_settings_update, push_settings_update}; | ||||
|  | ||||
| use std::cmp; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user