mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	always push the user defined vectors in arroy
This commit is contained in:
		| @@ -1,244 +0,0 @@ | ||||
| --- | ||||
| source: milli/src/search/new/tests/attribute_fid.rs | ||||
| expression: "format!(\"{document_ids_scores:#?}\")" | ||||
| --- | ||||
| [ | ||||
|     ( | ||||
|         2, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 19, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 91, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         6, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 15, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 81, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         5, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 14, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 79, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         4, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 13, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 77, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         3, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 12, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 83, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         9, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 11, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 75, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         8, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 10, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 79, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         7, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 10, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 73, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         11, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 7, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 77, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         10, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 6, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 81, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         13, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 6, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 81, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         12, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 6, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 78, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         14, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 5, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 75, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
|     ( | ||||
|         0, | ||||
|         [ | ||||
|             Fid( | ||||
|                 Rank { | ||||
|                     rank: 1, | ||||
|                     max_rank: 19, | ||||
|                 }, | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 91, | ||||
|                     max_rank: 91, | ||||
|                 }, | ||||
|             ), | ||||
|         ], | ||||
|     ), | ||||
| ] | ||||
| @@ -1,7 +0,0 @@ | ||||
| --- | ||||
| source: milli/src/index.rs | ||||
| --- | ||||
| age              1      | | ||||
| id               2      | | ||||
| name             2      | | ||||
|  | ||||
| @@ -1,7 +0,0 @@ | ||||
| --- | ||||
| source: milli/src/index.rs | ||||
| --- | ||||
| age              1      | | ||||
| id               2      | | ||||
| name             2      | | ||||
|  | ||||
| @@ -8,7 +8,6 @@ use std::sync::Arc; | ||||
|  | ||||
| use bytemuck::cast_slice; | ||||
| use grenad::Writer; | ||||
| use itertools::EitherOrBoth; | ||||
| use ordered_float::OrderedFloat; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::Value; | ||||
| @@ -50,7 +49,7 @@ enum VectorStateDelta { | ||||
|     // Note: changing the value of the manually specified vector **should not record** this delta | ||||
|     WasGeneratedNowManual(Vec<Vec<f32>>), | ||||
|  | ||||
|     ManualDelta(Vec<Vec<f32>>, Vec<Vec<f32>>), | ||||
|     ManualDelta(Vec<Vec<f32>>), | ||||
|  | ||||
|     // Add the vector computed from the specified prompt | ||||
|     // Remove any previous vector | ||||
| @@ -59,14 +58,12 @@ enum VectorStateDelta { | ||||
| } | ||||
|  | ||||
| impl VectorStateDelta { | ||||
|     fn into_values(self) -> (bool, String, (Vec<Vec<f32>>, Vec<Vec<f32>>)) { | ||||
|     fn into_values(self) -> (bool, String, Vec<Vec<f32>>) { | ||||
|         match self { | ||||
|             VectorStateDelta::NoChange => Default::default(), | ||||
|             VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), | ||||
|             VectorStateDelta::WasGeneratedNowManual(add) => { | ||||
|                 (true, Default::default(), (Default::default(), add)) | ||||
|             } | ||||
|             VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)), | ||||
|             VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add), | ||||
|             VectorStateDelta::ManualDelta(add) => (false, Default::default(), add), | ||||
|             VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), | ||||
|         } | ||||
|     } | ||||
| @@ -166,8 +163,14 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||
|         // lazily get it when needed | ||||
|         let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; | ||||
|  | ||||
|         let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) | ||||
|             .map_err(|error| error.to_crate_error(document_id().to_string()))?; | ||||
|         let mut parsed_vectors = ParsedVectorsDiff::new( | ||||
|             docid, | ||||
|             embedders_configs, | ||||
|             obkv, | ||||
|             old_vectors_fid, | ||||
|             new_vectors_fid, | ||||
|         ) | ||||
|         .map_err(|error| error.to_crate_error(document_id().to_string()))?; | ||||
|  | ||||
|         for EmbedderVectorExtractor { | ||||
|             embedder_name, | ||||
| @@ -182,7 +185,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||
|         { | ||||
|             let delta = match parsed_vectors.remove(embedder_name) { | ||||
|                 (Some(old), Some(new)) => { | ||||
|                     match (old.is_user_provided(), new.is_user_provided()) { | ||||
|                     match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { | ||||
|                         (true, true) | (false, false) => (), | ||||
|                         (true, false) => { | ||||
|                             remove_from_user_defined.insert(docid); | ||||
| @@ -193,7 +196,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||
|                     } | ||||
|  | ||||
|                     // no autogeneration | ||||
|                     let del_vectors = old.into_array_of_vectors(); | ||||
|                     let add_vectors = new.into_array_of_vectors(); | ||||
|  | ||||
|                     if add_vectors.len() > usize::from(u8::MAX) { | ||||
| @@ -203,15 +205,15 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||
|                         ))); | ||||
|                     } | ||||
|  | ||||
|                     VectorStateDelta::ManualDelta(del_vectors, add_vectors) | ||||
|                     VectorStateDelta::ManualDelta(add_vectors) | ||||
|                 } | ||||
|                 (Some(_old), None) => { | ||||
|                 (Some(old), None) => { | ||||
|                     // Do we keep this document? | ||||
|                     let document_is_kept = obkv | ||||
|                         .iter() | ||||
|                         .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) | ||||
|                         .any(|deladd| deladd.get(DelAdd::Addition).is_some()); | ||||
|                     if document_is_kept { | ||||
|                     if document_is_kept && old.is_some() { | ||||
|                         remove_from_user_defined.insert(docid); | ||||
|                         // becomes autogenerated | ||||
|                         VectorStateDelta::NowGenerated(prompt.render( | ||||
| @@ -219,6 +221,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||
|                             DelAdd::Addition, | ||||
|                             new_fields_ids_map, | ||||
|                         )?) | ||||
|                     } else if document_is_kept && old.is_none() { | ||||
|                         VectorStateDelta::NoChange | ||||
|                     } else { | ||||
|                         VectorStateDelta::NowRemoved | ||||
|                     } | ||||
| @@ -315,8 +319,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||
|     Ok(results) | ||||
| } | ||||
|  | ||||
| /// Computes the diff between both Del and Add numbers and | ||||
| /// only inserts the parts that differ in the sorter. | ||||
| /// We cannot compute the diff between both Del and Add vectors. | ||||
| /// We'll push every vector and compute the difference later in TypedChunk. | ||||
| fn push_vectors_diff( | ||||
|     remove_vectors_writer: &mut Writer<BufWriter<File>>, | ||||
|     prompts_writer: &mut Writer<BufWriter<File>>, | ||||
| @@ -325,7 +329,7 @@ fn push_vectors_diff( | ||||
|     delta: VectorStateDelta, | ||||
|     reindex_vectors: bool, | ||||
| ) -> Result<()> { | ||||
|     let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); | ||||
|     let (must_remove, prompt, mut add_vectors) = delta.into_values(); | ||||
|     if must_remove | ||||
|     // TODO: the below condition works because we erase the vec database when a embedding setting changes. | ||||
|     // When vector pipeline will be optimized, this should be removed. | ||||
| @@ -340,44 +344,25 @@ fn push_vectors_diff( | ||||
|     } | ||||
|  | ||||
|     // We sort and dedup the vectors | ||||
|     del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); | ||||
|     add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); | ||||
|     del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); | ||||
|     add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); | ||||
|  | ||||
|     let merged_vectors_iter = | ||||
|         itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); | ||||
|     // let merged_vectors_iter = | ||||
|     //     itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); | ||||
|  | ||||
|     // insert vectors into the writer | ||||
|     for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { | ||||
|     for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { | ||||
|         // Generate the key by extending the unique index to it. | ||||
|         key_buffer.truncate(TRUNCATE_SIZE); | ||||
|         let index = u16::try_from(i).unwrap(); | ||||
|         key_buffer.extend_from_slice(&index.to_be_bytes()); | ||||
|  | ||||
|         match eob { | ||||
|             EitherOrBoth::Both(_, _) => (), // no need to touch anything | ||||
|             EitherOrBoth::Left(vector) => { | ||||
|                 // TODO: the below condition works because we erase the vec database when a embedding setting changes. | ||||
|                 // When vector pipeline will be optimized, this should be removed. | ||||
|                 if !reindex_vectors { | ||||
|                     // We insert only the Del part of the Obkv to inform | ||||
|                     // that we only want to remove all those vectors. | ||||
|                     let mut obkv = KvWriterDelAdd::memory(); | ||||
|                     obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; | ||||
|                     let bytes = obkv.into_inner()?; | ||||
|                     manual_vectors_writer.insert(&key_buffer, bytes)?; | ||||
|                 } | ||||
|             } | ||||
|             EitherOrBoth::Right(vector) => { | ||||
|                 // We insert only the Add part of the Obkv to inform | ||||
|                 // that we only want to remove all those vectors. | ||||
|                 let mut obkv = KvWriterDelAdd::memory(); | ||||
|                 obkv.insert(DelAdd::Addition, cast_slice(&vector))?; | ||||
|                 let bytes = obkv.into_inner()?; | ||||
|                 manual_vectors_writer.insert(&key_buffer, bytes)?; | ||||
|             } | ||||
|         } | ||||
|         // We insert only the Add part of the Obkv to inform | ||||
|         // that we only want to remove all those vectors. | ||||
|         let mut obkv = KvWriterDelAdd::memory(); | ||||
|         obkv.insert(DelAdd::Addition, cast_slice(&vector))?; | ||||
|         let bytes = obkv.into_inner()?; | ||||
|         manual_vectors_writer.insert(&key_buffer, bytes)?; | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
|   | ||||
| @@ -4,8 +4,9 @@ use obkv::KvReader; | ||||
| use serde_json::{from_slice, Value}; | ||||
|  | ||||
| use super::Embedding; | ||||
| use crate::index::IndexEmbeddingConfig; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd}; | ||||
| use crate::{FieldId, InternalError, UserError}; | ||||
| use crate::{DocumentId, FieldId, InternalError, UserError}; | ||||
|  | ||||
| pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; | ||||
|  | ||||
| @@ -42,17 +43,19 @@ pub struct ExplicitVectors { | ||||
| } | ||||
|  | ||||
| pub struct ParsedVectorsDiff { | ||||
|     pub old: Option<BTreeMap<String, Vectors>>, | ||||
|     pub old: BTreeMap<String, Option<Vectors>>, | ||||
|     pub new: Option<BTreeMap<String, Vectors>>, | ||||
| } | ||||
|  | ||||
| impl ParsedVectorsDiff { | ||||
|     pub fn new( | ||||
|         docid: DocumentId, | ||||
|         embedders_configs: &[IndexEmbeddingConfig], | ||||
|         documents_diff: KvReader<'_, FieldId>, | ||||
|         old_vectors_fid: Option<FieldId>, | ||||
|         new_vectors_fid: Option<FieldId>, | ||||
|     ) -> Result<Self, Error> { | ||||
|         let old = match old_vectors_fid | ||||
|         let mut old = match old_vectors_fid | ||||
|             .and_then(|vectors_fid| documents_diff.get(vectors_fid)) | ||||
|             .map(KvReaderDelAdd::new) | ||||
|             .map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) | ||||
| @@ -68,7 +71,13 @@ impl ParsedVectorsDiff { | ||||
|                 return Err(error); | ||||
|             } | ||||
|         } | ||||
|         .flatten(); | ||||
|         .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect()); | ||||
|         for embedding_config in embedders_configs { | ||||
|             if embedding_config.user_defined.contains(docid) { | ||||
|                 old.entry(embedding_config.name.to_string()).or_insert(None); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let new = new_vectors_fid | ||||
|             .and_then(|vectors_fid| documents_diff.get(vectors_fid)) | ||||
|             .map(KvReaderDelAdd::new) | ||||
| @@ -78,8 +87,9 @@ impl ParsedVectorsDiff { | ||||
|         Ok(Self { old, new }) | ||||
|     } | ||||
|  | ||||
|     pub fn remove(&mut self, embedder_name: &str) -> (Option<Vectors>, Option<Vectors>) { | ||||
|         let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); | ||||
|     /// Return (Some(None), _) in case the vector is user defined and contained in the database. | ||||
|     pub fn remove(&mut self, embedder_name: &str) -> (Option<Option<Vectors>>, Option<Vectors>) { | ||||
|         let old = self.old.remove(embedder_name); | ||||
|         let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); | ||||
|         (old, new) | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user