mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	Add composite embedder
This commit is contained in:
		
							
								
								
									
										280
									
								
								crates/milli/src/vector/composite.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										280
									
								
								crates/milli/src/vector/composite.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,280 @@ | ||||
| use std::time::Instant; | ||||
|  | ||||
| use arroy::Distance; | ||||
|  | ||||
| use super::error::CompositeEmbedderContainsHuggingFace; | ||||
| use super::{ | ||||
|     hf, manual, ollama, openai, rest, DistributionShift, EmbedError, Embedding, NewEmbedderError, | ||||
| }; | ||||
| use crate::ThreadPoolNoAbort; | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub enum SubEmbedder { | ||||
|     /// An embedder based on running local models, fetched from the Hugging Face Hub. | ||||
|     HuggingFace(hf::Embedder), | ||||
|     /// An embedder based on making embedding queries against the OpenAI API. | ||||
|     OpenAi(openai::Embedder), | ||||
|     /// An embedder based on the user providing the embeddings in the documents and queries. | ||||
|     UserProvided(manual::Embedder), | ||||
|     /// An embedder based on making embedding queries against an <https://ollama.com> embedding server. | ||||
|     Ollama(ollama::Embedder), | ||||
|     /// An embedder based on making embedding queries against a generic JSON/REST embedding server. | ||||
|     Rest(rest::Embedder), | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] | ||||
| pub enum SubEmbedderOptions { | ||||
|     HuggingFace(hf::EmbedderOptions), | ||||
|     OpenAi(openai::EmbedderOptions), | ||||
|     Ollama(ollama::EmbedderOptions), | ||||
|     UserProvided(manual::EmbedderOptions), | ||||
|     Rest(rest::EmbedderOptions), | ||||
| } | ||||
|  | ||||
| impl SubEmbedderOptions { | ||||
|     pub fn distribution(&self) -> Option<DistributionShift> { | ||||
|         match self { | ||||
|             SubEmbedderOptions::HuggingFace(embedder_options) => embedder_options.distribution, | ||||
|             SubEmbedderOptions::OpenAi(embedder_options) => embedder_options.distribution, | ||||
|             SubEmbedderOptions::Ollama(embedder_options) => embedder_options.distribution, | ||||
|             SubEmbedderOptions::UserProvided(embedder_options) => embedder_options.distribution, | ||||
|             SubEmbedderOptions::Rest(embedder_options) => embedder_options.distribution, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct Embedder { | ||||
|     pub(super) search: SubEmbedder, | ||||
|     pub(super) index: SubEmbedder, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] | ||||
| pub struct EmbedderOptions { | ||||
|     pub search: SubEmbedderOptions, | ||||
|     pub index: SubEmbedderOptions, | ||||
| } | ||||
|  | ||||
| impl Embedder { | ||||
|     pub fn new( | ||||
|         EmbedderOptions { search, index }: EmbedderOptions, | ||||
|     ) -> Result<Self, NewEmbedderError> { | ||||
|         let search = SubEmbedder::new(search)?; | ||||
|         let index = SubEmbedder::new(index)?; | ||||
|  | ||||
|         // check dimensions | ||||
|         if search.dimensions() != index.dimensions() { | ||||
|             return Err(NewEmbedderError::composite_dimensions_mismatch( | ||||
|                 search.dimensions(), | ||||
|                 index.dimensions(), | ||||
|             )); | ||||
|         } | ||||
|         // check similarity | ||||
|         let search_embeddings = search | ||||
|             .embed( | ||||
|                 vec![ | ||||
|                     "test".into(), | ||||
|                     "a brave dog".into(), | ||||
|                     "This is a sample text. It is meant to compare similarity.".into(), | ||||
|                 ], | ||||
|                 None, | ||||
|             ) | ||||
|             .map_err(|error| NewEmbedderError::composite_test_embedding_failed(error, "search"))?; | ||||
|  | ||||
|         let index_embeddings = index | ||||
|             .embed( | ||||
|                 vec![ | ||||
|                     "test".into(), | ||||
|                     "a brave dog".into(), | ||||
|                     "This is a sample text. It is meant to compare similarity.".into(), | ||||
|                 ], | ||||
|                 None, | ||||
|             ) | ||||
|             .map_err(|error| { | ||||
|                 NewEmbedderError::composite_test_embedding_failed(error, "indexing") | ||||
|             })?; | ||||
|  | ||||
|         let hint = configuration_hint(&search, &index); | ||||
|  | ||||
|         check_similarity(search_embeddings, index_embeddings, hint)?; | ||||
|  | ||||
|         Ok(Self { search, index }) | ||||
|     } | ||||
|  | ||||
|     /// Indicates the dimensions of a single embedding produced by the embedder. | ||||
|     pub fn dimensions(&self) -> usize { | ||||
|         // can use the dimensions of any embedder since they should match | ||||
|         self.index.dimensions() | ||||
|     } | ||||
|  | ||||
|     /// An optional distribution used to apply an affine transformation to the similarity score of a document. | ||||
|     pub fn distribution(&self) -> Option<DistributionShift> { | ||||
|         // 3 cases here: | ||||
|         // 1. distribution provided by user => use that one, which was stored in search | ||||
|         // 2. no user-provided distribution, distribution in search embedder => use that one | ||||
|         // 2. no user-provided distribution, no distribution in search embedder => use the distribution in indexing embedder | ||||
|         self.search.distribution().or_else(|| self.index.distribution()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl SubEmbedder { | ||||
|     pub fn new(options: SubEmbedderOptions) -> std::result::Result<Self, NewEmbedderError> { | ||||
|         Ok(match options { | ||||
|             SubEmbedderOptions::HuggingFace(options) => { | ||||
|                 Self::HuggingFace(hf::Embedder::new(options)?) | ||||
|             } | ||||
|             SubEmbedderOptions::OpenAi(options) => Self::OpenAi(openai::Embedder::new(options)?), | ||||
|             SubEmbedderOptions::Ollama(options) => Self::Ollama(ollama::Embedder::new(options)?), | ||||
|             SubEmbedderOptions::UserProvided(options) => { | ||||
|                 Self::UserProvided(manual::Embedder::new(options)) | ||||
|             } | ||||
|             SubEmbedderOptions::Rest(options) => { | ||||
|                 Self::Rest(rest::Embedder::new(options, rest::ConfigurationSource::User)?) | ||||
|             } | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn embed( | ||||
|         &self, | ||||
|         texts: Vec<String>, | ||||
|         deadline: Option<Instant>, | ||||
|     ) -> std::result::Result<Vec<Embedding>, EmbedError> { | ||||
|         match self { | ||||
|             SubEmbedder::HuggingFace(embedder) => embedder.embed(texts), | ||||
|             SubEmbedder::OpenAi(embedder) => embedder.embed(&texts, deadline), | ||||
|             SubEmbedder::Ollama(embedder) => embedder.embed(&texts, deadline), | ||||
|             SubEmbedder::UserProvided(embedder) => embedder.embed(&texts), | ||||
|             SubEmbedder::Rest(embedder) => embedder.embed(texts, deadline), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Embed multiple chunks of texts. | ||||
|     /// | ||||
|     /// Each chunk is composed of one or multiple texts. | ||||
|     pub fn embed_index( | ||||
|         &self, | ||||
|         text_chunks: Vec<Vec<String>>, | ||||
|         threads: &ThreadPoolNoAbort, | ||||
|     ) -> std::result::Result<Vec<Vec<Embedding>>, EmbedError> { | ||||
|         match self { | ||||
|             SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), | ||||
|             SubEmbedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads), | ||||
|             SubEmbedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads), | ||||
|             SubEmbedder::UserProvided(embedder) => embedder.embed_index(text_chunks), | ||||
|             SubEmbedder::Rest(embedder) => embedder.embed_index(text_chunks, threads), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Non-owning variant of [`Self::embed_index`]. | ||||
|     pub fn embed_index_ref( | ||||
|         &self, | ||||
|         texts: &[&str], | ||||
|         threads: &ThreadPoolNoAbort, | ||||
|     ) -> std::result::Result<Vec<Embedding>, EmbedError> { | ||||
|         match self { | ||||
|             SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), | ||||
|             SubEmbedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads), | ||||
|             SubEmbedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads), | ||||
|             SubEmbedder::UserProvided(embedder) => embedder.embed_index_ref(texts), | ||||
|             SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] | ||||
|     pub fn chunk_count_hint(&self) -> usize { | ||||
|         match self { | ||||
|             SubEmbedder::HuggingFace(embedder) => embedder.chunk_count_hint(), | ||||
|             SubEmbedder::OpenAi(embedder) => embedder.chunk_count_hint(), | ||||
|             SubEmbedder::Ollama(embedder) => embedder.chunk_count_hint(), | ||||
|             SubEmbedder::UserProvided(_) => 100, | ||||
|             SubEmbedder::Rest(embedder) => embedder.chunk_count_hint(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Indicates the preferred number of texts in a single chunk passed to [`Self::embed`] | ||||
|     pub fn prompt_count_in_chunk_hint(&self) -> usize { | ||||
|         match self { | ||||
|             SubEmbedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(), | ||||
|             SubEmbedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(), | ||||
|             SubEmbedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(), | ||||
|             SubEmbedder::UserProvided(_) => 1, | ||||
|             SubEmbedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn uses_document_template(&self) -> bool { | ||||
|         match self { | ||||
|             SubEmbedder::HuggingFace(_) | ||||
|             | SubEmbedder::OpenAi(_) | ||||
|             | SubEmbedder::Ollama(_) | ||||
|             | SubEmbedder::Rest(_) => true, | ||||
|             SubEmbedder::UserProvided(_) => false, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Indicates the dimensions of a single embedding produced by the embedder. | ||||
|     pub fn dimensions(&self) -> usize { | ||||
|         match self { | ||||
|             SubEmbedder::HuggingFace(embedder) => embedder.dimensions(), | ||||
|             SubEmbedder::OpenAi(embedder) => embedder.dimensions(), | ||||
|             SubEmbedder::Ollama(embedder) => embedder.dimensions(), | ||||
|             SubEmbedder::UserProvided(embedder) => embedder.dimensions(), | ||||
|             SubEmbedder::Rest(embedder) => embedder.dimensions(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// An optional distribution used to apply an affine transformation to the similarity score of a document. | ||||
|     pub fn distribution(&self) -> Option<DistributionShift> { | ||||
|         match self { | ||||
|             SubEmbedder::HuggingFace(embedder) => embedder.distribution(), | ||||
|             SubEmbedder::OpenAi(embedder) => embedder.distribution(), | ||||
|             SubEmbedder::Ollama(embedder) => embedder.distribution(), | ||||
|             SubEmbedder::UserProvided(embedder) => embedder.distribution(), | ||||
|             SubEmbedder::Rest(embedder) => embedder.distribution(), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn check_similarity( | ||||
|     left: Vec<Embedding>, | ||||
|     right: Vec<Embedding>, | ||||
|     hint: CompositeEmbedderContainsHuggingFace, | ||||
| ) -> Result<(), NewEmbedderError> { | ||||
|     if left.len() != right.len() { | ||||
|         return Err(NewEmbedderError::composite_embedding_count_mismatch(left.len(), right.len())); | ||||
|     } | ||||
|  | ||||
|     for (left, right) in left.into_iter().zip(right) { | ||||
|         let left = arroy::internals::UnalignedVector::from_slice(&left); | ||||
|         let right = arroy::internals::UnalignedVector::from_slice(&right); | ||||
|         let left = arroy::internals::Leaf { | ||||
|             header: arroy::distances::Cosine::new_header(&left), | ||||
|             vector: left, | ||||
|         }; | ||||
|         let right = arroy::internals::Leaf { | ||||
|             header: arroy::distances::Cosine::new_header(&right), | ||||
|             vector: right, | ||||
|         }; | ||||
|  | ||||
|         let distance = arroy::distances::Cosine::built_distance(&left, &right); | ||||
|  | ||||
|         if distance > super::MAX_COMPOSITE_DISTANCE { | ||||
|             return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint)); | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| fn configuration_hint( | ||||
|     search: &SubEmbedder, | ||||
|     index: &SubEmbedder, | ||||
| ) -> CompositeEmbedderContainsHuggingFace { | ||||
|     match (search, index) { | ||||
|         (SubEmbedder::HuggingFace(_), SubEmbedder::HuggingFace(_)) => { | ||||
|             CompositeEmbedderContainsHuggingFace::Both | ||||
|         } | ||||
|         (SubEmbedder::HuggingFace(_), _) => CompositeEmbedderContainsHuggingFace::Search, | ||||
|         (_, SubEmbedder::HuggingFace(_)) => CompositeEmbedderContainsHuggingFace::Indexing, | ||||
|         _ => CompositeEmbedderContainsHuggingFace::None, | ||||
|     } | ||||
| } | ||||
| @@ -6,6 +6,7 @@ use hf_hub::api::sync::ApiError; | ||||
|  | ||||
| use super::parsed_vectors::ParsedVectorsDiff; | ||||
| use super::rest::ConfigurationSource; | ||||
| use super::MAX_COMPOSITE_DISTANCE; | ||||
| use crate::error::FaultSource; | ||||
| use crate::update::new::vector_document::VectorDocument; | ||||
| use crate::{FieldDistribution, PanicCatched}; | ||||
| @@ -335,6 +336,77 @@ impl NewEmbedderError { | ||||
|     pub(crate) fn ollama_unsupported_url(url: String) -> NewEmbedderError { | ||||
|         Self { kind: NewEmbedderErrorKind::OllamaUnsupportedUrl(url), fault: FaultSource::User } | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn composite_dimensions_mismatch( | ||||
|         search_dimensions: usize, | ||||
|         index_dimensions: usize, | ||||
|     ) -> NewEmbedderError { | ||||
|         Self { | ||||
|             kind: NewEmbedderErrorKind::CompositeDimensionsMismatch { | ||||
|                 search_dimensions, | ||||
|                 index_dimensions, | ||||
|             }, | ||||
|             fault: FaultSource::User, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn composite_test_embedding_failed( | ||||
|         inner: EmbedError, | ||||
|         failing_embedder: &'static str, | ||||
|     ) -> NewEmbedderError { | ||||
|         Self { | ||||
|             kind: NewEmbedderErrorKind::CompositeTestEmbeddingFailed { inner, failing_embedder }, | ||||
|             fault: FaultSource::Runtime, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn composite_embedding_count_mismatch( | ||||
|         search_count: usize, | ||||
|         index_count: usize, | ||||
|     ) -> NewEmbedderError { | ||||
|         Self { | ||||
|             kind: NewEmbedderErrorKind::CompositeEmbeddingCountMismatch { | ||||
|                 search_count, | ||||
|                 index_count, | ||||
|             }, | ||||
|             fault: FaultSource::Runtime, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn composite_embedding_value_mismatch( | ||||
|         distance: f32, | ||||
|         hint: CompositeEmbedderContainsHuggingFace, | ||||
|     ) -> NewEmbedderError { | ||||
|         Self { | ||||
|             kind: NewEmbedderErrorKind::CompositeEmbeddingValueMismatch { distance, hint }, | ||||
|             fault: FaultSource::User, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub enum CompositeEmbedderContainsHuggingFace { | ||||
|     Both, | ||||
|     Search, | ||||
|     Indexing, | ||||
|     None, | ||||
| } | ||||
|  | ||||
| impl std::fmt::Display for CompositeEmbedderContainsHuggingFace { | ||||
|     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||||
|         match self { | ||||
|             CompositeEmbedderContainsHuggingFace::Both => f.write_str( | ||||
|                 "\n  - Make sure the `model`, `revision` and `pooling` of both embedders match.", | ||||
|             ), | ||||
|             CompositeEmbedderContainsHuggingFace::Search => f.write_str( | ||||
|                 "\n  - Consider trying a different `pooling` method for the search embedder.", | ||||
|             ), | ||||
|             CompositeEmbedderContainsHuggingFace::Indexing => f.write_str( | ||||
|                 "\n  - Consider trying a different `pooling` method for the indexing embedder.", | ||||
|             ), | ||||
|             CompositeEmbedderContainsHuggingFace::None => Ok(()), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, thiserror::Error)] | ||||
| @@ -419,6 +491,14 @@ pub enum NewEmbedderErrorKind { | ||||
|     CouldNotParseTemplate(String), | ||||
|     #[error("unsupported Ollama URL.\n  - For `ollama` sources, the URL must end with `/api/embed` or `/api/embeddings`\n  - Got `{0}`")] | ||||
|     OllamaUnsupportedUrl(String), | ||||
|     #[error("error while generating test embeddings.\n  - the dimensions of embeddings produced at search time and at indexing time don't match.\n  - Search time dimensions: {search_dimensions}\n  - Indexing time dimensions: {index_dimensions}\n  - Note: Dimensions of embeddings produced by both embedders are required to match.")] | ||||
|     CompositeDimensionsMismatch { search_dimensions: usize, index_dimensions: usize }, | ||||
|     #[error("error while generating test embeddings.\n  - could not generate test embedding with embedder at {failing_embedder} time.\n  - Embedding failed with {inner}")] | ||||
|     CompositeTestEmbeddingFailed { inner: EmbedError, failing_embedder: &'static str }, | ||||
|     #[error("error while generating test embeddings.\n  - the number of generated embeddings differs.\n  - {search_count} embeddings for the search time embedder.\n  - {index_count} embeddings for the indexing time embedder.")] | ||||
|     CompositeEmbeddingCountMismatch { search_count: usize, index_count: usize }, | ||||
|     #[error("error while generating test embeddings.\n  - the embeddings produced at search time and indexing time are not similar enough.\n  - angular distance {distance}\n  - Meilisearch requires a maximum distance of {MAX_COMPOSITE_DISTANCE}.\n  - Note: check that both embedders produce similar embeddings.{hint}")] | ||||
|     CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace }, | ||||
| } | ||||
|  | ||||
| pub struct PossibleEmbeddingMistakes { | ||||
|   | ||||
| @@ -15,6 +15,7 @@ use self::error::{EmbedError, NewEmbedderError}; | ||||
| use crate::prompt::{Prompt, PromptData}; | ||||
| use crate::ThreadPoolNoAbort; | ||||
|  | ||||
| pub mod composite; | ||||
| pub mod error; | ||||
| pub mod hf; | ||||
| pub mod json_template; | ||||
| @@ -31,6 +32,7 @@ pub use self::error::Error; | ||||
| pub type Embedding = Vec<f32>; | ||||
|  | ||||
| pub const REQUEST_PARALLELISM: usize = 40; | ||||
| pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01; | ||||
|  | ||||
| pub struct ArroyWrapper { | ||||
|     quantized: bool, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user