mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	Various changes
- fixed seed for arroy - check vector dimensions as soon as it is provided to search - don't embed whitespace
This commit is contained in:
		| @@ -154,6 +154,15 @@ impl<'a> Search<'a> { | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> { | ||||
|         if has_vector_search { | ||||
|             let ctx = SearchContext::new(self.index, self.rtxn); | ||||
|             filtered_universe(&ctx, &self.filter) | ||||
|         } else { | ||||
|             Ok(self.execute()?.candidates) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn execute(&self) -> Result<SearchResult> { | ||||
|         let embedder_name; | ||||
|         let embedder_name = match &self.embedder_name { | ||||
| @@ -297,11 +306,16 @@ pub struct SearchForFacetValues<'a> { | ||||
|     query: Option<String>, | ||||
|     facet: String, | ||||
|     search_query: Search<'a>, | ||||
|     is_hybrid: bool, | ||||
| } | ||||
|  | ||||
| impl<'a> SearchForFacetValues<'a> { | ||||
|     pub fn new(facet: String, search_query: Search<'a>) -> SearchForFacetValues<'a> { | ||||
|         SearchForFacetValues { query: None, facet, search_query } | ||||
|     pub fn new( | ||||
|         facet: String, | ||||
|         search_query: Search<'a>, | ||||
|         is_hybrid: bool, | ||||
|     ) -> SearchForFacetValues<'a> { | ||||
|         SearchForFacetValues { query: None, facet, search_query, is_hybrid } | ||||
|     } | ||||
|  | ||||
|     pub fn query(&mut self, query: impl Into<String>) -> &mut Self { | ||||
| @@ -351,7 +365,9 @@ impl<'a> SearchForFacetValues<'a> { | ||||
|             None => return Ok(vec![]), | ||||
|         }; | ||||
|  | ||||
|         let search_candidates = self.search_query.execute()?.candidates; | ||||
|         let search_candidates = self | ||||
|             .search_query | ||||
|             .execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?; | ||||
|  | ||||
|         match self.query.as_ref() { | ||||
|             Some(query) => { | ||||
|   | ||||
| @@ -509,7 +509,7 @@ where | ||||
|         // We write the primary key field id into the main database | ||||
|         self.index.put_primary_key(self.wtxn, &primary_key)?; | ||||
|         let number_of_documents = self.index.number_of_documents(self.wtxn)?; | ||||
|         let mut rng = rand::rngs::StdRng::from_entropy(); | ||||
|         let mut rng = rand::rngs::StdRng::seed_from_u64(42); | ||||
|  | ||||
|         for (embedder_name, dimension) in dimension { | ||||
|             let wtxn = &mut *self.wtxn; | ||||
|   | ||||
| @@ -7,7 +7,7 @@ use hf_hub::{Repo, RepoType}; | ||||
| use tokenizers::{PaddingParams, Tokenizer}; | ||||
|  | ||||
| pub use super::error::{EmbedError, Error, NewEmbedderError}; | ||||
| use super::{Embedding, Embeddings}; | ||||
| use super::{DistributionShift, Embedding, Embeddings}; | ||||
|  | ||||
| #[derive( | ||||
|     Debug, | ||||
| @@ -184,4 +184,12 @@ impl Embedder { | ||||
|     pub fn dimensions(&self) -> usize { | ||||
|         self.dimensions | ||||
|     } | ||||
|  | ||||
|     pub fn distribution(&self) -> Option<DistributionShift> { | ||||
|         if self.options.model == "BAAI/bge-base-en-v1.5" { | ||||
|             Some(DistributionShift { current_mean: 0.85, current_sigma: 0.1 }) | ||||
|         } else { | ||||
|             None | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -202,6 +202,14 @@ impl Embedder { | ||||
|             Embedder::UserProvided(embedder) => embedder.dimensions(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn distribution(&self) -> Option<DistributionShift> { | ||||
|         match self { | ||||
|             Embedder::HuggingFace(embedder) => embedder.distribution(), | ||||
|             Embedder::OpenAi(embedder) => embedder.distribution(), | ||||
|             Embedder::UserProvided(_embedder) => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
|   | ||||
| @@ -4,7 +4,7 @@ use reqwest::StatusCode; | ||||
| use serde::{Deserialize, Serialize}; | ||||
|  | ||||
| use super::error::{EmbedError, NewEmbedderError}; | ||||
| use super::{Embedding, Embeddings}; | ||||
| use super::{DistributionShift, Embedding, Embeddings}; | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct Embedder { | ||||
| @@ -65,6 +65,14 @@ impl EmbeddingModel { | ||||
|             _ => None, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn distribution(&self) -> Option<DistributionShift> { | ||||
|         match self { | ||||
|             EmbeddingModel::TextEmbeddingAda002 => { | ||||
|                 Some(DistributionShift { current_mean: 0.90, current_sigma: 0.08 }) | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings"; | ||||
| @@ -326,6 +334,10 @@ impl Embedder { | ||||
|     pub fn dimensions(&self) -> usize { | ||||
|         self.options.embedding_model.dimensions() | ||||
|     } | ||||
|  | ||||
|     pub fn distribution(&self) -> Option<DistributionShift> { | ||||
|         self.options.embedding_model.distribution() | ||||
|     } | ||||
| } | ||||
|  | ||||
| // retrying in case of failure | ||||
|   | ||||
		Reference in New Issue
	
	Block a user