Various changes

- fixed seed for arroy
- check vector dimensions as soon as it is provided to search
- don't embed whitespace
This commit is contained in:
Louis Dureuil
2023-12-14 16:01:35 +01:00
parent 217105b7da
commit 87bba98bd8
9 changed files with 148 additions and 51 deletions

View File

@ -154,6 +154,15 @@ impl<'a> Search<'a> {
self
}
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
if has_vector_search {
let ctx = SearchContext::new(self.index, self.rtxn);
filtered_universe(&ctx, &self.filter)
} else {
Ok(self.execute()?.candidates)
}
}
pub fn execute(&self) -> Result<SearchResult> {
let embedder_name;
let embedder_name = match &self.embedder_name {
@ -297,11 +306,16 @@ pub struct SearchForFacetValues<'a> {
query: Option<String>,
facet: String,
search_query: Search<'a>,
is_hybrid: bool,
}
impl<'a> SearchForFacetValues<'a> {
pub fn new(facet: String, search_query: Search<'a>) -> SearchForFacetValues<'a> {
SearchForFacetValues { query: None, facet, search_query }
pub fn new(
facet: String,
search_query: Search<'a>,
is_hybrid: bool,
) -> SearchForFacetValues<'a> {
SearchForFacetValues { query: None, facet, search_query, is_hybrid }
}
pub fn query(&mut self, query: impl Into<String>) -> &mut Self {
@ -351,7 +365,9 @@ impl<'a> SearchForFacetValues<'a> {
None => return Ok(vec![]),
};
let search_candidates = self.search_query.execute()?.candidates;
let search_candidates = self
.search_query
.execute_for_candidates(self.is_hybrid || self.search_query.vector.is_some())?;
match self.query.as_ref() {
Some(query) => {

View File

@ -509,7 +509,7 @@ where
// We write the primary key field id into the main database
self.index.put_primary_key(self.wtxn, &primary_key)?;
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let mut rng = rand::rngs::StdRng::from_entropy();
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
for (embedder_name, dimension) in dimension {
let wtxn = &mut *self.wtxn;

View File

@ -7,7 +7,7 @@ use hf_hub::{Repo, RepoType};
use tokenizers::{PaddingParams, Tokenizer};
pub use super::error::{EmbedError, Error, NewEmbedderError};
use super::{Embedding, Embeddings};
use super::{DistributionShift, Embedding, Embeddings};
#[derive(
Debug,
@ -184,4 +184,12 @@ impl Embedder {
pub fn dimensions(&self) -> usize {
self.dimensions
}
pub fn distribution(&self) -> Option<DistributionShift> {
if self.options.model == "BAAI/bge-base-en-v1.5" {
Some(DistributionShift { current_mean: 0.85, current_sigma: 0.1 })
} else {
None
}
}
}

View File

@ -202,6 +202,14 @@ impl Embedder {
Embedder::UserProvided(embedder) => embedder.dimensions(),
}
}
pub fn distribution(&self) -> Option<DistributionShift> {
match self {
Embedder::HuggingFace(embedder) => embedder.distribution(),
Embedder::OpenAi(embedder) => embedder.distribution(),
Embedder::UserProvided(_embedder) => None,
}
}
}
#[derive(Debug, Clone, Copy)]

View File

@ -4,7 +4,7 @@ use reqwest::StatusCode;
use serde::{Deserialize, Serialize};
use super::error::{EmbedError, NewEmbedderError};
use super::{Embedding, Embeddings};
use super::{DistributionShift, Embedding, Embeddings};
#[derive(Debug)]
pub struct Embedder {
@ -65,6 +65,14 @@ impl EmbeddingModel {
_ => None,
}
}
fn distribution(&self) -> Option<DistributionShift> {
match self {
EmbeddingModel::TextEmbeddingAda002 => {
Some(DistributionShift { current_mean: 0.90, current_sigma: 0.08 })
}
}
}
}
pub const OPENAI_EMBEDDINGS_URL: &str = "https://api.openai.com/v1/embeddings";
@ -326,6 +334,10 @@ impl Embedder {
pub fn dimensions(&self) -> usize {
self.options.embedding_model.dimensions()
}
pub fn distribution(&self) -> Option<DistributionShift> {
self.options.embedding_model.distribution()
}
}
// retrying in case of failure