Merge branch 'main' into fix-threshold-overcounting-bug

This commit is contained in:
Mubelotix
2025-07-07 12:26:37 +02:00
141 changed files with 6398 additions and 1608 deletions

View File

@ -7,6 +7,7 @@ use roaring::RoaringBitmap;
use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
use crate::search::new::{distinct_fid, distinct_single_docid};
use crate::search::SemanticSearch;
use crate::vector::SearchQuery;
use crate::{Index, MatchingWords, Result, Search, SearchResult};
struct ScoreWithRatioResult {
@ -226,12 +227,9 @@ impl Search<'_> {
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
}
// no vector search against placeholder search
let Some(query) = search.query.take() else {
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
};
// no embedder, no semantic search
let Some(SemanticSearch { vector, embedder_name, embedder, quantized }) = semantic else {
let Some(SemanticSearch { vector, embedder_name, embedder, quantized, media }) = semantic
else {
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
};
@ -242,9 +240,17 @@ impl Search<'_> {
let span = tracing::trace_span!(target: "search::hybrid", "embed_one");
let _entered = span.enter();
let q = search.query.as_deref();
let media = media.as_ref();
let query = match (q, media) {
(Some(text), None) => SearchQuery::Text(text),
(q, media) => SearchQuery::Media { q, media },
};
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3);
match embedder.embed_search(&query, Some(deadline)) {
match embedder.embed_search(query, Some(deadline)) {
Ok(embedding) => embedding,
Err(error) => {
tracing::error!(error=%error, "Embedding failed");
@ -258,8 +264,13 @@ impl Search<'_> {
}
};
search.semantic =
Some(SemanticSearch { vector: Some(vector_query), embedder_name, embedder, quantized });
search.semantic = Some(SemanticSearch {
vector: Some(vector_query),
embedder_name,
embedder,
quantized,
media,
});
// TODO: would be better to have two distinct functions at this point
let vector_results = search.execute()?;

View File

@ -12,7 +12,7 @@ use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats};
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
use crate::index::MatchingStrategy;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::vector::Embedder;
use crate::vector::{Embedder, Embedding};
use crate::{
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index,
Result, SearchContext, TimeBudget, UserError,
@ -32,6 +32,7 @@ pub mod similar;
#[derive(Debug, Clone)]
pub struct SemanticSearch {
vector: Option<Vec<f32>>,
media: Option<serde_json::Value>,
embedder_name: String,
embedder: Arc<Embedder>,
quantized: bool,
@ -95,9 +96,10 @@ impl<'a> Search<'a> {
embedder_name: String,
embedder: Arc<Embedder>,
quantized: bool,
vector: Option<Vec<f32>>,
vector: Option<Embedding>,
media: Option<serde_json::Value>,
) -> &mut Search<'a> {
self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector });
self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector, media });
self
}
@ -238,26 +240,30 @@ impl<'a> Search<'a> {
degraded,
used_negative_operator,
} = match self.semantic.as_ref() {
Some(SemanticSearch { vector: Some(vector), embedder_name, embedder, quantized }) => {
execute_vector_search(
&mut ctx,
vector,
self.scoring_strategy,
self.exhaustive_number_hits,
self.max_total_hits,
universe,
&self.sort_criteria,
&self.distinct,
self.geo_param,
self.offset,
self.limit,
embedder_name,
embedder,
*quantized,
self.time_budget.clone(),
self.ranking_score_threshold,
)?
}
Some(SemanticSearch {
vector: Some(vector),
embedder_name,
embedder,
quantized,
media: _,
}) => execute_vector_search(
&mut ctx,
vector,
self.scoring_strategy,
self.exhaustive_number_hits,
self.max_total_hits,
universe,
&self.sort_criteria,
&self.distinct,
self.geo_param,
self.offset,
self.limit,
embedder_name,
embedder,
*quantized,
self.time_budget.clone(),
self.ranking_score_threshold,
)?,
_ => execute_search(
&mut ctx,
self.query.as_deref(),

View File

@ -8,7 +8,7 @@ use maplit::{btreemap, hashset};
use crate::progress::Progress;
use crate::update::new::indexer;
use crate::update::{IndexerConfig, Settings};
use crate::vector::EmbeddingConfigs;
use crate::vector::RuntimeEmbedders;
use crate::{db_snap, Criterion, FilterableAttributesRule, Index};
pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson");
use crate::constants::RESERVED_GEO_FIELD_NAME;
@ -55,7 +55,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let mut new_fields_ids_map = db_fields_ids_map.clone();
let embedders = EmbeddingConfigs::default();
let embedders = RuntimeEmbedders::default();
let mut indexer = indexer::DocumentOperation::new();
let mut file = tempfile::tempfile().unwrap();

View File

@ -32,8 +32,8 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
) -> Result<Self> {
let embedder_index = ctx
.index
.embedder_category_id
.get(ctx.txn, embedder_name)?
.embedding_configs()
.embedder_id(ctx.txn, embedder_name)?
.ok_or_else(|| crate::UserError::InvalidSearchEmbedder(embedder_name.to_owned()))?;
Ok(Self {

View File

@ -64,10 +64,13 @@ impl<'a> Similar<'a> {
let universe = universe;
let embedder_index =
self.index.embedder_category_id.get(self.rtxn, &self.embedder_name)?.ok_or_else(
|| crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()),
)?;
let embedder_index = self
.index
.embedding_configs()
.embedder_id(self.rtxn, &self.embedder_name)?
.ok_or_else(|| {
crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned())
})?;
let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized);
let results = reader.nns_by_item(