mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-27 08:41:00 +00:00
Merge remote-tracking branch 'origin/release-v1.16.0' into document-sorting
This commit is contained in:
@ -302,6 +302,8 @@ and can not be more than 511 bytes.", .document_id.to_string()
|
||||
InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError),
|
||||
#[error("Too many embedders in the configuration. Found {0}, but limited to 256.")]
|
||||
TooManyEmbedders(usize),
|
||||
#[error("Too many fragments in the configuration. Found {0}, but limited to 256.")]
|
||||
TooManyFragments(usize),
|
||||
#[error("Cannot find embedder with name `{0}`.")]
|
||||
InvalidSearchEmbedder(String),
|
||||
#[error("Cannot find embedder with name `{0}`.")]
|
||||
|
@ -30,7 +30,8 @@ use crate::order_by_map::OrderByMap;
|
||||
use crate::prompt::PromptData;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::new::StdResult;
|
||||
use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig};
|
||||
use crate::vector::db::IndexEmbeddingConfigs;
|
||||
use crate::vector::{ArroyStats, ArroyWrapper, Embedding};
|
||||
use crate::{
|
||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
|
||||
@ -177,7 +178,7 @@ pub struct Index {
|
||||
pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,
|
||||
|
||||
/// Maps an embedder name to its id in the arroy store.
|
||||
pub embedder_category_id: Database<Str, U8>,
|
||||
pub(crate) embedder_category_id: Database<Unspecified, Unspecified>,
|
||||
/// Vector store based on arroy™.
|
||||
pub vector_arroy: arroy::Database<Unspecified>,
|
||||
|
||||
@ -1745,34 +1746,6 @@ impl Index {
|
||||
self.main.remap_key_type::<Str>().delete(txn, main_key::LOCALIZED_ATTRIBUTES_RULES)
|
||||
}
|
||||
|
||||
/// Put the embedding configs:
|
||||
/// 1. The name of the embedder
|
||||
/// 2. The configuration option for this embedder
|
||||
/// 3. The list of documents with a user provided embedding
|
||||
pub(crate) fn put_embedding_configs(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
configs: Vec<IndexEmbeddingConfig>,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>().put(
|
||||
wtxn,
|
||||
main_key::EMBEDDING_CONFIGS,
|
||||
&configs,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::EMBEDDING_CONFIGS)
|
||||
}
|
||||
|
||||
pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result<Vec<IndexEmbeddingConfig>> {
|
||||
Ok(self
|
||||
.main
|
||||
.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>()
|
||||
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
||||
}
|
||||
@ -1785,19 +1758,29 @@ impl Index {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, main_key::SEARCH_CUTOFF)
|
||||
}
|
||||
|
||||
pub fn embedding_configs(&self) -> IndexEmbeddingConfigs {
|
||||
IndexEmbeddingConfigs::new(self.main, self.embedder_category_id)
|
||||
}
|
||||
|
||||
pub fn embeddings(
|
||||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
docid: DocumentId,
|
||||
) -> Result<BTreeMap<String, Vec<Embedding>>> {
|
||||
) -> Result<BTreeMap<String, (Vec<Embedding>, bool)>> {
|
||||
let mut res = BTreeMap::new();
|
||||
let embedding_configs = self.embedding_configs(rtxn)?;
|
||||
for config in embedding_configs {
|
||||
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
|
||||
let reader =
|
||||
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
|
||||
let embedders = self.embedding_configs();
|
||||
for config in embedders.embedding_configs(rtxn)? {
|
||||
let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap();
|
||||
let reader = ArroyWrapper::new(
|
||||
self.vector_arroy,
|
||||
embedder_info.embedder_id,
|
||||
config.config.quantized(),
|
||||
);
|
||||
let embeddings = reader.item_vectors(rtxn, docid)?;
|
||||
res.insert(config.name.to_owned(), embeddings);
|
||||
res.insert(
|
||||
config.name.to_owned(),
|
||||
(embeddings, embedder_info.embedding_status.must_regenerate(docid)),
|
||||
);
|
||||
}
|
||||
Ok(res)
|
||||
}
|
||||
@ -1809,9 +1792,9 @@ impl Index {
|
||||
|
||||
pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> {
|
||||
let mut stats = ArroyStats::default();
|
||||
let embedding_configs = self.embedding_configs(rtxn)?;
|
||||
for config in embedding_configs {
|
||||
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
|
||||
let embedding_configs = self.embedding_configs();
|
||||
for config in embedding_configs.embedding_configs(rtxn)? {
|
||||
let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap();
|
||||
let reader =
|
||||
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
|
||||
reader.aggregate_stats(rtxn, &mut stats)?;
|
||||
@ -1936,13 +1919,6 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct IndexEmbeddingConfig {
|
||||
pub name: String,
|
||||
pub config: EmbeddingConfig,
|
||||
pub user_provided: RoaringBitmap,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Deserialize, Serialize)]
|
||||
pub struct ChatConfig {
|
||||
pub description: String,
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::any::TypeId;
|
||||
use std::borrow::Cow;
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@ -22,6 +22,25 @@ pub struct Progress {
|
||||
steps: Arc<RwLock<InnerProgress>>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct EmbedderStats {
|
||||
pub errors: Arc<RwLock<(Option<String>, u32)>>,
|
||||
pub total_count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for EmbedderStats {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let guard = self.errors.read().unwrap_or_else(|p| p.into_inner());
|
||||
let (error, count) = (guard.0.clone(), guard.1);
|
||||
std::mem::drop(guard);
|
||||
f.debug_struct("EmbedderStats")
|
||||
.field("last_error", &error)
|
||||
.field("total_count", &self.total_count.load(Ordering::Relaxed))
|
||||
.field("error_count", &count)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct InnerProgress {
|
||||
/// The hierarchy of steps.
|
||||
|
@ -6,12 +6,18 @@ use liquid::{ObjectView, ValueView};
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Context<'a, D: ObjectView, F: ArrayView> {
|
||||
document: &'a D,
|
||||
fields: &'a F,
|
||||
fields: Option<&'a F>,
|
||||
}
|
||||
|
||||
impl<'a, D: ObjectView, F: ArrayView> Context<'a, D, F> {
|
||||
pub fn new(document: &'a D, fields: &'a F) -> Self {
|
||||
Self { document, fields }
|
||||
Self { document, fields: Some(fields) }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, D: ObjectView> Context<'a, D, Vec<bool>> {
|
||||
pub fn without_fields(document: &'a D) -> Self {
|
||||
Self { document, fields: None }
|
||||
}
|
||||
}
|
||||
|
||||
@ -21,17 +27,27 @@ impl<D: ObjectView, F: ArrayView> ObjectView for Context<'_, D, F> {
|
||||
}
|
||||
|
||||
fn size(&self) -> i64 {
|
||||
2
|
||||
if self.fields.is_some() {
|
||||
2
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
|
||||
fn keys<'k>(&'k self) -> Box<dyn Iterator<Item = KStringCow<'k>> + 'k> {
|
||||
Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s)))
|
||||
let keys = if self.fields.is_some() {
|
||||
either::Either::Left(["doc", "fields"])
|
||||
} else {
|
||||
either::Either::Right(["doc"])
|
||||
};
|
||||
|
||||
Box::new(keys.into_iter().map(KStringCow::from_static))
|
||||
}
|
||||
|
||||
fn values<'k>(&'k self) -> Box<dyn Iterator<Item = &'k dyn ValueView> + 'k> {
|
||||
Box::new(
|
||||
std::iter::once(self.document.as_value())
|
||||
.chain(std::iter::once(self.fields.as_value())),
|
||||
.chain(self.fields.iter().map(|fields| fields.as_value())),
|
||||
)
|
||||
}
|
||||
|
||||
@ -40,13 +56,13 @@ impl<D: ObjectView, F: ArrayView> ObjectView for Context<'_, D, F> {
|
||||
}
|
||||
|
||||
fn contains_key(&self, index: &str) -> bool {
|
||||
index == "doc" || index == "fields"
|
||||
index == "doc" || (index == "fields" && self.fields.is_some())
|
||||
}
|
||||
|
||||
fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> {
|
||||
match index {
|
||||
"doc" => Some(self.document.as_value()),
|
||||
"fields" => Some(self.fields.as_value()),
|
||||
match (index, &self.fields) {
|
||||
("doc", _) => Some(self.document.as_value()),
|
||||
("fields", Some(fields)) => Some(fields.as_value()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
@ -144,18 +144,19 @@ impl ValueView for Document<'_> {
|
||||
use crate::update::new::document::Document as DocumentTrait;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ParseableDocument<'doc, D> {
|
||||
pub struct ParseableDocument<'a, 'doc, D: DocumentTrait<'a> + Debug> {
|
||||
document: D,
|
||||
doc_alloc: &'doc Bump,
|
||||
_marker: std::marker::PhantomData<&'a ()>,
|
||||
}
|
||||
|
||||
impl<'doc, D> ParseableDocument<'doc, D> {
|
||||
impl<'a, 'doc, D: DocumentTrait<'a> + Debug> ParseableDocument<'a, 'doc, D> {
|
||||
pub fn new(document: D, doc_alloc: &'doc Bump) -> Self {
|
||||
Self { document, doc_alloc }
|
||||
Self { document, doc_alloc, _marker: std::marker::PhantomData }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc, D> {
|
||||
impl<'a, D: DocumentTrait<'a> + Debug> ObjectView for ParseableDocument<'a, '_, D> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> {
|
||||
impl<'a, D: DocumentTrait<'a> + Debug> ValueView for ParseableDocument<'a, '_, D> {
|
||||
fn as_debug(&self) -> &dyn Debug {
|
||||
self
|
||||
}
|
||||
|
@ -121,10 +121,10 @@ impl<D: ObjectView> ObjectView for FieldValue<'_, D> {
|
||||
pub struct OwnedFields<'a, D: ObjectView>(Vec<FieldValue<'a, D>>);
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct BorrowedFields<'a, 'map, D: ObjectView> {
|
||||
pub struct BorrowedFields<'a, 'doc, 'map, D: ObjectView> {
|
||||
document: &'a D,
|
||||
field_id_map: &'a RefCell<GlobalFieldsIdsMap<'map>>,
|
||||
doc_alloc: &'a Bump,
|
||||
doc_alloc: &'doc Bump,
|
||||
}
|
||||
|
||||
impl<'a, D: ObjectView> OwnedFields<'a, D> {
|
||||
@ -138,11 +138,11 @@ impl<'a, D: ObjectView> OwnedFields<'a, D> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'map, D: ObjectView> BorrowedFields<'a, 'map, D> {
|
||||
impl<'a, 'doc, 'map, D: ObjectView> BorrowedFields<'a, 'doc, 'map, D> {
|
||||
pub fn new(
|
||||
document: &'a D,
|
||||
field_id_map: &'a RefCell<GlobalFieldsIdsMap<'map>>,
|
||||
doc_alloc: &'a Bump,
|
||||
doc_alloc: &'doc Bump,
|
||||
) -> Self {
|
||||
Self { document, field_id_map, doc_alloc }
|
||||
}
|
||||
@ -170,7 +170,7 @@ impl<D: ObjectView> ArrayView for OwnedFields<'_, D> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<D: ObjectView> ArrayView for BorrowedFields<'_, '_, D> {
|
||||
impl<D: ObjectView> ArrayView for BorrowedFields<'_, '_, '_, D> {
|
||||
fn as_value(&self) -> &dyn ValueView {
|
||||
self
|
||||
}
|
||||
@ -212,7 +212,7 @@ impl<D: ObjectView> ArrayView for BorrowedFields<'_, '_, D> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<D: ObjectView> ValueView for BorrowedFields<'_, '_, D> {
|
||||
impl<D: ObjectView> ValueView for BorrowedFields<'_, '_, '_, D> {
|
||||
fn as_debug(&self) -> &dyn std::fmt::Debug {
|
||||
self
|
||||
}
|
||||
@ -288,11 +288,11 @@ impl<D: ObjectView> ValueView for OwnedFields<'_, D> {
|
||||
}
|
||||
}
|
||||
|
||||
struct ArraySource<'a, 'map, D: ObjectView> {
|
||||
s: &'a BorrowedFields<'a, 'map, D>,
|
||||
struct ArraySource<'a, 'doc, 'map, D: ObjectView> {
|
||||
s: &'a BorrowedFields<'a, 'doc, 'map, D>,
|
||||
}
|
||||
|
||||
impl<D: ObjectView> fmt::Display for ArraySource<'_, '_, D> {
|
||||
impl<D: ObjectView> fmt::Display for ArraySource<'_, '_, '_, D> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "[")?;
|
||||
for item in self.s.values() {
|
||||
@ -303,11 +303,11 @@ impl<D: ObjectView> fmt::Display for ArraySource<'_, '_, D> {
|
||||
}
|
||||
}
|
||||
|
||||
struct ArrayRender<'a, 'map, D: ObjectView> {
|
||||
s: &'a BorrowedFields<'a, 'map, D>,
|
||||
struct ArrayRender<'a, 'doc, 'map, D: ObjectView> {
|
||||
s: &'a BorrowedFields<'a, 'doc, 'map, D>,
|
||||
}
|
||||
|
||||
impl<D: ObjectView> fmt::Display for ArrayRender<'_, '_, D> {
|
||||
impl<D: ObjectView> fmt::Display for ArrayRender<'_, '_, '_, D> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
for item in self.s.values() {
|
||||
write!(f, "{}", item.render())?;
|
||||
|
@ -9,12 +9,11 @@ use std::fmt::Debug;
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use document::ParseableDocument;
|
||||
pub(crate) use document::{Document, ParseableDocument};
|
||||
use error::{NewPromptError, RenderPromptError};
|
||||
use fields::{BorrowedFields, OwnedFields};
|
||||
pub use fields::{BorrowedFields, OwnedFields};
|
||||
|
||||
use self::context::Context;
|
||||
use self::document::Document;
|
||||
pub use self::context::Context;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::GlobalFieldsIdsMap;
|
||||
@ -108,8 +107,8 @@ impl Prompt {
|
||||
}
|
||||
|
||||
pub fn render_document<
|
||||
'a, // lifetime of the borrow of the document
|
||||
'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents
|
||||
'a, // lifetime of the borrow of the document
|
||||
'doc, // lifetime of the allocator, will live for an entire chunk of documents
|
||||
>(
|
||||
&self,
|
||||
external_docid: &str,
|
||||
|
@ -7,6 +7,7 @@ use roaring::RoaringBitmap;
|
||||
use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
|
||||
use crate::search::new::{distinct_fid, distinct_single_docid};
|
||||
use crate::search::SemanticSearch;
|
||||
use crate::vector::SearchQuery;
|
||||
use crate::{Index, MatchingWords, Result, Search, SearchResult};
|
||||
|
||||
struct ScoreWithRatioResult {
|
||||
@ -225,12 +226,9 @@ impl Search<'_> {
|
||||
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
||||
}
|
||||
|
||||
// no vector search against placeholder search
|
||||
let Some(query) = search.query.take() else {
|
||||
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
||||
};
|
||||
// no embedder, no semantic search
|
||||
let Some(SemanticSearch { vector, embedder_name, embedder, quantized }) = semantic else {
|
||||
let Some(SemanticSearch { vector, embedder_name, embedder, quantized, media }) = semantic
|
||||
else {
|
||||
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
||||
};
|
||||
|
||||
@ -241,9 +239,17 @@ impl Search<'_> {
|
||||
let span = tracing::trace_span!(target: "search::hybrid", "embed_one");
|
||||
let _entered = span.enter();
|
||||
|
||||
let q = search.query.as_deref();
|
||||
let media = media.as_ref();
|
||||
|
||||
let query = match (q, media) {
|
||||
(Some(text), None) => SearchQuery::Text(text),
|
||||
(q, media) => SearchQuery::Media { q, media },
|
||||
};
|
||||
|
||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3);
|
||||
|
||||
match embedder.embed_search(&query, Some(deadline)) {
|
||||
match embedder.embed_search(query, Some(deadline)) {
|
||||
Ok(embedding) => embedding,
|
||||
Err(error) => {
|
||||
tracing::error!(error=%error, "Embedding failed");
|
||||
@ -257,8 +263,13 @@ impl Search<'_> {
|
||||
}
|
||||
};
|
||||
|
||||
search.semantic =
|
||||
Some(SemanticSearch { vector: Some(vector_query), embedder_name, embedder, quantized });
|
||||
search.semantic = Some(SemanticSearch {
|
||||
vector: Some(vector_query),
|
||||
embedder_name,
|
||||
embedder,
|
||||
quantized,
|
||||
media,
|
||||
});
|
||||
|
||||
// TODO: would be better to have two distinct functions at this point
|
||||
let vector_results = search.execute()?;
|
||||
|
@ -13,7 +13,7 @@ use crate::documents::GeoSortParameter;
|
||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||
use crate::index::MatchingStrategy;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::vector::Embedder;
|
||||
use crate::vector::{Embedder, Embedding};
|
||||
use crate::{
|
||||
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index,
|
||||
Result, SearchContext, TimeBudget, UserError,
|
||||
@ -33,6 +33,7 @@ pub mod similar;
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SemanticSearch {
|
||||
vector: Option<Vec<f32>>,
|
||||
media: Option<serde_json::Value>,
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
quantized: bool,
|
||||
@ -94,9 +95,10 @@ impl<'a> Search<'a> {
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
quantized: bool,
|
||||
vector: Option<Vec<f32>>,
|
||||
vector: Option<Embedding>,
|
||||
media: Option<serde_json::Value>,
|
||||
) -> &mut Search<'a> {
|
||||
self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector });
|
||||
self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector, media });
|
||||
self
|
||||
}
|
||||
|
||||
@ -232,24 +234,28 @@ impl<'a> Search<'a> {
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
} = match self.semantic.as_ref() {
|
||||
Some(SemanticSearch { vector: Some(vector), embedder_name, embedder, quantized }) => {
|
||||
execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
&self.distinct,
|
||||
self.geo_param,
|
||||
self.offset,
|
||||
self.limit,
|
||||
embedder_name,
|
||||
embedder,
|
||||
*quantized,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
)?
|
||||
}
|
||||
Some(SemanticSearch {
|
||||
vector: Some(vector),
|
||||
embedder_name,
|
||||
embedder,
|
||||
quantized,
|
||||
media: _,
|
||||
}) => execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
&self.distinct,
|
||||
self.geo_param,
|
||||
self.offset,
|
||||
self.limit,
|
||||
embedder_name,
|
||||
embedder,
|
||||
*quantized,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
)?,
|
||||
_ => execute_search(
|
||||
&mut ctx,
|
||||
self.query.as_deref(),
|
||||
|
@ -8,7 +8,7 @@ use maplit::{btreemap, hashset};
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::{IndexerConfig, Settings};
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::{db_snap, Criterion, FilterableAttributesRule, Index};
|
||||
pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson");
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
@ -44,7 +44,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
S("america") => vec![S("the united states")],
|
||||
});
|
||||
builder.set_searchable_fields(vec![S("title"), S("description")]);
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
builder.execute(&|| false, &Progress::default(), Default::default()).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
// index documents
|
||||
@ -55,7 +55,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
|
||||
let mut file = tempfile::tempfile().unwrap();
|
||||
@ -95,6 +95,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -32,8 +32,8 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
|
||||
) -> Result<Self> {
|
||||
let embedder_index = ctx
|
||||
.index
|
||||
.embedder_category_id
|
||||
.get(ctx.txn, embedder_name)?
|
||||
.embedding_configs()
|
||||
.embedder_id(ctx.txn, embedder_name)?
|
||||
.ok_or_else(|| crate::UserError::InvalidSearchEmbedder(embedder_name.to_owned()))?;
|
||||
|
||||
Ok(Self {
|
||||
|
@ -64,10 +64,13 @@ impl<'a> Similar<'a> {
|
||||
|
||||
let universe = universe;
|
||||
|
||||
let embedder_index =
|
||||
self.index.embedder_category_id.get(self.rtxn, &self.embedder_name)?.ok_or_else(
|
||||
|| crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()),
|
||||
)?;
|
||||
let embedder_index = self
|
||||
.index
|
||||
.embedding_configs()
|
||||
.embedder_id(self.rtxn, &self.embedder_name)?
|
||||
.ok_or_else(|| {
|
||||
crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned())
|
||||
})?;
|
||||
|
||||
let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized);
|
||||
let results = reader.nns_by_item(
|
||||
|
@ -18,7 +18,7 @@ use crate::update::{
|
||||
self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, Settings,
|
||||
};
|
||||
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::{db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult};
|
||||
|
||||
pub(crate) struct TempIndex {
|
||||
@ -66,7 +66,7 @@ impl TempIndex {
|
||||
let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs;
|
||||
let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.runtime_embedders;
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
match self.index_documents_config.update_method {
|
||||
IndexDocumentsMethod::ReplaceDocuments => {
|
||||
@ -103,6 +103,7 @@ impl TempIndex {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
})
|
||||
.unwrap()?;
|
||||
@ -134,7 +135,7 @@ impl TempIndex {
|
||||
) -> Result<(), crate::error::Error> {
|
||||
let mut builder = update::Settings::new(wtxn, &self.inner, &self.indexer_config);
|
||||
update(&mut builder);
|
||||
builder.execute(drop, || false)?;
|
||||
builder.execute(&|| false, &Progress::default(), Default::default())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -150,7 +151,7 @@ impl TempIndex {
|
||||
let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?;
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs;
|
||||
let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.runtime_embedders;
|
||||
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
let external_document_ids: Vec<_> =
|
||||
@ -185,6 +186,7 @@ impl TempIndex {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
})
|
||||
.unwrap()?;
|
||||
@ -221,7 +223,7 @@ fn aborting_indexation() {
|
||||
let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap();
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
let payload = documents!([
|
||||
{ "id": 1, "name": "kevin" },
|
||||
@ -259,6 +261,7 @@ fn aborting_indexation() {
|
||||
embedders,
|
||||
&|| should_abort.load(Relaxed),
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
})
|
||||
.unwrap()
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
use rayon::{BroadcastContext, ThreadPool, ThreadPoolBuilder};
|
||||
use thiserror::Error;
|
||||
|
||||
/// A rayon ThreadPool wrapper that can catch panics in the pool
|
||||
@ -32,6 +32,22 @@ impl ThreadPoolNoAbort {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn broadcast<OP, R>(&self, op: OP) -> Result<Vec<R>, PanicCatched>
|
||||
where
|
||||
OP: Fn(BroadcastContext<'_>) -> R + Sync,
|
||||
R: Send,
|
||||
{
|
||||
self.active_operations.fetch_add(1, Ordering::Relaxed);
|
||||
let output = self.thread_pool.broadcast(op);
|
||||
self.active_operations.fetch_sub(1, Ordering::Relaxed);
|
||||
// While reseting the pool panic catcher we return an error if we catched one.
|
||||
if self.pool_catched_panic.swap(false, Ordering::SeqCst) {
|
||||
Err(PanicCatched)
|
||||
} else {
|
||||
Ok(output)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn current_num_threads(&self) -> usize {
|
||||
self.thread_pool.current_num_threads()
|
||||
}
|
||||
|
@ -64,11 +64,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
|
||||
// Remove all user-provided bits from the configs
|
||||
let mut configs = self.index.embedding_configs(self.wtxn)?;
|
||||
for config in configs.iter_mut() {
|
||||
config.user_provided.clear();
|
||||
}
|
||||
self.index.put_embedding_configs(self.wtxn, configs)?;
|
||||
self.index.embedding_configs().clear_embedder_info_docids(self.wtxn)?;
|
||||
|
||||
// Clear the other databases.
|
||||
external_documents_ids.clear(self.wtxn)?;
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -23,15 +23,17 @@ use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, Extra
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_vector_points::{
|
||||
extract_embeddings, extract_vector_points, ExtractedVectorPoints,
|
||||
extract_embeddings_from_prompts, extract_vector_points, ExtractedVectorPoints,
|
||||
};
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::update::index_documents::extract::extract_vector_points::extract_embeddings_from_fragments;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::db::EmbedderInfo;
|
||||
use crate::vector::error::PossibleEmbeddingMistakes;
|
||||
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
|
||||
@ -45,10 +47,11 @@ pub(crate) fn data_from_obkv_documents(
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
primary_key_id: FieldId,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
embedder_info: Arc<Vec<(String, EmbedderInfo)>>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
embedder_stats: &Arc<EmbedderStats>,
|
||||
) -> Result<()> {
|
||||
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
@ -59,9 +62,10 @@ pub(crate) fn data_from_obkv_documents(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
embedders_configs.clone(),
|
||||
settings_diff.clone(),
|
||||
embedder_info.clone(),
|
||||
possible_embedding_mistakes.clone(),
|
||||
embedder_stats.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
@ -210,7 +214,7 @@ fn run_extraction_task<FE, FS, M>(
|
||||
})
|
||||
}
|
||||
|
||||
fn request_threads() -> &'static ThreadPoolNoAbort {
|
||||
pub fn request_threads() -> &'static ThreadPoolNoAbort {
|
||||
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
|
||||
|
||||
REQUEST_THREADS.get_or_init(|| {
|
||||
@ -228,20 +232,20 @@ fn send_original_documents_data(
|
||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
embedder_info: Arc<Vec<(String, EmbedderInfo)>>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
embedder_stats: Arc<EmbedderStats>,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
||||
// no point in indexing vectors without embedders
|
||||
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
||||
&& (!settings_diff.new.runtime_embedders.inner_as_ref().is_empty());
|
||||
|
||||
if index_vectors {
|
||||
let settings_diff = settings_diff.clone();
|
||||
let embedders_configs = embedders_configs.clone();
|
||||
|
||||
let original_documents_chunk = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
@ -249,8 +253,8 @@ fn send_original_documents_data(
|
||||
match extract_vector_points(
|
||||
original_documents_chunk.clone(),
|
||||
indexer,
|
||||
&embedders_configs,
|
||||
&settings_diff,
|
||||
embedder_info.as_slice(),
|
||||
&possible_embedding_mistakes,
|
||||
) {
|
||||
Ok((extracted_vectors, unused_vectors_distribution)) => {
|
||||
@ -258,18 +262,19 @@ fn send_original_documents_data(
|
||||
manual_vectors,
|
||||
remove_vectors,
|
||||
prompts,
|
||||
inputs,
|
||||
embedder_name,
|
||||
embedder,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
runtime,
|
||||
embedding_status_delta,
|
||||
} in extracted_vectors
|
||||
{
|
||||
let embeddings = match extract_embeddings(
|
||||
let embeddings_from_prompts = match extract_embeddings_from_prompts(
|
||||
prompts,
|
||||
indexer,
|
||||
embedder.clone(),
|
||||
runtime.clone(),
|
||||
&embedder_name,
|
||||
&possible_embedding_mistakes,
|
||||
&embedder_stats,
|
||||
&unused_vectors_distribution,
|
||||
request_threads(),
|
||||
) {
|
||||
@ -279,18 +284,37 @@ fn send_original_documents_data(
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let embeddings_from_fragments = match extract_embeddings_from_fragments(
|
||||
inputs,
|
||||
indexer,
|
||||
runtime.clone(),
|
||||
&embedder_name,
|
||||
&possible_embedding_mistakes,
|
||||
&embedder_stats,
|
||||
&unused_vectors_distribution,
|
||||
request_threads(),
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().is_none_or(|e| e.is_empty()))
|
||||
&& embeddings_from_prompts.as_ref().is_none_or(|e| e.is_empty())
|
||||
&& embeddings_from_fragments.as_ref().is_none_or(|e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
embeddings_from_prompts,
|
||||
embeddings_from_fragments,
|
||||
expected_dimension: runtime.embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
embedding_status_delta,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
@ -12,6 +12,7 @@ use std::sync::Arc;
|
||||
|
||||
use crossbeam_channel::{Receiver, Sender};
|
||||
use enrich::enrich_documents_batch;
|
||||
pub use extract::request_threads;
|
||||
use grenad::{Merger, MergerBuilder};
|
||||
use hashbrown::HashMap;
|
||||
use heed::types::Str;
|
||||
@ -32,12 +33,13 @@ use crate::database_stats::DatabaseStats;
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::index::{PrefixSearch, PrefixSettings};
|
||||
use crate::progress::Progress;
|
||||
use crate::progress::{EmbedderStats, Progress};
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::vector::{ArroyWrapper, EmbeddingConfigs};
|
||||
use crate::vector::db::EmbedderInfo;
|
||||
use crate::vector::{ArroyWrapper, RuntimeEmbedders};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
|
||||
|
||||
static MERGED_DATABASE_COUNT: usize = 7;
|
||||
@ -80,7 +82,8 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
|
||||
should_abort: FA,
|
||||
added_documents: u64,
|
||||
deleted_documents: u64,
|
||||
embedders: EmbeddingConfigs,
|
||||
embedders: RuntimeEmbedders,
|
||||
embedder_stats: &'t Arc<EmbedderStats>,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone)]
|
||||
@ -103,6 +106,7 @@ where
|
||||
config: IndexDocumentsConfig,
|
||||
progress: FP,
|
||||
should_abort: FA,
|
||||
embedder_stats: &'t Arc<EmbedderStats>,
|
||||
) -> Result<IndexDocuments<'t, 'i, 'a, FP, FA>> {
|
||||
let transform = Some(Transform::new(
|
||||
wtxn,
|
||||
@ -123,6 +127,7 @@ where
|
||||
added_documents: 0,
|
||||
deleted_documents: 0,
|
||||
embedders: Default::default(),
|
||||
embedder_stats,
|
||||
})
|
||||
}
|
||||
|
||||
@ -168,7 +173,7 @@ where
|
||||
Ok((self, Ok(indexed_documents)))
|
||||
}
|
||||
|
||||
pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self {
|
||||
pub fn with_embedders(mut self, embedders: RuntimeEmbedders) -> Self {
|
||||
self.embedders = embedders;
|
||||
self
|
||||
}
|
||||
@ -222,7 +227,13 @@ where
|
||||
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
|
||||
|
||||
let settings_diff = Arc::new(settings_diff);
|
||||
let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
|
||||
let embedder_infos: heed::Result<Vec<(String, EmbedderInfo)>> = self
|
||||
.index
|
||||
.embedding_configs()
|
||||
.iter_embedder_info(self.wtxn)?
|
||||
.map(|res| res.map(|(name, info)| (name.to_owned(), info)))
|
||||
.collect();
|
||||
let embedder_infos = Arc::new(embedder_infos?);
|
||||
|
||||
let possible_embedding_mistakes =
|
||||
crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution);
|
||||
@ -292,6 +303,7 @@ where
|
||||
|
||||
// Run extraction pipeline in parallel.
|
||||
let mut modified_docids = RoaringBitmap::new();
|
||||
let embedder_stats = self.embedder_stats.clone();
|
||||
pool.install(|| {
|
||||
let settings_diff_cloned = settings_diff.clone();
|
||||
rayon::spawn(move || {
|
||||
@ -323,10 +335,11 @@ where
|
||||
pool_params,
|
||||
lmdb_writer_sx.clone(),
|
||||
primary_key_id,
|
||||
embedders_configs.clone(),
|
||||
settings_diff_cloned,
|
||||
max_positions_per_attributes,
|
||||
Arc::new(possible_embedding_mistakes)
|
||||
embedder_infos,
|
||||
Arc::new(possible_embedding_mistakes),
|
||||
&embedder_stats
|
||||
)
|
||||
});
|
||||
|
||||
@ -424,21 +437,21 @@ where
|
||||
TypedChunk::VectorPoints {
|
||||
expected_dimension,
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
embeddings_from_prompts,
|
||||
embeddings_from_fragments,
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
embedding_status_delta,
|
||||
} => {
|
||||
dimension.insert(embedder_name.clone(), expected_dimension);
|
||||
TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
embeddings_from_prompts,
|
||||
embeddings_from_fragments,
|
||||
expected_dimension,
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
embedding_status_delta,
|
||||
}
|
||||
}
|
||||
otherwise => otherwise,
|
||||
@ -474,7 +487,7 @@ where
|
||||
// we should insert it in `dimension`
|
||||
for (name, action) in settings_diff.embedding_config_updates.iter() {
|
||||
if action.is_being_quantized && !dimension.contains_key(name.as_str()) {
|
||||
let index = self.index.embedder_category_id.get(self.wtxn, name)?.ok_or(
|
||||
let index = self.index.embedding_configs().embedder_id(self.wtxn, name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry {
|
||||
db_name: "embedder_category_id",
|
||||
key: None,
|
||||
@ -482,7 +495,9 @@ where
|
||||
)?;
|
||||
let reader =
|
||||
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
|
||||
let dim = reader.dimensions(self.wtxn)?;
|
||||
let Some(dim) = reader.dimensions(self.wtxn)? else {
|
||||
continue;
|
||||
};
|
||||
dimension.insert(name.to_string(), dim);
|
||||
}
|
||||
}
|
||||
@ -492,12 +507,19 @@ where
|
||||
let vector_arroy = self.index.vector_arroy;
|
||||
let cancel = &self.should_abort;
|
||||
|
||||
let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||
)?;
|
||||
let embedder_index =
|
||||
self.index.embedding_configs().embedder_id(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry {
|
||||
db_name: "embedder_category_id",
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name);
|
||||
let was_quantized =
|
||||
settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2);
|
||||
let was_quantized = settings_diff
|
||||
.old
|
||||
.runtime_embedders
|
||||
.get(&embedder_name)
|
||||
.is_some_and(|conf| conf.is_quantized);
|
||||
let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized);
|
||||
|
||||
pool.install(|| {
|
||||
@ -767,11 +789,11 @@ mod tests {
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::documents::mmap_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::Progress;
|
||||
use crate::search::TermsMatchingStrategy;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::Setting;
|
||||
use crate::vector::db::IndexEmbeddingConfig;
|
||||
use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError};
|
||||
|
||||
#[test]
|
||||
@ -2022,9 +2044,10 @@ mod tests {
|
||||
new_fields_ids_map,
|
||||
primary_key,
|
||||
&document_changes,
|
||||
EmbeddingConfigs::default(),
|
||||
RuntimeEmbedders::default(),
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2109,9 +2132,10 @@ mod tests {
|
||||
new_fields_ids_map,
|
||||
primary_key,
|
||||
&document_changes,
|
||||
EmbeddingConfigs::default(),
|
||||
RuntimeEmbedders::default(),
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2269,7 +2293,7 @@ mod tests {
|
||||
]);
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
indexer.replace_documents(&documents).unwrap();
|
||||
indexer.delete_documents(&["2"]);
|
||||
@ -2297,6 +2321,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2334,7 +2359,7 @@ mod tests {
|
||||
indexer.delete_documents(&["1", "2"]);
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let (document_changes, _operation_stats, primary_key) = indexer
|
||||
.into_changes(
|
||||
&indexer_alloc,
|
||||
@ -2359,6 +2384,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2384,7 +2410,7 @@ mod tests {
|
||||
{ "id": 3, "name": "jean", "age": 25 },
|
||||
]);
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
indexer.update_documents(&documents).unwrap();
|
||||
|
||||
@ -2412,6 +2438,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2435,7 +2462,7 @@ mod tests {
|
||||
{ "id": 3, "legs": 4 },
|
||||
]);
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
indexer.update_documents(&documents).unwrap();
|
||||
indexer.delete_documents(&["1", "2"]);
|
||||
@ -2464,6 +2491,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2484,7 +2512,7 @@ mod tests {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
indexer.delete_documents(&["1", "2"]);
|
||||
|
||||
@ -2518,6 +2546,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2539,7 +2568,7 @@ mod tests {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
|
||||
indexer.delete_documents(&["1", "2", "1", "2"]);
|
||||
@ -2577,6 +2606,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2597,7 +2627,7 @@ mod tests {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
|
||||
let documents = documents!([
|
||||
@ -2629,6 +2659,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2646,7 +2677,7 @@ mod tests {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
|
||||
indexer.delete_documents(&["1"]);
|
||||
@ -2681,6 +2712,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2759,6 +2791,8 @@ mod tests {
|
||||
document_template: Setting::NotSet,
|
||||
document_template_max_bytes: Setting::NotSet,
|
||||
url: Setting::NotSet,
|
||||
indexing_fragments: Setting::NotSet,
|
||||
search_fragments: Setting::NotSet,
|
||||
request: Setting::NotSet,
|
||||
response: Setting::NotSet,
|
||||
distribution: Setting::NotSet,
|
||||
@ -2785,17 +2819,27 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
|
||||
let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } =
|
||||
let embedders = index.embedding_configs();
|
||||
let mut embedding_configs = embedders.embedding_configs(&rtxn).unwrap();
|
||||
let IndexEmbeddingConfig { name: embedder_name, config: embedder, fragments } =
|
||||
embedding_configs.pop().unwrap();
|
||||
let info = embedders.embedder_info(&rtxn, &embedder_name).unwrap().unwrap();
|
||||
insta::assert_snapshot!(info.embedder_id, @"0");
|
||||
insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0, 1, 2]>");
|
||||
insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0, 1, 2]>");
|
||||
insta::assert_snapshot!(embedder_name, @"manual");
|
||||
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>");
|
||||
insta::assert_debug_snapshot!(fragments, @r###"
|
||||
FragmentConfigs(
|
||||
[],
|
||||
)
|
||||
"###);
|
||||
|
||||
let embedder = std::sync::Arc::new(
|
||||
crate::vector::Embedder::new(embedder.embedder_options, 0).unwrap(),
|
||||
);
|
||||
let res = index
|
||||
.search(&rtxn)
|
||||
.semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec()))
|
||||
.semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec()), None)
|
||||
.execute()
|
||||
.unwrap();
|
||||
assert_eq!(res.documents_ids.len(), 3);
|
||||
@ -2844,7 +2888,7 @@ mod tests {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
|
||||
// OP
|
||||
@ -2879,6 +2923,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2904,7 +2949,7 @@ mod tests {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
|
||||
indexer.delete_documents(&["1"]);
|
||||
@ -2938,6 +2983,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -2962,7 +3008,7 @@ mod tests {
|
||||
let mut new_fields_ids_map = db_fields_ids_map.clone();
|
||||
|
||||
let indexer_alloc = Bump::new();
|
||||
let embedders = EmbeddingConfigs::default();
|
||||
let embedders = RuntimeEmbedders::default();
|
||||
let mut indexer = indexer::DocumentOperation::new();
|
||||
|
||||
let documents = documents!([
|
||||
@ -2994,6 +3040,7 @@ mod tests {
|
||||
embedders,
|
||||
&|| false,
|
||||
&Progress::default(),
|
||||
&Default::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -31,7 +31,7 @@ use crate::update::index_documents::GrenadParameters;
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::update::{AvailableIds, UpdateIndexingStep};
|
||||
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||
use crate::vector::settings::WriteBackToDocuments;
|
||||
use crate::vector::settings::{RemoveFragments, WriteBackToDocuments};
|
||||
use crate::vector::ArroyWrapper;
|
||||
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result};
|
||||
|
||||
@ -933,10 +933,47 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
|
||||
// delete all vectors from the embedders that need removal
|
||||
for (_, (reader, _)) in readers {
|
||||
let dimensions = reader.dimensions(wtxn)?;
|
||||
let Some(dimensions) = reader.dimensions(wtxn)? else {
|
||||
continue;
|
||||
};
|
||||
reader.clear(wtxn, dimensions)?;
|
||||
}
|
||||
|
||||
// remove all vectors for the specified fragments
|
||||
for (embedder_name, RemoveFragments { fragment_ids }, was_quantized) in
|
||||
settings_diff.embedding_config_updates.iter().filter_map(|(name, action)| {
|
||||
action.remove_fragments().map(|fragments| (name, fragments, action.was_quantized))
|
||||
})
|
||||
{
|
||||
let Some(infos) = self.index.embedding_configs().embedder_info(wtxn, embedder_name)?
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
let arroy =
|
||||
ArroyWrapper::new(self.index.vector_arroy, infos.embedder_id, was_quantized);
|
||||
let Some(dimensions) = arroy.dimensions(wtxn)? else {
|
||||
continue;
|
||||
};
|
||||
for fragment_id in fragment_ids {
|
||||
// we must keep the user provided embeddings that ended up in this store
|
||||
|
||||
if infos.embedding_status.user_provided_docids().is_empty() {
|
||||
// no user provided: clear store
|
||||
arroy.clear_store(wtxn, *fragment_id, dimensions)?;
|
||||
continue;
|
||||
}
|
||||
|
||||
// some user provided, remove only the ids that are not user provided
|
||||
let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| {
|
||||
items - infos.embedding_status.user_provided_docids()
|
||||
})?;
|
||||
|
||||
for to_delete in to_delete {
|
||||
arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let grenad_params = GrenadParameters {
|
||||
chunk_compression_type: self.indexer_settings.chunk_compression_type,
|
||||
chunk_compression_level: self.indexer_settings.chunk_compression_level,
|
||||
|
@ -4,6 +4,7 @@ use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use bytemuck::allocation::pod_collect_to_vec;
|
||||
use byteorder::{BigEndian, ReadBytesExt as _};
|
||||
use grenad::{MergeFunction, Merger, MergerBuilder};
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, RwTxn};
|
||||
@ -18,7 +19,6 @@ use super::helpers::{
|
||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||
use crate::facet::FacetType;
|
||||
use crate::index::db_name::DOCUMENTS;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
@ -26,6 +26,7 @@ use crate::update::index_documents::helpers::{
|
||||
as_cloneable_grenad, try_split_array_at, KeepLatestObkv,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig};
|
||||
use crate::vector::ArroyWrapper;
|
||||
use crate::{
|
||||
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
||||
@ -86,12 +87,14 @@ pub(crate) enum TypedChunk {
|
||||
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||
VectorPoints {
|
||||
remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
embeddings: Option<grenad::Reader<BufReader<File>>>,
|
||||
// docid -> vector
|
||||
embeddings_from_prompts: Option<grenad::Reader<BufReader<File>>>,
|
||||
// docid, extractor_id -> Option<vector>,
|
||||
embeddings_from_fragments: Option<grenad::Reader<BufReader<File>>>,
|
||||
expected_dimension: usize,
|
||||
manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
embedder_name: String,
|
||||
add_to_user_provided: RoaringBitmap,
|
||||
remove_from_user_provided: RoaringBitmap,
|
||||
embedding_status_delta: EmbeddingStatusDelta,
|
||||
},
|
||||
}
|
||||
|
||||
@ -155,6 +158,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
|
||||
let embedders: BTreeSet<_> = index
|
||||
.embedding_configs()
|
||||
.embedding_configs(wtxn)?
|
||||
.into_iter()
|
||||
.map(|IndexEmbeddingConfig { name, .. }| name)
|
||||
@ -614,57 +618,66 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
|
||||
let _entered = span.enter();
|
||||
|
||||
let embedders = index.embedding_configs();
|
||||
|
||||
let mut remove_vectors_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut manual_vectors_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut embeddings_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut add_to_user_provided = RoaringBitmap::new();
|
||||
let mut remove_from_user_provided = RoaringBitmap::new();
|
||||
let mut embeddings_from_prompts_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut embeddings_from_fragments_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut params = None;
|
||||
let mut infos = None;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
manual_vectors,
|
||||
embeddings,
|
||||
embeddings_from_prompts,
|
||||
embeddings_from_fragments,
|
||||
expected_dimension,
|
||||
embedder_name,
|
||||
add_to_user_provided: aud,
|
||||
remove_from_user_provided: rud,
|
||||
embedding_status_delta,
|
||||
} = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
if infos.is_none() {
|
||||
infos = Some(embedders.embedder_info(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry {
|
||||
db_name: "embedder_category_id",
|
||||
key: None,
|
||||
},
|
||||
)?);
|
||||
}
|
||||
|
||||
params = Some((expected_dimension, embedder_name));
|
||||
|
||||
remove_vectors_builder.push(remove_vectors.into_cursor()?);
|
||||
manual_vectors_builder.push(manual_vectors.into_cursor()?);
|
||||
if let Some(embeddings) = embeddings {
|
||||
embeddings_builder.push(embeddings.into_cursor()?);
|
||||
if let Some(embeddings) = embeddings_from_prompts {
|
||||
embeddings_from_prompts_builder.push(embeddings.into_cursor()?);
|
||||
}
|
||||
if let Some(embeddings) = embeddings_from_fragments {
|
||||
embeddings_from_fragments_builder.push(embeddings.into_cursor()?);
|
||||
}
|
||||
|
||||
if let Some(infos) = &mut infos {
|
||||
embedding_status_delta.apply_to(&mut infos.embedding_status);
|
||||
}
|
||||
add_to_user_provided |= aud;
|
||||
remove_from_user_provided |= rud;
|
||||
}
|
||||
|
||||
// typed chunks has always at least 1 chunk.
|
||||
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
||||
let Some(infos) = infos else { unreachable!() };
|
||||
|
||||
let mut embedding_configs = index.embedding_configs(wtxn)?;
|
||||
let index_embedder_config = embedding_configs
|
||||
.iter_mut()
|
||||
.find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
|
||||
.unwrap();
|
||||
index_embedder_config.user_provided -= remove_from_user_provided;
|
||||
index_embedder_config.user_provided |= add_to_user_provided;
|
||||
embedders.put_embedder_info(wtxn, &embedder_name, &infos)?;
|
||||
|
||||
index.put_embedding_configs(wtxn, embedding_configs)?;
|
||||
|
||||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||
)?;
|
||||
let binary_quantized =
|
||||
settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2);
|
||||
let binary_quantized = settings_diff
|
||||
.old
|
||||
.runtime_embedders
|
||||
.get(&embedder_name)
|
||||
.is_some_and(|conf| conf.is_quantized);
|
||||
// FIXME: allow customizing distance
|
||||
let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized);
|
||||
let writer = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, binary_quantized);
|
||||
|
||||
// remove vectors for docids we want them removed
|
||||
let merger = remove_vectors_builder.build();
|
||||
@ -674,8 +687,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
writer.del_items(wtxn, expected_dimension, docid)?;
|
||||
}
|
||||
|
||||
// add generated embeddings
|
||||
let merger = embeddings_builder.build();
|
||||
// add generated embeddings -- from prompts
|
||||
let merger = embeddings_from_prompts_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
@ -702,6 +715,24 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
|
||||
// add generated embeddings -- from fragments
|
||||
let merger = embeddings_from_fragments_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((mut key, value)) = iter.next()? {
|
||||
let docid = key.read_u32::<BigEndian>().unwrap();
|
||||
let extractor_id = key.read_u8().unwrap();
|
||||
if value.is_empty() {
|
||||
writer.del_item_in_store(wtxn, docid, extractor_id, expected_dimension)?;
|
||||
} else {
|
||||
let data = pod_collect_to_vec(value);
|
||||
// it is a code error to have embeddings and not expected_dimension
|
||||
if data.len() != expected_dimension {
|
||||
panic!("wrong dimensions")
|
||||
}
|
||||
writer.add_item_in_store(wtxn, docid, extractor_id, &data)?;
|
||||
}
|
||||
}
|
||||
|
||||
// perform the manual diff
|
||||
let merger = manual_vectors_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
|
@ -15,6 +15,7 @@ pub struct IndexerConfig {
|
||||
pub thread_pool: ThreadPoolNoAbort,
|
||||
pub max_positions_per_attributes: Option<u32>,
|
||||
pub skip_index_budget: bool,
|
||||
pub experimental_no_edition_2024_for_settings: bool,
|
||||
}
|
||||
|
||||
impl IndexerConfig {
|
||||
@ -63,6 +64,7 @@ impl Default for IndexerConfig {
|
||||
chunk_compression_level: None,
|
||||
max_positions_per_attributes: None,
|
||||
skip_index_budget: false,
|
||||
experimental_no_edition_2024_for_settings: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,7 @@ pub use self::clear_documents::ClearDocuments;
|
||||
pub use self::concurrent_available_ids::ConcurrentAvailableIds;
|
||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
||||
pub use self::index_documents::*;
|
||||
pub use self::index_documents::{request_threads, *};
|
||||
pub use self::indexer_config::{default_thread_pool_and_threads, IndexerConfig};
|
||||
pub use self::new::ChannelCongestion;
|
||||
pub use self::settings::{validate_embedding_settings, Setting, Settings};
|
||||
|
@ -138,6 +138,7 @@ pub enum ReceiverAction {
|
||||
WakeUp,
|
||||
LargeEntry(LargeEntry),
|
||||
LargeVectors(LargeVectors),
|
||||
LargeVector(LargeVector),
|
||||
}
|
||||
|
||||
/// An entry that cannot fit in the BBQueue buffers has been
|
||||
@ -174,6 +175,24 @@ impl LargeVectors {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct LargeVector {
|
||||
/// The document id associated to the large embedding.
|
||||
pub docid: DocumentId,
|
||||
/// The embedder id in which to insert the large embedding.
|
||||
pub embedder_id: u8,
|
||||
/// The extractor id in which to insert the large embedding.
|
||||
pub extractor_id: u8,
|
||||
/// The large embedding that must be written.
|
||||
pub embedding: Mmap,
|
||||
}
|
||||
|
||||
impl LargeVector {
|
||||
pub fn read_embedding(&self, dimensions: usize) -> &[f32] {
|
||||
self.embedding.chunks_exact(dimensions).map(bytemuck::cast_slice).next().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> WriterBbqueueReceiver<'a> {
|
||||
/// Tries to receive an action to do until the timeout occurs
|
||||
/// and if it does, consider it as a spurious wake up.
|
||||
@ -238,6 +257,7 @@ pub enum EntryHeader {
|
||||
DbOperation(DbOperation),
|
||||
ArroyDeleteVector(ArroyDeleteVector),
|
||||
ArroySetVectors(ArroySetVectors),
|
||||
ArroySetVector(ArroySetVector),
|
||||
}
|
||||
|
||||
impl EntryHeader {
|
||||
@ -250,6 +270,7 @@ impl EntryHeader {
|
||||
EntryHeader::DbOperation(_) => 0,
|
||||
EntryHeader::ArroyDeleteVector(_) => 1,
|
||||
EntryHeader::ArroySetVectors(_) => 2,
|
||||
EntryHeader::ArroySetVector(_) => 3,
|
||||
}
|
||||
}
|
||||
|
||||
@ -274,11 +295,17 @@ impl EntryHeader {
|
||||
Self::variant_size() + mem::size_of::<ArroySetVectors>() + embedding_size * count
|
||||
}
|
||||
|
||||
fn total_set_vector_size(dimensions: usize) -> usize {
|
||||
let embedding_size = dimensions * mem::size_of::<f32>();
|
||||
Self::variant_size() + mem::size_of::<ArroySetVector>() + embedding_size
|
||||
}
|
||||
|
||||
fn header_size(&self) -> usize {
|
||||
let payload_size = match self {
|
||||
EntryHeader::DbOperation(op) => mem::size_of_val(op),
|
||||
EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv),
|
||||
EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs),
|
||||
EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv),
|
||||
};
|
||||
Self::variant_size() + payload_size
|
||||
}
|
||||
@ -301,6 +328,11 @@ impl EntryHeader {
|
||||
let header = checked::pod_read_unaligned(header_bytes);
|
||||
EntryHeader::ArroySetVectors(header)
|
||||
}
|
||||
3 => {
|
||||
let header_bytes = &remaining[..mem::size_of::<ArroySetVector>()];
|
||||
let header = checked::pod_read_unaligned(header_bytes);
|
||||
EntryHeader::ArroySetVector(header)
|
||||
}
|
||||
id => panic!("invalid variant id: {id}"),
|
||||
}
|
||||
}
|
||||
@ -311,6 +343,7 @@ impl EntryHeader {
|
||||
EntryHeader::DbOperation(op) => bytemuck::bytes_of(op),
|
||||
EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv),
|
||||
EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs),
|
||||
EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv),
|
||||
};
|
||||
*first = self.variant_id();
|
||||
remaining.copy_from_slice(payload_bytes);
|
||||
@ -379,6 +412,37 @@ impl ArroySetVectors {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
|
||||
#[repr(C)]
|
||||
/// The embeddings are in the remaining space and represents
|
||||
/// non-aligned [f32] each with dimensions f32s.
|
||||
pub struct ArroySetVector {
|
||||
pub docid: DocumentId,
|
||||
pub embedder_id: u8,
|
||||
pub extractor_id: u8,
|
||||
_padding: [u8; 2],
|
||||
}
|
||||
|
||||
impl ArroySetVector {
|
||||
fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] {
|
||||
let skip = EntryHeader::variant_size() + mem::size_of::<Self>();
|
||||
&frame[skip..]
|
||||
}
|
||||
|
||||
/// Read the embedding and write it into an aligned `f32` Vec.
|
||||
pub fn read_all_embeddings_into_vec<'v>(
|
||||
&self,
|
||||
frame: &FrameGrantR<'_>,
|
||||
vec: &'v mut Vec<f32>,
|
||||
) -> &'v [f32] {
|
||||
let embeddings_bytes = Self::embeddings_bytes(frame);
|
||||
let embeddings_count = embeddings_bytes.len() / mem::size_of::<f32>();
|
||||
vec.resize(embeddings_count, 0.0);
|
||||
bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes);
|
||||
&vec[..]
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)]
|
||||
#[repr(u16)]
|
||||
pub enum Database {
|
||||
@ -398,6 +462,7 @@ pub enum Database {
|
||||
FacetIdStringDocids,
|
||||
FieldIdDocidFacetStrings,
|
||||
FieldIdDocidFacetF64s,
|
||||
VectorEmbedderCategoryId,
|
||||
}
|
||||
|
||||
impl Database {
|
||||
@ -419,6 +484,7 @@ impl Database {
|
||||
Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(),
|
||||
Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(),
|
||||
Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(),
|
||||
Database::VectorEmbedderCategoryId => index.embedder_category_id.remap_types(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -440,6 +506,7 @@ impl Database {
|
||||
Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS,
|
||||
Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS,
|
||||
Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S,
|
||||
Database::VectorEmbedderCategoryId => db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -568,6 +635,82 @@ impl<'b> ExtractorBbqueueSender<'b> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_vector_for_extractor(
|
||||
&self,
|
||||
docid: u32,
|
||||
embedder_id: u8,
|
||||
extractor_id: u8,
|
||||
embedding: Option<Embedding>,
|
||||
) -> crate::Result<()> {
|
||||
let max_grant = self.max_grant;
|
||||
let refcell = self.producers.get().unwrap();
|
||||
let mut producer = refcell.0.borrow_mut_or_yield();
|
||||
|
||||
// If there are no vectors we specify the dimensions
|
||||
// to zero to allocate no extra space at all
|
||||
let dimensions = embedding.as_ref().map_or(0, |emb| emb.len());
|
||||
|
||||
let arroy_set_vector =
|
||||
ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] };
|
||||
let payload_header = EntryHeader::ArroySetVector(arroy_set_vector);
|
||||
let total_length = EntryHeader::total_set_vector_size(dimensions);
|
||||
if total_length > max_grant {
|
||||
let mut value_file = tempfile::tempfile().map(BufWriter::new)?;
|
||||
let embedding = embedding.expect("set_vector without a vector does not fit in RAM");
|
||||
|
||||
let mut embedding_bytes = bytemuck::cast_slice(&embedding);
|
||||
io::copy(&mut embedding_bytes, &mut value_file)?;
|
||||
|
||||
let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?;
|
||||
let embedding = unsafe { Mmap::map(&value_file)? };
|
||||
|
||||
let large_vectors = LargeVector { docid, embedder_id, extractor_id, embedding };
|
||||
self.sender.send(ReceiverAction::LargeVector(large_vectors)).unwrap();
|
||||
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Spin loop to have a frame the size we requested.
|
||||
reserve_and_write_grant(
|
||||
&mut producer,
|
||||
total_length,
|
||||
&self.sender,
|
||||
&self.sent_messages_attempts,
|
||||
&self.blocking_sent_messages_attempts,
|
||||
|grant| {
|
||||
let header_size = payload_header.header_size();
|
||||
let (header_bytes, remaining) = grant.split_at_mut(header_size);
|
||||
payload_header.serialize_into(header_bytes);
|
||||
|
||||
if dimensions != 0 {
|
||||
let output_iter =
|
||||
remaining.chunks_exact_mut(dimensions * mem::size_of::<f32>());
|
||||
|
||||
for (embedding, output) in embedding.iter().zip(output_iter) {
|
||||
output.copy_from_slice(bytemuck::cast_slice(embedding));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn embedding_status(
|
||||
&self,
|
||||
name: &str,
|
||||
infos: crate::vector::db::EmbedderInfo,
|
||||
) -> crate::Result<()> {
|
||||
let bytes = infos.to_bytes().map_err(|_| {
|
||||
InternalError::Serialization(crate::SerializationError::Encoding {
|
||||
db_name: Some(Database::VectorEmbedderCategoryId.database_name()),
|
||||
})
|
||||
})?;
|
||||
self.write_key_value(Database::VectorEmbedderCategoryId, name.as_bytes(), &bytes)
|
||||
}
|
||||
|
||||
fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> {
|
||||
let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| {
|
||||
InternalError::StorePut {
|
||||
@ -942,9 +1085,18 @@ impl EmbeddingSender<'_, '_> {
|
||||
&self,
|
||||
docid: DocumentId,
|
||||
embedder_id: u8,
|
||||
embedding: Embedding,
|
||||
extractor_id: u8,
|
||||
embedding: Option<Embedding>,
|
||||
) -> crate::Result<()> {
|
||||
self.0.set_vectors(docid, embedder_id, &[embedding])
|
||||
self.0.set_vector_for_extractor(docid, embedder_id, extractor_id, embedding)
|
||||
}
|
||||
|
||||
pub(crate) fn embedding_status(
|
||||
&self,
|
||||
name: &str,
|
||||
infos: crate::vector::db::EmbedderInfo,
|
||||
) -> crate::Result<()> {
|
||||
self.0.embedding_status(name, infos)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,7 +1,10 @@
|
||||
use std::cell::{Cell, RefCell};
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::sync::RwLock;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::RawMap;
|
||||
use heed::RoTxn;
|
||||
use heed::{RoTxn, WithoutTls};
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
@ -9,7 +12,14 @@ use super::vector_document::VectorDocument;
|
||||
use super::{KvReaderFieldId, KvWriterFieldId};
|
||||
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::{DocumentId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError};
|
||||
use crate::update::del_add::KvReaderDelAdd;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::update::new::vector_document::VectorDocumentFromDb;
|
||||
use crate::vector::settings::EmbedderAction;
|
||||
use crate::{
|
||||
DocumentId, FieldIdMapWithMetadata, FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError,
|
||||
Result, UserError,
|
||||
};
|
||||
|
||||
/// A view into a document that can represent either the current version from the DB,
|
||||
/// the update data from payload or other means, or the merged updated version.
|
||||
@ -309,6 +319,7 @@ where
|
||||
pub fn write_to_obkv<'s, 'a, 'map, 'buffer>(
|
||||
document: &'s impl Document<'s>,
|
||||
vector_document: Option<&'s impl VectorDocument<'s>>,
|
||||
embedder_actions: &'a BTreeMap<String, EmbedderAction>,
|
||||
fields_ids_map: &'a mut GlobalFieldsIdsMap<'map>,
|
||||
mut document_buffer: &'a mut bumpalo::collections::Vec<'buffer, u8>,
|
||||
) -> Result<&'a KvReaderFieldId>
|
||||
@ -338,20 +349,39 @@ where
|
||||
for res in vector_document.iter_vectors() {
|
||||
let (name, entry) = res?;
|
||||
if entry.has_configured_embedder {
|
||||
continue; // we don't write vectors with configured embedder in documents
|
||||
if let Some(action) = embedder_actions.get(name) {
|
||||
if action.write_back().is_some() && !entry.regenerate {
|
||||
vectors.insert(
|
||||
name,
|
||||
serde_json::json!({
|
||||
"regenerate": entry.regenerate,
|
||||
// TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object
|
||||
"embeddings": entry.embeddings,
|
||||
}),
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
match embedder_actions.get(name) {
|
||||
Some(action) if action.write_back().is_none() => {
|
||||
continue;
|
||||
}
|
||||
_ => {
|
||||
vectors.insert(
|
||||
name,
|
||||
if entry.implicit {
|
||||
serde_json::json!(entry.embeddings)
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"regenerate": entry.regenerate,
|
||||
// TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object
|
||||
"embeddings": entry.embeddings,
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
vectors.insert(
|
||||
name,
|
||||
if entry.implicit {
|
||||
serde_json::json!(entry.embeddings)
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"regenerate": entry.regenerate,
|
||||
// TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object
|
||||
"embeddings": entry.embeddings,
|
||||
})
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
if vectors.is_empty() {
|
||||
@ -439,3 +469,231 @@ impl<'doc> Versions<'doc> {
|
||||
self.data.get(k)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct KvDelAddDocument<'a, Mapper: FieldIdMapper> {
|
||||
document: &'a obkv::KvReaderU16,
|
||||
side: crate::update::del_add::DelAdd,
|
||||
fields_ids_map: &'a Mapper,
|
||||
}
|
||||
|
||||
impl<'a, Mapper: FieldIdMapper> KvDelAddDocument<'a, Mapper> {
|
||||
pub fn new(
|
||||
document: &'a obkv::KvReaderU16,
|
||||
side: crate::update::del_add::DelAdd,
|
||||
fields_ids_map: &'a Mapper,
|
||||
) -> Self {
|
||||
Self { document, side, fields_ids_map }
|
||||
}
|
||||
|
||||
fn get(&self, k: &str) -> Result<Option<&'a RawValue>> {
|
||||
let Some(id) = self.fields_ids_map.id(k) else { return Ok(None) };
|
||||
let Some(value) = self.document.get(id) else { return Ok(None) };
|
||||
let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else { return Ok(None) };
|
||||
|
||||
let value = serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?;
|
||||
|
||||
Ok(Some(value))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, Mapper: FieldIdMapper> Document<'a> for KvDelAddDocument<'a, Mapper> {
|
||||
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'a str, &'a RawValue)>> {
|
||||
let mut it = self.document.iter();
|
||||
|
||||
std::iter::from_fn(move || loop {
|
||||
let (fid, value) = it.next()?;
|
||||
let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else {
|
||||
continue;
|
||||
};
|
||||
let name = match self.fields_ids_map.name(fid).ok_or(
|
||||
InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: fid,
|
||||
process: "getting current document",
|
||||
}),
|
||||
) {
|
||||
Ok(name) => name,
|
||||
Err(error) => return Some(Err(error.into())),
|
||||
};
|
||||
|
||||
if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME {
|
||||
continue;
|
||||
}
|
||||
|
||||
let res = (|| {
|
||||
let value =
|
||||
serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?;
|
||||
|
||||
Ok((name, value))
|
||||
})();
|
||||
|
||||
return Some(res);
|
||||
})
|
||||
}
|
||||
|
||||
fn top_level_fields_count(&self) -> usize {
|
||||
let mut it = self.document.iter();
|
||||
|
||||
std::iter::from_fn(move || loop {
|
||||
let (fid, value) = it.next()?;
|
||||
let Some(_) = KvReaderDelAdd::from_slice(value).get(self.side) else {
|
||||
continue;
|
||||
};
|
||||
let name = match self.fields_ids_map.name(fid).ok_or(
|
||||
InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: fid,
|
||||
process: "getting current document",
|
||||
}),
|
||||
) {
|
||||
Ok(name) => name,
|
||||
Err(_) => return Some(()),
|
||||
};
|
||||
|
||||
if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME {
|
||||
continue;
|
||||
}
|
||||
|
||||
return Some(());
|
||||
})
|
||||
.count()
|
||||
}
|
||||
|
||||
fn top_level_field(&self, k: &str) -> Result<Option<&'a RawValue>> {
|
||||
if k == RESERVED_VECTORS_FIELD_NAME || k == RESERVED_GEO_FIELD_NAME {
|
||||
return Ok(None);
|
||||
}
|
||||
self.get(k)
|
||||
}
|
||||
|
||||
fn vectors_field(&self) -> Result<Option<&'a RawValue>> {
|
||||
self.get(RESERVED_VECTORS_FIELD_NAME)
|
||||
}
|
||||
|
||||
fn geo_field(&self) -> Result<Option<&'a RawValue>> {
|
||||
self.get(RESERVED_GEO_FIELD_NAME)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentIdentifiers<'doc> {
|
||||
docid: DocumentId,
|
||||
external_document_id: &'doc str,
|
||||
}
|
||||
|
||||
impl<'doc> DocumentIdentifiers<'doc> {
|
||||
pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self {
|
||||
Self { docid, external_document_id }
|
||||
}
|
||||
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
self.docid
|
||||
}
|
||||
|
||||
pub fn external_document_id(&self) -> &'doc str {
|
||||
self.external_document_id
|
||||
}
|
||||
|
||||
pub fn current<'a, Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
mapper: &'a Mapper,
|
||||
) -> Result<DocumentFromDb<'a, Mapper>> {
|
||||
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
|
||||
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
||||
)?)
|
||||
}
|
||||
|
||||
pub fn current_vectors<'a, Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
mapper: &'a Mapper,
|
||||
doc_alloc: &'a Bump,
|
||||
) -> Result<VectorDocumentFromDb<'a>> {
|
||||
Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or(
|
||||
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
||||
)?)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentContext<
|
||||
'doc, // covariant lifetime of a single `process` call
|
||||
'extractor: 'doc, // invariant lifetime of the extractor_allocs
|
||||
'fid: 'doc, // invariant lifetime of the new_fields_ids_map
|
||||
'indexer: 'doc, // covariant lifetime of objects that outlive a single `process` call
|
||||
T: MostlySend,
|
||||
> {
|
||||
/// The index we're indexing in
|
||||
pub index: &'indexer Index,
|
||||
/// The fields ids map as it was at the start of this indexing process. Contains at least all top-level fields from documents
|
||||
/// inside of the DB.
|
||||
pub db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||
/// A transaction providing data from the DB before all indexing operations
|
||||
pub rtxn: RoTxn<'indexer, WithoutTls>,
|
||||
|
||||
/// Global field id map that is up to date with the current state of the indexing process.
|
||||
///
|
||||
/// - Inserting a field will take a lock
|
||||
/// - Retrieving a field may take a lock as well
|
||||
pub new_fields_ids_map: &'doc std::cell::RefCell<GlobalFieldsIdsMap<'fid>>,
|
||||
|
||||
/// Data allocated in this allocator is cleared between each call to `process`.
|
||||
pub doc_alloc: Bump,
|
||||
|
||||
/// Data allocated in this allocator is not cleared between each call to `process`, unless the data spills.
|
||||
pub extractor_alloc: &'extractor Bump,
|
||||
|
||||
/// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents
|
||||
pub doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
|
||||
|
||||
/// Extractor-specific data
|
||||
pub data: &'doc T,
|
||||
}
|
||||
|
||||
impl<
|
||||
'doc, // covariant lifetime of a single `process` call
|
||||
'data: 'doc, // invariant on T lifetime of the datastore
|
||||
'extractor: 'doc, // invariant lifetime of extractor_allocs
|
||||
'fid: 'doc, // invariant lifetime of fields ids map
|
||||
'indexer: 'doc, // covariant lifetime of objects that survive a `process` call
|
||||
T: MostlySend,
|
||||
> DocumentContext<'doc, 'extractor, 'fid, 'indexer, T>
|
||||
{
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new<F>(
|
||||
index: &'indexer Index,
|
||||
db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||
new_fields_ids_map: &'fid RwLock<FieldIdMapWithMetadata>,
|
||||
extractor_allocs: &'extractor ThreadLocal<FullySend<Bump>>,
|
||||
doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
|
||||
datastore: &'data ThreadLocal<T>,
|
||||
fields_ids_map_store: &'doc ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
||||
init_data: F,
|
||||
) -> Result<Self>
|
||||
where
|
||||
F: FnOnce(&'extractor Bump) -> Result<T>,
|
||||
{
|
||||
let doc_alloc =
|
||||
doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024))));
|
||||
let doc_alloc = doc_alloc.0.take();
|
||||
let fields_ids_map = fields_ids_map_store
|
||||
.get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into());
|
||||
|
||||
let fields_ids_map = &fields_ids_map.0;
|
||||
let extractor_alloc = extractor_allocs.get_or_default();
|
||||
|
||||
let data = datastore.get_or_try(move || init_data(&extractor_alloc.0))?;
|
||||
|
||||
let txn = index.read_txn()?;
|
||||
Ok(DocumentContext {
|
||||
index,
|
||||
rtxn: txn,
|
||||
db_fields_ids_map,
|
||||
new_fields_ids_map: fields_ids_map,
|
||||
doc_alloc,
|
||||
extractor_alloc: &extractor_alloc.0,
|
||||
data,
|
||||
doc_allocs,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -10,20 +10,16 @@ use super::vector_document::{
|
||||
};
|
||||
use crate::attribute_patterns::PatternMatch;
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::update::new::document::DocumentIdentifiers;
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::{DocumentId, Index, InternalError, Result};
|
||||
|
||||
pub enum DocumentChange<'doc> {
|
||||
Deletion(Deletion<'doc>),
|
||||
Deletion(DocumentIdentifiers<'doc>),
|
||||
Update(Update<'doc>),
|
||||
Insertion(Insertion<'doc>),
|
||||
}
|
||||
|
||||
pub struct Deletion<'doc> {
|
||||
docid: DocumentId,
|
||||
external_document_id: &'doc str,
|
||||
}
|
||||
|
||||
pub struct Update<'doc> {
|
||||
docid: DocumentId,
|
||||
external_document_id: &'doc str,
|
||||
@ -55,31 +51,6 @@ impl<'doc> DocumentChange<'doc> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc> Deletion<'doc> {
|
||||
pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self {
|
||||
Self { docid, external_document_id }
|
||||
}
|
||||
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
self.docid
|
||||
}
|
||||
|
||||
pub fn external_document_id(&self) -> &'doc str {
|
||||
self.external_document_id
|
||||
}
|
||||
|
||||
pub fn current<'a, Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
mapper: &'a Mapper,
|
||||
) -> Result<DocumentFromDb<'a, Mapper>> {
|
||||
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
|
||||
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
||||
)?)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc> Insertion<'doc> {
|
||||
pub fn create(docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>) -> Self {
|
||||
Insertion { docid, external_document_id, new }
|
||||
@ -99,7 +70,7 @@ impl<'doc> Insertion<'doc> {
|
||||
pub fn inserted_vectors(
|
||||
&self,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
|
||||
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
|
||||
}
|
||||
@ -270,7 +241,7 @@ impl<'doc> Update<'doc> {
|
||||
pub fn only_changed_vectors(
|
||||
&self,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
|
||||
VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders)
|
||||
}
|
||||
@ -281,7 +252,7 @@ impl<'doc> Update<'doc> {
|
||||
index: &'doc Index,
|
||||
mapper: &'doc Mapper,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<MergedVectorDocument<'doc>>> {
|
||||
if self.from_scratch {
|
||||
MergedVectorDocument::without_db(
|
||||
|
@ -1,26 +1,33 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use hashbrown::HashMap;
|
||||
|
||||
use super::DelAddRoaringBitmap;
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::update::new::channel::DocumentsSender;
|
||||
use crate::update::new::document::{write_to_obkv, Document as _};
|
||||
use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor};
|
||||
use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender};
|
||||
use crate::update::new::document::{write_to_obkv, Document, DocumentContext, DocumentIdentifiers};
|
||||
use crate::update::new::indexer::document_changes::{Extractor, IndexingContext};
|
||||
use crate::update::new::indexer::settings_changes::{
|
||||
settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::thread_local::FullySend;
|
||||
use crate::update::new::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::new::vector_document::VectorDocument;
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::update::settings::SettingsDelta;
|
||||
use crate::vector::settings::EmbedderAction;
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::Result;
|
||||
|
||||
pub struct DocumentsExtractor<'a, 'b> {
|
||||
document_sender: DocumentsSender<'a, 'b>,
|
||||
embedders: &'a EmbeddingConfigs,
|
||||
embedders: &'a RuntimeEmbedders,
|
||||
}
|
||||
|
||||
impl<'a, 'b> DocumentsExtractor<'a, 'b> {
|
||||
pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self {
|
||||
pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a RuntimeEmbedders) -> Self {
|
||||
Self { document_sender, embedders }
|
||||
}
|
||||
}
|
||||
@ -41,10 +48,11 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> {
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
context: &DocumentContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc);
|
||||
let mut document_extractor_data = context.data.0.borrow_mut_or_yield();
|
||||
let embedder_actions = &Default::default();
|
||||
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
@ -121,9 +129,11 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> {
|
||||
let content = write_to_obkv(
|
||||
&content,
|
||||
vector_content.as_ref(),
|
||||
embedder_actions,
|
||||
&mut new_fields_ids_map,
|
||||
&mut document_buffer,
|
||||
)?;
|
||||
|
||||
self.document_sender.uncompressed(docid, external_docid, content).unwrap();
|
||||
}
|
||||
DocumentChange::Insertion(insertion) => {
|
||||
@ -146,6 +156,7 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> {
|
||||
let content = write_to_obkv(
|
||||
&content,
|
||||
inserted_vectors.as_ref(),
|
||||
embedder_actions,
|
||||
&mut new_fields_ids_map,
|
||||
&mut document_buffer,
|
||||
)?;
|
||||
@ -158,3 +169,144 @@ impl<'extractor> Extractor<'extractor> for DocumentsExtractor<'_, '_> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SettingsChangeDocumentExtractor<'a, 'b> {
|
||||
document_sender: DocumentsSender<'a, 'b>,
|
||||
embedder_actions: &'a BTreeMap<String, EmbedderAction>,
|
||||
}
|
||||
|
||||
impl<'a, 'b> SettingsChangeDocumentExtractor<'a, 'b> {
|
||||
pub fn new(
|
||||
document_sender: DocumentsSender<'a, 'b>,
|
||||
embedder_actions: &'a BTreeMap<String, EmbedderAction>,
|
||||
) -> Self {
|
||||
Self { document_sender, embedder_actions }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeDocumentExtractor<'_, '_> {
|
||||
type Data = FullySend<()>;
|
||||
|
||||
fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(FullySend(()))
|
||||
}
|
||||
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
documents: impl Iterator<Item = Result<DocumentIdentifiers<'doc>>>,
|
||||
context: &DocumentContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc);
|
||||
|
||||
for document in documents {
|
||||
let document = document?;
|
||||
// **WARNING**: the exclusive borrow on `new_fields_ids_map` needs to be taken **inside** of the `for change in changes` loop
|
||||
// Otherwise, `BorrowMutError` will occur for document changes that also need the new_fields_ids_map (e.g.: UpdateByFunction)
|
||||
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
|
||||
|
||||
let external_docid = document.external_document_id().to_owned();
|
||||
let content =
|
||||
document.current(&context.rtxn, context.index, &context.db_fields_ids_map)?;
|
||||
let vector_content = document.current_vectors(
|
||||
&context.rtxn,
|
||||
context.index,
|
||||
&context.db_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
|
||||
// if the document doesn't need to be updated, we skip it
|
||||
if !must_update_document(&vector_content, self.embedder_actions)? {
|
||||
continue;
|
||||
}
|
||||
|
||||
let content = write_to_obkv(
|
||||
&content,
|
||||
Some(&vector_content),
|
||||
self.embedder_actions,
|
||||
&mut new_fields_ids_map,
|
||||
&mut document_buffer,
|
||||
)?;
|
||||
|
||||
self.document_sender.uncompressed(document.docid(), external_docid, content).unwrap();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Modify the database documents based on the settings changes.
|
||||
///
|
||||
/// This function extracts the documents from the database,
|
||||
/// modifies them by adding or removing vector fields based on embedder actions,
|
||||
/// and then updates the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents::extract")]
|
||||
pub fn update_database_documents<'indexer, 'extractor, MSP, SD>(
|
||||
documents: &'indexer DocumentsIndentifiers<'indexer>,
|
||||
indexing_context: IndexingContext<MSP>,
|
||||
extractor_sender: &ExtractorBbqueueSender,
|
||||
settings_delta: &SD,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SD: SettingsDelta,
|
||||
{
|
||||
if !must_update_database(settings_delta) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let document_sender = extractor_sender.documents();
|
||||
let document_extractor =
|
||||
SettingsChangeDocumentExtractor::new(document_sender, settings_delta.embedder_actions());
|
||||
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
|
||||
settings_change_extract(
|
||||
documents,
|
||||
&document_extractor,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
&datastore,
|
||||
crate::update::new::steps::IndexingStep::ExtractingDocuments,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn must_update_database<SD: SettingsDelta>(settings_delta: &SD) -> bool {
|
||||
settings_delta.embedder_actions().iter().any(|(name, action)| {
|
||||
if action.reindex().is_some() {
|
||||
// if action has a reindex, we need to update the documents database if the embedder is a new one
|
||||
settings_delta.old_embedders().get(name).is_none()
|
||||
} else {
|
||||
// if action has a write_back, we need to update the documents database
|
||||
action.write_back().is_some()
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn must_update_document<'s, 'a>(
|
||||
vector_document: &'s impl VectorDocument<'s>,
|
||||
embedder_actions: &'a BTreeMap<String, EmbedderAction>,
|
||||
) -> Result<bool>
|
||||
where
|
||||
's: 'a,
|
||||
{
|
||||
// Check if any vector needs to be written back for the document
|
||||
for (name, action) in embedder_actions {
|
||||
// if the vector entry is not found, we don't need to update the document
|
||||
let Some(vector_entry) = vector_document.vectors_for_key(name)? else {
|
||||
continue;
|
||||
};
|
||||
|
||||
// if the vector entry is user provided, we need to update the document by writing back vectors.
|
||||
let write_back = action.write_back().is_some() && !vector_entry.regenerate;
|
||||
// if the vector entry is a new embedder, we need to update the document removing the vectors from the document.
|
||||
let new_embedder = action.reindex().is_some() && !vector_entry.has_configured_embedder;
|
||||
|
||||
if write_back || new_embedder {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
|
@ -15,9 +15,10 @@ use crate::filterable_attributes_rules::match_faceted_field;
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::new::channel::FieldIdDocidFacetSender;
|
||||
use crate::update::new::document::DocumentContext;
|
||||
use crate::update::new::extract::perm_json_p;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
extract, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
@ -51,7 +52,7 @@ impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'_, '_> {
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
context: &DocumentContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
@ -75,7 +76,7 @@ pub struct FacetedDocidsExtractor;
|
||||
impl FacetedDocidsExtractor {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<BalancedCaches>>,
|
||||
context: &DocumentContext<RefCell<BalancedCaches>>,
|
||||
filterable_attributes: &[FilterableAttributesRule],
|
||||
sortable_fields: &HashSet<String>,
|
||||
asc_desc_fields: &HashSet<String>,
|
||||
|
@ -10,8 +10,8 @@ use serde_json::value::RawValue;
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::error::GeoError;
|
||||
use crate::update::new::document::Document;
|
||||
use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor};
|
||||
use crate::update::new::document::{Document, DocumentContext};
|
||||
use crate::update::new::indexer::document_changes::Extractor;
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::thread_local::MostlySend;
|
||||
use crate::update::new::DocumentChange;
|
||||
@ -150,7 +150,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor {
|
||||
fn process<'doc>(
|
||||
&'doc self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &'doc DocumentChangeContext<Self::Data>,
|
||||
context: &'doc DocumentContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
let rtxn = &context.rtxn;
|
||||
let index = context.index;
|
||||
|
@ -12,7 +12,7 @@ pub use documents::*;
|
||||
pub use faceted::*;
|
||||
pub use geo::*;
|
||||
pub use searchable::*;
|
||||
pub use vectors::EmbeddingExtractor;
|
||||
pub use vectors::{EmbeddingExtractor, SettingsChangeEmbeddingExtractor};
|
||||
|
||||
/// TODO move in permissive json pointer
|
||||
pub mod perm_json_p {
|
||||
|
@ -8,10 +8,11 @@ use bumpalo::Bump;
|
||||
|
||||
use super::match_searchable_field;
|
||||
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
use crate::update::new::document::DocumentContext;
|
||||
use crate::update::new::extract::cache::BalancedCaches;
|
||||
use crate::update::new::extract::perm_json_p::contained_in;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
extract, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
@ -226,7 +227,7 @@ impl<'extractor> Extractor<'extractor> for WordDocidsExtractorData<'_> {
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
context: &DocumentContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
@ -305,7 +306,7 @@ impl WordDocidsExtractors {
|
||||
}
|
||||
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<Option<WordDocidsBalancedCaches>>>,
|
||||
context: &DocumentContext<RefCell<Option<WordDocidsBalancedCaches>>>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
searchable_attributes: Option<&[&str]>,
|
||||
document_change: DocumentChange,
|
||||
|
@ -7,10 +7,10 @@ use bumpalo::Bump;
|
||||
use super::match_searchable_field;
|
||||
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||
use crate::update::new::document::Document;
|
||||
use crate::update::new::document::{Document, DocumentContext};
|
||||
use crate::update::new::extract::cache::BalancedCaches;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
extract, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
@ -39,7 +39,7 @@ impl<'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData<
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
context: &DocumentContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
@ -116,7 +116,7 @@ impl WordPairProximityDocidsExtractor {
|
||||
// and to store the docids of the documents that have a number of words in a given field
|
||||
// equal to or under than MAX_COUNTED_WORDS.
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<BalancedCaches>>,
|
||||
context: &DocumentContext<RefCell<BalancedCaches>>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
searchable_attributes: Option<&[&str]>,
|
||||
document_change: DocumentChange,
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -3,100 +3,18 @@ use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use bumpalo::Bump;
|
||||
use heed::{RoTxn, WithoutTls};
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
|
||||
use super::super::document_change::DocumentChange;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::progress::{AtomicDocumentStep, Progress};
|
||||
use crate::update::new::document::DocumentContext;
|
||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result};
|
||||
|
||||
pub struct DocumentChangeContext<
|
||||
'doc, // covariant lifetime of a single `process` call
|
||||
'extractor: 'doc, // invariant lifetime of the extractor_allocs
|
||||
'fid: 'doc, // invariant lifetime of the new_fields_ids_map
|
||||
'indexer: 'doc, // covariant lifetime of objects that outlive a single `process` call
|
||||
T: MostlySend,
|
||||
> {
|
||||
/// The index we're indexing in
|
||||
pub index: &'indexer Index,
|
||||
/// The fields ids map as it was at the start of this indexing process. Contains at least all top-level fields from documents
|
||||
/// inside of the DB.
|
||||
pub db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||
/// A transaction providing data from the DB before all indexing operations
|
||||
pub rtxn: RoTxn<'indexer, WithoutTls>,
|
||||
|
||||
/// Global field id map that is up to date with the current state of the indexing process.
|
||||
///
|
||||
/// - Inserting a field will take a lock
|
||||
/// - Retrieving a field may take a lock as well
|
||||
pub new_fields_ids_map: &'doc std::cell::RefCell<GlobalFieldsIdsMap<'fid>>,
|
||||
|
||||
/// Data allocated in this allocator is cleared between each call to `process`.
|
||||
pub doc_alloc: Bump,
|
||||
|
||||
/// Data allocated in this allocator is not cleared between each call to `process`, unless the data spills.
|
||||
pub extractor_alloc: &'extractor Bump,
|
||||
|
||||
/// Pool of doc allocators, used to retrieve the doc allocator we provided for the documents
|
||||
doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
|
||||
|
||||
/// Extractor-specific data
|
||||
pub data: &'doc T,
|
||||
}
|
||||
|
||||
impl<
|
||||
'doc, // covariant lifetime of a single `process` call
|
||||
'data: 'doc, // invariant on T lifetime of the datastore
|
||||
'extractor: 'doc, // invariant lifetime of extractor_allocs
|
||||
'fid: 'doc, // invariant lifetime of fields ids map
|
||||
'indexer: 'doc, // covariant lifetime of objects that survive a `process` call
|
||||
T: MostlySend,
|
||||
> DocumentChangeContext<'doc, 'extractor, 'fid, 'indexer, T>
|
||||
{
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new<F>(
|
||||
index: &'indexer Index,
|
||||
db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||
new_fields_ids_map: &'fid RwLock<FieldIdMapWithMetadata>,
|
||||
extractor_allocs: &'extractor ThreadLocal<FullySend<Bump>>,
|
||||
doc_allocs: &'doc ThreadLocal<FullySend<Cell<Bump>>>,
|
||||
datastore: &'data ThreadLocal<T>,
|
||||
fields_ids_map_store: &'doc ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
||||
init_data: F,
|
||||
) -> Result<Self>
|
||||
where
|
||||
F: FnOnce(&'extractor Bump) -> Result<T>,
|
||||
{
|
||||
let doc_alloc =
|
||||
doc_allocs.get_or(|| FullySend(Cell::new(Bump::with_capacity(1024 * 1024))));
|
||||
let doc_alloc = doc_alloc.0.take();
|
||||
let fields_ids_map = fields_ids_map_store
|
||||
.get_or(|| RefCell::new(GlobalFieldsIdsMap::new(new_fields_ids_map)).into());
|
||||
|
||||
let fields_ids_map = &fields_ids_map.0;
|
||||
let extractor_alloc = extractor_allocs.get_or_default();
|
||||
|
||||
let data = datastore.get_or_try(move || init_data(&extractor_alloc.0))?;
|
||||
|
||||
let txn = index.read_txn()?;
|
||||
Ok(DocumentChangeContext {
|
||||
index,
|
||||
rtxn: txn,
|
||||
db_fields_ids_map,
|
||||
new_fields_ids_map: fields_ids_map,
|
||||
doc_alloc,
|
||||
extractor_alloc: &extractor_alloc.0,
|
||||
data,
|
||||
doc_allocs,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// An internal iterator (i.e. using `foreach`) of `DocumentChange`s
|
||||
pub trait Extractor<'extractor>: Sync {
|
||||
type Data: MostlySend;
|
||||
@ -106,7 +24,7 @@ pub trait Extractor<'extractor>: Sync {
|
||||
fn process<'doc>(
|
||||
&'doc self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &'doc DocumentChangeContext<Self::Data>,
|
||||
context: &'doc DocumentContext<Self::Data>,
|
||||
) -> Result<()>;
|
||||
}
|
||||
|
||||
@ -125,7 +43,7 @@ pub trait DocumentChanges<'pl // lifetime of the underlying payload
|
||||
fn item_to_document_change<'doc, // lifetime of a single `process` call
|
||||
T: MostlySend>(
|
||||
&'doc self,
|
||||
context: &'doc DocumentChangeContext<T>,
|
||||
context: &'doc DocumentContext<T>,
|
||||
item: &'doc Self::Item,
|
||||
) -> Result<Option<DocumentChange<'doc>>> where 'pl: 'doc // the payload must survive the process calls
|
||||
;
|
||||
@ -224,7 +142,7 @@ where
|
||||
let pi = document_changes.iter(CHUNK_SIZE);
|
||||
pi.try_arc_for_each_try_init(
|
||||
|| {
|
||||
DocumentChangeContext::new(
|
||||
DocumentContext::new(
|
||||
index,
|
||||
db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
|
@ -4,10 +4,11 @@ use rayon::iter::IndexedParallelIterator;
|
||||
use rayon::slice::ParallelSlice as _;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges};
|
||||
use super::document_changes::DocumentChanges;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::update::new::document::DocumentContext;
|
||||
use crate::update::new::thread_local::MostlySend;
|
||||
use crate::update::new::{Deletion, DocumentChange};
|
||||
use crate::update::new::{DocumentChange, DocumentIdentifiers};
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
#[derive(Default)]
|
||||
@ -58,7 +59,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
||||
T: MostlySend,
|
||||
>(
|
||||
&'doc self,
|
||||
context: &'doc DocumentChangeContext<T>,
|
||||
context: &'doc DocumentContext<T>,
|
||||
docid: &'doc Self::Item,
|
||||
) -> Result<Option<DocumentChange<'doc>>>
|
||||
where
|
||||
@ -74,7 +75,10 @@ impl<'pl> DocumentChanges<'pl> for DocumentDeletionChanges<'pl> {
|
||||
|
||||
let external_document_id = external_document_id.to_bump(&context.doc_alloc);
|
||||
|
||||
Ok(Some(DocumentChange::Deletion(Deletion::create(*docid, external_document_id))))
|
||||
Ok(Some(DocumentChange::Deletion(DocumentIdentifiers::create(
|
||||
*docid,
|
||||
external_document_id,
|
||||
))))
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
@ -93,9 +97,8 @@ mod test {
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::document::DocumentContext;
|
||||
use crate::update::new::indexer::document_changes::{extract, Extractor, IndexingContext};
|
||||
use crate::update::new::indexer::DocumentDeletion;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{MostlySend, ThreadLocal};
|
||||
@ -125,7 +128,7 @@ mod test {
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
changes: impl Iterator<Item = crate::Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
context: &DocumentContext<Self::Data>,
|
||||
) -> crate::Result<()> {
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
|
@ -12,14 +12,14 @@ use serde_json::value::RawValue;
|
||||
use serde_json::Deserializer;
|
||||
|
||||
use super::super::document_change::DocumentChange;
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges};
|
||||
use super::document_changes::DocumentChanges;
|
||||
use super::guess_primary_key::retrieve_or_guess_primary_key;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::progress::{AtomicPayloadStep, Progress};
|
||||
use crate::update::new::document::Versions;
|
||||
use crate::update::new::document::{DocumentContext, Versions};
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::MostlySend;
|
||||
use crate::update::new::{Deletion, Insertion, Update};
|
||||
use crate::update::new::{DocumentIdentifiers, Insertion, Update};
|
||||
use crate::update::{AvailableIds, IndexDocumentsMethod};
|
||||
use crate::{DocumentId, Error, FieldsIdsMap, Index, InternalError, Result, UserError};
|
||||
|
||||
@ -411,7 +411,7 @@ impl<'pl> DocumentChanges<'pl> for DocumentOperationChanges<'pl> {
|
||||
|
||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||
&'doc self,
|
||||
context: &'doc DocumentChangeContext<T>,
|
||||
context: &'doc DocumentContext<T>,
|
||||
item: &'doc Self::Item,
|
||||
) -> Result<Option<DocumentChange<'doc>>>
|
||||
where
|
||||
@ -577,7 +577,7 @@ impl<'pl> PayloadOperations<'pl> {
|
||||
if self.is_new {
|
||||
Ok(None)
|
||||
} else {
|
||||
let deletion = Deletion::create(self.docid, external_doc);
|
||||
let deletion = DocumentIdentifiers::create(self.docid, external_doc);
|
||||
Ok(Some(DocumentChange::Deletion(deletion)))
|
||||
}
|
||||
}
|
||||
|
@ -12,14 +12,18 @@ use super::super::steps::IndexingStep;
|
||||
use super::super::thread_local::{FullySend, ThreadLocal};
|
||||
use super::super::FacetFieldIdsDelta;
|
||||
use super::document_changes::{extract, DocumentChanges, IndexingContext};
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::MergingWordCache;
|
||||
use super::settings_changes::settings_change_extract;
|
||||
use crate::documents::{FieldIdMapper, PrimaryKey};
|
||||
use crate::progress::{EmbedderStats, MergingWordCache};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::new::extract::EmbeddingExtractor;
|
||||
use crate::update::new::indexer::settings_changes::DocumentsIndentifiers;
|
||||
use crate::update::new::merger::merge_and_send_rtree;
|
||||
use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases};
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
use crate::update::settings::SettingsDelta;
|
||||
use crate::vector::db::{EmbedderInfo, IndexEmbeddingConfig};
|
||||
use crate::vector::RuntimeEmbedders;
|
||||
use crate::{Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
|
||||
@ -27,13 +31,14 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
|
||||
indexing_context: IndexingContext<MSP>,
|
||||
indexer_span: Span,
|
||||
extractor_sender: ExtractorBbqueueSender,
|
||||
embedders: &EmbeddingConfigs,
|
||||
embedders: &RuntimeEmbedders,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
finished_extraction: &AtomicBool,
|
||||
field_distribution: &mut BTreeMap<String, u64>,
|
||||
mut index_embeddings: Vec<IndexEmbeddingConfig>,
|
||||
document_ids: &mut RoaringBitmap,
|
||||
modified_docids: &mut RoaringBitmap,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> Result<(FacetFieldIdsDelta, Vec<IndexEmbeddingConfig>)>
|
||||
where
|
||||
DC: DocumentChanges<'pl>,
|
||||
@ -245,6 +250,7 @@ where
|
||||
embedders,
|
||||
embedding_sender,
|
||||
field_distribution,
|
||||
embedder_stats,
|
||||
request_threads(),
|
||||
);
|
||||
let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
@ -265,14 +271,19 @@ where
|
||||
let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
let embedder_configs = index.embedding_configs();
|
||||
for config in &mut index_embeddings {
|
||||
let mut infos = embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap();
|
||||
|
||||
'data: for data in datastore.iter_mut() {
|
||||
let data = &mut data.get_mut().0;
|
||||
let Some(deladd) = data.remove(&config.name) else {
|
||||
let Some(delta) = data.remove(&config.name) else {
|
||||
continue 'data;
|
||||
};
|
||||
deladd.apply_to(&mut config.user_provided, modified_docids);
|
||||
delta.apply_to(&mut infos.embedding_status);
|
||||
}
|
||||
|
||||
extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -312,6 +323,122 @@ where
|
||||
Result::Ok((facet_field_ids_delta, index_embeddings))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub(super) fn extract_all_settings_changes<MSP, SD>(
|
||||
indexing_context: IndexingContext<MSP>,
|
||||
indexer_span: Span,
|
||||
extractor_sender: ExtractorBbqueueSender,
|
||||
settings_delta: &SD,
|
||||
extractor_allocs: &mut ThreadLocal<FullySend<Bump>>,
|
||||
finished_extraction: &AtomicBool,
|
||||
field_distribution: &mut BTreeMap<String, u64>,
|
||||
mut index_embeddings: Vec<IndexEmbeddingConfig>,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> Result<Vec<IndexEmbeddingConfig>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SD: SettingsDelta + Sync,
|
||||
{
|
||||
// Create the list of document ids to extract
|
||||
let rtxn = indexing_context.index.read_txn()?;
|
||||
let all_document_ids =
|
||||
indexing_context.index.documents_ids(&rtxn)?.into_iter().collect::<Vec<_>>();
|
||||
let primary_key =
|
||||
primary_key_from_db(indexing_context.index, &rtxn, &indexing_context.db_fields_ids_map)?;
|
||||
let documents = DocumentsIndentifiers::new(&all_document_ids, primary_key);
|
||||
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "extract");
|
||||
let _entered = span.enter();
|
||||
|
||||
update_database_documents(
|
||||
&documents,
|
||||
indexing_context,
|
||||
&extractor_sender,
|
||||
settings_delta,
|
||||
extractor_allocs,
|
||||
)?;
|
||||
|
||||
'vectors: {
|
||||
if settings_delta.embedder_actions().is_empty() {
|
||||
break 'vectors;
|
||||
}
|
||||
|
||||
let embedding_sender = extractor_sender.embeddings();
|
||||
|
||||
// extract the remaining embeddings
|
||||
let extractor = SettingsChangeEmbeddingExtractor::new(
|
||||
settings_delta,
|
||||
embedder_stats,
|
||||
embedding_sender,
|
||||
field_distribution,
|
||||
request_threads(),
|
||||
);
|
||||
let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
{
|
||||
let span = tracing::debug_span!(target: "indexing::documents::extract", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
settings_change_extract(
|
||||
&documents,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
&datastore,
|
||||
IndexingStep::ExtractingEmbeddings,
|
||||
)?;
|
||||
}
|
||||
{
|
||||
let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors");
|
||||
let _entered = span.enter();
|
||||
|
||||
let embedder_configs = indexing_context.index.embedding_configs();
|
||||
for config in &mut index_embeddings {
|
||||
// retrieve infos for existing embedder or create a fresh one
|
||||
let mut infos =
|
||||
embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap_or_else(|| {
|
||||
let embedder_id =
|
||||
*settings_delta.new_embedder_category_id().get(&config.name).unwrap();
|
||||
EmbedderInfo { embedder_id, embedding_status: Default::default() }
|
||||
});
|
||||
|
||||
'data: for data in datastore.iter_mut() {
|
||||
let data = &mut data.get_mut().0;
|
||||
let Some(delta) = data.remove(&config.name) else {
|
||||
continue 'data;
|
||||
};
|
||||
delta.apply_to(&mut infos.embedding_status);
|
||||
}
|
||||
|
||||
extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::WaitingForDatabaseWrites);
|
||||
finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
Result::Ok(index_embeddings)
|
||||
}
|
||||
|
||||
fn primary_key_from_db<'indexer>(
|
||||
index: &'indexer Index,
|
||||
rtxn: &'indexer heed::RoTxn<'_>,
|
||||
fields: &'indexer impl FieldIdMapper,
|
||||
) -> Result<PrimaryKey<'indexer>> {
|
||||
let Some(primary_key) = index.primary_key(rtxn)? else {
|
||||
return Err(InternalError::DatabaseMissingEntry {
|
||||
db_name: crate::index::db_name::MAIN,
|
||||
key: Some(crate::index::main_key::PRIMARY_KEY_KEY),
|
||||
}
|
||||
.into());
|
||||
};
|
||||
let Some(primary_key) = PrimaryKey::new(primary_key, fields) else {
|
||||
unreachable!("Primary key must exist at this point");
|
||||
};
|
||||
Ok(primary_key)
|
||||
}
|
||||
|
||||
fn request_threads() -> &'static ThreadPoolNoAbort {
|
||||
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
use std::sync::{Once, RwLock};
|
||||
use std::sync::{Arc, Once, RwLock};
|
||||
use std::thread::{self, Builder};
|
||||
|
||||
use big_s::S;
|
||||
@ -19,9 +20,11 @@ use super::steps::IndexingStep;
|
||||
use super::thread_local::ThreadLocal;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::progress::Progress;
|
||||
use crate::progress::{EmbedderStats, Progress};
|
||||
use crate::update::settings::SettingsDelta;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::vector::{ArroyWrapper, EmbeddingConfigs};
|
||||
use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments};
|
||||
use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort};
|
||||
|
||||
pub(crate) mod de;
|
||||
@ -32,6 +35,7 @@ mod extract;
|
||||
mod guess_primary_key;
|
||||
mod partial_dump;
|
||||
mod post_processing;
|
||||
pub mod settings_changes;
|
||||
mod update_by_function;
|
||||
mod write;
|
||||
|
||||
@ -40,8 +44,6 @@ static LOG_MEMORY_METRICS_ONCE: Once = Once::new();
|
||||
/// This is the main function of this crate.
|
||||
///
|
||||
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
||||
///
|
||||
/// TODO return stats
|
||||
#[allow(clippy::too_many_arguments)] // clippy: 😝
|
||||
pub fn index<'pl, 'indexer, 'index, DC, MSP>(
|
||||
wtxn: &mut RwTxn,
|
||||
@ -52,9 +54,10 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>(
|
||||
new_fields_ids_map: FieldsIdsMap,
|
||||
new_primary_key: Option<PrimaryKey<'pl>>,
|
||||
document_changes: &DC,
|
||||
embedders: EmbeddingConfigs,
|
||||
embedders: RuntimeEmbedders,
|
||||
must_stop_processing: &'indexer MSP,
|
||||
progress: &'indexer Progress,
|
||||
embedder_stats: &'indexer EmbedderStats,
|
||||
) -> Result<ChannelCongestion>
|
||||
where
|
||||
DC: DocumentChanges<'pl>,
|
||||
@ -65,48 +68,8 @@ where
|
||||
|
||||
let arroy_memory = grenad_parameters.max_memory;
|
||||
|
||||
// We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch
|
||||
// is because we still use the old indexer for the settings and it is highly impacted by the
|
||||
// max memory. So we keep the changes here and will remove these changes once we use the new
|
||||
// indexer to also index settings. Related to #5125 and #5141.
|
||||
let grenad_parameters = GrenadParameters {
|
||||
max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100),
|
||||
..grenad_parameters
|
||||
};
|
||||
|
||||
// 5% percent of the allocated memory for the extractors, or min 100MiB
|
||||
// 5% percent of the allocated memory for the bbqueues, or min 50MiB
|
||||
//
|
||||
// Minimum capacity for bbqueues
|
||||
let minimum_total_bbbuffer_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB
|
||||
let minimum_total_extractors_capacity = minimum_total_bbbuffer_capacity * 2;
|
||||
|
||||
let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or(
|
||||
(
|
||||
GrenadParameters {
|
||||
max_memory: Some(minimum_total_extractors_capacity),
|
||||
..grenad_parameters
|
||||
},
|
||||
minimum_total_bbbuffer_capacity,
|
||||
), // 100 MiB by thread by default
|
||||
|max_memory| {
|
||||
let total_bbbuffer_capacity = max_memory.max(minimum_total_bbbuffer_capacity);
|
||||
let new_grenad_parameters = GrenadParameters {
|
||||
max_memory: Some(max_memory.max(minimum_total_extractors_capacity)),
|
||||
..grenad_parameters
|
||||
};
|
||||
(new_grenad_parameters, total_bbbuffer_capacity)
|
||||
},
|
||||
);
|
||||
|
||||
LOG_MEMORY_METRICS_ONCE.call_once(|| {
|
||||
tracing::debug!(
|
||||
"Indexation allocated memory metrics - \
|
||||
Total BBQueue size: {total_bbbuffer_capacity}, \
|
||||
Total extractor memory: {:?}",
|
||||
grenad_parameters.max_memory,
|
||||
);
|
||||
});
|
||||
let (grenad_parameters, total_bbbuffer_capacity) =
|
||||
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
|
||||
|
||||
let (extractor_sender, writer_receiver) = pool
|
||||
.install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000))
|
||||
@ -130,7 +93,7 @@ where
|
||||
grenad_parameters: &grenad_parameters,
|
||||
};
|
||||
|
||||
let index_embeddings = index.embedding_configs(wtxn)?;
|
||||
let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?;
|
||||
let mut field_distribution = index.field_distribution(wtxn)?;
|
||||
let mut document_ids = index.documents_ids(wtxn)?;
|
||||
let mut modified_docids = roaring::RoaringBitmap::new();
|
||||
@ -158,6 +121,7 @@ where
|
||||
index_embeddings,
|
||||
document_ids,
|
||||
modified_docids,
|
||||
embedder_stats,
|
||||
)
|
||||
})
|
||||
.unwrap()
|
||||
@ -169,20 +133,21 @@ where
|
||||
let arroy_writers: Result<HashMap<_, _>> = embedders
|
||||
.inner_as_ref()
|
||||
.iter()
|
||||
.map(|(embedder_name, (embedder, _, was_quantized))| {
|
||||
let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry {
|
||||
.map(|(embedder_name, runtime)| {
|
||||
let embedder_index = index
|
||||
.embedding_configs()
|
||||
.embedder_id(wtxn, embedder_name)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: "embedder_category_id",
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
})?;
|
||||
|
||||
let dimensions = embedder.dimensions();
|
||||
let writer = ArroyWrapper::new(vector_arroy, embedder_index, *was_quantized);
|
||||
let dimensions = runtime.embedder.dimensions();
|
||||
let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized);
|
||||
|
||||
Ok((
|
||||
embedder_index,
|
||||
(embedder_name.as_str(), embedder.as_ref(), writer, dimensions),
|
||||
(embedder_name.as_str(), &*runtime.embedder, writer, dimensions),
|
||||
))
|
||||
})
|
||||
.collect();
|
||||
@ -206,6 +171,7 @@ where
|
||||
index_embeddings,
|
||||
arroy_memory,
|
||||
&mut arroy_writers,
|
||||
None,
|
||||
&indexing_context.must_stop_processing,
|
||||
)
|
||||
})
|
||||
@ -239,3 +205,275 @@ where
|
||||
|
||||
Ok(congestion)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn reindex<'indexer, 'index, MSP, SD>(
|
||||
wtxn: &mut RwTxn<'index>,
|
||||
index: &'index Index,
|
||||
pool: &ThreadPoolNoAbort,
|
||||
grenad_parameters: GrenadParameters,
|
||||
settings_delta: &'indexer SD,
|
||||
must_stop_processing: &'indexer MSP,
|
||||
progress: &'indexer Progress,
|
||||
embedder_stats: Arc<EmbedderStats>,
|
||||
) -> Result<ChannelCongestion>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SD: SettingsDelta + Sync,
|
||||
{
|
||||
delete_old_embedders_and_fragments(wtxn, index, settings_delta)?;
|
||||
|
||||
let mut bbbuffers = Vec::new();
|
||||
let finished_extraction = AtomicBool::new(false);
|
||||
|
||||
let arroy_memory = grenad_parameters.max_memory;
|
||||
|
||||
let (grenad_parameters, total_bbbuffer_capacity) =
|
||||
indexer_memory_settings(pool.current_num_threads(), grenad_parameters);
|
||||
|
||||
let (extractor_sender, writer_receiver) = pool
|
||||
.install(|| extractor_writer_bbqueue(&mut bbbuffers, total_bbbuffer_capacity, 1000))
|
||||
.unwrap();
|
||||
|
||||
let mut extractor_allocs = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
|
||||
let db_fields_ids_map = index.fields_ids_map(wtxn)?;
|
||||
let new_fields_ids_map = settings_delta.new_fields_ids_map().clone();
|
||||
let new_fields_ids_map = RwLock::new(new_fields_ids_map);
|
||||
let fields_ids_map_store = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
let doc_allocs = ThreadLocal::with_capacity(rayon::current_num_threads());
|
||||
|
||||
let indexing_context = IndexingContext {
|
||||
index,
|
||||
db_fields_ids_map: &db_fields_ids_map,
|
||||
new_fields_ids_map: &new_fields_ids_map,
|
||||
doc_allocs: &doc_allocs,
|
||||
fields_ids_map_store: &fields_ids_map_store,
|
||||
must_stop_processing,
|
||||
progress,
|
||||
grenad_parameters: &grenad_parameters,
|
||||
};
|
||||
|
||||
let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?;
|
||||
let mut field_distribution = index.field_distribution(wtxn)?;
|
||||
|
||||
let congestion = thread::scope(|s| -> Result<ChannelCongestion> {
|
||||
let indexer_span = tracing::Span::current();
|
||||
let finished_extraction = &finished_extraction;
|
||||
// prevent moving the field_distribution and document_ids in the inner closure...
|
||||
let field_distribution = &mut field_distribution;
|
||||
let extractor_handle =
|
||||
Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
||||
pool.install(move || {
|
||||
extract::extract_all_settings_changes(
|
||||
indexing_context,
|
||||
indexer_span,
|
||||
extractor_sender,
|
||||
settings_delta,
|
||||
&mut extractor_allocs,
|
||||
finished_extraction,
|
||||
field_distribution,
|
||||
index_embeddings,
|
||||
&embedder_stats,
|
||||
)
|
||||
})
|
||||
.unwrap()
|
||||
})?;
|
||||
|
||||
let new_embedders = settings_delta.new_embedders();
|
||||
let embedder_actions = settings_delta.embedder_actions();
|
||||
let index_embedder_category_ids = settings_delta.new_embedder_category_id();
|
||||
let mut arroy_writers = arroy_writers_from_embedder_actions(
|
||||
index,
|
||||
embedder_actions,
|
||||
new_embedders,
|
||||
index_embedder_category_ids,
|
||||
)?;
|
||||
|
||||
let congestion =
|
||||
write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?;
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
|
||||
|
||||
let index_embeddings = extractor_handle.join().unwrap()?;
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase);
|
||||
|
||||
pool.install(|| {
|
||||
build_vectors(
|
||||
index,
|
||||
wtxn,
|
||||
indexing_context.progress,
|
||||
index_embeddings,
|
||||
arroy_memory,
|
||||
&mut arroy_writers,
|
||||
Some(embedder_actions),
|
||||
&indexing_context.must_stop_processing,
|
||||
)
|
||||
})
|
||||
.unwrap()?;
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::Finalizing);
|
||||
|
||||
Ok(congestion) as Result<_>
|
||||
})?;
|
||||
|
||||
// required to into_inner the new_fields_ids_map
|
||||
drop(fields_ids_map_store);
|
||||
|
||||
let new_fields_ids_map = new_fields_ids_map.into_inner().unwrap();
|
||||
let document_ids = index.documents_ids(wtxn)?;
|
||||
update_index(
|
||||
index,
|
||||
wtxn,
|
||||
new_fields_ids_map,
|
||||
None,
|
||||
settings_delta.new_embedders().clone(),
|
||||
field_distribution,
|
||||
document_ids,
|
||||
)?;
|
||||
|
||||
Ok(congestion)
|
||||
}
|
||||
|
||||
fn arroy_writers_from_embedder_actions<'indexer>(
|
||||
index: &Index,
|
||||
embedder_actions: &'indexer BTreeMap<String, EmbedderAction>,
|
||||
embedders: &'indexer RuntimeEmbedders,
|
||||
index_embedder_category_ids: &'indexer std::collections::HashMap<String, u8>,
|
||||
) -> Result<HashMap<u8, (&'indexer str, &'indexer Embedder, ArroyWrapper, usize)>> {
|
||||
let vector_arroy = index.vector_arroy;
|
||||
|
||||
embedders
|
||||
.inner_as_ref()
|
||||
.iter()
|
||||
.filter_map(|(embedder_name, runtime)| match embedder_actions.get(embedder_name) {
|
||||
None => None,
|
||||
Some(action) if action.write_back().is_some() => None,
|
||||
Some(action) => {
|
||||
let Some(&embedder_category_id) = index_embedder_category_ids.get(embedder_name)
|
||||
else {
|
||||
return Some(Err(crate::error::Error::InternalError(
|
||||
crate::InternalError::DatabaseMissingEntry {
|
||||
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||
key: None,
|
||||
},
|
||||
)));
|
||||
};
|
||||
let writer =
|
||||
ArroyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized);
|
||||
let dimensions = runtime.embedder.dimensions();
|
||||
Some(Ok((
|
||||
embedder_category_id,
|
||||
(embedder_name.as_str(), runtime.embedder.as_ref(), writer, dimensions),
|
||||
)))
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn delete_old_embedders_and_fragments<SD>(
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
index: &Index,
|
||||
settings_delta: &SD,
|
||||
) -> Result<()>
|
||||
where
|
||||
SD: SettingsDelta,
|
||||
{
|
||||
for action in settings_delta.embedder_actions().values() {
|
||||
let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else {
|
||||
continue;
|
||||
};
|
||||
let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized);
|
||||
let Some(dimensions) = reader.dimensions(wtxn)? else {
|
||||
continue;
|
||||
};
|
||||
reader.clear(wtxn, dimensions)?;
|
||||
}
|
||||
|
||||
// remove all vectors for the specified fragments
|
||||
for (embedder_name, RemoveFragments { fragment_ids }, was_quantized) in
|
||||
settings_delta.embedder_actions().iter().filter_map(|(name, action)| {
|
||||
action.remove_fragments().map(|fragments| (name, fragments, action.was_quantized))
|
||||
})
|
||||
{
|
||||
let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else {
|
||||
continue;
|
||||
};
|
||||
let arroy = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, was_quantized);
|
||||
let Some(dimensions) = arroy.dimensions(wtxn)? else {
|
||||
continue;
|
||||
};
|
||||
for fragment_id in fragment_ids {
|
||||
// we must keep the user provided embeddings that ended up in this store
|
||||
|
||||
if infos.embedding_status.user_provided_docids().is_empty() {
|
||||
// no user provided: clear store
|
||||
arroy.clear_store(wtxn, *fragment_id, dimensions)?;
|
||||
continue;
|
||||
}
|
||||
|
||||
// some user provided, remove only the ids that are not user provided
|
||||
let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| {
|
||||
items - infos.embedding_status.user_provided_docids()
|
||||
})?;
|
||||
|
||||
for to_delete in to_delete {
|
||||
arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn indexer_memory_settings(
|
||||
current_num_threads: usize,
|
||||
grenad_parameters: GrenadParameters,
|
||||
) -> (GrenadParameters, usize) {
|
||||
// We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch
|
||||
// is because we still use the old indexer for the settings and it is highly impacted by the
|
||||
// max memory. So we keep the changes here and will remove these changes once we use the new
|
||||
// indexer to also index settings. Related to #5125 and #5141.
|
||||
let grenad_parameters = GrenadParameters {
|
||||
max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100),
|
||||
..grenad_parameters
|
||||
};
|
||||
|
||||
// 5% percent of the allocated memory for the extractors, or min 100MiB
|
||||
// 5% percent of the allocated memory for the bbqueues, or min 50MiB
|
||||
//
|
||||
// Minimum capacity for bbqueues
|
||||
let minimum_total_bbbuffer_capacity = 50 * 1024 * 1024 * current_num_threads;
|
||||
// 50 MiB
|
||||
let minimum_total_extractors_capacity = minimum_total_bbbuffer_capacity * 2;
|
||||
|
||||
let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or(
|
||||
(
|
||||
GrenadParameters {
|
||||
max_memory: Some(minimum_total_extractors_capacity),
|
||||
..grenad_parameters
|
||||
},
|
||||
minimum_total_bbbuffer_capacity,
|
||||
), // 100 MiB by thread by default
|
||||
|max_memory| {
|
||||
let total_bbbuffer_capacity = max_memory.max(minimum_total_bbbuffer_capacity);
|
||||
let new_grenad_parameters = GrenadParameters {
|
||||
max_memory: Some(max_memory.max(minimum_total_extractors_capacity)),
|
||||
..grenad_parameters
|
||||
};
|
||||
(new_grenad_parameters, total_bbbuffer_capacity)
|
||||
},
|
||||
);
|
||||
|
||||
LOG_MEMORY_METRICS_ONCE.call_once(|| {
|
||||
tracing::debug!(
|
||||
"Indexation allocated memory metrics - \
|
||||
Total BBQueue size: {total_bbbuffer_capacity}, \
|
||||
Total extractor memory: {:?}",
|
||||
grenad_parameters.max_memory,
|
||||
);
|
||||
});
|
||||
|
||||
(grenad_parameters, total_bbbuffer_capacity)
|
||||
}
|
||||
|
@ -5,10 +5,10 @@ use rayon::iter::IndexedParallelIterator;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges};
|
||||
use super::document_changes::DocumentChanges;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::update::concurrent_available_ids::ConcurrentAvailableIds;
|
||||
use crate::update::new::document::Versions;
|
||||
use crate::update::new::document::{DocumentContext, Versions};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::thread_local::MostlySend;
|
||||
use crate::update::new::{DocumentChange, Insertion};
|
||||
@ -55,7 +55,7 @@ where
|
||||
|
||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||
&'doc self,
|
||||
context: &'doc DocumentChangeContext<T>,
|
||||
context: &'doc DocumentContext<T>,
|
||||
document: &'doc Self::Item,
|
||||
) -> Result<Option<DocumentChange<'doc>>>
|
||||
where
|
||||
|
146
crates/milli/src/update/new/indexer/settings_changes.rs
Normal file
146
crates/milli/src/update/new/indexer/settings_changes.rs
Normal file
@ -0,0 +1,146 @@
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use rayon::iter::IndexedParallelIterator;
|
||||
use rayon::slice::ParallelSlice;
|
||||
|
||||
use super::document_changes::IndexingContext;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::progress::AtomicDocumentStep;
|
||||
use crate::update::new::document::{DocumentContext, DocumentIdentifiers};
|
||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::{DocumentId, InternalError, Result};
|
||||
|
||||
/// An internal iterator (i.e. using `foreach`) of `DocumentChange`s
|
||||
pub trait SettingsChangeExtractor<'extractor>: Sync {
|
||||
type Data: MostlySend;
|
||||
|
||||
fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> Result<Self::Data>;
|
||||
|
||||
fn process<'doc>(
|
||||
&'doc self,
|
||||
documents: impl Iterator<Item = Result<DocumentIdentifiers<'doc>>>,
|
||||
context: &'doc DocumentContext<Self::Data>,
|
||||
) -> Result<()>;
|
||||
}
|
||||
pub struct DocumentsIndentifiers<'indexer> {
|
||||
documents: &'indexer [DocumentId],
|
||||
primary_key: PrimaryKey<'indexer>,
|
||||
}
|
||||
|
||||
impl<'indexer> DocumentsIndentifiers<'indexer> {
|
||||
pub fn new(documents: &'indexer [DocumentId], primary_key: PrimaryKey<'indexer>) -> Self {
|
||||
Self { documents, primary_key }
|
||||
}
|
||||
|
||||
fn iter(&self, chunk_size: usize) -> impl IndexedParallelIterator<Item = &[DocumentId]> {
|
||||
self.documents.par_chunks(chunk_size)
|
||||
}
|
||||
|
||||
fn item_to_database_document<
|
||||
'doc, // lifetime of a single `process` call
|
||||
T: MostlySend,
|
||||
>(
|
||||
&'doc self,
|
||||
context: &'doc DocumentContext<T>,
|
||||
docid: &'doc DocumentId,
|
||||
) -> Result<Option<DocumentIdentifiers<'doc>>> {
|
||||
let current = context.index.document(&context.rtxn, *docid)?;
|
||||
|
||||
let external_document_id = self.primary_key.extract_docid_from_db(
|
||||
current,
|
||||
&context.db_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
|
||||
let external_document_id = external_document_id.to_bump(&context.doc_alloc);
|
||||
|
||||
Ok(Some(DocumentIdentifiers::create(*docid, external_document_id)))
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.documents.len()
|
||||
}
|
||||
}
|
||||
|
||||
const CHUNK_SIZE: usize = 100;
|
||||
|
||||
pub fn settings_change_extract<
|
||||
'extractor, // invariant lifetime of extractor_alloc
|
||||
'fid, // invariant lifetime of fields ids map
|
||||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing
|
||||
'data, // invariant on EX::Data lifetime of datastore
|
||||
'index, // covariant lifetime of the index
|
||||
EX: SettingsChangeExtractor<'extractor>,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
>(
|
||||
documents: &'indexer DocumentsIndentifiers<'indexer>,
|
||||
extractor: &EX,
|
||||
IndexingContext {
|
||||
index,
|
||||
db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
doc_allocs,
|
||||
fields_ids_map_store,
|
||||
must_stop_processing,
|
||||
progress,
|
||||
grenad_parameters: _,
|
||||
}: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
datastore: &'data ThreadLocal<EX::Data>,
|
||||
step: IndexingStep,
|
||||
) -> Result<()> {
|
||||
tracing::trace!("We are resetting the extractor allocators");
|
||||
progress.update_progress(step);
|
||||
// Clean up and reuse the extractor allocs
|
||||
for extractor_alloc in extractor_allocs.iter_mut() {
|
||||
tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes());
|
||||
extractor_alloc.0.reset();
|
||||
}
|
||||
|
||||
let total_documents = documents.len() as u32;
|
||||
let (step, progress_step) = AtomicDocumentStep::new(total_documents);
|
||||
progress.update_progress(progress_step);
|
||||
|
||||
let pi = documents.iter(CHUNK_SIZE);
|
||||
pi.try_arc_for_each_try_init(
|
||||
|| {
|
||||
DocumentContext::new(
|
||||
index,
|
||||
db_fields_ids_map,
|
||||
new_fields_ids_map,
|
||||
extractor_allocs,
|
||||
doc_allocs,
|
||||
datastore,
|
||||
fields_ids_map_store,
|
||||
move |index_alloc| extractor.init_data(index_alloc),
|
||||
)
|
||||
},
|
||||
|context, items| {
|
||||
if (must_stop_processing)() {
|
||||
return Err(Arc::new(InternalError::AbortedIndexation.into()));
|
||||
}
|
||||
|
||||
// Clean up and reuse the document-specific allocator
|
||||
context.doc_alloc.reset();
|
||||
|
||||
let documents = items
|
||||
.iter()
|
||||
.filter_map(|item| documents.item_to_database_document(context, item).transpose());
|
||||
|
||||
let res = extractor.process(documents, context).map_err(Arc::new);
|
||||
step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed);
|
||||
|
||||
// send back the doc_alloc in the pool
|
||||
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
|
||||
|
||||
res
|
||||
},
|
||||
)?;
|
||||
step.store(total_documents, Ordering::Relaxed);
|
||||
|
||||
Ok(())
|
||||
}
|
@ -5,15 +5,14 @@ use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
|
||||
use roaring::RoaringBitmap;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
|
||||
use super::document_changes::DocumentChangeContext;
|
||||
use super::DocumentChanges;
|
||||
use crate::documents::Error::InvalidDocumentFormat;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::error::{FieldIdMapMissingEntry, InternalError};
|
||||
use crate::update::new::document::Versions;
|
||||
use crate::update::new::document::{DocumentContext, Versions};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::thread_local::MostlySend;
|
||||
use crate::update::new::{Deletion, DocumentChange, KvReaderFieldId, Update};
|
||||
use crate::update::new::{DocumentChange, DocumentIdentifiers, KvReaderFieldId, Update};
|
||||
use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
|
||||
|
||||
pub struct UpdateByFunction {
|
||||
@ -86,13 +85,13 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
|
||||
|
||||
fn item_to_document_change<'doc, T: MostlySend + 'doc>(
|
||||
&self,
|
||||
context: &'doc DocumentChangeContext<T>,
|
||||
context: &'doc DocumentContext<T>,
|
||||
docid: &'doc Self::Item,
|
||||
) -> Result<Option<DocumentChange<'doc>>>
|
||||
where
|
||||
'index: 'doc,
|
||||
{
|
||||
let DocumentChangeContext {
|
||||
let DocumentContext {
|
||||
index,
|
||||
db_fields_ids_map,
|
||||
rtxn: txn,
|
||||
@ -128,10 +127,9 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
|
||||
|
||||
match scope.remove::<Dynamic>("doc") {
|
||||
// If the "doc" variable has been set to (), we effectively delete the document.
|
||||
Some(doc) if doc.is_unit() => Ok(Some(DocumentChange::Deletion(Deletion::create(
|
||||
docid,
|
||||
doc_alloc.alloc_str(&document_id),
|
||||
)))),
|
||||
Some(doc) if doc.is_unit() => Ok(Some(DocumentChange::Deletion(
|
||||
DocumentIdentifiers::create(docid, doc_alloc.alloc_str(&document_id)),
|
||||
))),
|
||||
None => unreachable!("missing doc variable from the Rhai scope"),
|
||||
Some(new_document) => match new_document.try_cast() {
|
||||
Some(new_rhai_document) => {
|
||||
|
@ -1,3 +1,4 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use bstr::ByteSlice as _;
|
||||
@ -10,10 +11,11 @@ use super::super::channel::*;
|
||||
use crate::database_stats::DatabaseStats;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::Progress;
|
||||
use crate::update::settings::InnerIndexSettings;
|
||||
use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings};
|
||||
use crate::vector::db::IndexEmbeddingConfig;
|
||||
use crate::vector::settings::EmbedderAction;
|
||||
use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders};
|
||||
use crate::{Error, Index, InternalError, Result, UserError};
|
||||
|
||||
pub fn write_to_db(
|
||||
@ -62,6 +64,14 @@ pub fn write_to_db(
|
||||
writer.del_items(wtxn, *dimensions, docid)?;
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
ReceiverAction::LargeVector(
|
||||
large_vector @ LargeVector { docid, embedder_id, extractor_id, .. },
|
||||
) => {
|
||||
let (_, _, writer, dimensions) =
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let embedding = large_vector.read_embedding(*dimensions);
|
||||
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Every time the is a message in the channel we search
|
||||
@ -99,6 +109,7 @@ impl ChannelCongestion {
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "debug", skip_all, target = "indexing::vectors")]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn build_vectors<MSP>(
|
||||
index: &Index,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
@ -106,6 +117,7 @@ pub fn build_vectors<MSP>(
|
||||
index_embeddings: Vec<IndexEmbeddingConfig>,
|
||||
arroy_memory: Option<usize>,
|
||||
arroy_writers: &mut HashMap<u8, (&str, &Embedder, ArroyWrapper, usize)>,
|
||||
embeder_actions: Option<&BTreeMap<String, EmbedderAction>>,
|
||||
must_stop_processing: &MSP,
|
||||
) -> Result<()>
|
||||
where
|
||||
@ -117,20 +129,23 @@ where
|
||||
|
||||
let seed = rand::random();
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(seed);
|
||||
for (_index, (_embedder_name, _embedder, writer, dimensions)) in arroy_writers {
|
||||
for (_index, (embedder_name, _embedder, writer, dimensions)) in arroy_writers {
|
||||
let dimensions = *dimensions;
|
||||
let is_being_quantized = embeder_actions
|
||||
.and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized))
|
||||
.unwrap_or(false);
|
||||
writer.build_and_quantize(
|
||||
wtxn,
|
||||
progress,
|
||||
&mut rng,
|
||||
dimensions,
|
||||
false,
|
||||
is_being_quantized,
|
||||
arroy_memory,
|
||||
must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
index.put_embedding_configs(wtxn, index_embeddings)?;
|
||||
index.embedding_configs().put_embedding_configs(wtxn, index_embeddings)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -140,7 +155,7 @@ pub(super) fn update_index(
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
new_fields_ids_map: FieldIdMapWithMetadata,
|
||||
new_primary_key: Option<PrimaryKey<'_>>,
|
||||
embedders: EmbeddingConfigs,
|
||||
embedders: RuntimeEmbedders,
|
||||
field_distribution: std::collections::BTreeMap<String, u64>,
|
||||
document_ids: roaring::RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
@ -219,14 +234,36 @@ pub fn write_from_bbqueue(
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let mut embeddings = Embeddings::new(*dimensions);
|
||||
let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding);
|
||||
if embeddings.append(all_embeddings.to_vec()).is_err() {
|
||||
return Err(Error::UserError(UserError::InvalidVectorDimensions {
|
||||
expected: *dimensions,
|
||||
found: all_embeddings.len(),
|
||||
}));
|
||||
}
|
||||
writer.del_items(wtxn, *dimensions, docid)?;
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
if !all_embeddings.is_empty() {
|
||||
if embeddings.append(all_embeddings.to_vec()).is_err() {
|
||||
return Err(Error::UserError(UserError::InvalidVectorDimensions {
|
||||
expected: *dimensions,
|
||||
found: all_embeddings.len(),
|
||||
}));
|
||||
}
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
}
|
||||
EntryHeader::ArroySetVector(
|
||||
asv @ ArroySetVector { docid, embedder_id, extractor_id, .. },
|
||||
) => {
|
||||
let frame = frame_with_header.frame();
|
||||
let (_, _, writer, dimensions) =
|
||||
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
|
||||
let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding);
|
||||
|
||||
if embedding.is_empty() {
|
||||
writer.del_item_in_store(wtxn, docid, extractor_id, *dimensions)?;
|
||||
} else {
|
||||
if embedding.len() != *dimensions {
|
||||
return Err(Error::UserError(UserError::InvalidVectorDimensions {
|
||||
expected: *dimensions,
|
||||
found: embedding.len(),
|
||||
}));
|
||||
}
|
||||
writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
pub use document_change::{Deletion, DocumentChange, Insertion, Update};
|
||||
pub use document::DocumentIdentifiers;
|
||||
pub use document_change::{DocumentChange, Insertion, Update};
|
||||
pub use indexer::ChannelCongestion;
|
||||
pub use merger::{
|
||||
merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases, FacetFieldIdsDelta,
|
||||
|
@ -12,9 +12,9 @@ use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions};
|
||||
use super::indexer::de::DeserrRawValue;
|
||||
use crate::constants::RESERVED_VECTORS_FIELD_NAME;
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig};
|
||||
use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors};
|
||||
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs};
|
||||
use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders};
|
||||
use crate::{DocumentId, Index, InternalError, Result, UserError};
|
||||
|
||||
#[derive(Serialize)]
|
||||
@ -109,7 +109,7 @@ impl<'t> VectorDocumentFromDb<'t> {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let embedding_config = index.embedding_configs(rtxn)?;
|
||||
let embedding_config = index.embedding_configs().embedding_configs(rtxn)?;
|
||||
|
||||
Ok(Some(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc }))
|
||||
}
|
||||
@ -118,6 +118,7 @@ impl<'t> VectorDocumentFromDb<'t> {
|
||||
&self,
|
||||
embedder_id: u8,
|
||||
config: &IndexEmbeddingConfig,
|
||||
status: &EmbeddingStatus,
|
||||
) -> Result<VectorEntry<'t>> {
|
||||
let reader =
|
||||
ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized());
|
||||
@ -126,7 +127,7 @@ impl<'t> VectorDocumentFromDb<'t> {
|
||||
Ok(VectorEntry {
|
||||
has_configured_embedder: true,
|
||||
embeddings: Some(Embeddings::FromDb(vectors)),
|
||||
regenerate: !config.user_provided.contains(self.docid),
|
||||
regenerate: status.must_regenerate(self.docid),
|
||||
implicit: false,
|
||||
})
|
||||
}
|
||||
@ -137,9 +138,9 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
|
||||
self.embedding_config
|
||||
.iter()
|
||||
.map(|config| {
|
||||
let embedder_id =
|
||||
self.index.embedder_category_id.get(self.rtxn, &config.name)?.unwrap();
|
||||
let entry = self.entry_from_db(embedder_id, config)?;
|
||||
let info =
|
||||
self.index.embedding_configs().embedder_info(self.rtxn, &config.name)?.unwrap();
|
||||
let entry = self.entry_from_db(info.embedder_id, config, &info.embedding_status)?;
|
||||
let config_name = self.doc_alloc.alloc_str(config.name.as_str());
|
||||
Ok((&*config_name, entry))
|
||||
})
|
||||
@ -156,11 +157,11 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> {
|
||||
}
|
||||
|
||||
fn vectors_for_key(&self, key: &str) -> Result<Option<VectorEntry<'t>>> {
|
||||
Ok(match self.index.embedder_category_id.get(self.rtxn, key)? {
|
||||
Some(embedder_id) => {
|
||||
Ok(match self.index.embedding_configs().embedder_info(self.rtxn, key)? {
|
||||
Some(info) => {
|
||||
let config =
|
||||
self.embedding_config.iter().find(|config| config.name == key).unwrap();
|
||||
Some(self.entry_from_db(embedder_id, config)?)
|
||||
Some(self.entry_from_db(info.embedder_id, config, &info.embedding_status)?)
|
||||
}
|
||||
None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) {
|
||||
Some(embedding_from_doc) => {
|
||||
@ -222,7 +223,7 @@ fn entry_from_raw_value(
|
||||
pub struct VectorDocumentFromVersions<'doc> {
|
||||
external_document_id: &'doc str,
|
||||
vectors: RawMap<'doc, FxBuildHasher>,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
}
|
||||
|
||||
impl<'doc> VectorDocumentFromVersions<'doc> {
|
||||
@ -230,7 +231,7 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
|
||||
external_document_id: &'doc str,
|
||||
versions: &Versions<'doc>,
|
||||
bump: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<Self>> {
|
||||
let document = DocumentFromVersions::new(versions);
|
||||
if let Some(vectors_field) = document.vectors_field()? {
|
||||
@ -283,7 +284,7 @@ impl<'doc> MergedVectorDocument<'doc> {
|
||||
db_fields_ids_map: &'doc Mapper,
|
||||
versions: &Versions<'doc>,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<Self>> {
|
||||
let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?;
|
||||
let new_doc =
|
||||
@ -295,7 +296,7 @@ impl<'doc> MergedVectorDocument<'doc> {
|
||||
external_document_id: &'doc str,
|
||||
versions: &Versions<'doc>,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
embedders: &'doc RuntimeEmbedders,
|
||||
) -> Result<Option<Self>> {
|
||||
let Some(new_doc) =
|
||||
VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)?
|
||||
|
@ -7,7 +7,6 @@ use std::sync::Arc;
|
||||
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
||||
use deserr::{DeserializeError, Deserr};
|
||||
use itertools::{merge_join_by, EitherOrBoth, Itertools};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use time::OffsetDateTime;
|
||||
|
||||
@ -23,20 +22,28 @@ use crate::error::UserError::{self, InvalidChatSettingsDocumentTemplateMaxBytes}
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::filterable_attributes_rules::match_faceted_field;
|
||||
use crate::index::{
|
||||
ChatConfig, IndexEmbeddingConfig, PrefixSearch, SearchParameters,
|
||||
DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
|
||||
ChatConfig, PrefixSearch, SearchParameters, DEFAULT_MIN_WORD_LEN_ONE_TYPO,
|
||||
DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
|
||||
};
|
||||
use crate::order_by_map::OrderByMap;
|
||||
use crate::progress::{EmbedderStats, Progress};
|
||||
use crate::prompt::{default_max_bytes, default_template_text, PromptData};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::index_documents::IndexDocumentsMethod;
|
||||
use crate::update::new::indexer::reindex;
|
||||
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
||||
use crate::vector::db::{FragmentConfigs, IndexEmbeddingConfig};
|
||||
use crate::vector::json_template::JsonTemplate;
|
||||
use crate::vector::settings::{
|
||||
EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction,
|
||||
SubEmbeddingSettings, WriteBackToDocuments,
|
||||
EmbedderAction, EmbedderSource, EmbeddingSettings, EmbeddingValidationContext, NestingContext,
|
||||
ReindexAction, SubEmbeddingSettings, WriteBackToDocuments,
|
||||
};
|
||||
use crate::vector::{
|
||||
Embedder, EmbeddingConfig, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment,
|
||||
};
|
||||
use crate::{
|
||||
ChannelCongestion, FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result,
|
||||
};
|
||||
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
||||
use crate::{FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
|
||||
pub enum Setting<T> {
|
||||
@ -466,7 +473,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace"
|
||||
skip(self, progress_callback, should_abort, settings_diff),
|
||||
skip(self, progress_callback, should_abort, settings_diff, embedder_stats),
|
||||
target = "indexing::documents"
|
||||
)]
|
||||
fn reindex<FP, FA>(
|
||||
@ -474,6 +481,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
progress_callback: &FP,
|
||||
should_abort: &FA,
|
||||
settings_diff: InnerIndexSettingsDiff,
|
||||
embedder_stats: &Arc<EmbedderStats>,
|
||||
) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
@ -505,6 +513,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
IndexDocumentsConfig::default(),
|
||||
&progress_callback,
|
||||
&should_abort,
|
||||
embedder_stats,
|
||||
)?;
|
||||
|
||||
indexing_builder.execute_raw(output)?;
|
||||
@ -545,10 +554,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
match self.searchable_fields {
|
||||
Setting::Set(ref fields) => {
|
||||
// Check to see if the searchable fields changed before doing anything else
|
||||
let old_fields = self.index.searchable_fields(self.wtxn)?;
|
||||
let old_fields = self.index.user_defined_searchable_fields(self.wtxn)?;
|
||||
let did_change = {
|
||||
let new_fields = fields.iter().map(String::as_str).collect::<Vec<_>>();
|
||||
new_fields != old_fields
|
||||
old_fields.is_none_or(|old| new_fields != old)
|
||||
};
|
||||
if !did_change {
|
||||
return Ok(false);
|
||||
@ -1037,22 +1046,27 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
match std::mem::take(&mut self.embedder_settings) {
|
||||
Setting::Set(configs) => self.update_embedding_configs_set(configs),
|
||||
Setting::Reset => {
|
||||
let embedders = self.index.embedding_configs();
|
||||
// all vectors should be written back to documents
|
||||
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
||||
let old_configs = embedders.embedding_configs(self.wtxn)?;
|
||||
let remove_all: Result<BTreeMap<String, EmbedderAction>> = old_configs
|
||||
.into_iter()
|
||||
.map(|IndexEmbeddingConfig { name, config, user_provided }| -> Result<_> {
|
||||
let embedder_id =
|
||||
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
|
||||
crate::InternalError::DatabaseMissingEntry {
|
||||
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
.map(|IndexEmbeddingConfig { name, config, fragments: _ }| -> Result<_> {
|
||||
let embedder_info = embedders.embedder_info(self.wtxn, &name)?.ok_or(
|
||||
crate::InternalError::DatabaseMissingEntry {
|
||||
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
Ok((
|
||||
name,
|
||||
EmbedderAction::with_write_back(
|
||||
WriteBackToDocuments { embedder_id, user_provided },
|
||||
WriteBackToDocuments {
|
||||
embedder_id: embedder_info.embedder_id,
|
||||
user_provided: embedder_info
|
||||
.embedding_status
|
||||
.into_user_provided(),
|
||||
},
|
||||
config.quantized(),
|
||||
),
|
||||
))
|
||||
@ -1062,7 +1076,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
let remove_all = remove_all?;
|
||||
|
||||
self.index.embedder_category_id.clear(self.wtxn)?;
|
||||
self.index.delete_embedding_configs(self.wtxn)?;
|
||||
embedders.delete_embedding_configs(self.wtxn)?;
|
||||
Ok(remove_all)
|
||||
}
|
||||
Setting::NotSet => Ok(Default::default()),
|
||||
@ -1074,12 +1088,12 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
configs: BTreeMap<String, Setting<EmbeddingSettings>>,
|
||||
) -> Result<BTreeMap<String, EmbedderAction>> {
|
||||
use crate::vector::settings::SettingsDiff;
|
||||
|
||||
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
||||
let old_configs: BTreeMap<String, (EmbeddingSettings, RoaringBitmap)> = old_configs
|
||||
let embedders = self.index.embedding_configs();
|
||||
let old_configs = embedders.embedding_configs(self.wtxn)?;
|
||||
let old_configs: BTreeMap<String, (EmbeddingSettings, FragmentConfigs)> = old_configs
|
||||
.into_iter()
|
||||
.map(|IndexEmbeddingConfig { name, config, user_provided }| {
|
||||
(name, (config.into(), user_provided))
|
||||
.map(|IndexEmbeddingConfig { name, config, fragments }| {
|
||||
(name, (config.into(), fragments))
|
||||
})
|
||||
.collect();
|
||||
let mut updated_configs = BTreeMap::new();
|
||||
@ -1090,71 +1104,111 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
{
|
||||
match joined {
|
||||
// updated config
|
||||
EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => {
|
||||
EitherOrBoth::Both((name, (old, mut fragments)), (_, new)) => {
|
||||
let was_quantized = old.binary_quantized.set().unwrap_or_default();
|
||||
let settings_diff = SettingsDiff::from_settings(&name, old, new)?;
|
||||
match settings_diff {
|
||||
SettingsDiff::Remove => {
|
||||
let info = embedders.remove_embedder(self.wtxn, &name)?.ok_or(
|
||||
crate::InternalError::DatabaseMissingEntry {
|
||||
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
tracing::debug!(
|
||||
embedder = name,
|
||||
user_provided = user_provided.len(),
|
||||
user_provided = info.embedding_status.user_provided_docids().len(),
|
||||
"removing embedder"
|
||||
);
|
||||
let embedder_id =
|
||||
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
|
||||
crate::InternalError::DatabaseMissingEntry {
|
||||
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
// free id immediately
|
||||
self.index.embedder_category_id.delete(self.wtxn, &name)?;
|
||||
embedder_actions.insert(
|
||||
name,
|
||||
EmbedderAction::with_write_back(
|
||||
WriteBackToDocuments { embedder_id, user_provided },
|
||||
WriteBackToDocuments {
|
||||
embedder_id: info.embedder_id,
|
||||
user_provided: info.embedding_status.into_user_provided(),
|
||||
},
|
||||
was_quantized,
|
||||
),
|
||||
);
|
||||
}
|
||||
SettingsDiff::Reindex { action, updated_settings, quantize } => {
|
||||
tracing::debug!(
|
||||
embedder = name,
|
||||
user_provided = user_provided.len(),
|
||||
?action,
|
||||
"reindex embedder"
|
||||
);
|
||||
embedder_actions.insert(
|
||||
name.clone(),
|
||||
let mut remove_fragments = None;
|
||||
let updated_settings = Setting::Set(updated_settings);
|
||||
if let ReindexAction::RegenerateFragments(regenerate_fragments) =
|
||||
&action
|
||||
{
|
||||
let it = regenerate_fragments
|
||||
.iter()
|
||||
.filter(|(_, action)| {
|
||||
matches!(
|
||||
action,
|
||||
crate::vector::settings::RegenerateFragment::Remove
|
||||
)
|
||||
})
|
||||
.map(|(name, _)| name.as_str());
|
||||
|
||||
remove_fragments = fragments.remove_fragments(it);
|
||||
|
||||
let it = regenerate_fragments
|
||||
.iter()
|
||||
.filter(|(_, action)| {
|
||||
matches!(
|
||||
action,
|
||||
crate::vector::settings::RegenerateFragment::Add
|
||||
)
|
||||
})
|
||||
.map(|(name, _)| name.clone());
|
||||
fragments.add_new_fragments(it)?;
|
||||
} else {
|
||||
// needs full reindex of fragments
|
||||
fragments = FragmentConfigs::new();
|
||||
fragments.add_new_fragments(
|
||||
crate::vector::settings::fragments_from_settings(
|
||||
&updated_settings,
|
||||
),
|
||||
)?;
|
||||
}
|
||||
tracing::debug!(embedder = name, ?action, "reindex embedder");
|
||||
|
||||
let embedder_action =
|
||||
EmbedderAction::with_reindex(action, was_quantized)
|
||||
.with_is_being_quantized(quantize),
|
||||
);
|
||||
let new =
|
||||
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
|
||||
updated_configs.insert(name, (new, user_provided));
|
||||
.with_is_being_quantized(quantize);
|
||||
|
||||
let embedder_action = if let Some(remove_fragments) = remove_fragments {
|
||||
embedder_action.with_remove_fragments(remove_fragments)
|
||||
} else {
|
||||
embedder_action
|
||||
};
|
||||
|
||||
embedder_actions.insert(name.clone(), embedder_action);
|
||||
let new = validate_embedding_settings(
|
||||
updated_settings,
|
||||
&name,
|
||||
EmbeddingValidationContext::FullSettings,
|
||||
)?;
|
||||
updated_configs.insert(name, (new, fragments));
|
||||
}
|
||||
SettingsDiff::UpdateWithoutReindex { updated_settings, quantize } => {
|
||||
tracing::debug!(
|
||||
embedder = name,
|
||||
user_provided = user_provided.len(),
|
||||
"update without reindex embedder"
|
||||
);
|
||||
let new =
|
||||
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
|
||||
tracing::debug!(embedder = name, "update without reindex embedder");
|
||||
let new = validate_embedding_settings(
|
||||
Setting::Set(updated_settings),
|
||||
&name,
|
||||
EmbeddingValidationContext::FullSettings,
|
||||
)?;
|
||||
if quantize {
|
||||
embedder_actions.insert(
|
||||
name.clone(),
|
||||
EmbedderAction::default().with_is_being_quantized(true),
|
||||
);
|
||||
}
|
||||
updated_configs.insert(name, (new, user_provided));
|
||||
updated_configs.insert(name, (new, fragments));
|
||||
}
|
||||
}
|
||||
}
|
||||
// unchanged config
|
||||
EitherOrBoth::Left((name, (setting, user_provided))) => {
|
||||
EitherOrBoth::Left((name, (setting, fragments))) => {
|
||||
tracing::debug!(embedder = name, "unchanged embedder");
|
||||
updated_configs.insert(name, (Setting::Set(setting), user_provided));
|
||||
updated_configs.insert(name, (Setting::Set(setting), fragments));
|
||||
}
|
||||
// new config
|
||||
EitherOrBoth::Right((name, mut setting)) => {
|
||||
@ -1164,52 +1218,51 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
|
||||
&mut setting,
|
||||
);
|
||||
let setting = validate_embedding_settings(setting, &name)?;
|
||||
let setting = validate_embedding_settings(
|
||||
setting,
|
||||
&name,
|
||||
EmbeddingValidationContext::FullSettings,
|
||||
)?;
|
||||
embedder_actions.insert(
|
||||
name.clone(),
|
||||
EmbedderAction::with_reindex(ReindexAction::FullReindex, false),
|
||||
);
|
||||
updated_configs.insert(name, (setting, RoaringBitmap::new()));
|
||||
let mut fragments = FragmentConfigs::new();
|
||||
fragments.add_new_fragments(
|
||||
crate::vector::settings::fragments_from_settings(&setting),
|
||||
)?;
|
||||
updated_configs.insert(name, (setting, fragments));
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
|
||||
for res in self.index.embedder_category_id.iter(self.wtxn)? {
|
||||
let (_name, id) = res?;
|
||||
free_indices[id as usize] = false;
|
||||
}
|
||||
let mut free_indices = free_indices.iter_mut().enumerate();
|
||||
let mut find_free_index =
|
||||
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
|
||||
for (name, action) in embedder_actions.iter() {
|
||||
// ignore actions that are not possible for a new embedder
|
||||
if matches!(action.reindex(), Some(ReindexAction::FullReindex))
|
||||
&& self.index.embedder_category_id.get(self.wtxn, name)?.is_none()
|
||||
{
|
||||
let id =
|
||||
find_free_index().ok_or(UserError::TooManyEmbedders(updated_configs.len()))?;
|
||||
tracing::debug!(embedder = name, id, "assigning free id to new embedder");
|
||||
self.index.embedder_category_id.put(self.wtxn, name, &id)?;
|
||||
}
|
||||
}
|
||||
embedders.add_new_embedders(
|
||||
self.wtxn,
|
||||
embedder_actions
|
||||
.iter()
|
||||
// ignore actions that are not possible for a new embedder, most critically deleted embedders
|
||||
.filter(|(_, action)| matches!(action.reindex(), Some(ReindexAction::FullReindex)))
|
||||
.map(|(name, _)| name.as_str()),
|
||||
updated_configs.len(),
|
||||
)?;
|
||||
|
||||
let updated_configs: Vec<IndexEmbeddingConfig> = updated_configs
|
||||
.into_iter()
|
||||
.filter_map(|(name, (config, user_provided))| match config {
|
||||
.filter_map(|(name, (config, fragments))| match config {
|
||||
Setting::Set(config) => {
|
||||
Some(IndexEmbeddingConfig { name, config: config.into(), user_provided })
|
||||
Some(IndexEmbeddingConfig { name, config: config.into(), fragments })
|
||||
}
|
||||
Setting::Reset => None,
|
||||
Setting::NotSet => Some(IndexEmbeddingConfig {
|
||||
name,
|
||||
config: EmbeddingSettings::default().into(),
|
||||
user_provided,
|
||||
fragments: Default::default(),
|
||||
}),
|
||||
})
|
||||
.collect();
|
||||
if updated_configs.is_empty() {
|
||||
self.index.delete_embedding_configs(self.wtxn)?;
|
||||
embedders.delete_embedding_configs(self.wtxn)?;
|
||||
} else {
|
||||
self.index.put_embedding_configs(self.wtxn, updated_configs)?;
|
||||
embedders.put_embedding_configs(self.wtxn, updated_configs)?;
|
||||
}
|
||||
Ok(embedder_actions)
|
||||
}
|
||||
@ -1355,7 +1408,12 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()>
|
||||
pub fn legacy_execute<FP, FA>(
|
||||
mut self,
|
||||
progress_callback: FP,
|
||||
should_abort: FA,
|
||||
embedder_stats: Arc<EmbedderStats>,
|
||||
) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
FA: Fn() -> bool + Sync,
|
||||
@ -1413,11 +1471,113 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
||||
);
|
||||
|
||||
if inner_settings_diff.any_reindexing_needed() {
|
||||
self.reindex(&progress_callback, &should_abort, inner_settings_diff)?;
|
||||
self.reindex(&progress_callback, &should_abort, inner_settings_diff, &embedder_stats)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn execute<'indexer, MSP>(
|
||||
mut self,
|
||||
must_stop_processing: &'indexer MSP,
|
||||
progress: &'indexer Progress,
|
||||
embedder_stats: Arc<EmbedderStats>,
|
||||
) -> Result<Option<ChannelCongestion>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
// force the old indexer if the environment says so
|
||||
if self.indexer_config.experimental_no_edition_2024_for_settings {
|
||||
return self
|
||||
.legacy_execute(
|
||||
|indexing_step| tracing::debug!(update = ?indexing_step),
|
||||
must_stop_processing,
|
||||
embedder_stats,
|
||||
)
|
||||
.map(|_| None);
|
||||
}
|
||||
|
||||
// only use the new indexer when only the embedder possibly changed
|
||||
if let Self {
|
||||
searchable_fields: Setting::NotSet,
|
||||
displayed_fields: Setting::NotSet,
|
||||
filterable_fields: Setting::NotSet,
|
||||
sortable_fields: Setting::NotSet,
|
||||
criteria: Setting::NotSet,
|
||||
stop_words: Setting::NotSet,
|
||||
non_separator_tokens: Setting::NotSet,
|
||||
separator_tokens: Setting::NotSet,
|
||||
dictionary: Setting::NotSet,
|
||||
distinct_field: Setting::NotSet,
|
||||
synonyms: Setting::NotSet,
|
||||
primary_key: Setting::NotSet,
|
||||
authorize_typos: Setting::NotSet,
|
||||
min_word_len_two_typos: Setting::NotSet,
|
||||
min_word_len_one_typo: Setting::NotSet,
|
||||
exact_words: Setting::NotSet,
|
||||
exact_attributes: Setting::NotSet,
|
||||
max_values_per_facet: Setting::NotSet,
|
||||
sort_facet_values_by: Setting::NotSet,
|
||||
pagination_max_total_hits: Setting::NotSet,
|
||||
proximity_precision: Setting::NotSet,
|
||||
embedder_settings: _,
|
||||
search_cutoff: Setting::NotSet,
|
||||
localized_attributes_rules: Setting::NotSet,
|
||||
prefix_search: Setting::NotSet,
|
||||
facet_search: Setting::NotSet,
|
||||
disable_on_numbers: Setting::NotSet,
|
||||
chat: Setting::NotSet,
|
||||
wtxn: _,
|
||||
index: _,
|
||||
indexer_config: _,
|
||||
} = &self
|
||||
{
|
||||
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
let old_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
|
||||
|
||||
// Update index settings
|
||||
let embedding_config_updates = self.update_embedding_configs()?;
|
||||
|
||||
let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
|
||||
|
||||
let primary_key_id = self
|
||||
.index
|
||||
.primary_key(self.wtxn)?
|
||||
.and_then(|name| new_inner_settings.fields_ids_map.id(name));
|
||||
let settings_update_only = true;
|
||||
let inner_settings_diff = InnerIndexSettingsDiff::new(
|
||||
old_inner_settings,
|
||||
new_inner_settings,
|
||||
primary_key_id,
|
||||
embedding_config_updates,
|
||||
settings_update_only,
|
||||
);
|
||||
|
||||
if self.index.number_of_documents(self.wtxn)? > 0 {
|
||||
reindex(
|
||||
self.wtxn,
|
||||
self.index,
|
||||
&self.indexer_config.thread_pool,
|
||||
self.indexer_config.grenad_parameters(),
|
||||
&inner_settings_diff,
|
||||
must_stop_processing,
|
||||
progress,
|
||||
embedder_stats,
|
||||
)
|
||||
.map(Some)
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
} else {
|
||||
self.legacy_execute(
|
||||
|indexing_step| tracing::debug!(update = ?indexing_step),
|
||||
must_stop_processing,
|
||||
embedder_stats,
|
||||
)
|
||||
.map(|_| None)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct InnerIndexSettingsDiff {
|
||||
@ -1429,6 +1589,7 @@ pub struct InnerIndexSettingsDiff {
|
||||
/// The set of only the additional searchable fields.
|
||||
/// If any other searchable field has been modified, is set to None.
|
||||
pub(crate) only_additional_fields: Option<HashSet<String>>,
|
||||
fragment_diffs: BTreeMap<String, Vec<(Option<usize>, usize)>>,
|
||||
|
||||
// Cache the check to see if all the stop_words, allowed_separators, dictionary,
|
||||
// exact_attributes, proximity_precision are different.
|
||||
@ -1497,13 +1658,13 @@ impl InnerIndexSettingsDiff {
|
||||
|
||||
// if the user-defined searchables changed, then we need to reindex prompts.
|
||||
if cache_user_defined_searchables {
|
||||
for (embedder_name, (config, _, _quantized)) in
|
||||
new_settings.embedding_configs.inner_as_ref()
|
||||
{
|
||||
let was_quantized =
|
||||
old_settings.embedding_configs.get(embedder_name).is_some_and(|conf| conf.2);
|
||||
for (embedder_name, runtime) in new_settings.runtime_embedders.inner_as_ref() {
|
||||
let was_quantized = old_settings
|
||||
.runtime_embedders
|
||||
.get(embedder_name)
|
||||
.is_some_and(|conf| conf.is_quantized);
|
||||
// skip embedders that don't use document templates
|
||||
if !config.uses_document_template() {
|
||||
if !runtime.embedder.uses_document_template() {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1516,22 +1677,86 @@ impl InnerIndexSettingsDiff {
|
||||
was_quantized,
|
||||
));
|
||||
}
|
||||
std::collections::btree_map::Entry::Occupied(entry) => {
|
||||
std::collections::btree_map::Entry::Occupied(mut entry) => {
|
||||
// future-proofing, make sure to destructure here so that any new field is taken into account in this case
|
||||
// case in point: adding `remove_fragments` was detected.
|
||||
let EmbedderAction {
|
||||
was_quantized: _,
|
||||
is_being_quantized: _,
|
||||
write_back: _, // We are deleting this embedder, so no point in regeneration
|
||||
reindex: _, // We are already fully reindexing
|
||||
} = entry.get();
|
||||
write_back, // We are deleting this embedder, so no point in regeneration
|
||||
reindex,
|
||||
remove_fragments: _,
|
||||
} = entry.get_mut();
|
||||
|
||||
// fixup reindex to make sure we regenerate all fragments
|
||||
*reindex = match reindex.take() {
|
||||
Some(reindex) => Some(reindex), // We are at least regenerating prompts
|
||||
None => {
|
||||
if write_back.is_none() {
|
||||
Some(ReindexAction::RegeneratePrompts) // quantization case
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// build the fragment diffs
|
||||
let mut fragment_diffs = BTreeMap::new();
|
||||
for (embedder_name, embedder_action) in &embedding_config_updates {
|
||||
let Some(new_embedder) = new_settings.runtime_embedders.get(embedder_name) else {
|
||||
continue;
|
||||
};
|
||||
let regenerate_fragments =
|
||||
if let Some(ReindexAction::RegenerateFragments(regenerate_fragments)) =
|
||||
embedder_action.reindex()
|
||||
{
|
||||
either::Either::Left(
|
||||
regenerate_fragments
|
||||
.iter()
|
||||
.filter(|(_, action)| {
|
||||
!matches!(
|
||||
action,
|
||||
crate::vector::settings::RegenerateFragment::Remove
|
||||
)
|
||||
})
|
||||
.map(|(name, _)| name),
|
||||
)
|
||||
} else {
|
||||
either::Either::Right(
|
||||
new_embedder.fragments().iter().map(|fragment| &fragment.name),
|
||||
)
|
||||
};
|
||||
|
||||
let old_embedder = old_settings.runtime_embedders.get(embedder_name);
|
||||
|
||||
let mut fragments = Vec::new();
|
||||
for fragment_name in regenerate_fragments {
|
||||
let Ok(new) = new_embedder
|
||||
.fragments()
|
||||
.binary_search_by_key(&fragment_name, |fragment| &fragment.name)
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
let old = old_embedder.as_ref().and_then(|old_embedder| {
|
||||
old_embedder
|
||||
.fragments()
|
||||
.binary_search_by_key(&fragment_name, |fragment| &fragment.name)
|
||||
.ok()
|
||||
});
|
||||
fragments.push((old, new));
|
||||
}
|
||||
fragment_diffs.insert(embedder_name.clone(), fragments);
|
||||
}
|
||||
|
||||
InnerIndexSettingsDiff {
|
||||
old: old_settings,
|
||||
new: new_settings,
|
||||
primary_key_id,
|
||||
fragment_diffs,
|
||||
embedding_config_updates,
|
||||
settings_update_only,
|
||||
only_additional_fields,
|
||||
@ -1676,7 +1901,8 @@ pub(crate) struct InnerIndexSettings {
|
||||
pub exact_attributes: HashSet<FieldId>,
|
||||
pub disabled_typos_terms: DisabledTyposTerms,
|
||||
pub proximity_precision: ProximityPrecision,
|
||||
pub embedding_configs: EmbeddingConfigs,
|
||||
pub runtime_embedders: RuntimeEmbedders,
|
||||
pub embedder_category_id: HashMap<String, u8>,
|
||||
pub geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
pub prefix_search: PrefixSearch,
|
||||
pub facet_search: bool,
|
||||
@ -1686,7 +1912,7 @@ impl InnerIndexSettings {
|
||||
pub fn from_index(
|
||||
index: &Index,
|
||||
rtxn: &heed::RoTxn<'_>,
|
||||
embedding_configs: Option<EmbeddingConfigs>,
|
||||
runtime_embedders: Option<RuntimeEmbedders>,
|
||||
) -> Result<Self> {
|
||||
let stop_words = index.stop_words(rtxn)?;
|
||||
let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
|
||||
@ -1695,10 +1921,15 @@ impl InnerIndexSettings {
|
||||
let mut fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let exact_attributes = index.exact_attributes_ids(rtxn)?;
|
||||
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
|
||||
let embedding_configs = match embedding_configs {
|
||||
let runtime_embedders = match runtime_embedders {
|
||||
Some(embedding_configs) => embedding_configs,
|
||||
None => embedders(index.embedding_configs(rtxn)?)?,
|
||||
None => embedders(index.embedding_configs().embedding_configs(rtxn)?)?,
|
||||
};
|
||||
let embedder_category_id = index
|
||||
.embedding_configs()
|
||||
.iter_embedder_id(rtxn)?
|
||||
.map(|r| r.map(|(k, v)| (k.to_string(), v)))
|
||||
.collect::<heed::Result<_>>()?;
|
||||
let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default();
|
||||
let facet_search = index.facet_search(rtxn)?;
|
||||
let geo_fields_ids = match fields_ids_map.id(RESERVED_GEO_FIELD_NAME) {
|
||||
@ -1737,7 +1968,8 @@ impl InnerIndexSettings {
|
||||
sortable_fields,
|
||||
exact_attributes,
|
||||
proximity_precision,
|
||||
embedding_configs,
|
||||
runtime_embedders,
|
||||
embedder_category_id,
|
||||
geo_fields_ids,
|
||||
prefix_search,
|
||||
facet_search,
|
||||
@ -1779,28 +2011,49 @@ impl InnerIndexSettings {
|
||||
}
|
||||
}
|
||||
|
||||
fn embedders(embedding_configs: Vec<IndexEmbeddingConfig>) -> Result<EmbeddingConfigs> {
|
||||
fn embedders(embedding_configs: Vec<IndexEmbeddingConfig>) -> Result<RuntimeEmbedders> {
|
||||
let res: Result<_> = embedding_configs
|
||||
.into_iter()
|
||||
.map(
|
||||
|IndexEmbeddingConfig {
|
||||
name,
|
||||
config: EmbeddingConfig { embedder_options, prompt, quantized },
|
||||
..
|
||||
fragments,
|
||||
}| {
|
||||
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
|
||||
let document_template = prompt.try_into().map_err(crate::Error::from)?;
|
||||
|
||||
let embedder = Arc::new(
|
||||
let embedder =
|
||||
// cache_cap: no cache needed for indexing purposes
|
||||
Embedder::new(embedder_options.clone(), 0)
|
||||
Arc::new(Embedder::new(embedder_options.clone(), 0)
|
||||
.map_err(crate::vector::Error::from)
|
||||
.map_err(crate::Error::from)?,
|
||||
);
|
||||
Ok((name, (embedder, prompt, quantized.unwrap_or_default())))
|
||||
.map_err(crate::Error::from)?);
|
||||
|
||||
let fragments = fragments
|
||||
.into_inner()
|
||||
.into_iter()
|
||||
.map(|fragment| {
|
||||
let template = JsonTemplate::new(
|
||||
embedder_options.fragment(&fragment.name).unwrap().clone(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
RuntimeFragment { name: fragment.name, id: fragment.id, template }
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok((
|
||||
name,
|
||||
Arc::new(RuntimeEmbedder::new(
|
||||
embedder,
|
||||
document_template,
|
||||
fragments,
|
||||
quantized.unwrap_or_default(),
|
||||
)),
|
||||
))
|
||||
},
|
||||
)
|
||||
.collect();
|
||||
res.map(EmbeddingConfigs::new)
|
||||
res.map(RuntimeEmbedders::new)
|
||||
}
|
||||
|
||||
fn validate_prompt(
|
||||
@ -1837,6 +2090,7 @@ fn validate_prompt(
|
||||
pub fn validate_embedding_settings(
|
||||
settings: Setting<EmbeddingSettings>,
|
||||
name: &str,
|
||||
context: EmbeddingValidationContext,
|
||||
) -> Result<Setting<EmbeddingSettings>> {
|
||||
let Setting::Set(settings) = settings else { return Ok(settings) };
|
||||
let EmbeddingSettings {
|
||||
@ -1849,6 +2103,8 @@ pub fn validate_embedding_settings(
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
search_embedder,
|
||||
@ -1875,9 +2131,106 @@ pub fn validate_embedding_settings(
|
||||
})?;
|
||||
}
|
||||
|
||||
// used below
|
||||
enum WithFragments {
|
||||
Yes {
|
||||
indexing_fragments: BTreeMap<String, serde_json::Value>,
|
||||
search_fragments: BTreeMap<String, serde_json::Value>,
|
||||
},
|
||||
No,
|
||||
Maybe,
|
||||
}
|
||||
|
||||
let with_fragments = {
|
||||
let has_reset = matches!(indexing_fragments, Setting::Reset)
|
||||
|| matches!(search_fragments, Setting::Reset);
|
||||
let indexing_fragments: BTreeMap<_, _> = indexing_fragments
|
||||
.as_ref()
|
||||
.set()
|
||||
.iter()
|
||||
.flat_map(|map| map.iter())
|
||||
.filter_map(|(name, fragment)| {
|
||||
Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?))
|
||||
})
|
||||
.collect();
|
||||
let search_fragments: BTreeMap<_, _> = search_fragments
|
||||
.as_ref()
|
||||
.set()
|
||||
.iter()
|
||||
.flat_map(|map| map.iter())
|
||||
.filter_map(|(name, fragment)| {
|
||||
Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let has_fragments = !indexing_fragments.is_empty() || !search_fragments.is_empty();
|
||||
|
||||
if context == EmbeddingValidationContext::FullSettings {
|
||||
let are_fragments_inconsistent =
|
||||
indexing_fragments.is_empty() ^ search_fragments.is_empty();
|
||||
if are_fragments_inconsistent {
|
||||
return Err(crate::vector::error::NewEmbedderError::rest_inconsistent_fragments(
|
||||
indexing_fragments.is_empty(),
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
))
|
||||
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into());
|
||||
}
|
||||
}
|
||||
if has_fragments {
|
||||
if context == EmbeddingValidationContext::SettingsPartialUpdate
|
||||
&& matches!(document_template, Setting::Set(_))
|
||||
{
|
||||
return Err(
|
||||
crate::vector::error::NewEmbedderError::rest_document_template_and_fragments(
|
||||
indexing_fragments.len(),
|
||||
search_fragments.len(),
|
||||
),
|
||||
)
|
||||
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into());
|
||||
}
|
||||
WithFragments::Yes { indexing_fragments, search_fragments }
|
||||
} else if has_reset || context == EmbeddingValidationContext::FullSettings {
|
||||
WithFragments::No
|
||||
} else {
|
||||
// if we are working with partial settings, the user could have changed only the `request` and not given again the fragments
|
||||
WithFragments::Maybe
|
||||
}
|
||||
};
|
||||
if let Some(request) = request.as_ref().set() {
|
||||
let request = crate::vector::rest::Request::new(request.to_owned())
|
||||
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?;
|
||||
let request = match with_fragments {
|
||||
WithFragments::Yes { indexing_fragments, search_fragments } => {
|
||||
crate::vector::rest::RequestData::new(
|
||||
request.to_owned(),
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
)
|
||||
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))
|
||||
}
|
||||
WithFragments::No => crate::vector::rest::RequestData::new(
|
||||
request.to_owned(),
|
||||
Default::default(),
|
||||
Default::default(),
|
||||
)
|
||||
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into())),
|
||||
WithFragments::Maybe => {
|
||||
let mut indexing_fragments = BTreeMap::new();
|
||||
indexing_fragments.insert("test".to_string(), serde_json::json!("test"));
|
||||
crate::vector::rest::RequestData::new(
|
||||
request.to_owned(),
|
||||
indexing_fragments,
|
||||
Default::default(),
|
||||
)
|
||||
.or_else(|_| {
|
||||
crate::vector::rest::RequestData::new(
|
||||
request.to_owned(),
|
||||
Default::default(),
|
||||
Default::default(),
|
||||
)
|
||||
})
|
||||
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))
|
||||
}
|
||||
}?;
|
||||
if let Some(response) = response.as_ref().set() {
|
||||
crate::vector::rest::Response::new(response.to_owned(), &request)
|
||||
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?;
|
||||
@ -1896,6 +2249,8 @@ pub fn validate_embedding_settings(
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
search_embedder,
|
||||
@ -1915,6 +2270,8 @@ pub fn validate_embedding_settings(
|
||||
&dimensions,
|
||||
&api_key,
|
||||
&url,
|
||||
&indexing_fragments,
|
||||
&search_fragments,
|
||||
&request,
|
||||
&response,
|
||||
&document_template,
|
||||
@ -1993,6 +2350,8 @@ pub fn validate_embedding_settings(
|
||||
&embedder.dimensions,
|
||||
&embedder.api_key,
|
||||
&embedder.url,
|
||||
&embedder.indexing_fragments,
|
||||
&embedder.search_fragments,
|
||||
&embedder.request,
|
||||
&embedder.response,
|
||||
&embedder.document_template,
|
||||
@ -2048,6 +2407,8 @@ pub fn validate_embedding_settings(
|
||||
&embedder.dimensions,
|
||||
&embedder.api_key,
|
||||
&embedder.url,
|
||||
&embedder.indexing_fragments,
|
||||
&embedder.search_fragments,
|
||||
&embedder.request,
|
||||
&embedder.response,
|
||||
&embedder.document_template,
|
||||
@ -2080,6 +2441,8 @@ pub fn validate_embedding_settings(
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
search_embedder,
|
||||
@ -2107,6 +2470,81 @@ fn deserialize_sub_embedder(
|
||||
}
|
||||
}
|
||||
|
||||
/// Implement this trait for the settings delta type.
|
||||
/// This is used in the new settings update flow and will allow to easily replace the old settings delta type: `InnerIndexSettingsDiff`.
|
||||
pub trait SettingsDelta {
|
||||
fn new_embedders(&self) -> &RuntimeEmbedders;
|
||||
fn old_embedders(&self) -> &RuntimeEmbedders;
|
||||
fn new_embedder_category_id(&self) -> &HashMap<String, u8>;
|
||||
fn embedder_actions(&self) -> &BTreeMap<String, EmbedderAction>;
|
||||
fn try_for_each_fragment_diff<F, E>(
|
||||
&self,
|
||||
embedder_name: &str,
|
||||
for_each: F,
|
||||
) -> std::result::Result<(), E>
|
||||
where
|
||||
F: FnMut(FragmentDiff) -> std::result::Result<(), E>;
|
||||
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
|
||||
}
|
||||
|
||||
pub struct FragmentDiff<'a> {
|
||||
pub old: Option<&'a RuntimeFragment>,
|
||||
pub new: &'a RuntimeFragment,
|
||||
}
|
||||
|
||||
impl SettingsDelta for InnerIndexSettingsDiff {
|
||||
fn new_embedders(&self) -> &RuntimeEmbedders {
|
||||
&self.new.runtime_embedders
|
||||
}
|
||||
|
||||
fn old_embedders(&self) -> &RuntimeEmbedders {
|
||||
&self.old.runtime_embedders
|
||||
}
|
||||
|
||||
fn new_embedder_category_id(&self) -> &HashMap<String, u8> {
|
||||
&self.new.embedder_category_id
|
||||
}
|
||||
|
||||
fn embedder_actions(&self) -> &BTreeMap<String, EmbedderAction> {
|
||||
&self.embedding_config_updates
|
||||
}
|
||||
|
||||
fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
|
||||
&self.new.fields_ids_map
|
||||
}
|
||||
|
||||
fn try_for_each_fragment_diff<F, E>(
|
||||
&self,
|
||||
embedder_name: &str,
|
||||
mut for_each: F,
|
||||
) -> std::result::Result<(), E>
|
||||
where
|
||||
F: FnMut(FragmentDiff) -> std::result::Result<(), E>,
|
||||
{
|
||||
let Some(fragment_diff) = self.fragment_diffs.get(embedder_name) else { return Ok(()) };
|
||||
for (old, new) in fragment_diff {
|
||||
let Some(new_runtime) = self.new.runtime_embedders.get(embedder_name) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
let new = new_runtime.fragments().get(*new).unwrap();
|
||||
|
||||
match old {
|
||||
Some(old) => {
|
||||
if let Some(old_runtime) = self.old.runtime_embedders.get(embedder_name) {
|
||||
let old = &old_runtime.fragments().get(*old).unwrap();
|
||||
for_each(FragmentDiff { old: Some(old), new })?;
|
||||
} else {
|
||||
for_each(FragmentDiff { old: None, new })?;
|
||||
}
|
||||
}
|
||||
None => for_each(FragmentDiff { old: None, new })?,
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[path = "test_settings.rs"]
|
||||
mod tests;
|
||||
|
@ -2,6 +2,7 @@ mod v1_12;
|
||||
mod v1_13;
|
||||
mod v1_14;
|
||||
mod v1_15;
|
||||
mod v1_16;
|
||||
use heed::RwTxn;
|
||||
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
|
||||
use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13};
|
||||
@ -10,6 +11,7 @@ use v1_15::Latest_V1_14_To_Latest_V1_15;
|
||||
|
||||
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
|
||||
use crate::progress::{Progress, VariableNameStep};
|
||||
use crate::update::upgrade::v1_16::Latest_V1_15_To_V1_16_0;
|
||||
use crate::{Index, InternalError, Result};
|
||||
|
||||
trait UpgradeIndex {
|
||||
@ -24,6 +26,59 @@ trait UpgradeIndex {
|
||||
fn target_version(&self) -> (u32, u32, u32);
|
||||
}
|
||||
|
||||
const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[
|
||||
&V1_12_To_V1_12_3 {},
|
||||
&V1_12_3_To_V1_13_0 {},
|
||||
&V1_13_0_To_V1_13_1 {},
|
||||
&V1_13_1_To_Latest_V1_13 {},
|
||||
&Latest_V1_13_To_Latest_V1_14 {},
|
||||
&Latest_V1_14_To_Latest_V1_15 {},
|
||||
&Latest_V1_15_To_V1_16_0 {},
|
||||
// This is the last upgrade function, it will be called when the index is up to date.
|
||||
// any other upgrade function should be added before this one.
|
||||
&ToCurrentNoOp {},
|
||||
];
|
||||
|
||||
/// Causes a compile-time error if the argument is not in range of `0..UPGRADE_FUNCTIONS.len()`
|
||||
macro_rules! function_index {
|
||||
($start:expr) => {{
|
||||
const _CHECK_INDEX: () = {
|
||||
if $start >= $crate::update::upgrade::UPGRADE_FUNCTIONS.len() {
|
||||
panic!("upgrade functions out of range")
|
||||
}
|
||||
};
|
||||
|
||||
$start
|
||||
}};
|
||||
}
|
||||
|
||||
const fn start(from: (u32, u32, u32)) -> Option<usize> {
|
||||
let start = match from {
|
||||
(1, 12, 0..=2) => function_index!(0),
|
||||
(1, 12, 3..) => function_index!(1),
|
||||
(1, 13, 0) => function_index!(2),
|
||||
(1, 13, _) => function_index!(4),
|
||||
(1, 14, _) => function_index!(5),
|
||||
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
|
||||
(1, 15, _) => function_index!(6),
|
||||
(1, 16, _) => function_index!(7),
|
||||
// We deliberately don't add a placeholder with (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) here to force manually
|
||||
// considering dumpless upgrade.
|
||||
(_major, _minor, _patch) => return None,
|
||||
};
|
||||
|
||||
Some(start)
|
||||
}
|
||||
|
||||
/// Causes a compile-time error if the latest package cannot be upgraded.
|
||||
///
|
||||
/// This serves as a reminder to consider the proper dumpless upgrade implementation when changing the package version.
|
||||
const _CHECK_PACKAGE_CAN_UPGRADE: () = {
|
||||
if start((VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)).is_none() {
|
||||
panic!("cannot upgrade from latest package version")
|
||||
}
|
||||
};
|
||||
|
||||
/// Return true if the cached stats of the index must be regenerated
|
||||
pub fn upgrade<MSP>(
|
||||
wtxn: &mut RwTxn,
|
||||
@ -36,33 +91,12 @@ where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
{
|
||||
let from = index.get_version(wtxn)?.unwrap_or(db_version);
|
||||
let upgrade_functions: &[&dyn UpgradeIndex] = &[
|
||||
&V1_12_To_V1_12_3 {},
|
||||
&V1_12_3_To_V1_13_0 {},
|
||||
&V1_13_0_To_V1_13_1 {},
|
||||
&V1_13_1_To_Latest_V1_13 {},
|
||||
&Latest_V1_13_To_Latest_V1_14 {},
|
||||
&Latest_V1_14_To_Latest_V1_15 {},
|
||||
// This is the last upgrade function, it will be called when the index is up to date.
|
||||
// any other upgrade function should be added before this one.
|
||||
&ToCurrentNoOp {},
|
||||
];
|
||||
|
||||
let start = match from {
|
||||
(1, 12, 0..=2) => 0,
|
||||
(1, 12, 3..) => 1,
|
||||
(1, 13, 0) => 2,
|
||||
(1, 13, _) => 4,
|
||||
(1, 14, _) => 5,
|
||||
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
|
||||
(1, 15, _) => 6,
|
||||
(major, minor, patch) => {
|
||||
return Err(InternalError::CannotUpgradeToVersion(major, minor, patch).into())
|
||||
}
|
||||
};
|
||||
let start =
|
||||
start(from).ok_or_else(|| InternalError::CannotUpgradeToVersion(from.0, from.1, from.2))?;
|
||||
|
||||
enum UpgradeVersion {}
|
||||
let upgrade_path = &upgrade_functions[start..];
|
||||
let upgrade_path = &UPGRADE_FUNCTIONS[start..];
|
||||
|
||||
let mut current_version = from;
|
||||
let mut regenerate_stats = false;
|
||||
|
@ -1,4 +1,6 @@
|
||||
use heed::RwTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::Deserialize;
|
||||
|
||||
use super::UpgradeIndex;
|
||||
use crate::progress::Progress;
|
||||
@ -26,3 +28,14 @@ impl UpgradeIndex for Latest_V1_14_To_Latest_V1_15 {
|
||||
(1, 15, 0)
|
||||
}
|
||||
}
|
||||
|
||||
/// Parts of v1.15 `IndexingEmbeddingConfig` that are relevant for upgrade to v1.16
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This object should not be rewritten to the DB, only read to get the name and `user_provided` roaring.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct IndexEmbeddingConfig {
|
||||
pub name: String,
|
||||
pub user_provided: RoaringBitmap,
|
||||
}
|
||||
|
48
crates/milli/src/update/upgrade/v1_16.rs
Normal file
48
crates/milli/src/update/upgrade/v1_16.rs
Normal file
@ -0,0 +1,48 @@
|
||||
use heed::types::{SerdeJson, Str};
|
||||
use heed::RwTxn;
|
||||
|
||||
use super::UpgradeIndex;
|
||||
use crate::progress::Progress;
|
||||
use crate::vector::db::{EmbedderInfo, EmbeddingStatus};
|
||||
use crate::{Index, InternalError, Result};
|
||||
|
||||
#[allow(non_camel_case_types)]
|
||||
pub(super) struct Latest_V1_15_To_V1_16_0();
|
||||
|
||||
impl UpgradeIndex for Latest_V1_15_To_V1_16_0 {
|
||||
fn upgrade(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
_original: (u32, u32, u32),
|
||||
_progress: Progress,
|
||||
) -> Result<bool> {
|
||||
let v1_15_indexing_configs = index
|
||||
.main
|
||||
.remap_types::<Str, SerdeJson<Vec<super::v1_15::IndexEmbeddingConfig>>>()
|
||||
.get(wtxn, crate::index::main_key::EMBEDDING_CONFIGS)?
|
||||
.unwrap_or_default();
|
||||
|
||||
let embedders = index.embedding_configs();
|
||||
for config in v1_15_indexing_configs {
|
||||
let embedder_id = embedders.embedder_id(wtxn, &config.name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry {
|
||||
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
let info = EmbedderInfo {
|
||||
embedder_id,
|
||||
// v1.15 used not to make a difference between `user_provided` and `! regenerate`.
|
||||
embedding_status: EmbeddingStatus::from_user_provided(config.user_provided),
|
||||
};
|
||||
embedders.put_embedder_info(wtxn, &config.name, &info)?;
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
fn target_version(&self) -> (u32, u32, u32) {
|
||||
(1, 16, 0)
|
||||
}
|
||||
}
|
@ -7,6 +7,7 @@ use super::{
|
||||
hf, manual, ollama, openai, rest, DistributionShift, EmbedError, Embedding, EmbeddingCache,
|
||||
NewEmbedderError,
|
||||
};
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
#[derive(Debug)]
|
||||
@ -81,6 +82,7 @@ impl Embedder {
|
||||
"This is a sample text. It is meant to compare similarity.".into(),
|
||||
],
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.map_err(|error| NewEmbedderError::composite_test_embedding_failed(error, "search"))?;
|
||||
|
||||
@ -92,6 +94,7 @@ impl Embedder {
|
||||
"This is a sample text. It is meant to compare similarity.".into(),
|
||||
],
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.map_err(|error| {
|
||||
NewEmbedderError::composite_test_embedding_failed(error, "indexing")
|
||||
@ -150,13 +153,14 @@ impl SubEmbedder {
|
||||
&self,
|
||||
texts: Vec<String>,
|
||||
deadline: Option<Instant>,
|
||||
embedder_stats: Option<&EmbedderStats>,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed(texts),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed(&texts, deadline),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed(&texts, deadline),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed(&texts, deadline, embedder_stats),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed(&texts, deadline, embedder_stats),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed(&texts),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed(texts, deadline),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed(texts, deadline, embedder_stats),
|
||||
}
|
||||
}
|
||||
|
||||
@ -164,18 +168,21 @@ impl SubEmbedder {
|
||||
&self,
|
||||
text: &str,
|
||||
deadline: Option<Instant>,
|
||||
embedder_stats: Option<&EmbedderStats>,
|
||||
) -> std::result::Result<Embedding, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed_one(text),
|
||||
SubEmbedder::OpenAi(embedder) => {
|
||||
embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding)
|
||||
}
|
||||
SubEmbedder::Ollama(embedder) => {
|
||||
embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding)
|
||||
}
|
||||
SubEmbedder::OpenAi(embedder) => embedder
|
||||
.embed(&[text], deadline, embedder_stats)?
|
||||
.pop()
|
||||
.ok_or_else(EmbedError::missing_embedding),
|
||||
SubEmbedder::Ollama(embedder) => embedder
|
||||
.embed(&[text], deadline, embedder_stats)?
|
||||
.pop()
|
||||
.ok_or_else(EmbedError::missing_embedding),
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed_one(text),
|
||||
SubEmbedder::Rest(embedder) => embedder
|
||||
.embed_ref(&[text], deadline)?
|
||||
.embed_ref(&[text], deadline, embedder_stats)?
|
||||
.pop()
|
||||
.ok_or_else(EmbedError::missing_embedding),
|
||||
}
|
||||
@ -188,13 +195,20 @@ impl SubEmbedder {
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed_index(text_chunks),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
SubEmbedder::OpenAi(embedder) => {
|
||||
embedder.embed_index(text_chunks, threads, embedder_stats)
|
||||
}
|
||||
SubEmbedder::Ollama(embedder) => {
|
||||
embedder.embed_index(text_chunks, threads, embedder_stats)
|
||||
}
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed_index(text_chunks),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
SubEmbedder::Rest(embedder) => {
|
||||
embedder.embed_index(text_chunks, threads, embedder_stats)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -203,13 +217,18 @@ impl SubEmbedder {
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
match self {
|
||||
SubEmbedder::HuggingFace(embedder) => embedder.embed_index_ref(texts),
|
||||
SubEmbedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
SubEmbedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
SubEmbedder::OpenAi(embedder) => {
|
||||
embedder.embed_index_ref(texts, threads, embedder_stats)
|
||||
}
|
||||
SubEmbedder::Ollama(embedder) => {
|
||||
embedder.embed_index_ref(texts, threads, embedder_stats)
|
||||
}
|
||||
SubEmbedder::UserProvided(embedder) => embedder.embed_index_ref(texts),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
SubEmbedder::Rest(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats),
|
||||
}
|
||||
}
|
||||
|
||||
|
450
crates/milli/src/vector/db.rs
Normal file
450
crates/milli/src/vector/db.rs
Normal file
@ -0,0 +1,450 @@
|
||||
//! Module containing types and methods to store meta-information about the embedders and fragments
|
||||
|
||||
use std::borrow::Cow;
|
||||
|
||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||
use heed::types::{SerdeJson, Str, U8};
|
||||
use heed::{BytesEncode, Database, RoTxn, RwTxn, Unspecified};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::vector::settings::RemoveFragments;
|
||||
use crate::vector::EmbeddingConfig;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, UserError};
|
||||
|
||||
#[derive(Debug, Deserialize, Serialize)]
|
||||
pub struct IndexEmbeddingConfig {
|
||||
pub name: String,
|
||||
pub config: EmbeddingConfig,
|
||||
#[serde(default)]
|
||||
pub fragments: FragmentConfigs,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize, Default)]
|
||||
pub struct FragmentConfigs(Vec<FragmentConfig>);
|
||||
|
||||
impl FragmentConfigs {
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
pub fn as_slice(&self) -> &[FragmentConfig] {
|
||||
self.0.as_slice()
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Vec<FragmentConfig> {
|
||||
self.0
|
||||
}
|
||||
|
||||
pub fn remove_fragments<'a>(
|
||||
&mut self,
|
||||
fragments: impl IntoIterator<Item = &'a str>,
|
||||
) -> Option<RemoveFragments> {
|
||||
let mut remove_fragments = Vec::new();
|
||||
for fragment in fragments {
|
||||
let Ok(index_to_remove) = self.0.binary_search_by_key(&fragment, |f| &f.name) else {
|
||||
continue;
|
||||
};
|
||||
let fragment = self.0.swap_remove(index_to_remove);
|
||||
remove_fragments.push(fragment.id);
|
||||
}
|
||||
(!remove_fragments.is_empty()).then_some(RemoveFragments { fragment_ids: remove_fragments })
|
||||
}
|
||||
|
||||
pub fn add_new_fragments(
|
||||
&mut self,
|
||||
new_fragments: impl IntoIterator<Item = String>,
|
||||
) -> crate::Result<()> {
|
||||
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
|
||||
|
||||
for FragmentConfig { id, name: _ } in self.0.iter() {
|
||||
free_indices[*id as usize] = false;
|
||||
}
|
||||
let mut free_indices = free_indices.iter_mut().enumerate();
|
||||
let mut find_free_index =
|
||||
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
|
||||
|
||||
let mut new_fragments = new_fragments.into_iter();
|
||||
|
||||
for name in &mut new_fragments {
|
||||
let id = match find_free_index() {
|
||||
Some(id) => id,
|
||||
None => {
|
||||
let more = (&mut new_fragments).count();
|
||||
return Err(UserError::TooManyFragments(u8::MAX as usize + more + 1).into());
|
||||
}
|
||||
};
|
||||
self.0.push(FragmentConfig { id, name });
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct FragmentConfig {
|
||||
pub id: u8,
|
||||
pub name: String,
|
||||
}
|
||||
|
||||
pub struct IndexEmbeddingConfigs {
|
||||
main: Database<Unspecified, Unspecified>,
|
||||
embedder_info: Database<Str, EmbedderInfoCodec>,
|
||||
}
|
||||
|
||||
pub struct EmbedderInfo {
|
||||
pub embedder_id: u8,
|
||||
pub embedding_status: EmbeddingStatus,
|
||||
}
|
||||
|
||||
impl EmbedderInfo {
|
||||
pub fn to_bytes(&self) -> Result<Cow<'_, [u8]>, heed::BoxedError> {
|
||||
EmbedderInfoCodec::bytes_encode(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// Optimized struct to hold the list of documents that are `user_provided` and `must_regenerate`.
|
||||
///
|
||||
/// Because most documents have the same value for `user_provided` and `must_regenerate`, we store only
|
||||
/// the `user_provided` and a list of the documents for which `must_regenerate` assumes the other value
|
||||
/// than `user_provided`.
|
||||
#[derive(Default)]
|
||||
pub struct EmbeddingStatus {
|
||||
user_provided: RoaringBitmap,
|
||||
skip_regenerate_different_from_user_provided: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl EmbeddingStatus {
|
||||
pub fn new() -> Self {
|
||||
Default::default()
|
||||
}
|
||||
|
||||
/// Create a new `EmbeddingStatus` that assumes that any `user_provided` docid is also skipping regenerate.
|
||||
///
|
||||
/// Used for migration from v1.15 and earlier DBs.
|
||||
pub(crate) fn from_user_provided(user_provided: RoaringBitmap) -> Self {
|
||||
Self { user_provided, skip_regenerate_different_from_user_provided: Default::default() }
|
||||
}
|
||||
|
||||
/// Whether the document contains user-provided vectors for that embedder.
|
||||
pub fn is_user_provided(&self, docid: DocumentId) -> bool {
|
||||
self.user_provided.contains(docid)
|
||||
}
|
||||
/// Whether vectors should be regenerated for that document and that embedder.
|
||||
pub fn must_regenerate(&self, docid: DocumentId) -> bool {
|
||||
let invert = self.skip_regenerate_different_from_user_provided.contains(docid);
|
||||
let user_provided = self.user_provided.contains(docid);
|
||||
!(user_provided ^ invert)
|
||||
}
|
||||
|
||||
pub fn is_user_provided_must_regenerate(&self, docid: DocumentId) -> (bool, bool) {
|
||||
let invert = self.skip_regenerate_different_from_user_provided.contains(docid);
|
||||
let user_provided = self.user_provided.contains(docid);
|
||||
(user_provided, !(user_provided ^ invert))
|
||||
}
|
||||
|
||||
pub fn user_provided_docids(&self) -> &RoaringBitmap {
|
||||
&self.user_provided
|
||||
}
|
||||
|
||||
pub fn skip_regenerate_docids(&self) -> RoaringBitmap {
|
||||
&self.user_provided ^ &self.skip_regenerate_different_from_user_provided
|
||||
}
|
||||
|
||||
pub(crate) fn into_user_provided(self) -> RoaringBitmap {
|
||||
self.user_provided
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct EmbeddingStatusDelta {
|
||||
del_status: EmbeddingStatus,
|
||||
add_status: EmbeddingStatus,
|
||||
}
|
||||
|
||||
impl EmbeddingStatusDelta {
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
pub fn needs_change(
|
||||
old_is_user_provided: bool,
|
||||
old_must_regenerate: bool,
|
||||
new_is_user_provided: bool,
|
||||
new_must_regenerate: bool,
|
||||
) -> bool {
|
||||
let old_skip_regenerate_different_user_provided =
|
||||
old_is_user_provided == old_must_regenerate;
|
||||
let new_skip_regenerate_different_user_provided =
|
||||
new_is_user_provided == new_must_regenerate;
|
||||
|
||||
old_is_user_provided != new_is_user_provided
|
||||
|| old_skip_regenerate_different_user_provided
|
||||
!= new_skip_regenerate_different_user_provided
|
||||
}
|
||||
|
||||
pub fn needs_clear(is_user_provided: bool, must_regenerate: bool) -> bool {
|
||||
Self::needs_change(is_user_provided, must_regenerate, false, true)
|
||||
}
|
||||
|
||||
pub fn clear_docid(
|
||||
&mut self,
|
||||
docid: DocumentId,
|
||||
is_user_provided: bool,
|
||||
must_regenerate: bool,
|
||||
) {
|
||||
self.push_delta(docid, is_user_provided, must_regenerate, false, true);
|
||||
}
|
||||
|
||||
pub fn push_delta(
|
||||
&mut self,
|
||||
docid: DocumentId,
|
||||
old_is_user_provided: bool,
|
||||
old_must_regenerate: bool,
|
||||
new_is_user_provided: bool,
|
||||
new_must_regenerate: bool,
|
||||
) {
|
||||
// must_regenerate == !skip_regenerate
|
||||
let old_skip_regenerate_different_user_provided =
|
||||
old_is_user_provided == old_must_regenerate;
|
||||
let new_skip_regenerate_different_user_provided =
|
||||
new_is_user_provided == new_must_regenerate;
|
||||
|
||||
match (old_is_user_provided, new_is_user_provided) {
|
||||
(true, true) | (false, false) => { /* no change */ }
|
||||
(true, false) => {
|
||||
self.del_status.user_provided.insert(docid);
|
||||
}
|
||||
(false, true) => {
|
||||
self.add_status.user_provided.insert(docid);
|
||||
}
|
||||
}
|
||||
|
||||
match (
|
||||
old_skip_regenerate_different_user_provided,
|
||||
new_skip_regenerate_different_user_provided,
|
||||
) {
|
||||
(true, true) | (false, false) => { /* no change */ }
|
||||
(true, false) => {
|
||||
self.del_status.skip_regenerate_different_from_user_provided.insert(docid);
|
||||
}
|
||||
(false, true) => {
|
||||
self.add_status.skip_regenerate_different_from_user_provided.insert(docid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn push_new(&mut self, docid: DocumentId, is_user_provided: bool, must_regenerate: bool) {
|
||||
self.push_delta(
|
||||
docid,
|
||||
!is_user_provided,
|
||||
!must_regenerate,
|
||||
is_user_provided,
|
||||
must_regenerate,
|
||||
);
|
||||
}
|
||||
|
||||
pub fn apply_to(&self, status: &mut EmbeddingStatus) {
|
||||
status.user_provided -= &self.del_status.user_provided;
|
||||
status.user_provided |= &self.add_status.user_provided;
|
||||
|
||||
status.skip_regenerate_different_from_user_provided -=
|
||||
&self.del_status.skip_regenerate_different_from_user_provided;
|
||||
status.skip_regenerate_different_from_user_provided |=
|
||||
&self.add_status.skip_regenerate_different_from_user_provided;
|
||||
}
|
||||
}
|
||||
|
||||
struct EmbedderInfoCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for EmbedderInfoCodec {
|
||||
type DItem = EmbedderInfo;
|
||||
|
||||
fn bytes_decode(mut bytes: &'a [u8]) -> Result<Self::DItem, heed::BoxedError> {
|
||||
let embedder_id = bytes.read_u8()?;
|
||||
// Support all version that didn't store the embedding status
|
||||
if bytes.is_empty() {
|
||||
return Ok(EmbedderInfo { embedder_id, embedding_status: EmbeddingStatus::new() });
|
||||
}
|
||||
let first_bitmap_size = bytes.read_u32::<BigEndian>()?;
|
||||
let first_bitmap_bytes = &bytes[..first_bitmap_size as usize];
|
||||
let user_provided = CboRoaringBitmapCodec::bytes_decode(first_bitmap_bytes)?;
|
||||
let skip_regenerate_different_from_user_provided =
|
||||
CboRoaringBitmapCodec::bytes_decode(&bytes[first_bitmap_size as usize..])?;
|
||||
Ok(EmbedderInfo {
|
||||
embedder_id,
|
||||
embedding_status: EmbeddingStatus {
|
||||
user_provided,
|
||||
skip_regenerate_different_from_user_provided,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for EmbedderInfoCodec {
|
||||
type EItem = EmbedderInfo;
|
||||
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, heed::BoxedError> {
|
||||
let first_bitmap_size =
|
||||
CboRoaringBitmapCodec::serialized_size(&item.embedding_status.user_provided);
|
||||
let second_bitmap_size = CboRoaringBitmapCodec::serialized_size(
|
||||
&item.embedding_status.skip_regenerate_different_from_user_provided,
|
||||
);
|
||||
|
||||
let mut bytes = Vec::with_capacity(1 + 4 + first_bitmap_size + second_bitmap_size);
|
||||
bytes.write_u8(item.embedder_id)?;
|
||||
bytes.write_u32::<BigEndian>(first_bitmap_size.try_into()?)?;
|
||||
CboRoaringBitmapCodec::serialize_into_writer(
|
||||
&item.embedding_status.user_provided,
|
||||
&mut bytes,
|
||||
)?;
|
||||
CboRoaringBitmapCodec::serialize_into_writer(
|
||||
&item.embedding_status.skip_regenerate_different_from_user_provided,
|
||||
&mut bytes,
|
||||
)?;
|
||||
Ok(bytes.into())
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexEmbeddingConfigs {
|
||||
pub(crate) fn new(
|
||||
main: Database<Unspecified, Unspecified>,
|
||||
embedder_info: Database<Unspecified, Unspecified>,
|
||||
) -> Self {
|
||||
Self { main, embedder_info: embedder_info.remap_types() }
|
||||
}
|
||||
|
||||
pub(crate) fn put_embedding_configs(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
configs: Vec<IndexEmbeddingConfig>,
|
||||
) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>().put(
|
||||
wtxn,
|
||||
crate::index::main_key::EMBEDDING_CONFIGS,
|
||||
&configs,
|
||||
)
|
||||
}
|
||||
|
||||
pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||
self.main.remap_key_type::<Str>().delete(wtxn, crate::index::main_key::EMBEDDING_CONFIGS)
|
||||
}
|
||||
|
||||
pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> heed::Result<Vec<IndexEmbeddingConfig>> {
|
||||
Ok(self
|
||||
.main
|
||||
.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>()
|
||||
.get(rtxn, crate::index::main_key::EMBEDDING_CONFIGS)?
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub fn embedder_id(&self, rtxn: &RoTxn<'_>, name: &str) -> heed::Result<Option<u8>> {
|
||||
self.embedder_info.remap_data_type::<U8>().get(rtxn, name)
|
||||
}
|
||||
|
||||
pub fn put_fresh_embedder_id(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
name: &str,
|
||||
embedder_id: u8,
|
||||
) -> heed::Result<()> {
|
||||
let info = EmbedderInfo { embedder_id, embedding_status: EmbeddingStatus::new() };
|
||||
self.put_embedder_info(wtxn, name, &info)
|
||||
}
|
||||
|
||||
/// Iterate through the passed list of embedder names, associating a fresh embedder id to any new names.
|
||||
///
|
||||
/// Passing the name of a currently existing embedder is not an error, and will not modify its embedder id,
|
||||
/// so it is not necessary to differentiate between new and existing embedders before calling this function.
|
||||
pub fn add_new_embedders<'a>(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
embedder_names: impl IntoIterator<Item = &'a str>,
|
||||
total_embedder_count: usize,
|
||||
) -> crate::Result<()> {
|
||||
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
|
||||
|
||||
for res in self.embedder_info.iter(wtxn)? {
|
||||
let (_name, EmbedderInfo { embedder_id, embedding_status: _ }) = res?;
|
||||
free_indices[embedder_id as usize] = false;
|
||||
}
|
||||
|
||||
let mut free_indices = free_indices.iter_mut().enumerate();
|
||||
let mut find_free_index =
|
||||
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
|
||||
|
||||
for embedder_name in embedder_names {
|
||||
if self.embedder_id(wtxn, embedder_name)?.is_some() {
|
||||
continue;
|
||||
}
|
||||
let embedder_id = find_free_index()
|
||||
.ok_or(crate::UserError::TooManyEmbedders(total_embedder_count))?;
|
||||
tracing::debug!(
|
||||
embedder = embedder_name,
|
||||
embedder_id,
|
||||
"assigning free id to new embedder"
|
||||
);
|
||||
self.put_fresh_embedder_id(wtxn, embedder_name, embedder_id)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn embedder_info(
|
||||
&self,
|
||||
rtxn: &RoTxn<'_>,
|
||||
name: &str,
|
||||
) -> heed::Result<Option<EmbedderInfo>> {
|
||||
self.embedder_info.get(rtxn, name)
|
||||
}
|
||||
|
||||
/// Clear the list of docids that are `user_provided` or `must_regenerate` across all embedders.
|
||||
pub fn clear_embedder_info_docids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<()> {
|
||||
let mut it = self.embedder_info.iter_mut(wtxn)?;
|
||||
while let Some(res) = it.next() {
|
||||
let (embedder_name, info) = res?;
|
||||
let embedder_name = embedder_name.to_owned();
|
||||
// SAFETY: we copied the `embedder_name` so are not using the reference while using put
|
||||
unsafe {
|
||||
it.put_current(
|
||||
&embedder_name,
|
||||
&EmbedderInfo {
|
||||
embedder_id: info.embedder_id,
|
||||
embedding_status: EmbeddingStatus::new(),
|
||||
},
|
||||
)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn iter_embedder_info<'a>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn<'_>,
|
||||
) -> heed::Result<impl Iterator<Item = heed::Result<(&'a str, EmbedderInfo)>>> {
|
||||
self.embedder_info.iter(rtxn)
|
||||
}
|
||||
|
||||
pub fn iter_embedder_id<'a>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn<'_>,
|
||||
) -> heed::Result<impl Iterator<Item = heed::Result<(&'a str, u8)>>> {
|
||||
self.embedder_info.remap_data_type::<U8>().iter(rtxn)
|
||||
}
|
||||
|
||||
pub fn remove_embedder(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
name: &str,
|
||||
) -> heed::Result<Option<EmbedderInfo>> {
|
||||
let info = self.embedder_info.get(wtxn, name)?;
|
||||
self.embedder_info.delete(wtxn, name)?;
|
||||
Ok(info)
|
||||
}
|
||||
|
||||
pub fn put_embedder_info(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
name: &str,
|
||||
info: &EmbedderInfo,
|
||||
) -> heed::Result<()> {
|
||||
self.embedder_info.put(wtxn, name, info)
|
||||
}
|
||||
}
|
@ -3,6 +3,7 @@ use std::path::PathBuf;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use hf_hub::api::sync::ApiError;
|
||||
use itertools::Itertools as _;
|
||||
|
||||
use super::parsed_vectors::ParsedVectorsDiff;
|
||||
use super::rest::ConfigurationSource;
|
||||
@ -101,6 +102,32 @@ pub enum EmbedErrorKind {
|
||||
MissingEmbedding,
|
||||
#[error(transparent)]
|
||||
PanicInThreadPool(#[from] PanicCatched),
|
||||
#[error("`media` requested but the configuration doesn't have source `rest`")]
|
||||
RestMediaNotARest,
|
||||
#[error("`media` requested, and the configuration has source `rest`, but the configuration doesn't have `searchFragments`.")]
|
||||
RestMediaNotAFragment,
|
||||
|
||||
#[error("Query matches multiple search fragments.\n - Note: First matched fragment `{name}`.\n - Note: Second matched fragment `{second_name}`.\n - Note: {}",
|
||||
{
|
||||
serde_json::json!({
|
||||
"q": q,
|
||||
"media": media
|
||||
})
|
||||
})]
|
||||
RestSearchMatchesMultipleFragments {
|
||||
name: String,
|
||||
second_name: String,
|
||||
q: Option<String>,
|
||||
media: Option<serde_json::Value>,
|
||||
},
|
||||
#[error("Query matches no search fragment.\n - Note: {}",
|
||||
{
|
||||
serde_json::json!({
|
||||
"q": q,
|
||||
"media": media
|
||||
})
|
||||
})]
|
||||
RestSearchMatchesNoFragment { q: Option<String>, media: Option<serde_json::Value> },
|
||||
}
|
||||
|
||||
fn option_info(info: Option<&str>, prefix: &str) -> String {
|
||||
@ -210,6 +237,44 @@ impl EmbedError {
|
||||
pub(crate) fn rest_extraction_error(error: String) -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::RestExtractionError(error), fault: FaultSource::Runtime }
|
||||
}
|
||||
|
||||
pub(crate) fn rest_media_not_a_rest() -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::RestMediaNotARest, fault: FaultSource::User }
|
||||
}
|
||||
|
||||
pub(crate) fn rest_media_not_a_fragment() -> EmbedError {
|
||||
Self { kind: EmbedErrorKind::RestMediaNotAFragment, fault: FaultSource::User }
|
||||
}
|
||||
|
||||
pub(crate) fn rest_search_matches_multiple_fragments(
|
||||
name: &str,
|
||||
second_name: &str,
|
||||
q: Option<&str>,
|
||||
media: Option<&serde_json::Value>,
|
||||
) -> EmbedError {
|
||||
Self {
|
||||
kind: EmbedErrorKind::RestSearchMatchesMultipleFragments {
|
||||
name: name.to_string(),
|
||||
second_name: second_name.to_string(),
|
||||
q: q.map(String::from),
|
||||
media: media.cloned(),
|
||||
},
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn rest_search_matches_no_fragment(
|
||||
q: Option<&str>,
|
||||
media: Option<&serde_json::Value>,
|
||||
) -> EmbedError {
|
||||
Self {
|
||||
kind: EmbedErrorKind::RestSearchMatchesNoFragment {
|
||||
q: q.map(String::from),
|
||||
media: media.cloned(),
|
||||
},
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@ -382,6 +447,49 @@ impl NewEmbedderError {
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn rest_cannot_infer_dimensions_for_fragment() -> NewEmbedderError {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::RestCannotInferDimensionsForFragment,
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn rest_inconsistent_fragments(
|
||||
indexing_fragments_is_empty: bool,
|
||||
indexing_fragments: BTreeMap<String, serde_json::Value>,
|
||||
search_fragments: BTreeMap<String, serde_json::Value>,
|
||||
) -> NewEmbedderError {
|
||||
let message = if indexing_fragments_is_empty {
|
||||
format!("`indexingFragments` is empty, but `searchFragments` declares {} fragments: {}{}\n - Hint: declare at least one fragment in `indexingFragments` or remove fragments from `searchFragments` by setting them to `null`",
|
||||
search_fragments.len(),
|
||||
search_fragments.keys().take(3).join(", "), if search_fragments.len() > 3 { ", ..." } else { "" }
|
||||
)
|
||||
} else {
|
||||
format!("`searchFragments` is empty, but `indexingFragments` declares {} fragments: {}{}\n - Hint: declare at least one fragment in `searchFragments` or remove fragments from `indexingFragments` by setting them to `null`",
|
||||
indexing_fragments.len(),
|
||||
indexing_fragments.keys().take(3).join(", "), if indexing_fragments.len() > 3 { ", ..." } else { "" }
|
||||
)
|
||||
};
|
||||
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::RestInconsistentFragments { message },
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn rest_document_template_and_fragments(
|
||||
indexing_fragments_len: usize,
|
||||
search_fragments_len: usize,
|
||||
) -> Self {
|
||||
Self {
|
||||
kind: NewEmbedderErrorKind::RestDocumentTemplateAndFragments {
|
||||
indexing_fragments_len,
|
||||
search_fragments_len,
|
||||
},
|
||||
fault: FaultSource::User,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
@ -499,6 +607,12 @@ pub enum NewEmbedderErrorKind {
|
||||
CompositeEmbeddingCountMismatch { search_count: usize, index_count: usize },
|
||||
#[error("error while generating test embeddings.\n - the embeddings produced at search time and indexing time are not similar enough.\n - angular distance {distance:.2}\n - Meilisearch requires a maximum distance of {MAX_COMPOSITE_DISTANCE}.\n - Note: check that both embedders produce similar embeddings.{hint}")]
|
||||
CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace },
|
||||
#[error("cannot infer `dimensions` for an embedder using `indexingFragments`.\n - Note: Specify `dimensions` explicitly or don't use `indexingFragments`.")]
|
||||
RestCannotInferDimensionsForFragment,
|
||||
#[error("inconsistent fragments: {message}")]
|
||||
RestInconsistentFragments { message: String },
|
||||
#[error("cannot pass both fragments and a document template.\n - Note: {indexing_fragments_len} fragments declared in `indexingFragments` and {search_fragments_len} fragments declared in `search_fragments_len`.\n - Hint: remove the declared fragments or remove the `documentTemplate`")]
|
||||
RestDocumentTemplateAndFragments { indexing_fragments_len: usize, search_fragments_len: usize },
|
||||
}
|
||||
|
||||
pub struct PossibleEmbeddingMistakes {
|
||||
|
244
crates/milli/src/vector/extractor.rs
Normal file
244
crates/milli/src/vector/extractor.rs
Normal file
@ -0,0 +1,244 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt::Debug;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::json_template::{self, JsonTemplate};
|
||||
use crate::prompt::error::RenderPromptError;
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::new::document::Document;
|
||||
use crate::vector::RuntimeFragment;
|
||||
use crate::GlobalFieldsIdsMap;
|
||||
|
||||
/// Trait for types that extract embedder inputs from a document.
|
||||
///
|
||||
/// An embedder input can then be sent to an embedder by using an [`super::session::EmbedSession`].
|
||||
pub trait Extractor<'doc> {
|
||||
/// The embedder input that is extracted from documents by this extractor.
|
||||
///
|
||||
/// The inputs have to be comparable for equality so that diffing is possible.
|
||||
type Input: PartialEq;
|
||||
|
||||
/// The error that can happen while extracting from a document.
|
||||
type Error;
|
||||
|
||||
/// Metadata associated with a document.
|
||||
type DocumentMetadata;
|
||||
|
||||
/// Extract the embedder input from a document and its metadata.
|
||||
fn extract<'a, D: Document<'a> + Debug>(
|
||||
&self,
|
||||
doc: D,
|
||||
meta: &Self::DocumentMetadata,
|
||||
) -> Result<Option<Self::Input>, Self::Error>;
|
||||
|
||||
/// Unique `id` associated with this extractor.
|
||||
///
|
||||
/// This will serve to decide where to store the vectors in the vector store.
|
||||
/// The id should be stable for a given extractor.
|
||||
fn extractor_id(&self) -> u8;
|
||||
|
||||
/// The result of diffing the embedder inputs extracted from two versions of a document.
|
||||
///
|
||||
/// # Parameters
|
||||
///
|
||||
/// - `old`: old version of the document
|
||||
/// - `new`: new version of the document
|
||||
/// - `meta`: metadata associated to the document
|
||||
fn diff_documents<'a, OD: Document<'a> + Debug, ND: Document<'a> + Debug>(
|
||||
&self,
|
||||
old: OD,
|
||||
new: ND,
|
||||
meta: &Self::DocumentMetadata,
|
||||
) -> Result<ExtractorDiff<Self::Input>, Self::Error>
|
||||
where
|
||||
'doc: 'a,
|
||||
{
|
||||
let old_input = self.extract(old, meta);
|
||||
let new_input = self.extract(new, meta);
|
||||
to_diff(old_input, new_input)
|
||||
}
|
||||
|
||||
/// The result of diffing the embedder inputs extracted from a document by two versions of this extractor.
|
||||
///
|
||||
/// # Parameters
|
||||
///
|
||||
/// - `doc`: the document from which to extract the embedder inputs
|
||||
/// - `meta`: metadata associated to the document
|
||||
/// - `old`: If `Some`, the old version of this extractor. If `None`, this is equivalent to calling `ExtractorDiff::Added(self.extract(_))`.
|
||||
fn diff_settings<'a, D: Document<'a> + Debug>(
|
||||
&self,
|
||||
doc: D,
|
||||
meta: &Self::DocumentMetadata,
|
||||
old: Option<&Self>,
|
||||
) -> Result<ExtractorDiff<Self::Input>, Self::Error> {
|
||||
let old_input = if let Some(old) = old { old.extract(&doc, meta) } else { Ok(None) };
|
||||
let new_input = self.extract(&doc, meta);
|
||||
|
||||
to_diff(old_input, new_input)
|
||||
}
|
||||
|
||||
/// Returns an extractor wrapping `self` and set to ignore all errors arising from extracting with this extractor.
|
||||
fn ignore_errors(self) -> IgnoreErrorExtractor<Self>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
IgnoreErrorExtractor(self)
|
||||
}
|
||||
}
|
||||
|
||||
fn to_diff<I: PartialEq, E>(
|
||||
old_input: Result<Option<I>, E>,
|
||||
new_input: Result<Option<I>, E>,
|
||||
) -> Result<ExtractorDiff<I>, E> {
|
||||
let old_input = old_input.ok().unwrap_or(None);
|
||||
let new_input = new_input?;
|
||||
Ok(match (old_input, new_input) {
|
||||
(Some(old), Some(new)) if old == new => ExtractorDiff::Unchanged,
|
||||
(None, None) => ExtractorDiff::Unchanged,
|
||||
(None, Some(input)) => ExtractorDiff::Added(input),
|
||||
(Some(_), None) => ExtractorDiff::Removed,
|
||||
(Some(_), Some(input)) => ExtractorDiff::Updated(input),
|
||||
})
|
||||
}
|
||||
|
||||
pub enum ExtractorDiff<Input> {
|
||||
Removed,
|
||||
Added(Input),
|
||||
Updated(Input),
|
||||
Unchanged,
|
||||
}
|
||||
|
||||
impl<Input> ExtractorDiff<Input> {
|
||||
pub fn into_input(self) -> Option<Input> {
|
||||
match self {
|
||||
ExtractorDiff::Removed => None,
|
||||
ExtractorDiff::Added(input) => Some(input),
|
||||
ExtractorDiff::Updated(input) => Some(input),
|
||||
ExtractorDiff::Unchanged => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn needs_change(&self) -> bool {
|
||||
match self {
|
||||
ExtractorDiff::Removed => true,
|
||||
ExtractorDiff::Added(_) => true,
|
||||
ExtractorDiff::Updated(_) => true,
|
||||
ExtractorDiff::Unchanged => false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_list_of_changes(
|
||||
named_diffs: impl IntoIterator<Item = (String, Self)>,
|
||||
) -> BTreeMap<String, Option<Input>> {
|
||||
named_diffs
|
||||
.into_iter()
|
||||
.filter(|(_, diff)| diff.needs_change())
|
||||
.map(|(name, diff)| (name, diff.into_input()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentTemplateExtractor<'a, 'b, 'c> {
|
||||
doc_alloc: &'a Bump,
|
||||
field_id_map: &'a RefCell<GlobalFieldsIdsMap<'b>>,
|
||||
template: &'c Prompt,
|
||||
}
|
||||
|
||||
impl<'a, 'b, 'c> DocumentTemplateExtractor<'a, 'b, 'c> {
|
||||
pub fn new(
|
||||
template: &'c Prompt,
|
||||
doc_alloc: &'a Bump,
|
||||
field_id_map: &'a RefCell<GlobalFieldsIdsMap<'b>>,
|
||||
) -> Self {
|
||||
Self { template, doc_alloc, field_id_map }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc> Extractor<'doc> for DocumentTemplateExtractor<'doc, '_, '_> {
|
||||
type DocumentMetadata = &'doc str;
|
||||
type Input = &'doc str;
|
||||
type Error = RenderPromptError;
|
||||
|
||||
fn extractor_id(&self) -> u8 {
|
||||
0
|
||||
}
|
||||
|
||||
fn extract<'a, D: Document<'a> + Debug>(
|
||||
&self,
|
||||
doc: D,
|
||||
external_docid: &Self::DocumentMetadata,
|
||||
) -> Result<Option<Self::Input>, Self::Error> {
|
||||
Ok(Some(self.template.render_document(
|
||||
external_docid,
|
||||
doc,
|
||||
self.field_id_map,
|
||||
self.doc_alloc,
|
||||
)?))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RequestFragmentExtractor<'a> {
|
||||
fragment: &'a JsonTemplate,
|
||||
extractor_id: u8,
|
||||
doc_alloc: &'a Bump,
|
||||
}
|
||||
|
||||
impl<'a> RequestFragmentExtractor<'a> {
|
||||
pub fn new(fragment: &'a RuntimeFragment, doc_alloc: &'a Bump) -> Self {
|
||||
Self { fragment: &fragment.template, extractor_id: fragment.id, doc_alloc }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc> Extractor<'doc> for RequestFragmentExtractor<'doc> {
|
||||
type DocumentMetadata = ();
|
||||
type Input = Value;
|
||||
type Error = json_template::Error;
|
||||
|
||||
fn extractor_id(&self) -> u8 {
|
||||
self.extractor_id
|
||||
}
|
||||
|
||||
fn extract<'a, D: Document<'a> + Debug>(
|
||||
&self,
|
||||
doc: D,
|
||||
_meta: &Self::DocumentMetadata,
|
||||
) -> Result<Option<Self::Input>, Self::Error> {
|
||||
Ok(Some(self.fragment.render_document(doc, self.doc_alloc)?))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IgnoreErrorExtractor<E>(E);
|
||||
|
||||
impl<'doc, E> Extractor<'doc> for IgnoreErrorExtractor<E>
|
||||
where
|
||||
E: Extractor<'doc>,
|
||||
{
|
||||
type DocumentMetadata = E::DocumentMetadata;
|
||||
type Input = E::Input;
|
||||
|
||||
type Error = Infallible;
|
||||
|
||||
fn extractor_id(&self) -> u8 {
|
||||
self.0.extractor_id()
|
||||
}
|
||||
|
||||
fn extract<'a, D: Document<'a> + Debug>(
|
||||
&self,
|
||||
doc: D,
|
||||
meta: &Self::DocumentMetadata,
|
||||
) -> Result<Option<Self::Input>, Self::Error> {
|
||||
Ok(self.0.extract(doc, meta).ok().flatten())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Infallible {}
|
||||
|
||||
impl From<Infallible> for crate::Error {
|
||||
fn from(_: Infallible) -> Self {
|
||||
unreachable!("Infallible values cannot be built")
|
||||
}
|
||||
}
|
@ -1,20 +1,17 @@
|
||||
//! Module to manipulate JSON templates.
|
||||
//! Module to manipulate JSON values containing placeholder strings.
|
||||
//!
|
||||
//! This module allows two main operations:
|
||||
//! 1. Render JSON values from a template and a context value.
|
||||
//! 2. Retrieve data from a template and JSON values.
|
||||
|
||||
#![warn(rustdoc::broken_intra_doc_links)]
|
||||
#![warn(missing_docs)]
|
||||
//! 1. Render JSON values from a template value containing placeholders and a value to inject.
|
||||
//! 2. Extract data from a template value containing placeholders and a concrete JSON value that fits the template value.
|
||||
|
||||
use serde::Deserialize;
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
type ValuePath = Vec<PathComponent>;
|
||||
use super::{format_value, inject_value, path_with_root, PathComponent, ValuePath};
|
||||
|
||||
/// Encapsulates a JSON template and allows injecting and extracting values from it.
|
||||
#[derive(Debug)]
|
||||
pub struct ValueTemplate {
|
||||
pub struct InjectableValue {
|
||||
template: Value,
|
||||
value_kind: ValueKind,
|
||||
}
|
||||
@ -32,34 +29,13 @@ struct ArrayPath {
|
||||
value_path_in_array: ValuePath,
|
||||
}
|
||||
|
||||
/// Component of a path to a Value
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum PathComponent {
|
||||
/// A key inside of an object
|
||||
MapKey(String),
|
||||
/// An index inside of an array
|
||||
ArrayIndex(usize),
|
||||
}
|
||||
|
||||
impl PartialEq for PathComponent {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
match (self, other) {
|
||||
(Self::MapKey(l0), Self::MapKey(r0)) => l0 == r0,
|
||||
(Self::ArrayIndex(l0), Self::ArrayIndex(r0)) => l0 == r0,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for PathComponent {}
|
||||
|
||||
/// Error that occurs when no few value was provided to a template for injection.
|
||||
/// Error that occurs when no value was provided to a template for injection.
|
||||
#[derive(Debug)]
|
||||
pub struct MissingValue;
|
||||
|
||||
/// Error that occurs when trying to parse a template in [`ValueTemplate::new`]
|
||||
/// Error that occurs when trying to parse a template in [`InjectableValue::new`]
|
||||
#[derive(Debug)]
|
||||
pub enum TemplateParsingError {
|
||||
pub enum InjectableParsingError {
|
||||
/// A repeat string appears inside a repeated value
|
||||
NestedRepeatString(ValuePath),
|
||||
/// A repeat string appears outside of an array
|
||||
@ -85,42 +61,42 @@ pub enum TemplateParsingError {
|
||||
},
|
||||
}
|
||||
|
||||
impl TemplateParsingError {
|
||||
impl InjectableParsingError {
|
||||
/// Produce an error message from the error kind, the name of the root object, the placeholder string and the repeat string
|
||||
pub fn error_message(&self, root: &str, placeholder: &str, repeat: &str) -> String {
|
||||
match self {
|
||||
TemplateParsingError::NestedRepeatString(path) => {
|
||||
InjectableParsingError::NestedRepeatString(path) => {
|
||||
format!(
|
||||
r#"in {}: "{repeat}" appears nested inside of a value that is itself repeated"#,
|
||||
path_with_root(root, path)
|
||||
)
|
||||
}
|
||||
TemplateParsingError::RepeatStringNotInArray(path) => format!(
|
||||
InjectableParsingError::RepeatStringNotInArray(path) => format!(
|
||||
r#"in {}: "{repeat}" appears outside of an array"#,
|
||||
path_with_root(root, path)
|
||||
),
|
||||
TemplateParsingError::BadIndexForRepeatString(path, index) => format!(
|
||||
InjectableParsingError::BadIndexForRepeatString(path, index) => format!(
|
||||
r#"in {}: "{repeat}" expected at position #1, but found at position #{index}"#,
|
||||
path_with_root(root, path)
|
||||
),
|
||||
TemplateParsingError::MissingPlaceholderInRepeatedValue(path) => format!(
|
||||
InjectableParsingError::MissingPlaceholderInRepeatedValue(path) => format!(
|
||||
r#"in {}: Expected "{placeholder}" inside of the repeated value"#,
|
||||
path_with_root(root, path)
|
||||
),
|
||||
TemplateParsingError::MultipleRepeatString(current, previous) => format!(
|
||||
InjectableParsingError::MultipleRepeatString(current, previous) => format!(
|
||||
r#"in {}: Found "{repeat}", but it was already present in {}"#,
|
||||
path_with_root(root, current),
|
||||
path_with_root(root, previous)
|
||||
),
|
||||
TemplateParsingError::MultiplePlaceholderString(current, previous) => format!(
|
||||
InjectableParsingError::MultiplePlaceholderString(current, previous) => format!(
|
||||
r#"in {}: Found "{placeholder}", but it was already present in {}"#,
|
||||
path_with_root(root, current),
|
||||
path_with_root(root, previous)
|
||||
),
|
||||
TemplateParsingError::MissingPlaceholderString => {
|
||||
InjectableParsingError::MissingPlaceholderString => {
|
||||
format!(r#"in `{root}`: "{placeholder}" not found"#)
|
||||
}
|
||||
TemplateParsingError::BothArrayAndSingle {
|
||||
InjectableParsingError::BothArrayAndSingle {
|
||||
single_path,
|
||||
path_to_array,
|
||||
array_to_placeholder,
|
||||
@ -140,41 +116,41 @@ impl TemplateParsingError {
|
||||
|
||||
fn prepend_path(self, mut prepended_path: ValuePath) -> Self {
|
||||
match self {
|
||||
TemplateParsingError::NestedRepeatString(mut path) => {
|
||||
InjectableParsingError::NestedRepeatString(mut path) => {
|
||||
prepended_path.append(&mut path);
|
||||
TemplateParsingError::NestedRepeatString(prepended_path)
|
||||
InjectableParsingError::NestedRepeatString(prepended_path)
|
||||
}
|
||||
TemplateParsingError::RepeatStringNotInArray(mut path) => {
|
||||
InjectableParsingError::RepeatStringNotInArray(mut path) => {
|
||||
prepended_path.append(&mut path);
|
||||
TemplateParsingError::RepeatStringNotInArray(prepended_path)
|
||||
InjectableParsingError::RepeatStringNotInArray(prepended_path)
|
||||
}
|
||||
TemplateParsingError::BadIndexForRepeatString(mut path, index) => {
|
||||
InjectableParsingError::BadIndexForRepeatString(mut path, index) => {
|
||||
prepended_path.append(&mut path);
|
||||
TemplateParsingError::BadIndexForRepeatString(prepended_path, index)
|
||||
InjectableParsingError::BadIndexForRepeatString(prepended_path, index)
|
||||
}
|
||||
TemplateParsingError::MissingPlaceholderInRepeatedValue(mut path) => {
|
||||
InjectableParsingError::MissingPlaceholderInRepeatedValue(mut path) => {
|
||||
prepended_path.append(&mut path);
|
||||
TemplateParsingError::MissingPlaceholderInRepeatedValue(prepended_path)
|
||||
InjectableParsingError::MissingPlaceholderInRepeatedValue(prepended_path)
|
||||
}
|
||||
TemplateParsingError::MultipleRepeatString(mut path, older_path) => {
|
||||
InjectableParsingError::MultipleRepeatString(mut path, older_path) => {
|
||||
let older_prepended_path =
|
||||
prepended_path.iter().cloned().chain(older_path).collect();
|
||||
prepended_path.append(&mut path);
|
||||
TemplateParsingError::MultipleRepeatString(prepended_path, older_prepended_path)
|
||||
InjectableParsingError::MultipleRepeatString(prepended_path, older_prepended_path)
|
||||
}
|
||||
TemplateParsingError::MultiplePlaceholderString(mut path, older_path) => {
|
||||
InjectableParsingError::MultiplePlaceholderString(mut path, older_path) => {
|
||||
let older_prepended_path =
|
||||
prepended_path.iter().cloned().chain(older_path).collect();
|
||||
prepended_path.append(&mut path);
|
||||
TemplateParsingError::MultiplePlaceholderString(
|
||||
InjectableParsingError::MultiplePlaceholderString(
|
||||
prepended_path,
|
||||
older_prepended_path,
|
||||
)
|
||||
}
|
||||
TemplateParsingError::MissingPlaceholderString => {
|
||||
TemplateParsingError::MissingPlaceholderString
|
||||
InjectableParsingError::MissingPlaceholderString => {
|
||||
InjectableParsingError::MissingPlaceholderString
|
||||
}
|
||||
TemplateParsingError::BothArrayAndSingle {
|
||||
InjectableParsingError::BothArrayAndSingle {
|
||||
single_path,
|
||||
mut path_to_array,
|
||||
array_to_placeholder,
|
||||
@ -184,7 +160,7 @@ impl TemplateParsingError {
|
||||
prepended_path.iter().cloned().chain(single_path).collect();
|
||||
prepended_path.append(&mut path_to_array);
|
||||
// we don't prepend the array_to_placeholder path as it is the array path that is prepended
|
||||
TemplateParsingError::BothArrayAndSingle {
|
||||
InjectableParsingError::BothArrayAndSingle {
|
||||
single_path: single_prepended_path,
|
||||
path_to_array: prepended_path,
|
||||
array_to_placeholder,
|
||||
@ -194,7 +170,7 @@ impl TemplateParsingError {
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that occurs when [`ValueTemplate::extract`] fails.
|
||||
/// Error that occurs when [`InjectableValue::extract`] fails.
|
||||
#[derive(Debug)]
|
||||
pub struct ExtractionError {
|
||||
/// The cause of the failure
|
||||
@ -336,27 +312,6 @@ enum LastNamedObject<'a> {
|
||||
NestedArrayInsideObject { object_name: &'a str, index: usize, nesting_level: usize },
|
||||
}
|
||||
|
||||
/// Builds a string representation of a path, preprending the name of the root value.
|
||||
pub fn path_with_root<'a>(
|
||||
root: &str,
|
||||
path: impl IntoIterator<Item = &'a PathComponent> + 'a,
|
||||
) -> String {
|
||||
use std::fmt::Write as _;
|
||||
let mut res = format!("`{root}");
|
||||
for component in path.into_iter() {
|
||||
match component {
|
||||
PathComponent::MapKey(key) => {
|
||||
let _ = write!(&mut res, ".{key}");
|
||||
}
|
||||
PathComponent::ArrayIndex(index) => {
|
||||
let _ = write!(&mut res, "[{index}]");
|
||||
}
|
||||
}
|
||||
}
|
||||
res.push('`');
|
||||
res
|
||||
}
|
||||
|
||||
/// Context where an extraction failure happened
|
||||
///
|
||||
/// The operation that failed
|
||||
@ -405,7 +360,7 @@ enum ArrayParsingContext<'a> {
|
||||
NotNested(&'a mut Option<ArrayPath>),
|
||||
}
|
||||
|
||||
impl ValueTemplate {
|
||||
impl InjectableValue {
|
||||
/// Prepare a template for injection or extraction.
|
||||
///
|
||||
/// # Parameters
|
||||
@ -419,12 +374,12 @@ impl ValueTemplate {
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// - [`TemplateParsingError`]: refer to the documentation of this type
|
||||
/// - [`InjectableParsingError`]: refer to the documentation of this type
|
||||
pub fn new(
|
||||
template: Value,
|
||||
placeholder_string: &str,
|
||||
repeat_string: &str,
|
||||
) -> Result<Self, TemplateParsingError> {
|
||||
) -> Result<Self, InjectableParsingError> {
|
||||
let mut value_path = None;
|
||||
let mut array_path = None;
|
||||
let mut current_path = Vec::new();
|
||||
@ -438,11 +393,11 @@ impl ValueTemplate {
|
||||
)?;
|
||||
|
||||
let value_kind = match (array_path, value_path) {
|
||||
(None, None) => return Err(TemplateParsingError::MissingPlaceholderString),
|
||||
(None, None) => return Err(InjectableParsingError::MissingPlaceholderString),
|
||||
(None, Some(value_path)) => ValueKind::Single(value_path),
|
||||
(Some(array_path), None) => ValueKind::Array(array_path),
|
||||
(Some(array_path), Some(value_path)) => {
|
||||
return Err(TemplateParsingError::BothArrayAndSingle {
|
||||
return Err(InjectableParsingError::BothArrayAndSingle {
|
||||
single_path: value_path,
|
||||
path_to_array: array_path.path_to_array,
|
||||
array_to_placeholder: array_path.value_path_in_array,
|
||||
@ -564,29 +519,29 @@ impl ValueTemplate {
|
||||
value_path: &mut Option<ValuePath>,
|
||||
mut array_path: &mut ArrayParsingContext,
|
||||
current_path: &mut ValuePath,
|
||||
) -> Result<(), TemplateParsingError> {
|
||||
) -> Result<(), InjectableParsingError> {
|
||||
// two modes for parsing array.
|
||||
match array {
|
||||
// 1. array contains a repeat string in second position
|
||||
[first, second, rest @ ..] if second == repeat_string => {
|
||||
let ArrayParsingContext::NotNested(array_path) = &mut array_path else {
|
||||
return Err(TemplateParsingError::NestedRepeatString(current_path.clone()));
|
||||
return Err(InjectableParsingError::NestedRepeatString(current_path.clone()));
|
||||
};
|
||||
if let Some(array_path) = array_path {
|
||||
return Err(TemplateParsingError::MultipleRepeatString(
|
||||
return Err(InjectableParsingError::MultipleRepeatString(
|
||||
current_path.clone(),
|
||||
array_path.path_to_array.clone(),
|
||||
));
|
||||
}
|
||||
if first == repeat_string {
|
||||
return Err(TemplateParsingError::BadIndexForRepeatString(
|
||||
return Err(InjectableParsingError::BadIndexForRepeatString(
|
||||
current_path.clone(),
|
||||
0,
|
||||
));
|
||||
}
|
||||
if let Some(position) = rest.iter().position(|value| value == repeat_string) {
|
||||
let position = position + 2;
|
||||
return Err(TemplateParsingError::BadIndexForRepeatString(
|
||||
return Err(InjectableParsingError::BadIndexForRepeatString(
|
||||
current_path.clone(),
|
||||
position,
|
||||
));
|
||||
@ -609,7 +564,9 @@ impl ValueTemplate {
|
||||
value_path.ok_or_else(|| {
|
||||
let mut repeated_value_path = current_path.clone();
|
||||
repeated_value_path.push(PathComponent::ArrayIndex(0));
|
||||
TemplateParsingError::MissingPlaceholderInRepeatedValue(repeated_value_path)
|
||||
InjectableParsingError::MissingPlaceholderInRepeatedValue(
|
||||
repeated_value_path,
|
||||
)
|
||||
})?
|
||||
};
|
||||
**array_path = Some(ArrayPath {
|
||||
@ -621,7 +578,7 @@ impl ValueTemplate {
|
||||
// 2. array does not contain a repeat string
|
||||
array => {
|
||||
if let Some(position) = array.iter().position(|value| value == repeat_string) {
|
||||
return Err(TemplateParsingError::BadIndexForRepeatString(
|
||||
return Err(InjectableParsingError::BadIndexForRepeatString(
|
||||
current_path.clone(),
|
||||
position,
|
||||
));
|
||||
@ -650,7 +607,7 @@ impl ValueTemplate {
|
||||
value_path: &mut Option<ValuePath>,
|
||||
array_path: &mut ArrayParsingContext,
|
||||
current_path: &mut ValuePath,
|
||||
) -> Result<(), TemplateParsingError> {
|
||||
) -> Result<(), InjectableParsingError> {
|
||||
for (key, value) in object.iter() {
|
||||
current_path.push(PathComponent::MapKey(key.to_owned()));
|
||||
Self::parse_value(
|
||||
@ -673,12 +630,12 @@ impl ValueTemplate {
|
||||
value_path: &mut Option<ValuePath>,
|
||||
array_path: &mut ArrayParsingContext,
|
||||
current_path: &mut ValuePath,
|
||||
) -> Result<(), TemplateParsingError> {
|
||||
) -> Result<(), InjectableParsingError> {
|
||||
match value {
|
||||
Value::String(str) => {
|
||||
if placeholder_string == str {
|
||||
if let Some(value_path) = value_path {
|
||||
return Err(TemplateParsingError::MultiplePlaceholderString(
|
||||
return Err(InjectableParsingError::MultiplePlaceholderString(
|
||||
current_path.clone(),
|
||||
value_path.clone(),
|
||||
));
|
||||
@ -687,7 +644,9 @@ impl ValueTemplate {
|
||||
*value_path = Some(current_path.clone());
|
||||
}
|
||||
if repeat_string == str {
|
||||
return Err(TemplateParsingError::RepeatStringNotInArray(current_path.clone()));
|
||||
return Err(InjectableParsingError::RepeatStringNotInArray(
|
||||
current_path.clone(),
|
||||
));
|
||||
}
|
||||
}
|
||||
Value::Null | Value::Bool(_) | Value::Number(_) => {}
|
||||
@ -712,27 +671,6 @@ impl ValueTemplate {
|
||||
}
|
||||
}
|
||||
|
||||
fn inject_value(rendered: &mut Value, injection_path: &Vec<PathComponent>, injected_value: Value) {
|
||||
let mut current_value = rendered;
|
||||
for injection_component in injection_path {
|
||||
current_value = match injection_component {
|
||||
PathComponent::MapKey(key) => current_value.get_mut(key).unwrap(),
|
||||
PathComponent::ArrayIndex(index) => current_value.get_mut(index).unwrap(),
|
||||
}
|
||||
}
|
||||
*current_value = injected_value;
|
||||
}
|
||||
|
||||
fn format_value(value: &Value) -> String {
|
||||
match value {
|
||||
Value::Array(array) => format!("an array of size {}", array.len()),
|
||||
Value::Object(object) => {
|
||||
format!("an object with {} field(s)", object.len())
|
||||
}
|
||||
value => value.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_value<T>(
|
||||
extraction_path: &[PathComponent],
|
||||
initial_value: &mut Value,
|
||||
@ -838,10 +776,10 @@ impl<T> ExtractionResultErrorContext<T> for Result<T, ExtractionErrorKind> {
|
||||
mod test {
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use super::{PathComponent, TemplateParsingError, ValueTemplate};
|
||||
use super::{InjectableParsingError, InjectableValue, PathComponent};
|
||||
|
||||
fn new_template(template: Value) -> Result<ValueTemplate, TemplateParsingError> {
|
||||
ValueTemplate::new(template, "{{text}}", "{{..}}")
|
||||
fn new_template(template: Value) -> Result<InjectableValue, InjectableParsingError> {
|
||||
InjectableValue::new(template, "{{text}}", "{{..}}")
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -853,7 +791,7 @@ mod test {
|
||||
});
|
||||
|
||||
let error = new_template(template.clone()).unwrap_err();
|
||||
assert!(matches!(error, TemplateParsingError::MissingPlaceholderString))
|
||||
assert!(matches!(error, InjectableParsingError::MissingPlaceholderString))
|
||||
}
|
||||
|
||||
#[test]
|
||||
@ -887,7 +825,7 @@ mod test {
|
||||
});
|
||||
|
||||
match new_template(template.clone()) {
|
||||
Err(TemplateParsingError::MultiplePlaceholderString(left, right)) => {
|
||||
Err(InjectableParsingError::MultiplePlaceholderString(left, right)) => {
|
||||
assert_eq!(
|
||||
left,
|
||||
vec![PathComponent::MapKey("titi".into()), PathComponent::ArrayIndex(3)]
|
282
crates/milli/src/vector/json_template/mod.rs
Normal file
282
crates/milli/src/vector/json_template/mod.rs
Normal file
@ -0,0 +1,282 @@
|
||||
//! Exposes types to manipulate JSON values
|
||||
//!
|
||||
//! - [`JsonTemplate`]: renders JSON values by rendering its strings as [`Template`]s.
|
||||
//! - [`InjectableValue`]: Describes a JSON value containing placeholders,
|
||||
//! then allows to inject values instead of the placeholder to produce new concrete JSON values,
|
||||
//! or extract sub-values at the placeholder location from concrete JSON values.
|
||||
//!
|
||||
//! The module also exposes foundational types to work with JSON paths:
|
||||
//!
|
||||
//! - [`ValuePath`] is made of [`PathComponent`]s to indicate the location of a sub-value inside of a JSON value.
|
||||
//! - [`inject_value`] is a primitive that replaces the sub-value at the described location by an injected value.
|
||||
|
||||
#![warn(rustdoc::broken_intra_doc_links)]
|
||||
#![warn(missing_docs)]
|
||||
|
||||
use bumpalo::Bump;
|
||||
use liquid::{Parser, Template};
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use crate::prompt::ParseableDocument;
|
||||
use crate::update::new::document::Document;
|
||||
|
||||
mod injectable_value;
|
||||
|
||||
pub use injectable_value::InjectableValue;
|
||||
|
||||
/// Represents a JSON [`Value`] where each string is rendered as a [`Template`].
|
||||
#[derive(Debug)]
|
||||
pub struct JsonTemplate {
|
||||
value: Value,
|
||||
templates: Vec<TemplateAtPath>,
|
||||
}
|
||||
|
||||
impl Clone for JsonTemplate {
|
||||
fn clone(&self) -> Self {
|
||||
Self::new(self.value.clone()).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
struct TemplateAtPath {
|
||||
template: Template,
|
||||
path: ValuePath,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for TemplateAtPath {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("TemplateAtPath")
|
||||
.field("template", &&"template")
|
||||
.field("path", &self.path)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that can occur either when parsing the templates in the value, or when trying to render them.
|
||||
#[derive(Debug)]
|
||||
pub struct Error {
|
||||
template_error: liquid::Error,
|
||||
path: ValuePath,
|
||||
}
|
||||
|
||||
impl Error {
|
||||
/// Produces an error message when the error happened at rendering time.
|
||||
pub fn rendering_error(&self, root: &str) -> String {
|
||||
format!(
|
||||
"in `{}`, error while rendering template: {}",
|
||||
path_with_root(root, self.path.iter()),
|
||||
&self.template_error
|
||||
)
|
||||
}
|
||||
|
||||
/// Produces an error message when the error happened at parsing time.
|
||||
pub fn parsing(&self, root: &str) -> String {
|
||||
format!(
|
||||
"in `{}`, error while parsing template: {}",
|
||||
path_with_root(root, self.path.iter()),
|
||||
&self.template_error
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl JsonTemplate {
|
||||
/// Creates a new `JsonTemplate` by parsing all strings inside the value as templates.
|
||||
///
|
||||
/// # Error
|
||||
///
|
||||
/// - If any of the strings contains a template that cannot be parsed.
|
||||
pub fn new(value: Value) -> Result<Self, Error> {
|
||||
let templates = build_templates(&value)?;
|
||||
Ok(Self { value, templates })
|
||||
}
|
||||
|
||||
/// Renders this value by replacing all its strings with the rendered version of the template they represent from the given context.
|
||||
///
|
||||
/// # Error
|
||||
///
|
||||
/// - If any of the strings contains a template that cannot be rendered with the given context.
|
||||
pub fn render(&self, context: &dyn liquid::ObjectView) -> Result<Value, Error> {
|
||||
let mut rendered = self.value.clone();
|
||||
for TemplateAtPath { template, path } in &self.templates {
|
||||
let injected_value =
|
||||
template.render(context).map_err(|err| error_with_path(err, path.clone()))?;
|
||||
inject_value(&mut rendered, path, Value::String(injected_value));
|
||||
}
|
||||
Ok(rendered)
|
||||
}
|
||||
|
||||
/// Renders this value by replacing all its strings with the rendered version of the template they represent from the contents of the given document.
|
||||
///
|
||||
/// # Error
|
||||
///
|
||||
/// - If any of the strings contains a template that cannot be rendered with the given document.
|
||||
pub fn render_document<'a, 'doc, D: Document<'a> + std::fmt::Debug>(
|
||||
&self,
|
||||
document: D,
|
||||
doc_alloc: &'doc Bump,
|
||||
) -> Result<Value, Error> {
|
||||
let document = ParseableDocument::new(document, doc_alloc);
|
||||
let context = crate::prompt::Context::without_fields(&document);
|
||||
self.render(&context)
|
||||
}
|
||||
|
||||
/// Renders this value by replacing all its strings with the rendered version of the template they represent from the contents of the search query.
|
||||
///
|
||||
/// # Error
|
||||
///
|
||||
/// - If any of the strings contains a template that cannot be rendered from the contents of the search query
|
||||
pub fn render_search(&self, q: Option<&str>, media: Option<&Value>) -> Result<Value, Error> {
|
||||
let search_data = match (q, media) {
|
||||
(None, None) => liquid::object!({}),
|
||||
(None, Some(media)) => liquid::object!({ "media": media }),
|
||||
(Some(q), None) => liquid::object!({"q": q}),
|
||||
(Some(q), Some(media)) => liquid::object!({"q": q, "media": media}),
|
||||
};
|
||||
self.render(&search_data)
|
||||
}
|
||||
|
||||
/// The JSON value representing the underlying template
|
||||
pub fn template(&self) -> &Value {
|
||||
&self.value
|
||||
}
|
||||
}
|
||||
|
||||
fn build_templates(value: &Value) -> Result<Vec<TemplateAtPath>, Error> {
|
||||
let mut current_path = ValuePath::new();
|
||||
let mut templates = Vec::new();
|
||||
let compiler = liquid::ParserBuilder::with_stdlib().build().unwrap();
|
||||
parse_value(value, &mut current_path, &mut templates, &compiler)?;
|
||||
Ok(templates)
|
||||
}
|
||||
|
||||
fn error_with_path(template_error: liquid::Error, path: ValuePath) -> Error {
|
||||
Error { template_error, path }
|
||||
}
|
||||
|
||||
fn parse_value(
|
||||
value: &Value,
|
||||
current_path: &mut ValuePath,
|
||||
templates: &mut Vec<TemplateAtPath>,
|
||||
compiler: &Parser,
|
||||
) -> Result<(), Error> {
|
||||
match value {
|
||||
Value::String(template) => {
|
||||
let template = compiler
|
||||
.parse(template)
|
||||
.map_err(|err| error_with_path(err, current_path.clone()))?;
|
||||
templates.push(TemplateAtPath { template, path: current_path.clone() });
|
||||
}
|
||||
Value::Array(values) => {
|
||||
parse_array(values, current_path, templates, compiler)?;
|
||||
}
|
||||
Value::Object(map) => {
|
||||
parse_object(map, current_path, templates, compiler)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_object(
|
||||
map: &Map<String, Value>,
|
||||
current_path: &mut ValuePath,
|
||||
templates: &mut Vec<TemplateAtPath>,
|
||||
compiler: &Parser,
|
||||
) -> Result<(), Error> {
|
||||
for (key, value) in map {
|
||||
current_path.push(PathComponent::MapKey(key.clone()));
|
||||
parse_value(value, current_path, templates, compiler)?;
|
||||
current_path.pop();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn parse_array(
|
||||
values: &[Value],
|
||||
current_path: &mut ValuePath,
|
||||
templates: &mut Vec<TemplateAtPath>,
|
||||
compiler: &Parser,
|
||||
) -> Result<(), Error> {
|
||||
for (index, value) in values.iter().enumerate() {
|
||||
current_path.push(PathComponent::ArrayIndex(index));
|
||||
parse_value(value, current_path, templates, compiler)?;
|
||||
current_path.pop();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// A list of [`PathComponent`]s describing a path to a value inside a JSON value.
|
||||
///
|
||||
/// The empty list refers to the root value.
|
||||
pub type ValuePath = Vec<PathComponent>;
|
||||
|
||||
/// Component of a path to a Value
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum PathComponent {
|
||||
/// A key inside of an object
|
||||
MapKey(String),
|
||||
/// An index inside of an array
|
||||
ArrayIndex(usize),
|
||||
}
|
||||
|
||||
impl PartialEq for PathComponent {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
match (self, other) {
|
||||
(Self::MapKey(l0), Self::MapKey(r0)) => l0 == r0,
|
||||
(Self::ArrayIndex(l0), Self::ArrayIndex(r0)) => l0 == r0,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for PathComponent {}
|
||||
|
||||
/// Builds a string representation of a path, preprending the name of the root value.
|
||||
pub fn path_with_root<'a>(
|
||||
root: &str,
|
||||
path: impl IntoIterator<Item = &'a PathComponent> + 'a,
|
||||
) -> String {
|
||||
use std::fmt::Write as _;
|
||||
let mut res = format!("`{root}");
|
||||
for component in path.into_iter() {
|
||||
match component {
|
||||
PathComponent::MapKey(key) => {
|
||||
let _ = write!(&mut res, ".{key}");
|
||||
}
|
||||
PathComponent::ArrayIndex(index) => {
|
||||
let _ = write!(&mut res, "[{index}]");
|
||||
}
|
||||
}
|
||||
}
|
||||
res.push('`');
|
||||
res
|
||||
}
|
||||
|
||||
/// Modifies `rendered` to replace the sub-value at the `injection_path` location by the `injected_value`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if the provided `injection_path` cannot be traversed in `rendered`.
|
||||
pub fn inject_value(
|
||||
rendered: &mut Value,
|
||||
injection_path: &Vec<PathComponent>,
|
||||
injected_value: Value,
|
||||
) {
|
||||
let mut current_value = rendered;
|
||||
for injection_component in injection_path {
|
||||
current_value = match injection_component {
|
||||
PathComponent::MapKey(key) => current_value.get_mut(key).unwrap(),
|
||||
PathComponent::ArrayIndex(index) => current_value.get_mut(index).unwrap(),
|
||||
}
|
||||
}
|
||||
*current_value = injected_value;
|
||||
}
|
||||
|
||||
fn format_value(value: &Value) -> String {
|
||||
match value {
|
||||
Value::Array(array) => format!("an array of size {}", array.len()),
|
||||
Value::Object(object) => {
|
||||
format!("an object with {} field(s)", object.len())
|
||||
}
|
||||
value => value.to_string(),
|
||||
}
|
||||
}
|
@ -13,17 +13,22 @@ use serde::{Deserialize, Serialize};
|
||||
use utoipa::ToSchema;
|
||||
|
||||
use self::error::{EmbedError, NewEmbedderError};
|
||||
use crate::progress::Progress;
|
||||
use crate::progress::{EmbedderStats, Progress};
|
||||
use crate::prompt::{Prompt, PromptData};
|
||||
use crate::vector::composite::SubEmbedderOptions;
|
||||
use crate::vector::json_template::JsonTemplate;
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
pub mod composite;
|
||||
pub mod db;
|
||||
pub mod error;
|
||||
pub mod extractor;
|
||||
pub mod hf;
|
||||
pub mod json_template;
|
||||
pub mod manual;
|
||||
pub mod openai;
|
||||
pub mod parsed_vectors;
|
||||
pub mod session;
|
||||
pub mod settings;
|
||||
|
||||
pub mod ollama;
|
||||
@ -60,7 +65,7 @@ impl ArroyWrapper {
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
db: arroy::Database<D>,
|
||||
) -> impl Iterator<Item = Result<arroy::Reader<'a, D>, arroy::Error>> + 'a {
|
||||
arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| {
|
||||
arroy_store_range_for_embedder(self.embedder_index).filter_map(move |index| {
|
||||
match arroy::Reader::open(rtxn, index, db) {
|
||||
Ok(reader) => match reader.is_empty(rtxn) {
|
||||
Ok(false) => Some(Ok(reader)),
|
||||
@ -73,12 +78,57 @@ impl ArroyWrapper {
|
||||
})
|
||||
}
|
||||
|
||||
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<usize, arroy::Error> {
|
||||
let first_id = arroy_db_range_for_embedder(self.embedder_index).next().unwrap();
|
||||
/// The item ids that are present in the store specified by its id.
|
||||
///
|
||||
/// The ids are accessed via a lambda to avoid lifetime shenanigans.
|
||||
pub fn items_in_store<F, O>(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
store_id: u8,
|
||||
with_items: F,
|
||||
) -> Result<O, arroy::Error>
|
||||
where
|
||||
F: FnOnce(&RoaringBitmap) -> O,
|
||||
{
|
||||
if self.quantized {
|
||||
Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions())
|
||||
self._items_in_store(rtxn, self.quantized_db(), store_id, with_items)
|
||||
} else {
|
||||
Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions())
|
||||
self._items_in_store(rtxn, self.angular_db(), store_id, with_items)
|
||||
}
|
||||
}
|
||||
|
||||
fn _items_in_store<D: arroy::Distance, F, O>(
|
||||
&self,
|
||||
rtxn: &RoTxn,
|
||||
db: arroy::Database<D>,
|
||||
store_id: u8,
|
||||
with_items: F,
|
||||
) -> Result<O, arroy::Error>
|
||||
where
|
||||
F: FnOnce(&RoaringBitmap) -> O,
|
||||
{
|
||||
let index = arroy_store_for_embedder(self.embedder_index, store_id);
|
||||
let reader = arroy::Reader::open(rtxn, index, db);
|
||||
match reader {
|
||||
Ok(reader) => Ok(with_items(reader.item_ids())),
|
||||
Err(arroy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())),
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn dimensions(&self, rtxn: &RoTxn) -> Result<Option<usize>, arroy::Error> {
|
||||
if self.quantized {
|
||||
Ok(self
|
||||
.readers(rtxn, self.quantized_db())
|
||||
.next()
|
||||
.transpose()?
|
||||
.map(|reader| reader.dimensions()))
|
||||
} else {
|
||||
Ok(self
|
||||
.readers(rtxn, self.angular_db())
|
||||
.next()
|
||||
.transpose()?
|
||||
.map(|reader| reader.dimensions()))
|
||||
}
|
||||
}
|
||||
|
||||
@ -93,13 +143,13 @@ impl ArroyWrapper {
|
||||
arroy_memory: Option<usize>,
|
||||
cancel: &(impl Fn() -> bool + Sync + Send),
|
||||
) -> Result<(), arroy::Error> {
|
||||
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
||||
if self.quantized {
|
||||
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
||||
if writer.need_build(wtxn)? {
|
||||
writer.builder(rng).build(wtxn)?
|
||||
} else if writer.is_empty(wtxn)? {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
||||
@ -124,7 +174,7 @@ impl ArroyWrapper {
|
||||
.cancel(cancel)
|
||||
.build(wtxn)?;
|
||||
} else if writer.is_empty(wtxn)? {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -143,7 +193,7 @@ impl ArroyWrapper {
|
||||
) -> Result<(), arroy::Error> {
|
||||
let dimension = embeddings.dimension();
|
||||
for (index, vector) in
|
||||
arroy_db_range_for_embedder(self.embedder_index).zip(embeddings.iter())
|
||||
arroy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter())
|
||||
{
|
||||
if self.quantized {
|
||||
arroy::Writer::new(self.quantized_db(), index, dimension)
|
||||
@ -179,7 +229,7 @@ impl ArroyWrapper {
|
||||
) -> Result<(), arroy::Error> {
|
||||
let dimension = vector.len();
|
||||
|
||||
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
||||
let writer = arroy::Writer::new(db, index, dimension);
|
||||
if !writer.contains_item(wtxn, item_id)? {
|
||||
writer.add_item(wtxn, item_id, vector)?;
|
||||
@ -189,6 +239,38 @@ impl ArroyWrapper {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Add a vector associated with a document in store specified by its id.
|
||||
///
|
||||
/// Any existing vector associated with the document in the store will be replaced by the new vector.
|
||||
pub fn add_item_in_store(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
item_id: arroy::ItemId,
|
||||
store_id: u8,
|
||||
vector: &[f32],
|
||||
) -> Result<(), arroy::Error> {
|
||||
if self.quantized {
|
||||
self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector)
|
||||
} else {
|
||||
self._add_item_in_store(wtxn, self.angular_db(), item_id, store_id, vector)
|
||||
}
|
||||
}
|
||||
|
||||
fn _add_item_in_store<D: arroy::Distance>(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
db: arroy::Database<D>,
|
||||
item_id: arroy::ItemId,
|
||||
store_id: u8,
|
||||
vector: &[f32],
|
||||
) -> Result<(), arroy::Error> {
|
||||
let dimension = vector.len();
|
||||
|
||||
let index = arroy_store_for_embedder(self.embedder_index, store_id);
|
||||
let writer = arroy::Writer::new(db, index, dimension);
|
||||
writer.add_item(wtxn, item_id, vector)
|
||||
}
|
||||
|
||||
/// Delete all embeddings from a specific `item_id`
|
||||
pub fn del_items(
|
||||
&self,
|
||||
@ -196,24 +278,84 @@ impl ArroyWrapper {
|
||||
dimension: usize,
|
||||
item_id: arroy::ItemId,
|
||||
) -> Result<(), arroy::Error> {
|
||||
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
||||
if self.quantized {
|
||||
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
||||
if !writer.del_item(wtxn, item_id)? {
|
||||
break;
|
||||
}
|
||||
writer.del_item(wtxn, item_id)?;
|
||||
} else {
|
||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
||||
if !writer.del_item(wtxn, item_id)? {
|
||||
break;
|
||||
}
|
||||
writer.del_item(wtxn, item_id)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Delete one item.
|
||||
/// Removes the item specified by its id from the store specified by its id.
|
||||
///
|
||||
/// Returns whether the item was removed.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// - This function will silently fail to remove the item if used against an arroy database that was never built.
|
||||
pub fn del_item_in_store(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
item_id: arroy::ItemId,
|
||||
store_id: u8,
|
||||
dimensions: usize,
|
||||
) -> Result<bool, arroy::Error> {
|
||||
if self.quantized {
|
||||
self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions)
|
||||
} else {
|
||||
self._del_item_in_store(wtxn, self.angular_db(), item_id, store_id, dimensions)
|
||||
}
|
||||
}
|
||||
|
||||
fn _del_item_in_store<D: arroy::Distance>(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
db: arroy::Database<D>,
|
||||
item_id: arroy::ItemId,
|
||||
store_id: u8,
|
||||
dimensions: usize,
|
||||
) -> Result<bool, arroy::Error> {
|
||||
let index = arroy_store_for_embedder(self.embedder_index, store_id);
|
||||
let writer = arroy::Writer::new(db, index, dimensions);
|
||||
writer.del_item(wtxn, item_id)
|
||||
}
|
||||
|
||||
/// Removes all items from the store specified by its id.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// - This function will silently fail to remove the items if used against an arroy database that was never built.
|
||||
pub fn clear_store(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
store_id: u8,
|
||||
dimensions: usize,
|
||||
) -> Result<(), arroy::Error> {
|
||||
if self.quantized {
|
||||
self._clear_store(wtxn, self.quantized_db(), store_id, dimensions)
|
||||
} else {
|
||||
self._clear_store(wtxn, self.angular_db(), store_id, dimensions)
|
||||
}
|
||||
}
|
||||
|
||||
fn _clear_store<D: arroy::Distance>(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
db: arroy::Database<D>,
|
||||
store_id: u8,
|
||||
dimensions: usize,
|
||||
) -> Result<(), arroy::Error> {
|
||||
let index = arroy_store_for_embedder(self.embedder_index, store_id);
|
||||
let writer = arroy::Writer::new(db, index, dimensions);
|
||||
writer.clear(wtxn)
|
||||
}
|
||||
|
||||
/// Delete one item from its value.
|
||||
pub fn del_item(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
@ -235,54 +377,31 @@ impl ArroyWrapper {
|
||||
vector: &[f32],
|
||||
) -> Result<bool, arroy::Error> {
|
||||
let dimension = vector.len();
|
||||
let mut deleted_index = None;
|
||||
|
||||
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
||||
let writer = arroy::Writer::new(db, index, dimension);
|
||||
let Some(candidate) = writer.item_vector(wtxn, item_id)? else {
|
||||
// uses invariant: vectors are packed in the first writers.
|
||||
break;
|
||||
continue;
|
||||
};
|
||||
if candidate == vector {
|
||||
writer.del_item(wtxn, item_id)?;
|
||||
deleted_index = Some(index);
|
||||
return writer.del_item(wtxn, item_id);
|
||||
}
|
||||
}
|
||||
|
||||
// 🥲 enforce invariant: vectors are packed in the first writers.
|
||||
if let Some(deleted_index) = deleted_index {
|
||||
let mut last_index_with_a_vector = None;
|
||||
for index in
|
||||
arroy_db_range_for_embedder(self.embedder_index).skip(deleted_index as usize)
|
||||
{
|
||||
let writer = arroy::Writer::new(db, index, dimension);
|
||||
let Some(candidate) = writer.item_vector(wtxn, item_id)? else {
|
||||
break;
|
||||
};
|
||||
last_index_with_a_vector = Some((index, candidate));
|
||||
}
|
||||
if let Some((last_index, vector)) = last_index_with_a_vector {
|
||||
let writer = arroy::Writer::new(db, last_index, dimension);
|
||||
writer.del_item(wtxn, item_id)?;
|
||||
let writer = arroy::Writer::new(db, deleted_index, dimension);
|
||||
writer.add_item(wtxn, item_id, &vector)?;
|
||||
}
|
||||
}
|
||||
Ok(deleted_index.is_some())
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> {
|
||||
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
||||
if self.quantized {
|
||||
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
||||
if writer.is_empty(wtxn)? {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
writer.clear(wtxn)?;
|
||||
} else {
|
||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
||||
if writer.is_empty(wtxn)? {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
writer.clear(wtxn)?;
|
||||
}
|
||||
@ -296,17 +415,17 @@ impl ArroyWrapper {
|
||||
dimension: usize,
|
||||
item: arroy::ItemId,
|
||||
) -> Result<bool, arroy::Error> {
|
||||
for index in arroy_db_range_for_embedder(self.embedder_index) {
|
||||
for index in arroy_store_range_for_embedder(self.embedder_index) {
|
||||
let contains = if self.quantized {
|
||||
let writer = arroy::Writer::new(self.quantized_db(), index, dimension);
|
||||
if writer.is_empty(rtxn)? {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
writer.contains_item(rtxn, item)?
|
||||
} else {
|
||||
let writer = arroy::Writer::new(self.angular_db(), index, dimension);
|
||||
if writer.is_empty(rtxn)? {
|
||||
break;
|
||||
continue;
|
||||
}
|
||||
writer.contains_item(rtxn, item)?
|
||||
};
|
||||
@ -345,13 +464,14 @@ impl ArroyWrapper {
|
||||
let reader = reader?;
|
||||
let mut searcher = reader.nns(limit);
|
||||
if let Some(filter) = filter {
|
||||
if reader.item_ids().is_disjoint(filter) {
|
||||
continue;
|
||||
}
|
||||
searcher.candidates(filter);
|
||||
}
|
||||
|
||||
if let Some(mut ret) = searcher.by_item(rtxn, item)? {
|
||||
results.append(&mut ret);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
||||
@ -386,6 +506,9 @@ impl ArroyWrapper {
|
||||
let reader = reader?;
|
||||
let mut searcher = reader.nns(limit);
|
||||
if let Some(filter) = filter {
|
||||
if reader.item_ids().is_disjoint(filter) {
|
||||
continue;
|
||||
}
|
||||
searcher.candidates(filter);
|
||||
}
|
||||
|
||||
@ -404,16 +527,12 @@ impl ArroyWrapper {
|
||||
for reader in self.readers(rtxn, self.quantized_db()) {
|
||||
if let Some(vec) = reader?.item_vector(rtxn, item_id)? {
|
||||
vectors.push(vec);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for reader in self.readers(rtxn, self.angular_db()) {
|
||||
if let Some(vec) = reader?.item_vector(rtxn, item_id)? {
|
||||
vectors.push(vec);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -465,6 +584,7 @@ pub struct ArroyStats {
|
||||
pub documents: RoaringBitmap,
|
||||
}
|
||||
/// One or multiple embeddings stored consecutively in a flat vector.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct Embeddings<F> {
|
||||
data: Vec<F>,
|
||||
dimension: usize,
|
||||
@ -615,15 +735,43 @@ impl EmbeddingConfig {
|
||||
}
|
||||
}
|
||||
|
||||
/// Map of embedder configurations.
|
||||
///
|
||||
/// Each configuration is mapped to a name.
|
||||
/// Map of runtime embedder data.
|
||||
#[derive(Clone, Default)]
|
||||
pub struct EmbeddingConfigs(HashMap<String, (Arc<Embedder>, Arc<Prompt>, bool)>);
|
||||
pub struct RuntimeEmbedders(HashMap<String, Arc<RuntimeEmbedder>>);
|
||||
|
||||
impl EmbeddingConfigs {
|
||||
pub struct RuntimeEmbedder {
|
||||
pub embedder: Arc<Embedder>,
|
||||
pub document_template: Prompt,
|
||||
fragments: Vec<RuntimeFragment>,
|
||||
pub is_quantized: bool,
|
||||
}
|
||||
|
||||
impl RuntimeEmbedder {
|
||||
pub fn new(
|
||||
embedder: Arc<Embedder>,
|
||||
document_template: Prompt,
|
||||
mut fragments: Vec<RuntimeFragment>,
|
||||
is_quantized: bool,
|
||||
) -> Self {
|
||||
fragments.sort_unstable_by(|left, right| left.name.cmp(&right.name));
|
||||
Self { embedder, document_template, fragments, is_quantized }
|
||||
}
|
||||
|
||||
/// The runtime fragments sorted by name.
|
||||
pub fn fragments(&self) -> &[RuntimeFragment] {
|
||||
self.fragments.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RuntimeFragment {
|
||||
pub name: String,
|
||||
pub id: u8,
|
||||
pub template: JsonTemplate,
|
||||
}
|
||||
|
||||
impl RuntimeEmbedders {
|
||||
/// Create the map from its internal component.s
|
||||
pub fn new(data: HashMap<String, (Arc<Embedder>, Arc<Prompt>, bool)>) -> Self {
|
||||
pub fn new(data: HashMap<String, Arc<RuntimeEmbedder>>) -> Self {
|
||||
Self(data)
|
||||
}
|
||||
|
||||
@ -632,24 +780,31 @@ impl EmbeddingConfigs {
|
||||
}
|
||||
|
||||
/// Get an embedder configuration and template from its name.
|
||||
pub fn get(&self, name: &str) -> Option<(Arc<Embedder>, Arc<Prompt>, bool)> {
|
||||
self.0.get(name).cloned()
|
||||
pub fn get(&self, name: &str) -> Option<&Arc<RuntimeEmbedder>> {
|
||||
self.0.get(name)
|
||||
}
|
||||
|
||||
pub fn inner_as_ref(&self) -> &HashMap<String, (Arc<Embedder>, Arc<Prompt>, bool)> {
|
||||
pub fn inner_as_ref(&self) -> &HashMap<String, Arc<RuntimeEmbedder>> {
|
||||
&self.0
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> HashMap<String, (Arc<Embedder>, Arc<Prompt>, bool)> {
|
||||
pub fn into_inner(self) -> HashMap<String, Arc<RuntimeEmbedder>> {
|
||||
self.0
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.0.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for EmbeddingConfigs {
|
||||
type Item = (String, (Arc<Embedder>, Arc<Prompt>, bool));
|
||||
impl IntoIterator for RuntimeEmbedders {
|
||||
type Item = (String, Arc<RuntimeEmbedder>);
|
||||
|
||||
type IntoIter =
|
||||
std::collections::hash_map::IntoIter<String, (Arc<Embedder>, Arc<Prompt>, bool)>;
|
||||
type IntoIter = std::collections::hash_map::IntoIter<String, Arc<RuntimeEmbedder>>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.into_iter()
|
||||
@ -667,6 +822,27 @@ pub enum EmbedderOptions {
|
||||
Composite(composite::EmbedderOptions),
|
||||
}
|
||||
|
||||
impl EmbedderOptions {
|
||||
pub fn fragment(&self, name: &str) -> Option<&serde_json::Value> {
|
||||
match &self {
|
||||
EmbedderOptions::HuggingFace(_)
|
||||
| EmbedderOptions::OpenAi(_)
|
||||
| EmbedderOptions::Ollama(_)
|
||||
| EmbedderOptions::UserProvided(_) => None,
|
||||
EmbedderOptions::Rest(embedder_options) => {
|
||||
embedder_options.indexing_fragments.get(name)
|
||||
}
|
||||
EmbedderOptions::Composite(embedder_options) => {
|
||||
if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index {
|
||||
embedder_options.indexing_fragments.get(name)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for EmbedderOptions {
|
||||
fn default() -> Self {
|
||||
Self::HuggingFace(Default::default())
|
||||
@ -707,6 +883,17 @@ impl Embedder {
|
||||
|
||||
#[tracing::instrument(level = "debug", skip_all, target = "search")]
|
||||
pub fn embed_search(
|
||||
&self,
|
||||
query: SearchQuery<'_>,
|
||||
deadline: Option<Instant>,
|
||||
) -> std::result::Result<Embedding, EmbedError> {
|
||||
match query {
|
||||
SearchQuery::Text(text) => self.embed_search_text(text, deadline),
|
||||
SearchQuery::Media { q, media } => self.embed_search_media(q, media, deadline),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn embed_search_text(
|
||||
&self,
|
||||
text: &str,
|
||||
deadline: Option<Instant>,
|
||||
@ -719,18 +906,17 @@ impl Embedder {
|
||||
}
|
||||
let embedding = match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_one(text),
|
||||
Embedder::OpenAi(embedder) => {
|
||||
embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding)
|
||||
}
|
||||
Embedder::Ollama(embedder) => {
|
||||
embedder.embed(&[text], deadline)?.pop().ok_or_else(EmbedError::missing_embedding)
|
||||
}
|
||||
Embedder::UserProvided(embedder) => embedder.embed_one(text),
|
||||
Embedder::Rest(embedder) => embedder
|
||||
.embed_ref(&[text], deadline)?
|
||||
Embedder::OpenAi(embedder) => embedder
|
||||
.embed(&[text], deadline, None)?
|
||||
.pop()
|
||||
.ok_or_else(EmbedError::missing_embedding),
|
||||
Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline),
|
||||
Embedder::Ollama(embedder) => embedder
|
||||
.embed(&[text], deadline, None)?
|
||||
.pop()
|
||||
.ok_or_else(EmbedError::missing_embedding),
|
||||
Embedder::UserProvided(embedder) => embedder.embed_one(text),
|
||||
Embedder::Rest(embedder) => embedder.embed_one(SearchQuery::Text(text), deadline, None),
|
||||
Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline, None),
|
||||
}?;
|
||||
|
||||
if let Some(cache) = self.cache() {
|
||||
@ -740,6 +926,18 @@ impl Embedder {
|
||||
Ok(embedding)
|
||||
}
|
||||
|
||||
pub fn embed_search_media(
|
||||
&self,
|
||||
q: Option<&str>,
|
||||
media: Option<&serde_json::Value>,
|
||||
deadline: Option<Instant>,
|
||||
) -> std::result::Result<Embedding, EmbedError> {
|
||||
let Embedder::Rest(embedder) = self else {
|
||||
return Err(EmbedError::rest_media_not_a_rest());
|
||||
};
|
||||
embedder.embed_one(SearchQuery::Media { q, media }, deadline, None)
|
||||
}
|
||||
|
||||
/// Embed multiple chunks of texts.
|
||||
///
|
||||
/// Each chunk is composed of one or multiple texts.
|
||||
@ -747,14 +945,21 @@ impl Embedder {
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks),
|
||||
Embedder::OpenAi(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
Embedder::Ollama(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
Embedder::OpenAi(embedder) => {
|
||||
embedder.embed_index(text_chunks, threads, embedder_stats)
|
||||
}
|
||||
Embedder::Ollama(embedder) => {
|
||||
embedder.embed_index(text_chunks, threads, embedder_stats)
|
||||
}
|
||||
Embedder::UserProvided(embedder) => embedder.embed_index(text_chunks),
|
||||
Embedder::Rest(embedder) => embedder.embed_index(text_chunks, threads),
|
||||
Embedder::Composite(embedder) => embedder.index.embed_index(text_chunks, threads),
|
||||
Embedder::Rest(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats),
|
||||
Embedder::Composite(embedder) => {
|
||||
embedder.index.embed_index(text_chunks, threads, embedder_stats)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -763,14 +968,37 @@ impl Embedder {
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
match self {
|
||||
Embedder::HuggingFace(embedder) => embedder.embed_index_ref(texts),
|
||||
Embedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
Embedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
Embedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats),
|
||||
Embedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats),
|
||||
Embedder::UserProvided(embedder) => embedder.embed_index_ref(texts),
|
||||
Embedder::Rest(embedder) => embedder.embed_index_ref(texts, threads),
|
||||
Embedder::Composite(embedder) => embedder.index.embed_index_ref(texts, threads),
|
||||
Embedder::Rest(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats),
|
||||
Embedder::Composite(embedder) => {
|
||||
embedder.index.embed_index_ref(texts, threads, embedder_stats)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn embed_index_ref_fragments(
|
||||
&self,
|
||||
fragments: &[serde_json::Value],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
if let Embedder::Rest(embedder) = self {
|
||||
embedder.embed_index_ref(fragments, threads, embedder_stats)
|
||||
} else {
|
||||
let Embedder::Composite(embedder) = self else {
|
||||
unimplemented!("embedding fragments is only available for rest embedders")
|
||||
};
|
||||
let crate::vector::composite::SubEmbedder::Rest(embedder) = &embedder.index else {
|
||||
unimplemented!("embedding fragments is only available for rest embedders")
|
||||
};
|
||||
|
||||
embedder.embed_index_ref(fragments, threads, embedder_stats)
|
||||
}
|
||||
}
|
||||
|
||||
@ -845,6 +1073,12 @@ impl Embedder {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum SearchQuery<'a> {
|
||||
Text(&'a str),
|
||||
Media { q: Option<&'a str>, media: Option<&'a serde_json::Value> },
|
||||
}
|
||||
|
||||
/// Describes the mean and sigma of distribution of embedding similarity in the embedding space.
|
||||
///
|
||||
/// The intended use is to make the similarity score more comparable to the regular ranking score.
|
||||
@ -974,8 +1208,11 @@ pub const fn is_cuda_enabled() -> bool {
|
||||
cfg!(feature = "cuda")
|
||||
}
|
||||
|
||||
pub fn arroy_db_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
|
||||
let embedder_id = (embedder_id as u16) << 8;
|
||||
|
||||
(0..=u8::MAX).map(move |k| embedder_id | (k as u16))
|
||||
fn arroy_store_range_for_embedder(embedder_id: u8) -> impl Iterator<Item = u16> {
|
||||
(0..=u8::MAX).map(move |store_id| arroy_store_for_embedder(embedder_id, store_id))
|
||||
}
|
||||
|
||||
fn arroy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 {
|
||||
let embedder_id = (embedder_id as u16) << 8;
|
||||
embedder_id | (store_id as u16)
|
||||
}
|
||||
|
@ -7,6 +7,7 @@ use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErro
|
||||
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
|
||||
use super::{DistributionShift, EmbeddingCache, REQUEST_PARALLELISM};
|
||||
use crate::error::FaultSource;
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::vector::Embedding;
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
@ -70,6 +71,8 @@ impl EmbedderOptions {
|
||||
request,
|
||||
response,
|
||||
headers: Default::default(),
|
||||
indexing_fragments: Default::default(),
|
||||
search_fragments: Default::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -104,8 +107,9 @@ impl Embedder {
|
||||
&self,
|
||||
texts: &[S],
|
||||
deadline: Option<Instant>,
|
||||
embedder_stats: Option<&EmbedderStats>,
|
||||
) -> Result<Vec<Embedding>, EmbedError> {
|
||||
match self.rest_embedder.embed_ref(texts, deadline) {
|
||||
match self.rest_embedder.embed_ref(texts, deadline, embedder_stats) {
|
||||
Ok(embeddings) => Ok(embeddings),
|
||||
Err(EmbedError { kind: EmbedErrorKind::RestOtherStatusCode(404, error), fault: _ }) => {
|
||||
Err(EmbedError::ollama_model_not_found(error))
|
||||
@ -118,15 +122,22 @@ impl Embedder {
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
// This condition helps reduce the number of active rayon jobs
|
||||
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
|
||||
if threads.active_operations() >= REQUEST_PARALLELISM {
|
||||
text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None)).collect()
|
||||
text_chunks
|
||||
.into_iter()
|
||||
.map(move |chunk| self.embed(&chunk, None, Some(embedder_stats)))
|
||||
.collect()
|
||||
} else {
|
||||
threads
|
||||
.install(move || {
|
||||
text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect()
|
||||
text_chunks
|
||||
.into_par_iter()
|
||||
.map(move |chunk| self.embed(&chunk, None, Some(embedder_stats)))
|
||||
.collect()
|
||||
})
|
||||
.map_err(|error| EmbedError {
|
||||
kind: EmbedErrorKind::PanicInThreadPool(error),
|
||||
@ -139,13 +150,14 @@ impl Embedder {
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> Result<Vec<Vec<f32>>, EmbedError> {
|
||||
// This condition helps reduce the number of active rayon jobs
|
||||
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
|
||||
if threads.active_operations() >= REQUEST_PARALLELISM {
|
||||
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
|
||||
.chunks(self.prompt_count_in_chunk_hint())
|
||||
.map(move |chunk| self.embed(chunk, None))
|
||||
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats)))
|
||||
.collect();
|
||||
|
||||
let embeddings = embeddings?;
|
||||
@ -155,7 +167,7 @@ impl Embedder {
|
||||
.install(move || {
|
||||
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
|
||||
.par_chunks(self.prompt_count_in_chunk_hint())
|
||||
.map(move |chunk| self.embed(chunk, None))
|
||||
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats)))
|
||||
.collect();
|
||||
|
||||
let embeddings = embeddings?;
|
||||
|
@ -9,6 +9,7 @@ use super::error::{EmbedError, NewEmbedderError};
|
||||
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
|
||||
use super::{DistributionShift, EmbeddingCache, REQUEST_PARALLELISM};
|
||||
use crate::error::FaultSource;
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::vector::error::EmbedErrorKind;
|
||||
use crate::vector::Embedding;
|
||||
use crate::ThreadPoolNoAbort;
|
||||
@ -200,6 +201,8 @@ impl Embedder {
|
||||
]
|
||||
}),
|
||||
headers: Default::default(),
|
||||
indexing_fragments: Default::default(),
|
||||
search_fragments: Default::default(),
|
||||
},
|
||||
cache_cap,
|
||||
super::rest::ConfigurationSource::OpenAi,
|
||||
@ -215,8 +218,9 @@ impl Embedder {
|
||||
&self,
|
||||
texts: &[S],
|
||||
deadline: Option<Instant>,
|
||||
embedder_stats: Option<&EmbedderStats>,
|
||||
) -> Result<Vec<Embedding>, EmbedError> {
|
||||
match self.rest_embedder.embed_ref(texts, deadline) {
|
||||
match self.rest_embedder.embed_ref(texts, deadline, embedder_stats) {
|
||||
Ok(embeddings) => Ok(embeddings),
|
||||
Err(EmbedError { kind: EmbedErrorKind::RestBadRequest(error, _), fault: _ }) => {
|
||||
tracing::warn!(error=?error, "OpenAI: received `BAD_REQUEST`. Input was maybe too long, retrying on tokenized version. For best performance, limit the size of your document template.");
|
||||
@ -238,7 +242,11 @@ impl Embedder {
|
||||
let encoded = self.tokenizer.encode_ordinary(text);
|
||||
let len = encoded.len();
|
||||
if len < max_token_count {
|
||||
all_embeddings.append(&mut self.rest_embedder.embed_ref(&[text], deadline)?);
|
||||
all_embeddings.append(&mut self.rest_embedder.embed_ref(
|
||||
&[text],
|
||||
deadline,
|
||||
None,
|
||||
)?);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -255,15 +263,22 @@ impl Embedder {
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
// This condition helps reduce the number of active rayon jobs
|
||||
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
|
||||
if threads.active_operations() >= REQUEST_PARALLELISM {
|
||||
text_chunks.into_iter().map(move |chunk| self.embed(&chunk, None)).collect()
|
||||
text_chunks
|
||||
.into_iter()
|
||||
.map(move |chunk| self.embed(&chunk, None, Some(embedder_stats)))
|
||||
.collect()
|
||||
} else {
|
||||
threads
|
||||
.install(move || {
|
||||
text_chunks.into_par_iter().map(move |chunk| self.embed(&chunk, None)).collect()
|
||||
text_chunks
|
||||
.into_par_iter()
|
||||
.map(move |chunk| self.embed(&chunk, None, Some(embedder_stats)))
|
||||
.collect()
|
||||
})
|
||||
.map_err(|error| EmbedError {
|
||||
kind: EmbedErrorKind::PanicInThreadPool(error),
|
||||
@ -276,13 +291,14 @@ impl Embedder {
|
||||
&self,
|
||||
texts: &[&str],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> Result<Vec<Vec<f32>>, EmbedError> {
|
||||
// This condition helps reduce the number of active rayon jobs
|
||||
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
|
||||
if threads.active_operations() >= REQUEST_PARALLELISM {
|
||||
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
|
||||
.chunks(self.prompt_count_in_chunk_hint())
|
||||
.map(move |chunk| self.embed(chunk, None))
|
||||
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats)))
|
||||
.collect();
|
||||
let embeddings = embeddings?;
|
||||
Ok(embeddings.into_iter().flatten().collect())
|
||||
@ -291,7 +307,7 @@ impl Embedder {
|
||||
.install(move || {
|
||||
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
|
||||
.par_chunks(self.prompt_count_in_chunk_hint())
|
||||
.map(move |chunk| self.embed(chunk, None))
|
||||
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats)))
|
||||
.collect();
|
||||
|
||||
let embeddings = embeddings?;
|
||||
|
@ -6,9 +6,8 @@ use serde_json::value::RawValue;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::Embedding;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::{DocumentId, FieldId, InternalError, UserError};
|
||||
use crate::{FieldId, InternalError, UserError};
|
||||
|
||||
#[derive(serde::Serialize, Debug)]
|
||||
#[serde(untagged)]
|
||||
@ -151,7 +150,8 @@ impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor {
|
||||
regenerate = Some(value);
|
||||
}
|
||||
Ok(Some("embeddings")) => {
|
||||
let value: &RawValue = match map.next_value() {
|
||||
let value: &RawValue = match map.next_value::<&RawValue>() {
|
||||
Ok(value) if value.get() == RawValue::NULL.get() => continue,
|
||||
Ok(value) => value,
|
||||
Err(error) => {
|
||||
return Ok(Err(RawVectorsError::DeserializeEmbeddings {
|
||||
@ -374,8 +374,7 @@ pub struct ParsedVectorsDiff {
|
||||
|
||||
impl ParsedVectorsDiff {
|
||||
pub fn new(
|
||||
docid: DocumentId,
|
||||
embedders_configs: &[IndexEmbeddingConfig],
|
||||
regenerate_for_embedders: impl Iterator<Item = String>,
|
||||
documents_diff: &KvReader<FieldId>,
|
||||
old_vectors_fid: Option<FieldId>,
|
||||
new_vectors_fid: Option<FieldId>,
|
||||
@ -396,10 +395,8 @@ impl ParsedVectorsDiff {
|
||||
}
|
||||
}
|
||||
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect());
|
||||
for embedding_config in embedders_configs {
|
||||
if embedding_config.user_provided.contains(docid) {
|
||||
old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual);
|
||||
}
|
||||
for name in regenerate_for_embedders {
|
||||
old.entry(name).or_insert(VectorState::Generated);
|
||||
}
|
||||
|
||||
let new = 'new: {
|
||||
|
@ -6,13 +6,16 @@ use rand::Rng;
|
||||
use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _};
|
||||
use rayon::slice::ParallelSlice as _;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::error::EmbedErrorKind;
|
||||
use super::json_template::ValueTemplate;
|
||||
use super::json_template::{InjectableValue, JsonTemplate};
|
||||
use super::{
|
||||
DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, REQUEST_PARALLELISM,
|
||||
DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, SearchQuery,
|
||||
REQUEST_PARALLELISM,
|
||||
};
|
||||
use crate::error::FaultSource;
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::ThreadPoolNoAbort;
|
||||
|
||||
// retrying in case of failure
|
||||
@ -87,19 +90,61 @@ struct EmbedderData {
|
||||
bearer: Option<String>,
|
||||
headers: BTreeMap<String, String>,
|
||||
url: String,
|
||||
request: Request,
|
||||
request: RequestData,
|
||||
response: Response,
|
||||
configuration_source: ConfigurationSource,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum RequestData {
|
||||
Single(Request),
|
||||
FromFragments(RequestFromFragments),
|
||||
}
|
||||
|
||||
impl RequestData {
|
||||
pub fn new(
|
||||
request: Value,
|
||||
indexing_fragments: BTreeMap<String, Value>,
|
||||
search_fragments: BTreeMap<String, Value>,
|
||||
) -> Result<Self, NewEmbedderError> {
|
||||
Ok(if indexing_fragments.is_empty() && search_fragments.is_empty() {
|
||||
RequestData::Single(Request::new(request)?)
|
||||
} else {
|
||||
for (name, value) in indexing_fragments {
|
||||
JsonTemplate::new(value).map_err(|error| {
|
||||
NewEmbedderError::rest_could_not_parse_template(
|
||||
error.parsing(&format!(".indexingFragments.{name}")),
|
||||
)
|
||||
})?;
|
||||
}
|
||||
RequestData::FromFragments(RequestFromFragments::new(request, search_fragments)?)
|
||||
})
|
||||
}
|
||||
|
||||
fn input_type(&self) -> InputType {
|
||||
match self {
|
||||
RequestData::Single(request) => request.input_type(),
|
||||
RequestData::FromFragments(request_from_fragments) => {
|
||||
request_from_fragments.input_type()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn has_fragments(&self) -> bool {
|
||||
matches!(self, RequestData::FromFragments(_))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
|
||||
pub struct EmbedderOptions {
|
||||
pub api_key: Option<String>,
|
||||
pub distribution: Option<DistributionShift>,
|
||||
pub dimensions: Option<usize>,
|
||||
pub url: String,
|
||||
pub request: serde_json::Value,
|
||||
pub response: serde_json::Value,
|
||||
pub request: Value,
|
||||
pub search_fragments: BTreeMap<String, Value>,
|
||||
pub indexing_fragments: BTreeMap<String, Value>,
|
||||
pub response: Value,
|
||||
pub headers: BTreeMap<String, String>,
|
||||
}
|
||||
|
||||
@ -137,7 +182,12 @@ impl Embedder {
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
.build();
|
||||
|
||||
let request = Request::new(options.request)?;
|
||||
let request = RequestData::new(
|
||||
options.request,
|
||||
options.indexing_fragments,
|
||||
options.search_fragments,
|
||||
)?;
|
||||
|
||||
let response = Response::new(options.response, &request)?;
|
||||
|
||||
let data = EmbedderData {
|
||||
@ -168,19 +218,28 @@ impl Embedder {
|
||||
&self,
|
||||
texts: Vec<String>,
|
||||
deadline: Option<Instant>,
|
||||
embedder_stats: Option<&EmbedderStats>,
|
||||
) -> Result<Vec<Embedding>, EmbedError> {
|
||||
embed(&self.data, texts.as_slice(), texts.len(), Some(self.dimensions), deadline)
|
||||
embed(
|
||||
&self.data,
|
||||
texts.as_slice(),
|
||||
texts.len(),
|
||||
Some(self.dimensions),
|
||||
deadline,
|
||||
embedder_stats,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn embed_ref<S>(
|
||||
&self,
|
||||
texts: &[S],
|
||||
deadline: Option<Instant>,
|
||||
embedder_stats: Option<&EmbedderStats>,
|
||||
) -> Result<Vec<Embedding>, EmbedError>
|
||||
where
|
||||
S: AsRef<str> + Serialize,
|
||||
S: Serialize,
|
||||
{
|
||||
embed(&self.data, texts, texts.len(), Some(self.dimensions), deadline)
|
||||
embed(&self.data, texts, texts.len(), Some(self.dimensions), deadline, embedder_stats)
|
||||
}
|
||||
|
||||
pub fn embed_tokens(
|
||||
@ -188,7 +247,7 @@ impl Embedder {
|
||||
tokens: &[u32],
|
||||
deadline: Option<Instant>,
|
||||
) -> Result<Embedding, EmbedError> {
|
||||
let mut embeddings = embed(&self.data, tokens, 1, Some(self.dimensions), deadline)?;
|
||||
let mut embeddings = embed(&self.data, tokens, 1, Some(self.dimensions), deadline, None)?;
|
||||
// unwrap: guaranteed that embeddings.len() == 1, otherwise the previous line terminated in error
|
||||
Ok(embeddings.pop().unwrap())
|
||||
}
|
||||
@ -197,15 +256,22 @@ impl Embedder {
|
||||
&self,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> Result<Vec<Vec<Embedding>>, EmbedError> {
|
||||
// This condition helps reduce the number of active rayon jobs
|
||||
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
|
||||
if threads.active_operations() >= REQUEST_PARALLELISM {
|
||||
text_chunks.into_iter().map(move |chunk| self.embed(chunk, None)).collect()
|
||||
text_chunks
|
||||
.into_iter()
|
||||
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats)))
|
||||
.collect()
|
||||
} else {
|
||||
threads
|
||||
.install(move || {
|
||||
text_chunks.into_par_iter().map(move |chunk| self.embed(chunk, None)).collect()
|
||||
text_chunks
|
||||
.into_par_iter()
|
||||
.map(move |chunk| self.embed(chunk, None, Some(embedder_stats)))
|
||||
.collect()
|
||||
})
|
||||
.map_err(|error| EmbedError {
|
||||
kind: EmbedErrorKind::PanicInThreadPool(error),
|
||||
@ -214,17 +280,18 @@ impl Embedder {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn embed_index_ref(
|
||||
pub(crate) fn embed_index_ref<S: Serialize + Sync>(
|
||||
&self,
|
||||
texts: &[&str],
|
||||
texts: &[S],
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> Result<Vec<Embedding>, EmbedError> {
|
||||
// This condition helps reduce the number of active rayon jobs
|
||||
// so that we avoid consuming all the LMDB rtxns and avoid stack overflows.
|
||||
if threads.active_operations() >= REQUEST_PARALLELISM {
|
||||
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
|
||||
.chunks(self.prompt_count_in_chunk_hint())
|
||||
.map(move |chunk| self.embed_ref(chunk, None))
|
||||
.map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats)))
|
||||
.collect();
|
||||
|
||||
let embeddings = embeddings?;
|
||||
@ -234,7 +301,7 @@ impl Embedder {
|
||||
.install(move || {
|
||||
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
|
||||
.par_chunks(self.prompt_count_in_chunk_hint())
|
||||
.map(move |chunk| self.embed_ref(chunk, None))
|
||||
.map(move |chunk| self.embed_ref(chunk, None, Some(embedder_stats)))
|
||||
.collect();
|
||||
|
||||
let embeddings = embeddings?;
|
||||
@ -269,10 +336,45 @@ impl Embedder {
|
||||
pub(super) fn cache(&self) -> &EmbeddingCache {
|
||||
&self.cache
|
||||
}
|
||||
|
||||
pub(crate) fn embed_one(
|
||||
&self,
|
||||
query: SearchQuery,
|
||||
deadline: Option<Instant>,
|
||||
embedder_stats: Option<&EmbedderStats>,
|
||||
) -> Result<Embedding, EmbedError> {
|
||||
let mut embeddings = match (&self.data.request, query) {
|
||||
(RequestData::Single(_), SearchQuery::Text(text)) => {
|
||||
embed(&self.data, &[text], 1, Some(self.dimensions), deadline, embedder_stats)
|
||||
}
|
||||
(RequestData::Single(_), SearchQuery::Media { q: _, media: _ }) => {
|
||||
return Err(EmbedError::rest_media_not_a_fragment())
|
||||
}
|
||||
(RequestData::FromFragments(request_from_fragments), SearchQuery::Text(q)) => {
|
||||
let fragment = request_from_fragments.render_search_fragment(Some(q), None)?;
|
||||
|
||||
embed(&self.data, &[fragment], 1, Some(self.dimensions), deadline, embedder_stats)
|
||||
}
|
||||
(
|
||||
RequestData::FromFragments(request_from_fragments),
|
||||
SearchQuery::Media { q, media },
|
||||
) => {
|
||||
let fragment = request_from_fragments.render_search_fragment(q, media)?;
|
||||
|
||||
embed(&self.data, &[fragment], 1, Some(self.dimensions), deadline, embedder_stats)
|
||||
}
|
||||
}?;
|
||||
|
||||
// unwrap: checked by `expected_count`
|
||||
Ok(embeddings.pop().unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
fn infer_dimensions(data: &EmbedderData) -> Result<usize, NewEmbedderError> {
|
||||
let v = embed(data, ["test"].as_slice(), 1, None, None)
|
||||
if data.request.has_fragments() {
|
||||
return Err(NewEmbedderError::rest_cannot_infer_dimensions_for_fragment());
|
||||
}
|
||||
let v = embed(data, ["test"].as_slice(), 1, None, None, None)
|
||||
.map_err(NewEmbedderError::could_not_determine_dimension)?;
|
||||
// unwrap: guaranteed that v.len() == 1, otherwise the previous line terminated in error
|
||||
Ok(v.first().unwrap().len())
|
||||
@ -284,10 +386,18 @@ fn embed<S>(
|
||||
expected_count: usize,
|
||||
expected_dimension: Option<usize>,
|
||||
deadline: Option<Instant>,
|
||||
embedder_stats: Option<&EmbedderStats>,
|
||||
) -> Result<Vec<Embedding>, EmbedError>
|
||||
where
|
||||
S: Serialize,
|
||||
{
|
||||
if inputs.is_empty() {
|
||||
if expected_count != 0 {
|
||||
return Err(EmbedError::rest_response_embedding_count(expected_count, 0));
|
||||
}
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let request = data.client.post(&data.url);
|
||||
let request = if let Some(bearer) = &data.bearer {
|
||||
request.set("Authorization", bearer)
|
||||
@ -299,9 +409,17 @@ where
|
||||
request = request.set(header.as_str(), value.as_str());
|
||||
}
|
||||
|
||||
let body = data.request.inject_texts(inputs);
|
||||
let body = match &data.request {
|
||||
RequestData::Single(request) => request.inject_texts(inputs),
|
||||
RequestData::FromFragments(request_from_fragments) => {
|
||||
request_from_fragments.request_from_fragments(inputs).expect("inputs was empty")
|
||||
}
|
||||
};
|
||||
|
||||
for attempt in 0..10 {
|
||||
if let Some(embedder_stats) = &embedder_stats {
|
||||
embedder_stats.total_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
let response = request.clone().send_json(&body);
|
||||
let result = check_response(response, data.configuration_source).and_then(|response| {
|
||||
response_to_embedding(response, data, expected_count, expected_dimension)
|
||||
@ -311,6 +429,13 @@ where
|
||||
Ok(response) => return Ok(response),
|
||||
Err(retry) => {
|
||||
tracing::warn!("Failed: {}", retry.error);
|
||||
if let Some(embedder_stats) = &embedder_stats {
|
||||
let stringified_error = retry.error.to_string();
|
||||
let mut errors =
|
||||
embedder_stats.errors.write().unwrap_or_else(|p| p.into_inner());
|
||||
errors.0 = Some(stringified_error);
|
||||
errors.1 += 1;
|
||||
}
|
||||
if let Some(deadline) = deadline {
|
||||
let now = std::time::Instant::now();
|
||||
if now > deadline {
|
||||
@ -336,12 +461,26 @@ where
|
||||
std::thread::sleep(retry_duration);
|
||||
}
|
||||
|
||||
if let Some(embedder_stats) = &embedder_stats {
|
||||
embedder_stats.total_count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
let response = request.send_json(&body);
|
||||
let result = check_response(response, data.configuration_source);
|
||||
result.map_err(Retry::into_error).and_then(|response| {
|
||||
let result = check_response(response, data.configuration_source).and_then(|response| {
|
||||
response_to_embedding(response, data, expected_count, expected_dimension)
|
||||
.map_err(Retry::into_error)
|
||||
})
|
||||
});
|
||||
|
||||
match result {
|
||||
Ok(response) => Ok(response),
|
||||
Err(retry) => {
|
||||
if let Some(embedder_stats) = &embedder_stats {
|
||||
let stringified_error = retry.error.to_string();
|
||||
let mut errors = embedder_stats.errors.write().unwrap_or_else(|p| p.into_inner());
|
||||
errors.0 = Some(stringified_error);
|
||||
errors.1 += 1;
|
||||
};
|
||||
Err(retry.into_error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn check_response(
|
||||
@ -383,7 +522,7 @@ fn response_to_embedding(
|
||||
expected_count: usize,
|
||||
expected_dimensions: Option<usize>,
|
||||
) -> Result<Vec<Embedding>, Retry> {
|
||||
let response: serde_json::Value = response
|
||||
let response: Value = response
|
||||
.into_json()
|
||||
.map_err(EmbedError::rest_response_deserialization)
|
||||
.map_err(Retry::retry_later)?;
|
||||
@ -412,21 +551,24 @@ fn response_to_embedding(
|
||||
}
|
||||
|
||||
pub(super) const REQUEST_PLACEHOLDER: &str = "{{text}}";
|
||||
pub(super) const REQUEST_FRAGMENT_PLACEHOLDER: &str = "{{fragment}}";
|
||||
pub(super) const RESPONSE_PLACEHOLDER: &str = "{{embedding}}";
|
||||
pub(super) const REPEAT_PLACEHOLDER: &str = "{{..}}";
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Request {
|
||||
template: ValueTemplate,
|
||||
template: InjectableValue,
|
||||
}
|
||||
|
||||
impl Request {
|
||||
pub fn new(template: serde_json::Value) -> Result<Self, NewEmbedderError> {
|
||||
let template = match ValueTemplate::new(template, REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER) {
|
||||
pub fn new(template: Value) -> Result<Self, NewEmbedderError> {
|
||||
let template = match InjectableValue::new(template, REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER)
|
||||
{
|
||||
Ok(template) => template,
|
||||
Err(error) => {
|
||||
let message =
|
||||
error.error_message("request", REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER);
|
||||
let message = format!("{message}\n - Note: this template is using a document template, and so expects to contain the placeholder {REQUEST_PLACEHOLDER:?} rather than {REQUEST_FRAGMENT_PLACEHOLDER:?}");
|
||||
return Err(NewEmbedderError::rest_could_not_parse_template(message));
|
||||
}
|
||||
};
|
||||
@ -442,42 +584,120 @@ impl Request {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn inject_texts<S: Serialize>(
|
||||
&self,
|
||||
texts: impl IntoIterator<Item = S>,
|
||||
) -> serde_json::Value {
|
||||
pub fn inject_texts<S: Serialize>(&self, texts: impl IntoIterator<Item = S>) -> Value {
|
||||
self.template.inject(texts.into_iter().map(|s| serde_json::json!(s))).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Response {
|
||||
template: ValueTemplate,
|
||||
pub struct RequestFromFragments {
|
||||
search_fragments: BTreeMap<String, JsonTemplate>,
|
||||
request: InjectableValue,
|
||||
}
|
||||
|
||||
impl Response {
|
||||
pub fn new(template: serde_json::Value, request: &Request) -> Result<Self, NewEmbedderError> {
|
||||
let template = match ValueTemplate::new(template, RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER)
|
||||
{
|
||||
impl RequestFromFragments {
|
||||
pub fn new(
|
||||
request: Value,
|
||||
search_fragments: impl IntoIterator<Item = (String, Value)>,
|
||||
) -> Result<Self, NewEmbedderError> {
|
||||
let request = match InjectableValue::new(
|
||||
request,
|
||||
REQUEST_FRAGMENT_PLACEHOLDER,
|
||||
REPEAT_PLACEHOLDER,
|
||||
) {
|
||||
Ok(template) => template,
|
||||
Err(error) => {
|
||||
let message =
|
||||
error.error_message("response", RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER);
|
||||
let message = error.error_message(
|
||||
"request",
|
||||
REQUEST_FRAGMENT_PLACEHOLDER,
|
||||
REPEAT_PLACEHOLDER,
|
||||
);
|
||||
let message = format!("{message}\n - Note: this template is using fragments, and so expects to contain the placeholder {REQUEST_FRAGMENT_PLACEHOLDER:?} rathern than {REQUEST_PLACEHOLDER:?}");
|
||||
|
||||
return Err(NewEmbedderError::rest_could_not_parse_template(message));
|
||||
}
|
||||
};
|
||||
|
||||
match (template.has_array_value(), request.template.has_array_value()) {
|
||||
let search_fragments: Result<_, NewEmbedderError> = search_fragments
|
||||
.into_iter()
|
||||
.map(|(name, value)| {
|
||||
let json_template = JsonTemplate::new(value).map_err(|error| {
|
||||
NewEmbedderError::rest_could_not_parse_template(
|
||||
error.parsing(&format!(".searchFragments.{name}")),
|
||||
)
|
||||
})?;
|
||||
Ok((name, json_template))
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(Self { request, search_fragments: search_fragments? })
|
||||
}
|
||||
|
||||
fn input_type(&self) -> InputType {
|
||||
if self.request.has_array_value() {
|
||||
InputType::TextArray
|
||||
} else {
|
||||
InputType::Text
|
||||
}
|
||||
}
|
||||
|
||||
pub fn render_search_fragment(
|
||||
&self,
|
||||
q: Option<&str>,
|
||||
media: Option<&Value>,
|
||||
) -> Result<Value, EmbedError> {
|
||||
let mut it = self.search_fragments.iter().filter_map(|(name, template)| {
|
||||
let render = template.render_search(q, media).ok()?;
|
||||
Some((name, render))
|
||||
});
|
||||
let Some((name, fragment)) = it.next() else {
|
||||
return Err(EmbedError::rest_search_matches_no_fragment(q, media));
|
||||
};
|
||||
if let Some((second_name, _)) = it.next() {
|
||||
return Err(EmbedError::rest_search_matches_multiple_fragments(
|
||||
name,
|
||||
second_name,
|
||||
q,
|
||||
media,
|
||||
));
|
||||
}
|
||||
|
||||
Ok(fragment)
|
||||
}
|
||||
|
||||
pub fn request_from_fragments<'a, S: Serialize + 'a>(
|
||||
&self,
|
||||
fragments: impl IntoIterator<Item = &'a S>,
|
||||
) -> Option<Value> {
|
||||
self.request.inject(fragments.into_iter().map(|fragment| serde_json::json!(fragment))).ok()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Response {
|
||||
template: InjectableValue,
|
||||
}
|
||||
|
||||
impl Response {
|
||||
pub fn new(template: Value, request: &RequestData) -> Result<Self, NewEmbedderError> {
|
||||
let template =
|
||||
match InjectableValue::new(template, RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER) {
|
||||
Ok(template) => template,
|
||||
Err(error) => {
|
||||
let message =
|
||||
error.error_message("response", RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER);
|
||||
return Err(NewEmbedderError::rest_could_not_parse_template(message));
|
||||
}
|
||||
};
|
||||
|
||||
match (template.has_array_value(), request.input_type() == InputType::TextArray) {
|
||||
(true, true) | (false, false) => Ok(Self {template}),
|
||||
(true, false) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has multiple embeddings, but `request` has only one text to embed".to_string())),
|
||||
(false, true) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has a single embedding, but `request` has multiple texts to embed".to_string())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extract_embeddings(
|
||||
&self,
|
||||
response: serde_json::Value,
|
||||
) -> Result<Vec<Embedding>, EmbedError> {
|
||||
pub fn extract_embeddings(&self, response: Value) -> Result<Vec<Embedding>, EmbedError> {
|
||||
let extracted_values: Vec<Embedding> = match self.template.extract(response) {
|
||||
Ok(extracted_values) => extracted_values,
|
||||
Err(error) => {
|
||||
|
177
crates/milli/src/vector/session.rs
Normal file
177
crates/milli/src/vector/session.rs
Normal file
@ -0,0 +1,177 @@
|
||||
use bumpalo::collections::Vec as BVec;
|
||||
use bumpalo::Bump;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::{EmbedError, Embedder, Embedding};
|
||||
use crate::progress::EmbedderStats;
|
||||
use crate::{DocumentId, Result, ThreadPoolNoAbort};
|
||||
type ExtractorId = u8;
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub struct Metadata<'doc> {
|
||||
pub docid: DocumentId,
|
||||
pub external_docid: &'doc str,
|
||||
pub extractor_id: ExtractorId,
|
||||
}
|
||||
|
||||
pub struct EmbeddingResponse<'doc> {
|
||||
pub metadata: Metadata<'doc>,
|
||||
pub embedding: Option<Embedding>,
|
||||
}
|
||||
|
||||
pub trait OnEmbed<'doc> {
|
||||
type ErrorMetadata;
|
||||
|
||||
fn process_embedding_response(&mut self, response: EmbeddingResponse<'doc>);
|
||||
fn process_embedding_error(
|
||||
&mut self,
|
||||
error: EmbedError,
|
||||
embedder_name: &'doc str,
|
||||
unused_vectors_distribution: &Self::ErrorMetadata,
|
||||
metadata: BVec<'doc, Metadata<'doc>>,
|
||||
) -> crate::Error;
|
||||
}
|
||||
|
||||
pub struct EmbedSession<'doc, C, I> {
|
||||
// requests
|
||||
inputs: BVec<'doc, I>,
|
||||
metadata: BVec<'doc, Metadata<'doc>>,
|
||||
|
||||
threads: &'doc ThreadPoolNoAbort,
|
||||
embedder: &'doc Embedder,
|
||||
|
||||
embedder_name: &'doc str,
|
||||
|
||||
embedder_stats: &'doc EmbedderStats,
|
||||
|
||||
on_embed: C,
|
||||
}
|
||||
|
||||
pub trait Input: Sized {
|
||||
fn embed_ref(
|
||||
inputs: &[Self],
|
||||
embedder: &Embedder,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError>;
|
||||
}
|
||||
|
||||
impl Input for &'_ str {
|
||||
fn embed_ref(
|
||||
inputs: &[Self],
|
||||
embedder: &Embedder,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
embedder.embed_index_ref(inputs, threads, embedder_stats)
|
||||
}
|
||||
}
|
||||
|
||||
impl Input for Value {
|
||||
fn embed_ref(
|
||||
inputs: &[Value],
|
||||
embedder: &Embedder,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
embedder_stats: &EmbedderStats,
|
||||
) -> std::result::Result<Vec<Embedding>, EmbedError> {
|
||||
embedder.embed_index_ref_fragments(inputs, threads, embedder_stats)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
embedder: &'doc Embedder,
|
||||
embedder_name: &'doc str,
|
||||
threads: &'doc ThreadPoolNoAbort,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedder_stats: &'doc EmbedderStats,
|
||||
on_embed: C,
|
||||
) -> Self {
|
||||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
Self {
|
||||
inputs: texts,
|
||||
metadata: ids,
|
||||
embedder,
|
||||
threads,
|
||||
embedder_name,
|
||||
embedder_stats,
|
||||
on_embed,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn request_embedding(
|
||||
&mut self,
|
||||
metadata: Metadata<'doc>,
|
||||
rendered: I,
|
||||
unused_vectors_distribution: &C::ErrorMetadata,
|
||||
) -> Result<()> {
|
||||
if self.inputs.len() < self.inputs.capacity() {
|
||||
self.inputs.push(rendered);
|
||||
self.metadata.push(metadata);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.embed_chunks(unused_vectors_distribution)
|
||||
}
|
||||
|
||||
pub fn drain(mut self, unused_vectors_distribution: &C::ErrorMetadata) -> Result<C> {
|
||||
self.embed_chunks(unused_vectors_distribution)?;
|
||||
Ok(self.on_embed)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn embed_chunks(&mut self, unused_vectors_distribution: &C::ErrorMetadata) -> Result<()> {
|
||||
if self.inputs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
let res = match I::embed_ref(
|
||||
self.inputs.as_slice(),
|
||||
self.embedder,
|
||||
self.threads,
|
||||
self.embedder_stats,
|
||||
) {
|
||||
Ok(embeddings) => {
|
||||
for (metadata, embedding) in self.metadata.iter().copied().zip(embeddings) {
|
||||
self.on_embed.process_embedding_response(EmbeddingResponse {
|
||||
metadata,
|
||||
embedding: Some(embedding),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(error) => {
|
||||
// reset metadata and inputs, and send metadata to the error processing.
|
||||
let doc_alloc = self.metadata.bump();
|
||||
let metadata = std::mem::replace(
|
||||
&mut self.metadata,
|
||||
BVec::with_capacity_in(self.inputs.capacity(), doc_alloc),
|
||||
);
|
||||
self.inputs.clear();
|
||||
return Err(self.on_embed.process_embedding_error(
|
||||
error,
|
||||
self.embedder_name,
|
||||
unused_vectors_distribution,
|
||||
metadata,
|
||||
));
|
||||
}
|
||||
};
|
||||
self.inputs.clear();
|
||||
self.metadata.clear();
|
||||
res
|
||||
}
|
||||
|
||||
pub(crate) fn embedder_name(&self) -> &'doc str {
|
||||
self.embedder_name
|
||||
}
|
||||
|
||||
pub(crate) fn doc_alloc(&self) -> &'doc Bump {
|
||||
self.inputs.bump()
|
||||
}
|
||||
|
||||
pub(crate) fn on_embed_mut(&mut self) -> &mut C {
|
||||
&mut self.on_embed
|
||||
}
|
||||
}
|
@ -2,6 +2,8 @@ use std::collections::BTreeMap;
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use deserr::Deserr;
|
||||
use either::Either;
|
||||
use itertools::Itertools;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use utoipa::ToSchema;
|
||||
@ -229,6 +231,35 @@ pub struct EmbeddingSettings {
|
||||
/// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated
|
||||
pub url: Setting<String>,
|
||||
|
||||
/// Template fragments that will be reassembled and sent to the remote embedder at indexing time.
|
||||
///
|
||||
/// # Availability
|
||||
///
|
||||
/// - This parameter is available for sources `rest`.
|
||||
///
|
||||
/// # 🔄 Reindexing
|
||||
///
|
||||
/// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents.
|
||||
/// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes.
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<BTreeMap<String, serde_json::Value>>)]
|
||||
pub indexing_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
|
||||
/// Template fragments that will be reassembled and sent to the remote embedder at search time.
|
||||
///
|
||||
/// # Availability
|
||||
///
|
||||
/// - This parameter is available for sources `rest`.
|
||||
///
|
||||
/// # 🔄 Reindexing
|
||||
///
|
||||
/// - 🌱 Changing the value of this parameter never regenerates embeddings
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<BTreeMap<String, serde_json::Value>>)]
|
||||
pub search_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<serde_json::Value>)]
|
||||
@ -483,6 +514,36 @@ pub struct SubEmbeddingSettings {
|
||||
/// - 🌱 When modified for source `openAi`, embeddings are never regenerated
|
||||
/// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated
|
||||
pub url: Setting<String>,
|
||||
|
||||
/// Template fragments that will be reassembled and sent to the remote embedder at indexing time.
|
||||
///
|
||||
/// # Availability
|
||||
///
|
||||
/// - This parameter is available for sources `rest`.
|
||||
///
|
||||
/// # 🔄 Reindexing
|
||||
///
|
||||
/// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents.
|
||||
/// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes.
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<BTreeMap<String, serde_json::Value>>)]
|
||||
pub indexing_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
|
||||
/// Template fragments that will be reassembled and sent to the remote embedder at search time.
|
||||
///
|
||||
/// # Availability
|
||||
///
|
||||
/// - This parameter is available for sources `rest`.
|
||||
///
|
||||
/// # 🔄 Reindexing
|
||||
///
|
||||
/// - 🌱 Changing the value of this parameter never regenerates embeddings
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<BTreeMap<String, serde_json::Value>>)]
|
||||
pub search_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default)]
|
||||
#[schema(value_type = Option<serde_json::Value>)]
|
||||
@ -554,17 +615,31 @@ pub struct SubEmbeddingSettings {
|
||||
pub indexing_embedder: Setting<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
||||
pub enum EmbeddingValidationContext {
|
||||
FullSettings,
|
||||
SettingsPartialUpdate,
|
||||
}
|
||||
|
||||
/// Indicates what action should take place during a reindexing operation for an embedder
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub enum ReindexAction {
|
||||
/// An indexing operation should take place for this embedder, keeping existing vectors
|
||||
/// and checking whether the document template changed or not
|
||||
RegeneratePrompts,
|
||||
RegenerateFragments(Vec<(String, RegenerateFragment)>),
|
||||
/// An indexing operation should take place for all documents for this embedder, removing existing vectors
|
||||
/// (except userProvided ones)
|
||||
FullReindex,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum RegenerateFragment {
|
||||
Update,
|
||||
Remove,
|
||||
Add,
|
||||
}
|
||||
|
||||
pub enum SettingsDiff {
|
||||
Remove,
|
||||
Reindex { action: ReindexAction, updated_settings: EmbeddingSettings, quantize: bool },
|
||||
@ -577,6 +652,12 @@ pub struct EmbedderAction {
|
||||
pub is_being_quantized: bool,
|
||||
pub write_back: Option<WriteBackToDocuments>,
|
||||
pub reindex: Option<ReindexAction>,
|
||||
pub remove_fragments: Option<RemoveFragments>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RemoveFragments {
|
||||
pub fragment_ids: Vec<u8>,
|
||||
}
|
||||
|
||||
impl EmbedderAction {
|
||||
@ -592,6 +673,10 @@ impl EmbedderAction {
|
||||
self.reindex.as_ref()
|
||||
}
|
||||
|
||||
pub fn remove_fragments(&self) -> Option<&RemoveFragments> {
|
||||
self.remove_fragments.as_ref()
|
||||
}
|
||||
|
||||
pub fn with_is_being_quantized(mut self, quantize: bool) -> Self {
|
||||
self.is_being_quantized = quantize;
|
||||
self
|
||||
@ -603,11 +688,23 @@ impl EmbedderAction {
|
||||
is_being_quantized: false,
|
||||
write_back: Some(write_back),
|
||||
reindex: None,
|
||||
remove_fragments: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_reindex(reindex: ReindexAction, was_quantized: bool) -> Self {
|
||||
Self { was_quantized, is_being_quantized: false, write_back: None, reindex: Some(reindex) }
|
||||
Self {
|
||||
was_quantized,
|
||||
is_being_quantized: false,
|
||||
write_back: None,
|
||||
reindex: Some(reindex),
|
||||
remove_fragments: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_remove_fragments(mut self, remove_fragments: RemoveFragments) -> Self {
|
||||
self.remove_fragments = Some(remove_fragments);
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
@ -634,6 +731,8 @@ impl SettingsDiff {
|
||||
mut dimensions,
|
||||
mut document_template,
|
||||
mut url,
|
||||
mut indexing_fragments,
|
||||
mut search_fragments,
|
||||
mut request,
|
||||
mut response,
|
||||
mut search_embedder,
|
||||
@ -653,6 +752,8 @@ impl SettingsDiff {
|
||||
dimensions: new_dimensions,
|
||||
document_template: new_document_template,
|
||||
url: new_url,
|
||||
indexing_fragments: new_indexing_fragments,
|
||||
search_fragments: new_search_fragments,
|
||||
request: new_request,
|
||||
response: new_response,
|
||||
search_embedder: new_search_embedder,
|
||||
@ -684,6 +785,8 @@ impl SettingsDiff {
|
||||
&mut document_template,
|
||||
&mut document_template_max_bytes,
|
||||
&mut url,
|
||||
&mut indexing_fragments,
|
||||
&mut search_fragments,
|
||||
&mut request,
|
||||
&mut response,
|
||||
&mut headers,
|
||||
@ -696,6 +799,8 @@ impl SettingsDiff {
|
||||
new_document_template,
|
||||
new_document_template_max_bytes,
|
||||
new_url,
|
||||
new_indexing_fragments,
|
||||
new_search_fragments,
|
||||
new_request,
|
||||
new_response,
|
||||
new_headers,
|
||||
@ -722,6 +827,8 @@ impl SettingsDiff {
|
||||
dimensions,
|
||||
document_template,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
search_embedder,
|
||||
@ -769,6 +876,8 @@ impl SettingsDiff {
|
||||
mut document_template,
|
||||
mut document_template_max_bytes,
|
||||
mut url,
|
||||
mut indexing_fragments,
|
||||
mut search_fragments,
|
||||
mut request,
|
||||
mut response,
|
||||
mut headers,
|
||||
@ -794,6 +903,8 @@ impl SettingsDiff {
|
||||
document_template: new_document_template,
|
||||
document_template_max_bytes: new_document_template_max_bytes,
|
||||
url: new_url,
|
||||
indexing_fragments: new_indexing_fragments,
|
||||
search_fragments: new_search_fragments,
|
||||
request: new_request,
|
||||
response: new_response,
|
||||
headers: new_headers,
|
||||
@ -814,6 +925,8 @@ impl SettingsDiff {
|
||||
&mut document_template,
|
||||
&mut document_template_max_bytes,
|
||||
&mut url,
|
||||
&mut indexing_fragments,
|
||||
&mut search_fragments,
|
||||
&mut request,
|
||||
&mut response,
|
||||
&mut headers,
|
||||
@ -826,6 +939,8 @@ impl SettingsDiff {
|
||||
new_document_template,
|
||||
new_document_template_max_bytes,
|
||||
new_url,
|
||||
new_indexing_fragments,
|
||||
new_search_fragments,
|
||||
new_request,
|
||||
new_response,
|
||||
new_headers,
|
||||
@ -846,6 +961,8 @@ impl SettingsDiff {
|
||||
dimensions,
|
||||
document_template,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
headers,
|
||||
@ -875,6 +992,8 @@ impl SettingsDiff {
|
||||
document_template: &mut Setting<String>,
|
||||
document_template_max_bytes: &mut Setting<usize>,
|
||||
url: &mut Setting<String>,
|
||||
indexing_fragments: &mut Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
search_fragments: &mut Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
request: &mut Setting<serde_json::Value>,
|
||||
response: &mut Setting<serde_json::Value>,
|
||||
headers: &mut Setting<BTreeMap<String, String>>,
|
||||
@ -887,6 +1006,8 @@ impl SettingsDiff {
|
||||
new_document_template: Setting<String>,
|
||||
new_document_template_max_bytes: Setting<usize>,
|
||||
new_url: Setting<String>,
|
||||
new_indexing_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
new_search_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
new_request: Setting<serde_json::Value>,
|
||||
new_response: Setting<serde_json::Value>,
|
||||
new_headers: Setting<BTreeMap<String, String>>,
|
||||
@ -902,6 +1023,8 @@ impl SettingsDiff {
|
||||
pooling,
|
||||
dimensions,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
document_template,
|
||||
@ -941,6 +1064,105 @@ impl SettingsDiff {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*search_fragments = match (std::mem::take(search_fragments), new_search_fragments) {
|
||||
(Setting::Set(search_fragments), Setting::Set(new_search_fragments)) => {
|
||||
Setting::Set(
|
||||
search_fragments
|
||||
.into_iter()
|
||||
.merge_join_by(new_search_fragments, |(left, _), (right, _)| {
|
||||
left.cmp(right)
|
||||
})
|
||||
.map(|eob| {
|
||||
match eob {
|
||||
// merge fragments
|
||||
itertools::EitherOrBoth::Both((name, _), (_, right)) => {
|
||||
(name, right)
|
||||
}
|
||||
// unchanged fragment
|
||||
itertools::EitherOrBoth::Left(left) => left,
|
||||
// new fragment
|
||||
itertools::EitherOrBoth::Right(right) => right,
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
(_, Setting::Reset) => Setting::Reset,
|
||||
(left, Setting::NotSet) => left,
|
||||
(Setting::NotSet | Setting::Reset, Setting::Set(new_search_fragments)) => {
|
||||
Setting::Set(new_search_fragments)
|
||||
}
|
||||
};
|
||||
|
||||
let mut regenerate_fragments = Vec::new();
|
||||
*indexing_fragments = match (std::mem::take(indexing_fragments), new_indexing_fragments) {
|
||||
(Setting::Set(fragments), Setting::Set(new_fragments)) => {
|
||||
Setting::Set(
|
||||
fragments
|
||||
.into_iter()
|
||||
.merge_join_by(new_fragments, |(left, _), (right, _)| left.cmp(right))
|
||||
.map(|eob| {
|
||||
match eob {
|
||||
// merge fragments
|
||||
itertools::EitherOrBoth::Both(
|
||||
(name, left),
|
||||
(other_name, right),
|
||||
) => {
|
||||
if left == right {
|
||||
(name, left)
|
||||
} else {
|
||||
match right {
|
||||
Some(right) => {
|
||||
regenerate_fragments
|
||||
.push((other_name, RegenerateFragment::Update));
|
||||
(name, Some(right))
|
||||
}
|
||||
None => {
|
||||
regenerate_fragments
|
||||
.push((other_name, RegenerateFragment::Remove));
|
||||
(name, None)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// unchanged fragment
|
||||
itertools::EitherOrBoth::Left(left) => left,
|
||||
// new fragment
|
||||
itertools::EitherOrBoth::Right((name, right)) => {
|
||||
if right.is_some() {
|
||||
regenerate_fragments
|
||||
.push((name.clone(), RegenerateFragment::Add));
|
||||
}
|
||||
(name, right)
|
||||
}
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
)
|
||||
}
|
||||
// remove all fragments => move to document template
|
||||
(_, Setting::Reset) => {
|
||||
ReindexAction::push_action(reindex_action, ReindexAction::FullReindex);
|
||||
Setting::Reset
|
||||
}
|
||||
// add all fragments
|
||||
(Setting::NotSet | Setting::Reset, Setting::Set(new_fragments)) => {
|
||||
ReindexAction::push_action(reindex_action, ReindexAction::FullReindex);
|
||||
|
||||
Setting::Set(new_fragments)
|
||||
}
|
||||
// no change
|
||||
(left, Setting::NotSet) => left,
|
||||
};
|
||||
if !regenerate_fragments.is_empty() {
|
||||
regenerate_fragments.sort_unstable_by(|(left, _), (right, _)| left.cmp(right));
|
||||
ReindexAction::push_action(
|
||||
reindex_action,
|
||||
ReindexAction::RegenerateFragments(regenerate_fragments),
|
||||
);
|
||||
}
|
||||
|
||||
if request.apply(new_request) {
|
||||
ReindexAction::push_action(reindex_action, ReindexAction::FullReindex);
|
||||
}
|
||||
@ -972,10 +1194,16 @@ impl SettingsDiff {
|
||||
|
||||
impl ReindexAction {
|
||||
fn push_action(this: &mut Option<Self>, other: Self) {
|
||||
*this = match (*this, other) {
|
||||
(_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex),
|
||||
(Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex),
|
||||
(_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts),
|
||||
use ReindexAction::*;
|
||||
*this = match (this.take(), other) {
|
||||
(_, FullReindex) => Some(FullReindex),
|
||||
(Some(FullReindex), _) => Some(FullReindex),
|
||||
(_, RegenerateFragments(fragments)) => Some(RegenerateFragments(fragments)),
|
||||
(Some(RegenerateFragments(fragments)), RegeneratePrompts) => {
|
||||
Some(RegenerateFragments(fragments))
|
||||
}
|
||||
(Some(RegeneratePrompts), RegeneratePrompts) => Some(RegeneratePrompts),
|
||||
(None, RegeneratePrompts) => Some(RegeneratePrompts),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -988,6 +1216,8 @@ fn apply_default_for_source(
|
||||
pooling: &mut Setting<OverridePooling>,
|
||||
dimensions: &mut Setting<usize>,
|
||||
url: &mut Setting<String>,
|
||||
indexing_fragments: &mut Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
search_fragments: &mut Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
request: &mut Setting<serde_json::Value>,
|
||||
response: &mut Setting<serde_json::Value>,
|
||||
document_template: &mut Setting<String>,
|
||||
@ -1003,6 +1233,8 @@ fn apply_default_for_source(
|
||||
*pooling = Setting::Reset;
|
||||
*dimensions = Setting::NotSet;
|
||||
*url = Setting::NotSet;
|
||||
*indexing_fragments = Setting::NotSet;
|
||||
*search_fragments = Setting::NotSet;
|
||||
*request = Setting::NotSet;
|
||||
*response = Setting::NotSet;
|
||||
*headers = Setting::NotSet;
|
||||
@ -1015,6 +1247,8 @@ fn apply_default_for_source(
|
||||
*pooling = Setting::NotSet;
|
||||
*dimensions = Setting::Reset;
|
||||
*url = Setting::NotSet;
|
||||
*indexing_fragments = Setting::NotSet;
|
||||
*search_fragments = Setting::NotSet;
|
||||
*request = Setting::NotSet;
|
||||
*response = Setting::NotSet;
|
||||
*headers = Setting::NotSet;
|
||||
@ -1027,6 +1261,8 @@ fn apply_default_for_source(
|
||||
*pooling = Setting::NotSet;
|
||||
*dimensions = Setting::NotSet;
|
||||
*url = Setting::Reset;
|
||||
*indexing_fragments = Setting::NotSet;
|
||||
*search_fragments = Setting::NotSet;
|
||||
*request = Setting::NotSet;
|
||||
*response = Setting::NotSet;
|
||||
*headers = Setting::NotSet;
|
||||
@ -1039,6 +1275,8 @@ fn apply_default_for_source(
|
||||
*pooling = Setting::NotSet;
|
||||
*dimensions = Setting::Reset;
|
||||
*url = Setting::Reset;
|
||||
*indexing_fragments = Setting::Reset;
|
||||
*search_fragments = Setting::Reset;
|
||||
*request = Setting::Reset;
|
||||
*response = Setting::Reset;
|
||||
*headers = Setting::Reset;
|
||||
@ -1051,6 +1289,8 @@ fn apply_default_for_source(
|
||||
*pooling = Setting::NotSet;
|
||||
*dimensions = Setting::Reset;
|
||||
*url = Setting::NotSet;
|
||||
*indexing_fragments = Setting::NotSet;
|
||||
*search_fragments = Setting::NotSet;
|
||||
*request = Setting::NotSet;
|
||||
*response = Setting::NotSet;
|
||||
*document_template = Setting::NotSet;
|
||||
@ -1065,6 +1305,8 @@ fn apply_default_for_source(
|
||||
*pooling = Setting::NotSet;
|
||||
*dimensions = Setting::NotSet;
|
||||
*url = Setting::NotSet;
|
||||
*indexing_fragments = Setting::NotSet;
|
||||
*search_fragments = Setting::NotSet;
|
||||
*request = Setting::NotSet;
|
||||
*response = Setting::NotSet;
|
||||
*document_template = Setting::NotSet;
|
||||
@ -1131,6 +1373,8 @@ pub enum MetaEmbeddingSetting {
|
||||
DocumentTemplate,
|
||||
DocumentTemplateMaxBytes,
|
||||
Url,
|
||||
IndexingFragments,
|
||||
SearchFragments,
|
||||
Request,
|
||||
Response,
|
||||
Headers,
|
||||
@ -1153,6 +1397,8 @@ impl MetaEmbeddingSetting {
|
||||
DocumentTemplate => "documentTemplate",
|
||||
DocumentTemplateMaxBytes => "documentTemplateMaxBytes",
|
||||
Url => "url",
|
||||
IndexingFragments => "indexingFragments",
|
||||
SearchFragments => "searchFragments",
|
||||
Request => "request",
|
||||
Response => "response",
|
||||
Headers => "headers",
|
||||
@ -1176,6 +1422,8 @@ impl EmbeddingSettings {
|
||||
dimensions: &Setting<usize>,
|
||||
api_key: &Setting<String>,
|
||||
url: &Setting<String>,
|
||||
indexing_fragments: &Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
search_fragments: &Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
request: &Setting<serde_json::Value>,
|
||||
response: &Setting<serde_json::Value>,
|
||||
document_template: &Setting<String>,
|
||||
@ -1210,6 +1458,20 @@ impl EmbeddingSettings {
|
||||
)?;
|
||||
Self::check_setting(embedder_name, source, MetaEmbeddingSetting::ApiKey, context, api_key)?;
|
||||
Self::check_setting(embedder_name, source, MetaEmbeddingSetting::Url, context, url)?;
|
||||
Self::check_setting(
|
||||
embedder_name,
|
||||
source,
|
||||
MetaEmbeddingSetting::IndexingFragments,
|
||||
context,
|
||||
indexing_fragments,
|
||||
)?;
|
||||
Self::check_setting(
|
||||
embedder_name,
|
||||
source,
|
||||
MetaEmbeddingSetting::SearchFragments,
|
||||
context,
|
||||
search_fragments,
|
||||
)?;
|
||||
Self::check_setting(
|
||||
embedder_name,
|
||||
source,
|
||||
@ -1348,8 +1610,8 @@ impl EmbeddingSettings {
|
||||
) => FieldStatus::Allowed,
|
||||
(
|
||||
OpenAi,
|
||||
Revision | Pooling | Request | Response | Headers | SearchEmbedder
|
||||
| IndexingEmbedder,
|
||||
Revision | Pooling | IndexingFragments | SearchFragments | Request | Response
|
||||
| Headers | SearchEmbedder | IndexingEmbedder,
|
||||
_,
|
||||
) => FieldStatus::Disallowed,
|
||||
(
|
||||
@ -1359,8 +1621,8 @@ impl EmbeddingSettings {
|
||||
) => FieldStatus::Allowed,
|
||||
(
|
||||
HuggingFace,
|
||||
ApiKey | Dimensions | Url | Request | Response | Headers | SearchEmbedder
|
||||
| IndexingEmbedder,
|
||||
ApiKey | Dimensions | Url | IndexingFragments | SearchFragments | Request
|
||||
| Response | Headers | SearchEmbedder | IndexingEmbedder,
|
||||
_,
|
||||
) => FieldStatus::Disallowed,
|
||||
(Ollama, Model, _) => FieldStatus::Mandatory,
|
||||
@ -1371,8 +1633,8 @@ impl EmbeddingSettings {
|
||||
) => FieldStatus::Allowed,
|
||||
(
|
||||
Ollama,
|
||||
Revision | Pooling | Request | Response | Headers | SearchEmbedder
|
||||
| IndexingEmbedder,
|
||||
Revision | Pooling | IndexingFragments | SearchFragments | Request | Response
|
||||
| Headers | SearchEmbedder | IndexingEmbedder,
|
||||
_,
|
||||
) => FieldStatus::Disallowed,
|
||||
(UserProvided, Dimensions, _) => FieldStatus::Mandatory,
|
||||
@ -1386,6 +1648,8 @@ impl EmbeddingSettings {
|
||||
| DocumentTemplate
|
||||
| DocumentTemplateMaxBytes
|
||||
| Url
|
||||
| IndexingFragments
|
||||
| SearchFragments
|
||||
| Request
|
||||
| Response
|
||||
| Headers
|
||||
@ -1404,6 +1668,10 @@ impl EmbeddingSettings {
|
||||
| Headers,
|
||||
_,
|
||||
) => FieldStatus::Allowed,
|
||||
(Rest, IndexingFragments, NotNested | Indexing) => FieldStatus::Allowed,
|
||||
(Rest, IndexingFragments, Search) => FieldStatus::Disallowed,
|
||||
(Rest, SearchFragments, NotNested | Search) => FieldStatus::Allowed,
|
||||
(Rest, SearchFragments, Indexing) => FieldStatus::Disallowed,
|
||||
(Rest, Model | Revision | Pooling | SearchEmbedder | IndexingEmbedder, _) => {
|
||||
FieldStatus::Disallowed
|
||||
}
|
||||
@ -1419,6 +1687,8 @@ impl EmbeddingSettings {
|
||||
| DocumentTemplate
|
||||
| DocumentTemplateMaxBytes
|
||||
| Url
|
||||
| IndexingFragments
|
||||
| SearchFragments
|
||||
| Request
|
||||
| Response
|
||||
| Headers,
|
||||
@ -1512,6 +1782,11 @@ impl std::fmt::Display for EmbedderSource {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)]
|
||||
pub struct Fragment {
|
||||
pub value: serde_json::Value,
|
||||
}
|
||||
|
||||
impl EmbeddingSettings {
|
||||
fn from_hugging_face(
|
||||
super::hf::EmbedderOptions {
|
||||
@ -1534,6 +1809,8 @@ impl EmbeddingSettings {
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
url: Setting::NotSet,
|
||||
indexing_fragments: Setting::NotSet,
|
||||
search_fragments: Setting::NotSet,
|
||||
request: Setting::NotSet,
|
||||
response: Setting::NotSet,
|
||||
headers: Setting::NotSet,
|
||||
@ -1566,6 +1843,8 @@ impl EmbeddingSettings {
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
url: Setting::some_or_not_set(url),
|
||||
indexing_fragments: Setting::NotSet,
|
||||
search_fragments: Setting::NotSet,
|
||||
request: Setting::NotSet,
|
||||
response: Setting::NotSet,
|
||||
headers: Setting::NotSet,
|
||||
@ -1598,6 +1877,8 @@ impl EmbeddingSettings {
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
url: Setting::some_or_not_set(url),
|
||||
indexing_fragments: Setting::NotSet,
|
||||
search_fragments: Setting::NotSet,
|
||||
request: Setting::NotSet,
|
||||
response: Setting::NotSet,
|
||||
headers: Setting::NotSet,
|
||||
@ -1622,6 +1903,8 @@ impl EmbeddingSettings {
|
||||
document_template: Setting::NotSet,
|
||||
document_template_max_bytes: Setting::NotSet,
|
||||
url: Setting::NotSet,
|
||||
indexing_fragments: Setting::NotSet,
|
||||
search_fragments: Setting::NotSet,
|
||||
request: Setting::NotSet,
|
||||
response: Setting::NotSet,
|
||||
headers: Setting::NotSet,
|
||||
@ -1638,6 +1921,8 @@ impl EmbeddingSettings {
|
||||
dimensions,
|
||||
url,
|
||||
request,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
response,
|
||||
distribution,
|
||||
headers,
|
||||
@ -1653,9 +1938,39 @@ impl EmbeddingSettings {
|
||||
pooling: Setting::NotSet,
|
||||
api_key: Setting::some_or_not_set(api_key),
|
||||
dimensions: Setting::some_or_not_set(dimensions),
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
document_template: if indexing_fragments.is_empty() && search_fragments.is_empty() {
|
||||
document_template
|
||||
} else {
|
||||
Setting::NotSet
|
||||
},
|
||||
document_template_max_bytes: if indexing_fragments.is_empty()
|
||||
&& search_fragments.is_empty()
|
||||
{
|
||||
document_template_max_bytes
|
||||
} else {
|
||||
Setting::NotSet
|
||||
},
|
||||
url: Setting::Set(url),
|
||||
indexing_fragments: if indexing_fragments.is_empty() {
|
||||
Setting::NotSet
|
||||
} else {
|
||||
Setting::Set(
|
||||
indexing_fragments
|
||||
.into_iter()
|
||||
.map(|(name, fragment)| (name, Some(Fragment { value: fragment })))
|
||||
.collect(),
|
||||
)
|
||||
},
|
||||
search_fragments: if search_fragments.is_empty() {
|
||||
Setting::NotSet
|
||||
} else {
|
||||
Setting::Set(
|
||||
search_fragments
|
||||
.into_iter()
|
||||
.map(|(name, fragment)| (name, Some(Fragment { value: fragment })))
|
||||
.collect(),
|
||||
)
|
||||
},
|
||||
request: Setting::Set(request),
|
||||
response: Setting::Set(response),
|
||||
distribution: Setting::some_or_not_set(distribution),
|
||||
@ -1714,6 +2029,8 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
|
||||
document_template: Setting::NotSet,
|
||||
document_template_max_bytes: Setting::NotSet,
|
||||
url: Setting::NotSet,
|
||||
indexing_fragments: Setting::NotSet,
|
||||
search_fragments: Setting::NotSet,
|
||||
request: Setting::NotSet,
|
||||
response: Setting::NotSet,
|
||||
headers: Setting::NotSet,
|
||||
@ -1786,6 +2103,8 @@ impl From<EmbeddingSettings> for SubEmbeddingSettings {
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
headers,
|
||||
@ -1804,6 +2123,8 @@ impl From<EmbeddingSettings> for SubEmbeddingSettings {
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
headers,
|
||||
@ -1828,6 +2149,8 @@ impl From<EmbeddingSettings> for EmbeddingConfig {
|
||||
document_template,
|
||||
document_template_max_bytes,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
distribution,
|
||||
@ -1879,6 +2202,8 @@ impl From<EmbeddingSettings> for EmbeddingConfig {
|
||||
EmbedderSource::Rest => SubEmbedderOptions::rest(
|
||||
url.set().unwrap(),
|
||||
api_key,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request.set().unwrap(),
|
||||
response.set().unwrap(),
|
||||
headers,
|
||||
@ -1922,6 +2247,8 @@ impl SubEmbedderOptions {
|
||||
document_template: _,
|
||||
document_template_max_bytes: _,
|
||||
url,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request,
|
||||
response,
|
||||
headers,
|
||||
@ -1944,6 +2271,8 @@ impl SubEmbedderOptions {
|
||||
EmbedderSource::Rest => Self::rest(
|
||||
url.set().unwrap(),
|
||||
api_key,
|
||||
indexing_fragments,
|
||||
search_fragments,
|
||||
request.set().unwrap(),
|
||||
response.set().unwrap(),
|
||||
headers,
|
||||
@ -2010,9 +2339,13 @@ impl SubEmbedderOptions {
|
||||
distribution: distribution.set(),
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn rest(
|
||||
url: String,
|
||||
api_key: Setting<String>,
|
||||
indexing_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
search_fragments: Setting<BTreeMap<String, Option<Fragment>>>,
|
||||
request: serde_json::Value,
|
||||
response: serde_json::Value,
|
||||
headers: Setting<BTreeMap<String, String>>,
|
||||
@ -2027,6 +2360,22 @@ impl SubEmbedderOptions {
|
||||
response,
|
||||
distribution: distribution.set(),
|
||||
headers: headers.set().unwrap_or_default(),
|
||||
search_fragments: search_fragments
|
||||
.set()
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|(name, fragment)| {
|
||||
Some((name, fragment.map(|fragment| fragment.value)?))
|
||||
})
|
||||
.collect(),
|
||||
indexing_fragments: indexing_fragments
|
||||
.set()
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.filter_map(|(name, fragment)| {
|
||||
Some((name, fragment.map(|fragment| fragment.value)?))
|
||||
})
|
||||
.collect(),
|
||||
})
|
||||
}
|
||||
fn ollama(
|
||||
@ -2066,3 +2415,29 @@ impl From<SubEmbedderOptions> for EmbedderOptions {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn fragments_from_settings(
|
||||
setting: &Setting<EmbeddingSettings>,
|
||||
) -> impl Iterator<Item = String> + '_ {
|
||||
let Some(setting) = setting.as_ref().set() else { return Either::Left(None.into_iter()) };
|
||||
|
||||
let filter_map = |(name, fragment): (&String, &Option<Fragment>)| {
|
||||
if fragment.is_some() {
|
||||
Some(name.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(setting) = setting.indexing_fragments.as_ref().set() {
|
||||
Either::Right(setting.iter().filter_map(filter_map))
|
||||
} else {
|
||||
let Some(setting) = setting.indexing_embedder.as_ref().set() else {
|
||||
return Either::Left(None.into_iter());
|
||||
};
|
||||
let Some(setting) = setting.indexing_fragments.as_ref().set() else {
|
||||
return Either::Left(None.into_iter());
|
||||
};
|
||||
Either::Right(setting.iter().filter_map(filter_map))
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user