Compare commits

..

1 Commits

Author SHA1 Message Date
934b73142d reduce the number of computed prefix 2025-03-27 17:57:57 +01:00
7 changed files with 111 additions and 168 deletions

View File

@ -454,10 +454,7 @@ impl ErrorCode for milli::Error {
}
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
UserError::InvalidVectorDimensions { .. }
| UserError::InvalidIndexingVectorDimensions { .. } => {
Code::InvalidVectorDimensions
}
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
UserError::InvalidVectorsMapType { .. }
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
UserError::TooManyVectors(_, _) => Code::TooManyVectors,

View File

@ -164,87 +164,6 @@ async fn add_remove_user_provided() {
"###);
}
#[actix_rt::test]
async fn user_provide_mismatched_embedding_dimension() {
let server = Server::new().await;
let index = server.index("doggo");
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await.succeeded();
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": "[uid]",
"batchUid": "[batch_uid]",
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
"code": "invalid_vector_dimensions",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let new_document = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
]);
let (response, code) = index.add_documents(new_document, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(response.uid()).await;
snapshot!(task, @r###"
{
"uid": "[uid]",
"batchUid": "[batch_uid]",
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
"code": "invalid_vector_dimensions",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
async fn generate_default_user_provided_documents(server: &Server) -> Index {
let index = server.index("doggo");

View File

@ -129,14 +129,6 @@ and can not be more than 511 bytes.", .document_id.to_string()
InvalidGeoField(#[from] GeoError),
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
InvalidVectorDimensions { expected: usize, found: usize },
#[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n - note: embedding #{embedding_index} has dimensions {found}\n - note: embedder `{embedder_name}` requires {expected}")]
InvalidIndexingVectorDimensions {
embedder_name: String,
document_id: String,
embedding_index: usize,
expected: usize,
found: usize,
},
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
InvalidVectorsMapType { document_id: String, value: Value },
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]

View File

@ -173,19 +173,17 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ranking_rule_scores.push(ScoreDetails::Skipped);
// remove candidates from the universe without adding them to result if their score is below the threshold
let is_below_threshold =
ranking_score_threshold.is_some_and(|ranking_score_threshold| {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
current_score < ranking_score_threshold
});
if is_below_threshold {
all_candidates -= &bucket;
all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
} else {
maybe_add_to_results!(bucket);
if let Some(ranking_score_threshold) = ranking_score_threshold {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
if current_score < ranking_score_threshold {
all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
back!();
continue;
}
}
maybe_add_to_results!(bucket);
ranking_rule_scores.pop();
if cur_ranking_rule_index == 0 {
@ -239,24 +237,23 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
);
// remove candidates from the universe without adding them to result if their score is below the threshold
let is_below_threshold = ranking_score_threshold.is_some_and(|ranking_score_threshold| {
if let Some(ranking_score_threshold) = ranking_score_threshold {
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
current_score < ranking_score_threshold
});
if current_score < ranking_score_threshold {
all_candidates -=
next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
back!();
continue;
}
}
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
if cur_ranking_rule_index == ranking_rules_len - 1
|| (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
|| cur_offset + (next_bucket.candidates.len() as usize) < from
|| is_below_threshold
{
if is_below_threshold {
all_candidates -= &next_bucket.candidates;
all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
} else {
maybe_add_to_results!(next_bucket.candidates);
}
maybe_add_to_results!(next_bucket.candidates);
ranking_rule_scores.pop();
continue;
}

View File

@ -37,12 +37,12 @@ pub struct DatabaseCache<'ctx> {
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<RoaringBitmap>>,
pub word_positions: FxHashMap<Interned<String>, Vec<u16>>,
pub word_prefix_positions: FxHashMap<Interned<String>, Vec<u16>>,
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<RoaringBitmap>>,
pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
}
@ -562,14 +562,46 @@ impl<'ctx> SearchContext<'ctx> {
return Ok(None);
}
DatabaseCache::get_value(
self.txn,
(word_prefix, fid),
&(self.word_interner.get(word_prefix).as_str(), fid),
&mut self.db_cache.word_prefix_fid_docids,
universe,
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
)
let cache = &mut self.db_cache.word_prefix_fid_docids;
let prefix_db = &self.index.word_prefix_fid_docids;
let db = &self.index.word_fid_docids;
if let Entry::Vacant(entry) = cache.entry((word_prefix, fid)) {
let word_prefix_bytes = self.word_interner.get(word_prefix).as_bytes().to_owned();
let word_prefix_str = std::str::from_utf8(&word_prefix_bytes).unwrap();
match prefix_db.get(self.txn, &(word_prefix_str, fid))? {
Some(mut bitmap) => {
if let Some(universe) = universe {
bitmap &= universe;
}
entry.insert(Some(bitmap));
}
None => {
let mut key = word_prefix_bytes.clone();
key.push(0);
let remap_key_type = db
.remap_key_type::<Bytes>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
let mut bitmap = RoaringBitmap::new();
for result in remap_key_type {
let ((_, pos), value) = result?;
if pos == fid {
if let Some(universe) = universe {
bitmap |= value & universe;
} else {
bitmap |= value;
}
}
}
entry.insert(Some(bitmap));
}
}
}
Ok(cache.get(&(word_prefix, fid)).unwrap().clone())
}
pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
@ -605,6 +637,7 @@ impl<'ctx> SearchContext<'ctx> {
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
key.push(0);
let mut fids = vec![];
// TODO: This is no more exhaustive, we should iterate over all fids.
let remap_key_type = self
.index
.word_prefix_fid_docids
@ -612,11 +645,7 @@ impl<'ctx> SearchContext<'ctx> {
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {
let ((_, fid), value) = result?;
// filling other caches to avoid searching for them again
self.db_cache
.word_prefix_fid_docids
.insert((word_prefix, fid), Some(Cow::Borrowed(value)));
let ((_, fid), _value) = result?;
fids.push(fid);
}
entry.insert(fids.clone());
@ -648,14 +677,46 @@ impl<'ctx> SearchContext<'ctx> {
word_prefix: Interned<String>,
position: u16,
) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value(
self.txn,
(word_prefix, position),
&(self.word_interner.get(word_prefix).as_str(), position),
&mut self.db_cache.word_prefix_position_docids,
universe,
self.index.word_prefix_position_docids.remap_data_type::<Bytes>(),
)
let cache = &mut self.db_cache.word_prefix_position_docids;
let prefix_db = &self.index.word_prefix_position_docids;
let db = &self.index.word_position_docids;
if let Entry::Vacant(entry) = cache.entry((word_prefix, position)) {
let word_prefix_bytes = self.word_interner.get(word_prefix).as_bytes().to_owned();
let word_prefix_str = std::str::from_utf8(&word_prefix_bytes).unwrap();
match prefix_db.get(self.txn, &(word_prefix_str, position))? {
Some(mut bitmap) => {
if let Some(universe) = universe {
bitmap &= universe;
}
entry.insert(Some(bitmap));
}
None => {
let mut key = word_prefix_bytes.clone();
key.push(0);
let remap_key_type = db
.remap_key_type::<Bytes>()
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
let mut bitmap = RoaringBitmap::new();
for result in remap_key_type {
let ((_, pos), value) = result?;
if pos == position {
if let Some(universe) = universe {
bitmap |= value & universe;
} else {
bitmap |= value;
}
}
}
entry.insert(Some(bitmap));
}
}
}
Ok(cache.get(&(word_prefix, position)).unwrap().clone())
}
pub fn get_db_word_positions(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
@ -696,6 +757,7 @@ impl<'ctx> SearchContext<'ctx> {
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
key.push(0);
let mut positions = vec![];
// TODO: This is no more exhaustive, we should iterate over all positions.
let remap_key_type = self
.index
.word_prefix_position_docids
@ -703,11 +765,7 @@ impl<'ctx> SearchContext<'ctx> {
.prefix_iter(self.txn, &key)?
.remap_key_type::<StrBEU16Codec>();
for result in remap_key_type {
let ((_, position), value) = result?;
// filling other caches to avoid searching for them again
self.db_cache
.word_prefix_position_docids
.insert((word_prefix, position), Some(Cow::Borrowed(value)));
let ((_, position), _value) = result?;
positions.push(position);
}
entry.insert(positions.clone());

View File

@ -121,7 +121,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
// do we have set embeddings?
if let Some(embeddings) = new_vectors.embeddings {
chunks.set_vectors(
update.external_document_id(),
update.docid(),
embeddings
.into_vec(&context.doc_alloc, embedder_name)
@ -129,7 +128,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
document_id: update.external_document_id().to_string(),
error: error.to_string(),
})?,
)?;
);
} else if new_vectors.regenerate {
let new_rendered = prompt.render_document(
update.external_document_id(),
@ -210,7 +209,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
if let Some(embeddings) = new_vectors.embeddings {
chunks.set_vectors(
insertion.external_document_id(),
insertion.docid(),
embeddings
.into_vec(&context.doc_alloc, embedder_name)
@ -220,7 +218,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
.to_string(),
error: error.to_string(),
})?,
)?;
);
} else if new_vectors.regenerate {
let rendered = prompt.render_document(
insertion.external_document_id(),
@ -275,7 +273,6 @@ struct Chunks<'a, 'b, 'extractor> {
embedder: &'a Embedder,
embedder_id: u8,
embedder_name: &'a str,
dimensions: usize,
prompt: &'a Prompt,
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
@ -300,7 +297,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
let texts = BVec::with_capacity_in(capacity, doc_alloc);
let ids = BVec::with_capacity_in(capacity, doc_alloc);
let dimensions = embedder.dimensions();
Self {
texts,
ids,
@ -313,7 +309,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
embedder_name,
user_provided,
has_manual_generation: None,
dimensions,
}
}
@ -495,25 +490,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
}
}
fn set_vectors(
&self,
external_docid: &'a str,
docid: DocumentId,
embeddings: Vec<Embedding>,
) -> Result<()> {
for (embedding_index, embedding) in embeddings.iter().enumerate() {
if embedding.len() != self.dimensions {
return Err(UserError::InvalidIndexingVectorDimensions {
expected: self.dimensions,
found: embedding.len(),
embedder_name: self.embedder_name.to_string(),
document_id: external_docid.to_string(),
embedding_index,
}
.into());
}
}
fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
Ok(())
}
}

View File

@ -291,6 +291,9 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> {
let (_word, pos) = StrBEU16Codec::bytes_decode(key).map_err(Error::Decoding)?;
positions.entry(pos).or_insert_with(Vec::new).push(bytes);
}
// We remove all the positions that have less than 100 bitmaps.
positions.retain(|_, bitmaps| bitmaps.len() > 100);
assert!(prefixes_bitmaps.insert(prefix.as_str(), positions).is_none());
}