reduce the number of computed prefix

2025-11-23 13:16:33 +00:00 · 2025-03-27 17:57:57 +01:00
13 changed files with 125 additions and 212 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -394,7 +394,8 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
 [[package]]
 name = "arroy"
 version = "0.6.1"
-source = "git+https://github.com/meilisearch/arroy.git?rev=5b748bac2c69c65a97980901b02067a3a545e357#5b748bac2c69c65a97980901b02067a3a545e357"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "08e6111f351d004bd13e95ab540721272136fd3218b39d3ec95a2ea1c4e6a0a6"
 dependencies = [
 "bytemuck",
 "byteorder",
--- a/crates/index-scheduler/src/lib.rs
+++ b/crates/index-scheduler/src/lib.rs
@@ -625,8 +625,8 @@ impl IndexScheduler {
        task_id: Option<TaskId>,
        dry_run: bool,
    ) -> Result<Task> {
-        // if the task doesn't delete or cancel anything and 40% of the task queue is full, we must refuse to enqueue the incoming task
-        if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } | KindWithContent::TaskCancelation { tasks, .. } if !tasks.is_empty())
+        // if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task
+        if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty())
            && (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 40
        {
            return Err(Error::NoSpaceLeftInTaskQueue);
--- a/crates/index-scheduler/src/queue/mod.rs
+++ b/crates/index-scheduler/src/queue/mod.rs
@@ -292,6 +292,8 @@ impl Queue {
            return Ok(task);
        }

+        // Get rid of the mutability.
+        let task = task;
        self.tasks.register(wtxn, &task)?;

        Ok(task)
--- a/crates/index-scheduler/src/queue/test.rs
+++ b/crates/index-scheduler/src/queue/test.rs
@@ -364,7 +364,7 @@ fn test_task_queue_is_full() {
    // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
    snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");

-    // Even the task deletion and cancelation that don't delete anything should be refused
+    // Even the task deletion that doesn't delete anything shouldn't be accepted
    let result = index_scheduler
        .register(
            KindWithContent::TaskDeletion { query: S("test"), tasks: RoaringBitmap::new() },
@@ -373,39 +373,10 @@ fn test_task_queue_is_full() {
        )
        .unwrap_err();
    snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
-    let result = index_scheduler
-        .register(
-            KindWithContent::TaskCancelation { query: S("test"), tasks: RoaringBitmap::new() },
-            None,
-            false,
-        )
-        .unwrap_err();
-    snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
-
    // we won't be able to test this error in an integration test thus as a best effort test I still ensure the error return the expected error code
    snapshot!(format!("{:?}", result.error_code()), @"NoSpaceLeftOnDevice");

-    // But a task cancelation that cancel something should work
-    index_scheduler
-        .register(
-            KindWithContent::TaskCancelation { query: S("test"), tasks: (0..100).collect() },
-            None,
-            false,
-        )
-        .unwrap();
-    handle.advance_one_successful_batch();
-
-    // But we should still be forbidden from enqueuing new tasks
-    let result = index_scheduler
-        .register(
-            KindWithContent::IndexCreation { index_uid: S("doggo"), primary_key: None },
-            None,
-            false,
-        )
-        .unwrap_err();
-    snapshot!(result, @"Meilisearch cannot receive write operations because the limit of the task database has been reached. Please delete tasks to continue performing write operations.");
-
-    // And a task deletion that delete something should works
+    // But a task deletion that delete something should works
    index_scheduler
        .register(
            KindWithContent::TaskDeletion { query: S("test"), tasks: (0..100).collect() },
--- a/crates/meilisearch-types/src/error.rs
+++ b/crates/meilisearch-types/src/error.rs
@@ -454,10 +454,7 @@ impl ErrorCode for milli::Error {
                    }
                    UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
                    UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
-                    UserError::InvalidVectorDimensions { .. }
-                    | UserError::InvalidIndexingVectorDimensions { .. } => {
-                        Code::InvalidVectorDimensions
-                    }
+                    UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
                    UserError::InvalidVectorsMapType { .. }
                    | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
                    UserError::TooManyVectors(_, _) => Code::TooManyVectors,
--- a/crates/meilisearch/tests/vector/mod.rs
+++ b/crates/meilisearch/tests/vector/mod.rs
@@ -164,87 +164,6 @@ async fn add_remove_user_provided() {
    "###);
 }

-#[actix_rt::test]
-async fn user_provide_mismatched_embedding_dimension() {
-    let server = Server::new().await;
-    let index = server.index("doggo");
-
-    let (response, code) = index
-        .update_settings(json!({
-          "embedders": {
-              "manual": {
-                  "source": "userProvided",
-                  "dimensions": 3,
-              }
-          },
-        }))
-        .await;
-    snapshot!(code, @"202 Accepted");
-    server.wait_task(response.uid()).await.succeeded();
-
-    let documents = json!([
-      {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
-    ]);
-    let (value, code) = index.add_documents(documents, None).await;
-    snapshot!(code, @"202 Accepted");
-    let task = index.wait_task(value.uid()).await;
-    snapshot!(task, @r###"
-    {
-      "uid": "[uid]",
-      "batchUid": "[batch_uid]",
-      "indexUid": "doggo",
-      "status": "failed",
-      "type": "documentAdditionOrUpdate",
-      "canceledBy": null,
-      "details": {
-        "receivedDocuments": 1,
-        "indexedDocuments": 0
-      },
-      "error": {
-        "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n  - note: embedding #0 has dimensions 2\n  - note: embedder `manual` requires 3",
-        "code": "invalid_vector_dimensions",
-        "type": "invalid_request",
-        "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
-      },
-      "duration": "[duration]",
-      "enqueuedAt": "[date]",
-      "startedAt": "[date]",
-      "finishedAt": "[date]"
-    }
-    "###);
-
-    let new_document = json!([
-      {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
-    ]);
-    let (response, code) = index.add_documents(new_document, None).await;
-    snapshot!(code, @"202 Accepted");
-    let task = index.wait_task(response.uid()).await;
-    snapshot!(task, @r###"
-    {
-      "uid": "[uid]",
-      "batchUid": "[batch_uid]",
-      "indexUid": "doggo",
-      "status": "failed",
-      "type": "documentAdditionOrUpdate",
-      "canceledBy": null,
-      "details": {
-        "receivedDocuments": 1,
-        "indexedDocuments": 0
-      },
-      "error": {
-        "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n  - note: embedding #0 has dimensions 2\n  - note: embedder `manual` requires 3",
-        "code": "invalid_vector_dimensions",
-        "type": "invalid_request",
-        "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
-      },
-      "duration": "[duration]",
-      "enqueuedAt": "[date]",
-      "startedAt": "[date]",
-      "finishedAt": "[date]"
-    }
-    "###);
-}
-
 async fn generate_default_user_provided_documents(server: &Server) -> Index {
    let index = server.index("doggo");

--- a/crates/milli/Cargo.toml
+++ b/crates/milli/Cargo.toml
@@ -87,8 +87,7 @@ rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838
    "no_time",
    "sync",
 ] }
-# arroy = "0.6.1"
-arroy = { git = "https://github.com/meilisearch/arroy.git", rev = "5b748bac2c69c65a97980901b02067a3a545e357" } # incremental update
+arroy = "0.6.1"
 rand = "0.8.5"
 tracing = "0.1.41"
 ureq = { version = "2.12.1", features = ["json"] }
--- a/crates/milli/src/error.rs
+++ b/crates/milli/src/error.rs
@@ -129,14 +129,6 @@ and can not be more than 511 bytes.", .document_id.to_string()
    InvalidGeoField(#[from] GeoError),
    #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
    InvalidVectorDimensions { expected: usize, found: usize },
-    #[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n  - note: embedding #{embedding_index} has dimensions {found}\n  - note: embedder `{embedder_name}` requires {expected}")]
-    InvalidIndexingVectorDimensions {
-        embedder_name: String,
-        document_id: String,
-        embedding_index: usize,
-        expected: usize,
-        found: usize,
-    },
    #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
    InvalidVectorsMapType { document_id: String, value: Value },
    #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
--- a/crates/milli/src/progress.rs
+++ b/crates/milli/src/progress.rs
@@ -266,15 +266,12 @@ impl Step for arroy::MainStep {
                "writing the descendants and metadata"
            }
            arroy::MainStep::RetrieveTheUpdatedItems => "retrieve the updated items",
+            arroy::MainStep::RetrievingTheTreeAndItemNodes => "retrieving the tree and item nodes",
+            arroy::MainStep::UpdatingTheTrees => "updating the trees",
+            arroy::MainStep::CreateNewTrees => "create new trees",
+            arroy::MainStep::WritingNodesToDatabase => "writing nodes to database",
+            arroy::MainStep::DeleteExtraneousTrees => "delete extraneous trees",
            arroy::MainStep::WriteTheMetadata => "write the metadata",
-            arroy::MainStep::RetrievingTheItemsIds => "retrieving the items ids",
-            arroy::MainStep::RetrievingTheUsedTreeNodes => "retrieving the used tree nodes",
-            arroy::MainStep::DeletingExtraTrees => "deleting extra trees",
-            arroy::MainStep::RemoveItemsFromExistingTrees => "remove items from existing trees",
-            arroy::MainStep::InsertItemsInCurrentTrees => "insert items in current trees",
-            arroy::MainStep::IncrementalIndexLargeDescendants => {
-                "incremental index large descendants"
-            }
        }
        .into()
    }
--- a/crates/milli/src/search/new/bucket_sort.rs
+++ b/crates/milli/src/search/new/bucket_sort.rs
@@ -173,19 +173,17 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
                ranking_rule_scores.push(ScoreDetails::Skipped);

                // remove candidates from the universe without adding them to result if their score is below the threshold
-                let is_below_threshold =
-                    ranking_score_threshold.is_some_and(|ranking_score_threshold| {
-                        let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
-                        current_score < ranking_score_threshold
-                    });
-
-                if is_below_threshold {
-                    all_candidates -= &bucket;
-                    all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
-                } else {
-                    maybe_add_to_results!(bucket);
+                if let Some(ranking_score_threshold) = ranking_score_threshold {
+                    let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
+                    if current_score < ranking_score_threshold {
+                        all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
+                        back!();
+                        continue;
+                    }
                }

+                maybe_add_to_results!(bucket);
+
                ranking_rule_scores.pop();

                if cur_ranking_rule_index == 0 {
@@ -239,24 +237,23 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
        );

        // remove candidates from the universe without adding them to result if their score is below the threshold
-        let is_below_threshold = ranking_score_threshold.is_some_and(|ranking_score_threshold| {
+        if let Some(ranking_score_threshold) = ranking_score_threshold {
            let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
-            current_score < ranking_score_threshold
-        });
+            if current_score < ranking_score_threshold {
+                all_candidates -=
+                    next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
+                back!();
+                continue;
+            }
+        }

        ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;

        if cur_ranking_rule_index == ranking_rules_len - 1
            || (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
            || cur_offset + (next_bucket.candidates.len() as usize) < from
-            || is_below_threshold
        {
-            if is_below_threshold {
-                all_candidates -= &next_bucket.candidates;
-                all_candidates -= &ranking_rule_universes[cur_ranking_rule_index];
-            } else {
-                maybe_add_to_results!(next_bucket.candidates);
-            }
+            maybe_add_to_results!(next_bucket.candidates);
            ranking_rule_scores.pop();
            continue;
        }
--- a/crates/milli/src/search/new/db_cache.rs
+++ b/crates/milli/src/search/new/db_cache.rs
@@ -37,12 +37,12 @@ pub struct DatabaseCache<'ctx> {

    pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
    pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
-    pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
+    pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<RoaringBitmap>>,
    pub word_positions: FxHashMap<Interned<String>, Vec<u16>>,
    pub word_prefix_positions: FxHashMap<Interned<String>, Vec<u16>>,

    pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
-    pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
+    pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<RoaringBitmap>>,
    pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
    pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
 }
@@ -562,14 +562,46 @@ impl<'ctx> SearchContext<'ctx> {
            return Ok(None);
        }

-        DatabaseCache::get_value(
-            self.txn,
-            (word_prefix, fid),
-            &(self.word_interner.get(word_prefix).as_str(), fid),
-            &mut self.db_cache.word_prefix_fid_docids,
-            universe,
-            self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
-        )
+        let cache = &mut self.db_cache.word_prefix_fid_docids;
+        let prefix_db = &self.index.word_prefix_fid_docids;
+        let db = &self.index.word_fid_docids;
+        if let Entry::Vacant(entry) = cache.entry((word_prefix, fid)) {
+            let word_prefix_bytes = self.word_interner.get(word_prefix).as_bytes().to_owned();
+            let word_prefix_str = std::str::from_utf8(&word_prefix_bytes).unwrap();
+            match prefix_db.get(self.txn, &(word_prefix_str, fid))? {
+                Some(mut bitmap) => {
+                    if let Some(universe) = universe {
+                        bitmap &= universe;
+                    }
+                    entry.insert(Some(bitmap));
+                }
+                None => {
+                    let mut key = word_prefix_bytes.clone();
+                    key.push(0);
+                    let remap_key_type = db
+                        .remap_key_type::<Bytes>()
+                        .prefix_iter(self.txn, &key)?
+                        .remap_key_type::<StrBEU16Codec>();
+
+                    let mut bitmap = RoaringBitmap::new();
+                    for result in remap_key_type {
+                        let ((_, pos), value) = result?;
+
+                        if pos == fid {
+                            if let Some(universe) = universe {
+                                bitmap |= value & universe;
+                            } else {
+                                bitmap |= value;
+                            }
+                        }
+                    }
+
+                    entry.insert(Some(bitmap));
+                }
+            }
+        }
+
+        Ok(cache.get(&(word_prefix, fid)).unwrap().clone())
    }

    pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
@@ -605,6 +637,7 @@ impl<'ctx> SearchContext<'ctx> {
                let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
                key.push(0);
                let mut fids = vec![];
+                // TODO: This is no more exhaustive, we should iterate over all fids.
                let remap_key_type = self
                    .index
                    .word_prefix_fid_docids
@@ -612,11 +645,7 @@ impl<'ctx> SearchContext<'ctx> {
                    .prefix_iter(self.txn, &key)?
                    .remap_key_type::<StrBEU16Codec>();
                for result in remap_key_type {
-                    let ((_, fid), value) = result?;
-                    // filling other caches to avoid searching for them again
-                    self.db_cache
-                        .word_prefix_fid_docids
-                        .insert((word_prefix, fid), Some(Cow::Borrowed(value)));
+                    let ((_, fid), _value) = result?;
                    fids.push(fid);
                }
                entry.insert(fids.clone());
@@ -648,14 +677,46 @@ impl<'ctx> SearchContext<'ctx> {
        word_prefix: Interned<String>,
        position: u16,
    ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value(
-            self.txn,
-            (word_prefix, position),
-            &(self.word_interner.get(word_prefix).as_str(), position),
-            &mut self.db_cache.word_prefix_position_docids,
-            universe,
-            self.index.word_prefix_position_docids.remap_data_type::<Bytes>(),
-        )
+        let cache = &mut self.db_cache.word_prefix_position_docids;
+        let prefix_db = &self.index.word_prefix_position_docids;
+        let db = &self.index.word_position_docids;
+        if let Entry::Vacant(entry) = cache.entry((word_prefix, position)) {
+            let word_prefix_bytes = self.word_interner.get(word_prefix).as_bytes().to_owned();
+            let word_prefix_str = std::str::from_utf8(&word_prefix_bytes).unwrap();
+            match prefix_db.get(self.txn, &(word_prefix_str, position))? {
+                Some(mut bitmap) => {
+                    if let Some(universe) = universe {
+                        bitmap &= universe;
+                    }
+                    entry.insert(Some(bitmap));
+                }
+                None => {
+                    let mut key = word_prefix_bytes.clone();
+                    key.push(0);
+                    let remap_key_type = db
+                        .remap_key_type::<Bytes>()
+                        .prefix_iter(self.txn, &key)?
+                        .remap_key_type::<StrBEU16Codec>();
+
+                    let mut bitmap = RoaringBitmap::new();
+                    for result in remap_key_type {
+                        let ((_, pos), value) = result?;
+
+                        if pos == position {
+                            if let Some(universe) = universe {
+                                bitmap |= value & universe;
+                            } else {
+                                bitmap |= value;
+                            }
+                        }
+                    }
+
+                    entry.insert(Some(bitmap));
+                }
+            }
+        }
+
+        Ok(cache.get(&(word_prefix, position)).unwrap().clone())
    }

    pub fn get_db_word_positions(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
@@ -696,6 +757,7 @@ impl<'ctx> SearchContext<'ctx> {
                let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
                key.push(0);
                let mut positions = vec![];
+                // TODO: This is no more exhaustive, we should iterate over all positions.
                let remap_key_type = self
                    .index
                    .word_prefix_position_docids
@@ -703,11 +765,7 @@ impl<'ctx> SearchContext<'ctx> {
                    .prefix_iter(self.txn, &key)?
                    .remap_key_type::<StrBEU16Codec>();
                for result in remap_key_type {
-                    let ((_, position), value) = result?;
-                    // filling other caches to avoid searching for them again
-                    self.db_cache
-                        .word_prefix_position_docids
-                        .insert((word_prefix, position), Some(Cow::Borrowed(value)));
+                    let ((_, position), _value) = result?;
                    positions.push(position);
                }
                entry.insert(positions.clone());
--- a/crates/milli/src/update/new/extract/vectors/mod.rs
+++ b/crates/milli/src/update/new/extract/vectors/mod.rs
@@ -121,7 +121,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                            // do we have set embeddings?
                            if let Some(embeddings) = new_vectors.embeddings {
                                chunks.set_vectors(
-                                    update.external_document_id(),
                                    update.docid(),
                                    embeddings
                                        .into_vec(&context.doc_alloc, embedder_name)
@@ -129,7 +128,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                                            document_id: update.external_document_id().to_string(),
                                            error: error.to_string(),
                                        })?,
-                                )?;
+                                );
                            } else if new_vectors.regenerate {
                                let new_rendered = prompt.render_document(
                                    update.external_document_id(),
@@ -210,7 +209,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                            chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
                            if let Some(embeddings) = new_vectors.embeddings {
                                chunks.set_vectors(
-                                    insertion.external_document_id(),
                                    insertion.docid(),
                                    embeddings
                                        .into_vec(&context.doc_alloc, embedder_name)
@@ -220,7 +218,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                                                .to_string(),
                                            error: error.to_string(),
                                        })?,
-                                )?;
+                                );
                            } else if new_vectors.regenerate {
                                let rendered = prompt.render_document(
                                    insertion.external_document_id(),
@@ -275,7 +273,6 @@ struct Chunks<'a, 'b, 'extractor> {
    embedder: &'a Embedder,
    embedder_id: u8,
    embedder_name: &'a str,
-    dimensions: usize,
    prompt: &'a Prompt,
    possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
    user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
@@ -300,7 +297,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
        let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
        let texts = BVec::with_capacity_in(capacity, doc_alloc);
        let ids = BVec::with_capacity_in(capacity, doc_alloc);
-        let dimensions = embedder.dimensions();
        Self {
            texts,
            ids,
@@ -313,7 +309,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
            embedder_name,
            user_provided,
            has_manual_generation: None,
-            dimensions,
        }
    }

@@ -495,25 +490,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
        }
    }

-    fn set_vectors(
-        &self,
-        external_docid: &'a str,
-        docid: DocumentId,
-        embeddings: Vec<Embedding>,
-    ) -> Result<()> {
-        for (embedding_index, embedding) in embeddings.iter().enumerate() {
-            if embedding.len() != self.dimensions {
-                return Err(UserError::InvalidIndexingVectorDimensions {
-                    expected: self.dimensions,
-                    found: embedding.len(),
-                    embedder_name: self.embedder_name.to_string(),
-                    document_id: external_docid.to_string(),
-                    embedding_index,
-                }
-                .into());
-            }
-        }
+    fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
        self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
-        Ok(())
    }
 }
--- a/crates/milli/src/update/new/words_prefix_docids.rs
+++ b/crates/milli/src/update/new/words_prefix_docids.rs
@@ -291,6 +291,9 @@ impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> {
                let (_word, pos) = StrBEU16Codec::bytes_decode(key).map_err(Error::Decoding)?;
                positions.entry(pos).or_insert_with(Vec::new).push(bytes);
            }
+
+            // We remove all the positions that have less than 100 bitmaps.
+            positions.retain(|_, bitmaps| bitmaps.len() > 100);
            assert!(prefixes_bitmaps.insert(prefix.as_str(), positions).is_none());
        }