Fix Facet Indexing bugs

1. Handle keys with variable length correctly This fixes https://github.com/meilisearch/meilisearch/issues/3042 and is easily reproducible with the updated fuzz tests, which now generate keys with variable lengths. 2. Prevent adding facets to the database if their encoded value does not satisfy `valid_lmdb_key`. This fixes an indexing failure when a document had a filterable attribute containing a value whose length is higher than ~500 bytes.
2025-11-04 09:56:28 +00:00 · 2022-11-14 14:16:14 +01:00
parent a651397afc
commit d95d02cb8a
4 changed files with 66 additions and 254 deletions
--- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
@@ -38,7 +38,6 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(

        let key = FacetGroupKey { field_id, level: 0, left_bound: number };
        let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
-
        facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
    }

--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@@ -6,7 +6,7 @@ use heed::BytesEncode;
 use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
 use crate::heed_codec::StrRefCodec;
-use crate::update::index_documents::merge_cbo_roaring_bitmaps;
+use crate::update::index_documents::{merge_cbo_roaring_bitmaps, valid_lmdb_key};
 use crate::{FieldId, Result};

 /// Extracts the facet string and the documents ids where this facet string appear.
@@ -41,9 +41,10 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
        let normalised_value = std::str::from_utf8(normalized_value_bytes)?;
        let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
        let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
-
-        // document id is encoded in native-endian because of the CBO roaring bitmap codec
-        facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
+        if valid_lmdb_key(&key_bytes) {
+            // document id is encoded in native-endian because of the CBO roaring bitmap codec
+            facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
+        }
    }

    sorter_into_reader(facet_string_docids_sorter, indexer)