Fix Facet Indexing bugs

1. Handle keys with variable length correctly

This fixes https://github.com/meilisearch/meilisearch/issues/3042 and
is easily reproducible with the updated fuzz tests, which now generate
keys with variable lengths.

2. Prevent adding facets to the database if their encoded value does
not satisfy `valid_lmdb_key`.

This fixes an indexing failure when a document had a filterable
attribute containing a value whose length is higher than ~500 bytes.
This commit is contained in:
Loïc Lecrenier
2022-11-14 14:16:14 +01:00
parent a651397afc
commit d95d02cb8a
4 changed files with 66 additions and 254 deletions

View File

@ -38,7 +38,6 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
}

View File

@ -6,7 +6,7 @@ use heed::BytesEncode;
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
use crate::heed_codec::StrRefCodec;
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
use crate::update::index_documents::{merge_cbo_roaring_bitmaps, valid_lmdb_key};
use crate::{FieldId, Result};
/// Extracts the facet string and the documents ids where this facet string appear.
@ -41,9 +41,10 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
let normalised_value = std::str::from_utf8(normalized_value_bytes)?;
let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
// document id is encoded in native-endian because of the CBO roaring bitmap codec
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
if valid_lmdb_key(&key_bytes) {
// document id is encoded in native-endian because of the CBO roaring bitmap codec
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
}
}
sorter_into_reader(facet_string_docids_sorter, indexer)