mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 09:56:28 +00:00 
			
		
		
		
	Merge branch 'filter/field-exist'
This commit is contained in:
		@@ -1,15 +1,19 @@
 | 
			
		||||
use std::collections::HashSet;
 | 
			
		||||
use std::collections::{BTreeMap, HashSet};
 | 
			
		||||
use std::convert::TryInto;
 | 
			
		||||
use std::fs::File;
 | 
			
		||||
use std::io;
 | 
			
		||||
use std::mem::size_of;
 | 
			
		||||
 | 
			
		||||
use heed::zerocopy::AsBytes;
 | 
			
		||||
use heed::BytesEncode;
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
use serde_json::Value;
 | 
			
		||||
 | 
			
		||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 | 
			
		||||
use crate::error::InternalError;
 | 
			
		||||
use crate::facet::value_encoding::f64_into_bytes;
 | 
			
		||||
use crate::{DocumentId, FieldId, Result};
 | 
			
		||||
use crate::update::index_documents::{create_writer, writer_into_reader};
 | 
			
		||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32};
 | 
			
		||||
 | 
			
		||||
/// Extracts the facet values of each faceted field of each document.
 | 
			
		||||
///
 | 
			
		||||
@@ -20,7 +24,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 | 
			
		||||
    obkv_documents: grenad::Reader<R>,
 | 
			
		||||
    indexer: GrenadParameters,
 | 
			
		||||
    faceted_fields: &HashSet<FieldId>,
 | 
			
		||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
 | 
			
		||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
 | 
			
		||||
    let max_memory = indexer.max_memory_by_thread();
 | 
			
		||||
 | 
			
		||||
    let mut fid_docid_facet_numbers_sorter = create_sorter(
 | 
			
		||||
@@ -39,6 +43,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 | 
			
		||||
        max_memory.map(|m| m / 2),
 | 
			
		||||
    );
 | 
			
		||||
 | 
			
		||||
    let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
 | 
			
		||||
 | 
			
		||||
    let mut key_buffer = Vec::new();
 | 
			
		||||
    let mut cursor = obkv_documents.into_cursor()?;
 | 
			
		||||
    while let Some((docid_bytes, value)) = cursor.move_on_next()? {
 | 
			
		||||
@@ -46,16 +52,26 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 | 
			
		||||
 | 
			
		||||
        for (field_id, field_bytes) in obkv.iter() {
 | 
			
		||||
            if faceted_fields.contains(&field_id) {
 | 
			
		||||
                let value =
 | 
			
		||||
                    serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
 | 
			
		||||
                let (numbers, strings) = extract_facet_values(&value);
 | 
			
		||||
 | 
			
		||||
                key_buffer.clear();
 | 
			
		||||
 | 
			
		||||
                // prefix key with the field_id and the document_id
 | 
			
		||||
                // Set key to the field_id
 | 
			
		||||
                // Note: this encoding is consistent with FieldIdCodec
 | 
			
		||||
                key_buffer.extend_from_slice(&field_id.to_be_bytes());
 | 
			
		||||
 | 
			
		||||
                // Here, we know already that the document must be added to the “field id exists” database
 | 
			
		||||
                let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
 | 
			
		||||
                let document = BEU32::from(document).get();
 | 
			
		||||
 | 
			
		||||
                facet_exists_docids.entry(field_id).or_default().insert(document);
 | 
			
		||||
 | 
			
		||||
                // For the other extraction tasks, prefix the key with the field_id and the document_id
 | 
			
		||||
                key_buffer.extend_from_slice(&docid_bytes);
 | 
			
		||||
 | 
			
		||||
                let value =
 | 
			
		||||
                    serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
 | 
			
		||||
 | 
			
		||||
                let (numbers, strings) = extract_facet_values(&value);
 | 
			
		||||
 | 
			
		||||
                // insert facet numbers in sorter
 | 
			
		||||
                for number in numbers {
 | 
			
		||||
                    key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
 | 
			
		||||
@@ -77,9 +93,21 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    let mut facet_exists_docids_writer = create_writer(
 | 
			
		||||
        indexer.chunk_compression_type,
 | 
			
		||||
        indexer.chunk_compression_level,
 | 
			
		||||
        tempfile::tempfile()?,
 | 
			
		||||
    );
 | 
			
		||||
    for (fid, bitmap) in facet_exists_docids.into_iter() {
 | 
			
		||||
        let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
 | 
			
		||||
        facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
 | 
			
		||||
    }
 | 
			
		||||
    let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
 | 
			
		||||
 | 
			
		||||
    Ok((
 | 
			
		||||
        sorter_into_reader(fid_docid_facet_numbers_sorter, indexer.clone())?,
 | 
			
		||||
        sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
 | 
			
		||||
        sorter_into_reader(fid_docid_facet_strings_sorter, indexer.clone())?,
 | 
			
		||||
        facet_exists_docids_reader,
 | 
			
		||||
    ))
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -53,7 +53,7 @@ pub(crate) fn data_from_obkv_documents(
 | 
			
		||||
        })
 | 
			
		||||
        .collect::<Result<()>>()?;
 | 
			
		||||
 | 
			
		||||
    let result: Result<(Vec<_>, (Vec<_>, Vec<_>))> = flattened_obkv_chunks
 | 
			
		||||
    let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks
 | 
			
		||||
        .par_bridge()
 | 
			
		||||
        .map(|flattened_obkv_chunks| {
 | 
			
		||||
            send_and_extract_flattened_documents_data(
 | 
			
		||||
@@ -72,9 +72,28 @@ pub(crate) fn data_from_obkv_documents(
 | 
			
		||||
 | 
			
		||||
    let (
 | 
			
		||||
        docid_word_positions_chunks,
 | 
			
		||||
        (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
 | 
			
		||||
        (
 | 
			
		||||
            docid_fid_facet_numbers_chunks,
 | 
			
		||||
            (docid_fid_facet_strings_chunks, facet_exists_docids_chunks),
 | 
			
		||||
        ),
 | 
			
		||||
    ) = result?;
 | 
			
		||||
 | 
			
		||||
    // merge facet_exists_docids and send them as a typed chunk
 | 
			
		||||
    {
 | 
			
		||||
        let lmdb_writer_sx = lmdb_writer_sx.clone();
 | 
			
		||||
        rayon::spawn(move || {
 | 
			
		||||
            debug!("merge {} database", "facet-id-exists-docids");
 | 
			
		||||
            match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
 | 
			
		||||
                Ok(reader) => {
 | 
			
		||||
                    let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
 | 
			
		||||
                }
 | 
			
		||||
                Err(e) => {
 | 
			
		||||
                    let _ = lmdb_writer_sx.send(Err(e));
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        });
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
 | 
			
		||||
        docid_word_positions_chunks.clone(),
 | 
			
		||||
        indexer.clone(),
 | 
			
		||||
@@ -197,6 +216,7 @@ fn send_original_documents_data(
 | 
			
		||||
/// - docid_word_positions
 | 
			
		||||
/// - docid_fid_facet_numbers
 | 
			
		||||
/// - docid_fid_facet_strings
 | 
			
		||||
/// - docid_fid_facet_exists
 | 
			
		||||
fn send_and_extract_flattened_documents_data(
 | 
			
		||||
    flattened_documents_chunk: Result<grenad::Reader<File>>,
 | 
			
		||||
    indexer: GrenadParameters,
 | 
			
		||||
@@ -209,7 +229,10 @@ fn send_and_extract_flattened_documents_data(
 | 
			
		||||
    max_positions_per_attributes: Option<u32>,
 | 
			
		||||
) -> Result<(
 | 
			
		||||
    grenad::Reader<CursorClonableMmap>,
 | 
			
		||||
    (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
 | 
			
		||||
    (
 | 
			
		||||
        grenad::Reader<CursorClonableMmap>,
 | 
			
		||||
        (grenad::Reader<CursorClonableMmap>, grenad::Reader<File>),
 | 
			
		||||
    ),
 | 
			
		||||
)> {
 | 
			
		||||
    let flattened_documents_chunk =
 | 
			
		||||
        flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
 | 
			
		||||
@@ -250,12 +273,15 @@ fn send_and_extract_flattened_documents_data(
 | 
			
		||||
                Ok(docid_word_positions_chunk)
 | 
			
		||||
            },
 | 
			
		||||
            || {
 | 
			
		||||
                let (docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk) =
 | 
			
		||||
                    extract_fid_docid_facet_values(
 | 
			
		||||
                        flattened_documents_chunk.clone(),
 | 
			
		||||
                        indexer.clone(),
 | 
			
		||||
                        faceted_fields,
 | 
			
		||||
                    )?;
 | 
			
		||||
                let (
 | 
			
		||||
                    docid_fid_facet_numbers_chunk,
 | 
			
		||||
                    docid_fid_facet_strings_chunk,
 | 
			
		||||
                    fid_facet_exists_docids_chunk,
 | 
			
		||||
                ) = extract_fid_docid_facet_values(
 | 
			
		||||
                    flattened_documents_chunk.clone(),
 | 
			
		||||
                    indexer.clone(),
 | 
			
		||||
                    faceted_fields,
 | 
			
		||||
                )?;
 | 
			
		||||
 | 
			
		||||
                // send docid_fid_facet_numbers_chunk to DB writer
 | 
			
		||||
                let docid_fid_facet_numbers_chunk =
 | 
			
		||||
@@ -273,7 +299,10 @@ fn send_and_extract_flattened_documents_data(
 | 
			
		||||
                    docid_fid_facet_strings_chunk.clone(),
 | 
			
		||||
                )));
 | 
			
		||||
 | 
			
		||||
                Ok((docid_fid_facet_numbers_chunk, docid_fid_facet_strings_chunk))
 | 
			
		||||
                Ok((
 | 
			
		||||
                    docid_fid_facet_numbers_chunk,
 | 
			
		||||
                    (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk),
 | 
			
		||||
                ))
 | 
			
		||||
            },
 | 
			
		||||
        );
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -613,6 +613,7 @@ mod tests {
 | 
			
		||||
    use super::*;
 | 
			
		||||
    use crate::documents::DocumentsBatchBuilder;
 | 
			
		||||
    use crate::update::DeleteDocuments;
 | 
			
		||||
    use crate::BEU16;
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
    fn simple_document_replacement() {
 | 
			
		||||
@@ -2040,6 +2041,109 @@ mod tests {
 | 
			
		||||
        assert_eq!(ids.len(), map.len());
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
    fn index_documents_check_exists_database() {
 | 
			
		||||
        let config = IndexerConfig::default();
 | 
			
		||||
        let indexing_config = IndexDocumentsConfig::default();
 | 
			
		||||
 | 
			
		||||
        let faceted_fields = hashset!(S("colour"));
 | 
			
		||||
        let content = || {
 | 
			
		||||
            documents!([
 | 
			
		||||
                {
 | 
			
		||||
                    "id": 0,
 | 
			
		||||
                    "colour": 0,
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "id": 1,
 | 
			
		||||
                    "colour": []
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "id": 2,
 | 
			
		||||
                    "colour": {}
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "id": 3,
 | 
			
		||||
                    "colour": null
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "id": 4,
 | 
			
		||||
                    "colour": [1]
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "id": 5
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "id": 6,
 | 
			
		||||
                    "colour": {
 | 
			
		||||
                        "green": 1
 | 
			
		||||
                    }
 | 
			
		||||
                },
 | 
			
		||||
                {
 | 
			
		||||
                    "id": 7,
 | 
			
		||||
                    "colour": {
 | 
			
		||||
                        "green": {
 | 
			
		||||
                            "blue": []
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            ])
 | 
			
		||||
        };
 | 
			
		||||
        let make_index = || {
 | 
			
		||||
            let path = tempfile::tempdir().unwrap();
 | 
			
		||||
            let mut options = EnvOpenOptions::new();
 | 
			
		||||
            options.map_size(10 * 1024 * 1024); // 10 MB
 | 
			
		||||
            Index::new(options, &path).unwrap()
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        let set_filterable_fields = |index: &Index| {
 | 
			
		||||
            let mut wtxn = index.write_txn().unwrap();
 | 
			
		||||
            let mut builder = update::Settings::new(&mut wtxn, &index, &config);
 | 
			
		||||
            builder.set_filterable_fields(faceted_fields.clone());
 | 
			
		||||
            builder.execute(|_| ()).unwrap();
 | 
			
		||||
            wtxn.commit().unwrap();
 | 
			
		||||
        };
 | 
			
		||||
        let add_documents = |index: &Index| {
 | 
			
		||||
            let mut wtxn = index.write_txn().unwrap();
 | 
			
		||||
            let builder =
 | 
			
		||||
                IndexDocuments::new(&mut wtxn, index, &config, indexing_config.clone(), |_| ())
 | 
			
		||||
                    .unwrap();
 | 
			
		||||
            let (builder, user_error) = builder.add_documents(content()).unwrap();
 | 
			
		||||
            user_error.unwrap();
 | 
			
		||||
            builder.execute().unwrap();
 | 
			
		||||
            wtxn.commit().unwrap();
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        let check_ok = |index: &Index| {
 | 
			
		||||
            let rtxn = index.read_txn().unwrap();
 | 
			
		||||
            let facets = index.faceted_fields(&rtxn).unwrap();
 | 
			
		||||
            assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
 | 
			
		||||
 | 
			
		||||
            let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
 | 
			
		||||
            let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
 | 
			
		||||
 | 
			
		||||
            let bitmap_colour =
 | 
			
		||||
                index.facet_id_exists_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap();
 | 
			
		||||
            assert_eq!(bitmap_colour.into_iter().collect::<Vec<_>>(), vec![0, 1, 2, 3, 4, 6, 7]);
 | 
			
		||||
 | 
			
		||||
            let bitmap_colour_green = index
 | 
			
		||||
                .facet_id_exists_docids
 | 
			
		||||
                .get(&rtxn, &BEU16::new(colour_green_id))
 | 
			
		||||
                .unwrap()
 | 
			
		||||
                .unwrap();
 | 
			
		||||
            assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![6, 7]);
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        let index = make_index();
 | 
			
		||||
        add_documents(&index);
 | 
			
		||||
        set_filterable_fields(&index);
 | 
			
		||||
        check_ok(&index);
 | 
			
		||||
 | 
			
		||||
        let index = make_index();
 | 
			
		||||
        set_filterable_fields(&index);
 | 
			
		||||
        add_documents(&index);
 | 
			
		||||
        check_ok(&index);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
    fn primary_key_must_not_contain_floats() {
 | 
			
		||||
        let tmp = tempfile::tempdir().unwrap();
 | 
			
		||||
 
 | 
			
		||||
@@ -35,6 +35,7 @@ pub(crate) enum TypedChunk {
 | 
			
		||||
    WordPairProximityDocids(grenad::Reader<File>),
 | 
			
		||||
    FieldIdFacetStringDocids(grenad::Reader<File>),
 | 
			
		||||
    FieldIdFacetNumberDocids(grenad::Reader<File>),
 | 
			
		||||
    FieldIdFacetExistsDocids(grenad::Reader<File>),
 | 
			
		||||
    GeoPoints(grenad::Reader<File>),
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@@ -146,6 +147,17 @@ pub(crate) fn write_typed_chunk_into_index(
 | 
			
		||||
            )?;
 | 
			
		||||
            is_merged_database = true;
 | 
			
		||||
        }
 | 
			
		||||
        TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => {
 | 
			
		||||
            append_entries_into_database(
 | 
			
		||||
                facet_id_exists_docids,
 | 
			
		||||
                &index.facet_id_exists_docids,
 | 
			
		||||
                wtxn,
 | 
			
		||||
                index_is_empty,
 | 
			
		||||
                |value, _buffer| Ok(value),
 | 
			
		||||
                merge_cbo_roaring_bitmaps,
 | 
			
		||||
            )?;
 | 
			
		||||
            is_merged_database = true;
 | 
			
		||||
        }
 | 
			
		||||
        TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => {
 | 
			
		||||
            append_entries_into_database(
 | 
			
		||||
                word_pair_proximity_docids_iter,
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user