mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-24 20:46:27 +00:00 
			
		
		
		
	Split the update side to use the number and the strings facet databases
This commit is contained in:
		
				
					committed by
					
						 Kerollmops
						Kerollmops
					
				
			
			
				
	
			
			
			
						parent
						
							038e03a4e4
						
					
				
				
					commit
					bd7b285bae
				
			| @@ -1,4 +1,4 @@ | ||||
| use std::collections::HashMap; | ||||
| use std::collections::HashSet; | ||||
| use std::fmt; | ||||
|  | ||||
| use anyhow::{Context, bail}; | ||||
| @@ -6,8 +6,6 @@ use regex::Regex; | ||||
| use serde::{Serialize, Deserialize}; | ||||
| use once_cell::sync::Lazy; | ||||
|  | ||||
| use crate::facet::FacetType; | ||||
|  | ||||
| static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| { | ||||
|     Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap() | ||||
| }); | ||||
| @@ -33,7 +31,7 @@ pub enum Criterion { | ||||
| } | ||||
|  | ||||
| impl Criterion { | ||||
|     pub fn from_str(faceted_attributes: &HashMap<String, FacetType>, txt: &str) -> anyhow::Result<Criterion> { | ||||
|     pub fn from_str(faceted_attributes: &HashSet<String>, txt: &str) -> anyhow::Result<Criterion> { | ||||
|         match txt { | ||||
|             "words" => Ok(Criterion::Words), | ||||
|             "typo" => Ok(Criterion::Typo), | ||||
| @@ -44,7 +42,9 @@ impl Criterion { | ||||
|                 let caps = ASC_DESC_REGEX.captures(text).with_context(|| format!("unknown criterion name: {}", text))?; | ||||
|                 let order = caps.get(1).unwrap().as_str(); | ||||
|                 let field_name = caps.get(2).unwrap().as_str(); | ||||
|                 faceted_attributes.get(field_name).with_context(|| format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name))?; | ||||
|                 faceted_attributes.get(field_name).with_context(|| { | ||||
|                     format!("Can't use {:?} as a criterion as it isn't a faceted field.", field_name) | ||||
|                 })?; | ||||
|                 match order { | ||||
|                     "asc" => Ok(Criterion::Asc(field_name.to_string())), | ||||
|                     "desc" => Ok(Criterion::Desc(field_name.to_string())), | ||||
|   | ||||
| @@ -1,5 +1,5 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashMap; | ||||
| use std::collections::{HashMap, HashSet}; | ||||
| use std::path::Path; | ||||
|  | ||||
| use anyhow::Context; | ||||
| @@ -18,24 +18,24 @@ use crate::heed_codec::facet::{ | ||||
|     FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, | ||||
|     FacetValueStringCodec, FacetLevelValueF64Codec, | ||||
| }; | ||||
| use crate::facet::FacetType; | ||||
| use crate::fields_ids_map::FieldsIdsMap; | ||||
|  | ||||
| pub const CRITERIA_KEY: &str = "criteria"; | ||||
| pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; | ||||
| pub const DISTINCT_ATTRIBUTE_KEY: &str = "distinct-attribute-key"; | ||||
| pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; | ||||
| pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; | ||||
| pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; | ||||
| pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; | ||||
| pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; | ||||
| pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; | ||||
| pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; | ||||
| pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; | ||||
| pub const PRIMARY_KEY_KEY: &str = "primary-key"; | ||||
| pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields"; | ||||
| pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; | ||||
| pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids"; | ||||
| pub const WORDS_FST_KEY: &str = "words-fst"; | ||||
| pub const STOP_WORDS_KEY: &str = "stop-words"; | ||||
| pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids"; | ||||
| pub const SYNONYMS_KEY: &str = "synonyms"; | ||||
| pub const WORDS_FST_KEY: &str = "words-fst"; | ||||
| pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; | ||||
| const CREATED_AT_KEY: &str = "created-at"; | ||||
| const UPDATED_AT_KEY: &str = "updated-at"; | ||||
| @@ -321,53 +321,97 @@ impl Index { | ||||
|  | ||||
|     /* faceted fields */ | ||||
|  | ||||
|     /// Writes the facet fields associated with their facet type or `None` if | ||||
|     /// the facet type is currently unknown. | ||||
|     pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields_types: &HashMap<String, FacetType>) -> heed::Result<()> { | ||||
|         self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields_types) | ||||
|     /// Writes the facet fields names in the database. | ||||
|     pub fn put_faceted_fields(&self, wtxn: &mut RwTxn, fields: &HashSet<String>) -> heed::Result<()> { | ||||
|         self.main.put::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY, fields) | ||||
|     } | ||||
|  | ||||
|     /// Deletes the facet fields ids associated with their facet type. | ||||
|     /// Deletes the facet fields ids in the database. | ||||
|     pub fn delete_faceted_fields(&self, wtxn: &mut RwTxn) -> heed::Result<bool> { | ||||
|         self.main.delete::<_, Str>(wtxn, FACETED_FIELDS_KEY) | ||||
|     } | ||||
|  | ||||
|     /// Returns the facet fields names associated with their facet type. | ||||
|     pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, FacetType>> { | ||||
|     /// Returns the facet fields names. | ||||
|     pub fn faceted_fields(&self, rtxn: &RoTxn) -> heed::Result<HashSet<String>> { | ||||
|         Ok(self.main.get::<_, Str, SerdeJson<_>>(rtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) | ||||
|     } | ||||
|  | ||||
|     /// Same as `faceted_fields`, but returns ids instead. | ||||
|     pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result<HashMap<FieldId, FacetType>> { | ||||
|     pub fn faceted_fields_ids(&self, rtxn: &RoTxn) -> heed::Result<HashSet<FieldId>> { | ||||
|         let faceted_fields = self.faceted_fields(rtxn)?; | ||||
|         let fields_ids_map = self.fields_ids_map(rtxn)?; | ||||
|         let faceted_fields = faceted_fields | ||||
|             .iter() | ||||
|             .map(|(k, v)| { | ||||
|                 let kid = fields_ids_map | ||||
|             .map(|k| { | ||||
|                 fields_ids_map | ||||
|                     .id(k) | ||||
|                     .ok_or_else(|| format!("{:?} should be present in the field id map", k)) | ||||
|                     .expect("corrupted data: "); | ||||
|                 (kid, *v) | ||||
|                     .expect("corrupted data: ") | ||||
|             }) | ||||
|             .collect(); | ||||
|  | ||||
|         Ok(faceted_fields) | ||||
|     } | ||||
|  | ||||
|     /* faceted documents ids */ | ||||
|  | ||||
|     /// Writes the documents ids that are faceted under this field id. | ||||
|     pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: FieldId, docids: &RoaringBitmap) -> heed::Result<()> { | ||||
|         let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||
|         buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||
|     /// Writes the documents ids that are faceted with numbers under this field id. | ||||
|     pub fn put_number_faceted_documents_ids( | ||||
|         &self, | ||||
|         wtxn: &mut RwTxn, | ||||
|         field_id: FieldId, | ||||
|         docids: &RoaringBitmap, | ||||
|     ) -> heed::Result<()> | ||||
|     { | ||||
|         let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||
|         buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] | ||||
|             .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||
|         *buffer.last_mut().unwrap() = field_id; | ||||
|         self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) | ||||
|     } | ||||
|  | ||||
|     /// Retrieve all the documents ids that faceted under this field id. | ||||
|     pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: FieldId) -> heed::Result<RoaringBitmap> { | ||||
|         let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||
|         buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||
|     /// Retrieve all the documents ids that faceted with numbers under this field id. | ||||
|     pub fn number_faceted_documents_ids( | ||||
|         &self, | ||||
|         rtxn: &RoTxn, | ||||
|         field_id: FieldId, | ||||
|     ) -> heed::Result<RoaringBitmap> | ||||
|     { | ||||
|         let mut buffer = [0u8; STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||
|         buffer[..STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] | ||||
|             .copy_from_slice(STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||
|         *buffer.last_mut().unwrap() = field_id; | ||||
|         match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { | ||||
|             Some(docids) => Ok(docids), | ||||
|             None => Ok(RoaringBitmap::new()), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Writes the documents ids that are faceted with strings under this field id. | ||||
|     pub fn put_string_faceted_documents_ids( | ||||
|         &self, | ||||
|         wtxn: &mut RwTxn, | ||||
|         field_id: FieldId, | ||||
|         docids: &RoaringBitmap, | ||||
|     ) -> heed::Result<()> | ||||
|     { | ||||
|         let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||
|         buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] | ||||
|             .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||
|         *buffer.last_mut().unwrap() = field_id; | ||||
|         self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) | ||||
|     } | ||||
|  | ||||
|     /// Retrieve all the documents ids that faceted with strings under this field id. | ||||
|     pub fn string_faceted_documents_ids( | ||||
|         &self, | ||||
|         rtxn: &RoTxn, | ||||
|         field_id: FieldId, | ||||
|     ) -> heed::Result<RoaringBitmap> | ||||
|     { | ||||
|         let mut buffer = [0u8; NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||
|         buffer[..NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] | ||||
|             .copy_from_slice(NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||
|         *buffer.last_mut().unwrap() = field_id; | ||||
|         match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { | ||||
|             Some(docids) => Ok(docids), | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| use std::collections::HashMap; | ||||
| use std::mem::take; | ||||
|  | ||||
| use anyhow::{bail, Context as _}; | ||||
| use anyhow::Context; | ||||
| use itertools::Itertools; | ||||
| use log::debug; | ||||
| use ordered_float::OrderedFloat; | ||||
| @@ -23,7 +23,6 @@ pub struct AscDesc<'t> { | ||||
|     rtxn: &'t heed::RoTxn<'t>, | ||||
|     field_name: String, | ||||
|     field_id: FieldId, | ||||
|     facet_type: FacetType, | ||||
|     ascending: bool, | ||||
|     query_tree: Option<Operation>, | ||||
|     candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>, | ||||
| @@ -51,6 +50,7 @@ impl<'t> AscDesc<'t> { | ||||
|         Self::new(index, rtxn, parent, field_name, false) | ||||
|     } | ||||
|  | ||||
|  | ||||
|     fn new( | ||||
|         index: &'t Index, | ||||
|         rtxn: &'t heed::RoTxn, | ||||
| @@ -60,19 +60,19 @@ impl<'t> AscDesc<'t> { | ||||
|     ) -> anyhow::Result<Self> { | ||||
|         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||
|         let faceted_fields = index.faceted_fields(rtxn)?; | ||||
|         let (field_id, facet_type) = | ||||
|             field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; | ||||
|         let field_id = fields_ids_map | ||||
|             .id(&field_name) | ||||
|             .with_context(|| format!("field {:?} isn't registered", field_name))?; | ||||
|  | ||||
|         Ok(AscDesc { | ||||
|             index, | ||||
|             rtxn, | ||||
|             field_name, | ||||
|             field_id, | ||||
|             facet_type, | ||||
|             ascending, | ||||
|             query_tree: None, | ||||
|             candidates: Box::new(std::iter::empty()), | ||||
|             faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, | ||||
|             faceted_candidates: index.number_faceted_documents_ids(rtxn, field_id)?, | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent, | ||||
|         }) | ||||
| @@ -165,27 +165,20 @@ fn facet_ordered<'t>( | ||||
|     index: &'t Index, | ||||
|     rtxn: &'t heed::RoTxn, | ||||
|     field_id: FieldId, | ||||
|     facet_type: FacetType, | ||||
|     ascending: bool, | ||||
|     candidates: RoaringBitmap, | ||||
| ) -> anyhow::Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> { | ||||
|     match facet_type { | ||||
|         FacetType::Number => { | ||||
|             if candidates.len() <= CANDIDATES_THRESHOLD { | ||||
|                 let iter = | ||||
|                     iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; | ||||
|                 Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>) | ||||
|             } else { | ||||
|                 let facet_fn = if ascending { | ||||
|                     FacetIter::new_reducing | ||||
|                 } else { | ||||
|                     FacetIter::new_reverse_reducing | ||||
|                 }; | ||||
|                 let iter = facet_fn(rtxn, index, field_id, candidates)?; | ||||
|                 Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) | ||||
|             } | ||||
|         } | ||||
|         FacetType::String => bail!("criteria facet type must be a number"), | ||||
|     if candidates.len() <= CANDIDATES_THRESHOLD { | ||||
|         let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; | ||||
|         Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>) | ||||
|     } else { | ||||
|         let facet_fn = if ascending { | ||||
|             FacetIter::new_reducing | ||||
|         } else { | ||||
|             FacetIter::new_reverse_reducing | ||||
|         }; | ||||
|         let iter = facet_fn(rtxn, index, field_id, candidates)?; | ||||
|         Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) | ||||
|     } | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -5,7 +5,7 @@ use roaring::RoaringBitmap; | ||||
|  | ||||
| use super::{Distinct, DocIter}; | ||||
| use crate::heed_codec::facet::*; | ||||
| use crate::{facet::FacetType, DocumentId, FieldId, Index}; | ||||
| use crate::{DocumentId, FieldId, Index}; | ||||
|  | ||||
| const FID_SIZE: usize = size_of::<FieldId>(); | ||||
| const DOCID_SIZE: usize = size_of::<DocumentId>(); | ||||
| @@ -22,7 +22,6 @@ pub struct FacetDistinct<'a> { | ||||
|     distinct: FieldId, | ||||
|     index: &'a Index, | ||||
|     txn: &'a heed::RoTxn<'a>, | ||||
|     facet_type: FacetType, | ||||
| } | ||||
|  | ||||
| impl<'a> FacetDistinct<'a> { | ||||
| @@ -30,14 +29,9 @@ impl<'a> FacetDistinct<'a> { | ||||
|         distinct: FieldId, | ||||
|         index: &'a Index, | ||||
|         txn: &'a heed::RoTxn<'a>, | ||||
|         facet_type: FacetType, | ||||
|     ) -> Self { | ||||
|         Self { | ||||
|             distinct, | ||||
|             index, | ||||
|             txn, | ||||
|             facet_type, | ||||
|         } | ||||
|     ) -> Self | ||||
|     { | ||||
|         Self { distinct, index, txn } | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -45,7 +39,6 @@ pub struct FacetDistinctIter<'a> { | ||||
|     candidates: RoaringBitmap, | ||||
|     distinct: FieldId, | ||||
|     excluded: RoaringBitmap, | ||||
|     facet_type: FacetType, | ||||
|     index: &'a Index, | ||||
|     iter_offset: usize, | ||||
|     txn: &'a heed::RoTxn<'a>, | ||||
| @@ -117,6 +110,7 @@ impl<'a> FacetDistinctIter<'a> { | ||||
|                 // increasing the offset we make sure to get the first valid value for the next | ||||
|                 // distinct document to keep. | ||||
|                 self.iter_offset += 1; | ||||
|  | ||||
|                 Ok(Some(id)) | ||||
|             } | ||||
|             // no more candidate at this offset, return. | ||||
| @@ -188,7 +182,6 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { | ||||
|             candidates, | ||||
|             distinct: self.distinct, | ||||
|             excluded, | ||||
|             facet_type: self.facet_type, | ||||
|             index: self.index, | ||||
|             iter_offset: 0, | ||||
|             txn: self.txn, | ||||
|   | ||||
| @@ -145,7 +145,7 @@ impl<'a> Search<'a> { | ||||
|                 let faceted_fields = self.index.faceted_fields(self.rtxn)?; | ||||
|                 match faceted_fields.get(name) { | ||||
|                     Some(facet_type) => { | ||||
|                         let distinct = FacetDistinct::new(id, self.index, self.rtxn, *facet_type); | ||||
|                         let distinct = FacetDistinct::new(id, self.index, self.rtxn); | ||||
|                         self.perform_sort(distinct, matching_words, criteria) | ||||
|                     } | ||||
|                     None => { | ||||
|   | ||||
| @@ -49,8 +49,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|         self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; | ||||
|  | ||||
|         // We clean all the faceted documents ids. | ||||
|         for (field_id, _) in faceted_fields { | ||||
|             self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?; | ||||
|         let empty = RoaringBitmap::default(); | ||||
|         for field_id in faceted_fields { | ||||
|             self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?; | ||||
|             self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?; | ||||
|         } | ||||
|  | ||||
|         // Clear the other databases. | ||||
|   | ||||
| @@ -330,11 +330,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|         )?; | ||||
|  | ||||
|         // Remove the documents ids from the faceted documents ids. | ||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; | ||||
|         for (field_id, facet_type) in faceted_fields { | ||||
|             let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; | ||||
|         for field_id in self.index.faceted_fields_ids(self.wtxn)? { | ||||
|             // Remove docids from the number faceted documents ids | ||||
|             let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; | ||||
|             docids.difference_with(&self.documents_ids); | ||||
|             self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; | ||||
|             self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; | ||||
|  | ||||
|             remove_docids_from_field_id_docid_facet_value( | ||||
|                 self.wtxn, | ||||
| @@ -344,6 +344,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|                 |(_fid, docid, _value)| docid, | ||||
|             )?; | ||||
|  | ||||
|             // Remove docids from the string faceted documents ids | ||||
|             let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; | ||||
|             docids.difference_with(&self.documents_ids); | ||||
|             self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; | ||||
|  | ||||
|             remove_docids_from_field_id_docid_facet_value( | ||||
|                 self.wtxn, | ||||
|                 field_id_docid_facet_strings, | ||||
|   | ||||
| @@ -9,7 +9,6 @@ use heed::{BytesEncode, Error}; | ||||
| use log::debug; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::CboRoaringBitmapCodec; | ||||
| use crate::heed_codec::facet::FacetLevelValueF64Codec; | ||||
| use crate::Index; | ||||
| @@ -62,56 +61,51 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; | ||||
|  | ||||
|         debug!("Computing and writing the facet values levels docids into LMDB on disk..."); | ||||
|         for (field_id, facet_type) in faceted_fields { | ||||
|             let (content, documents_ids) = match facet_type { | ||||
|                 FacetType::String => { | ||||
|                     let documents_ids = compute_faceted_documents_ids( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|  | ||||
|                     (None, documents_ids) | ||||
|                 }, | ||||
|                 FacetType::Number => { | ||||
|                     clear_field_number_levels( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>(), | ||||
|                         field_id, | ||||
|                     )?; | ||||
|         for field_id in faceted_fields { | ||||
|             // Compute and store the faceted strings documents ids. | ||||
|             let string_documents_ids = compute_faceted_documents_ids( | ||||
|                 self.wtxn, | ||||
|                 self.index.facet_id_string_docids.remap_key_type::<ByteSlice>(), | ||||
|                 field_id, | ||||
|             )?; | ||||
|  | ||||
|                     let documents_ids = compute_faceted_documents_ids( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|             // Clear the facet number levels. | ||||
|             clear_field_number_levels( | ||||
|                 self.wtxn, | ||||
|                 self.index.facet_id_f64_docids, | ||||
|                 field_id, | ||||
|             )?; | ||||
|  | ||||
|                     let content = compute_facet_number_levels( | ||||
|                         self.wtxn, | ||||
|                         self.index.facet_field_id_value_docids.remap_key_type::<FacetLevelValueF64Codec>(), | ||||
|                         self.chunk_compression_type, | ||||
|                         self.chunk_compression_level, | ||||
|                         self.chunk_fusing_shrink_size, | ||||
|                         self.level_group_size, | ||||
|                         self.min_level_size, | ||||
|                         field_id, | ||||
|                     )?; | ||||
|             // Compute and store the faceted numbers documents ids. | ||||
|             let number_documents_ids = compute_faceted_documents_ids( | ||||
|                 self.wtxn, | ||||
|                 self.index.facet_id_f64_docids.remap_key_type::<ByteSlice>(), | ||||
|                 field_id, | ||||
|             )?; | ||||
|  | ||||
|                     (Some(content), documents_ids) | ||||
|                 }, | ||||
|             }; | ||||
|             let content = compute_facet_number_levels( | ||||
|                 self.wtxn, | ||||
|                 self.index.facet_id_f64_docids, | ||||
|                 self.chunk_compression_type, | ||||
|                 self.chunk_compression_level, | ||||
|                 self.chunk_fusing_shrink_size, | ||||
|                 self.level_group_size, | ||||
|                 self.min_level_size, | ||||
|                 field_id, | ||||
|             )?; | ||||
|  | ||||
|             if let Some(content) = content { | ||||
|                 write_into_lmdb_database( | ||||
|                     self.wtxn, | ||||
|                     *self.index.facet_field_id_value_docids.as_polymorph(), | ||||
|                     content, | ||||
|                     |_, _| anyhow::bail!("invalid facet level merging"), | ||||
|                     WriteMethod::GetMergePut, | ||||
|                 )?; | ||||
|             } | ||||
|             self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?; | ||||
|             self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?; | ||||
|  | ||||
|             self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?; | ||||
|             // Store the | ||||
|             write_into_lmdb_database( | ||||
|                 self.wtxn, | ||||
|                 *self.index.facet_id_f64_docids.as_polymorph(), | ||||
|                 content, | ||||
|                 |_, _| anyhow::bail!("invalid facet number level merging"), | ||||
|                 WriteMethod::GetMergePut, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
| @@ -205,10 +199,12 @@ fn compute_faceted_documents_ids( | ||||
| ) -> anyhow::Result<RoaringBitmap> | ||||
| { | ||||
|     let mut documents_ids = RoaringBitmap::new(); | ||||
|  | ||||
|     for result in db.prefix_iter(rtxn, &[field_id])? { | ||||
|         let (_key, docids) = result?; | ||||
|         documents_ids.union_with(&docids); | ||||
|         documents_ids |= docids; | ||||
|     } | ||||
|  | ||||
|     Ok(documents_ids) | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -412,7 +412,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             Main, | ||||
|             WordDocids, | ||||
|             WordLevel0PositionDocids, | ||||
|             FacetLevel0ValuesDocids, | ||||
|             FacetLevel0NumbersDocids, | ||||
|         } | ||||
|  | ||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; | ||||
| @@ -478,8 +478,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut field_id_docid_facet_strings_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut documents_readers = Vec::with_capacity(readers.len()); | ||||
|             readers.into_iter().for_each(|readers| { | ||||
|                 let Readers { | ||||
| @@ -488,17 +490,21 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                     docid_word_positions, | ||||
|                     words_pairs_proximities_docids, | ||||
|                     word_level_position_docids, | ||||
|                     facet_field_value_docids, | ||||
|                     field_id_docid_facet_values, | ||||
|                     documents | ||||
|                     facet_field_numbers_docids, | ||||
|                     facet_field_strings_docids, | ||||
|                     field_id_docid_facet_numbers, | ||||
|                     field_id_docid_facet_strings, | ||||
|                     documents, | ||||
|                 } = readers; | ||||
|                 main_readers.push(main); | ||||
|                 word_docids_readers.push(word_docids); | ||||
|                 docid_word_positions_readers.push(docid_word_positions); | ||||
|                 words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); | ||||
|                 word_level_position_docids_readers.push(word_level_position_docids); | ||||
|                 facet_field_value_docids_readers.push(facet_field_value_docids); | ||||
|                 field_id_docid_facet_values_readers.push(field_id_docid_facet_values); | ||||
|                 facet_field_numbers_docids_readers.push(facet_field_numbers_docids); | ||||
|                 facet_field_strings_docids_readers.push(facet_field_strings_docids); | ||||
|                 field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); | ||||
|                 field_id_docid_facet_strings_readers.push(field_id_docid_facet_strings); | ||||
|                 documents_readers.push(documents); | ||||
|             }); | ||||
|  | ||||
| @@ -523,8 +529,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                     (DatabaseType::Main, main_readers, main_merge as MergeFn), | ||||
|                     (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), | ||||
|                     ( | ||||
|                         DatabaseType::FacetLevel0ValuesDocids, | ||||
|                         facet_field_value_docids_readers, | ||||
|                         DatabaseType::FacetLevel0NumbersDocids, | ||||
|                         facet_field_numbers_docids_readers, | ||||
|                         facet_field_value_docids_merge, | ||||
|                     ), | ||||
|                     ( | ||||
| @@ -547,7 +553,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                 docid_word_positions_readers, | ||||
|                 documents_readers, | ||||
|                 words_pairs_proximities_docids_readers, | ||||
|                 field_id_docid_facet_values_readers, | ||||
|                 facet_field_numbers_docids_readers, | ||||
|                 facet_field_strings_docids_readers, | ||||
|                 field_id_docid_facet_numbers_readers, | ||||
|                 field_id_docid_facet_strings_readers, | ||||
|             )) as anyhow::Result<_> | ||||
|         })?; | ||||
|  | ||||
| @@ -556,7 +565,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             docid_word_positions_readers, | ||||
|             documents_readers, | ||||
|             words_pairs_proximities_docids_readers, | ||||
|             field_id_docid_facet_values_readers, | ||||
|             facet_field_numbers_docids_readers, | ||||
|             facet_field_strings_docids_readers, | ||||
|             field_id_docid_facet_numbers_readers, | ||||
|             field_id_docid_facet_strings_readers, | ||||
|         ) = readers; | ||||
|  | ||||
|         let mut documents_ids = self.index.documents_ids(self.wtxn)?; | ||||
| @@ -624,11 +636,26 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             total_databases, | ||||
|         }); | ||||
|  | ||||
|         debug!("Writing the field id docid facet values into LMDB on disk..."); | ||||
|         debug!("Writing the field id docid facet numbers into LMDB on disk..."); | ||||
|         merge_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.field_id_docid_facet_values.as_polymorph(), | ||||
|             field_id_docid_facet_values_readers, | ||||
|             *self.index.field_id_docid_facet_f64s.as_polymorph(), | ||||
|             field_id_docid_facet_numbers_readers, | ||||
|             field_id_docid_facet_values_merge, | ||||
|             write_method, | ||||
|         )?; | ||||
|  | ||||
|         database_count += 1; | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: database_count, | ||||
|             total_databases, | ||||
|         }); | ||||
|  | ||||
|         debug!("Writing the field id docid facet strings into LMDB on disk..."); | ||||
|         merge_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.field_id_docid_facet_strings.as_polymorph(), | ||||
|             field_id_docid_facet_strings_readers, | ||||
|             field_id_docid_facet_values_merge, | ||||
|             write_method, | ||||
|         )?; | ||||
| @@ -678,9 +705,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 }, | ||||
|                 DatabaseType::FacetLevel0ValuesDocids => { | ||||
|                     debug!("Writing the facet level 0 values docids into LMDB on disk..."); | ||||
|                     let db = *self.index.facet_field_id_value_docids.as_polymorph(); | ||||
|                 DatabaseType::FacetLevel0NumbersDocids => { | ||||
|                     debug!("Writing the facet numbers docids into LMDB on disk..."); | ||||
|                     let db = *self.index.facet_id_f64_docids.as_polymorph(); | ||||
|                     write_into_lmdb_database( | ||||
|                         self.wtxn, | ||||
|                         db, | ||||
|   | ||||
| @@ -6,25 +6,24 @@ use std::iter::FromIterator; | ||||
| use std::time::Instant; | ||||
| use std::{cmp, iter}; | ||||
|  | ||||
| use anyhow::{bail, Context}; | ||||
| use anyhow::Context; | ||||
| use bstr::ByteSlice as _; | ||||
| use fst::Set; | ||||
| use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; | ||||
| use heed::BytesEncode; | ||||
| use linked_hash_map::LinkedHashMap; | ||||
| use log::{debug, info, warn}; | ||||
| use log::{debug, info}; | ||||
| use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; | ||||
| use ordered_float::OrderedFloat; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::Value; | ||||
| use tempfile::tempfile; | ||||
|  | ||||
| use crate::facet::{FacetType, FacetValue}; | ||||
| use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec}; | ||||
| use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec}; | ||||
| use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||
| use crate::update::UpdateIndexingStep; | ||||
| use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; | ||||
| use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, FieldsIdsMap}; | ||||
|  | ||||
| use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; | ||||
| use super::merge_function::{ | ||||
| @@ -45,8 +44,10 @@ pub struct Readers { | ||||
|     pub docid_word_positions: Reader<FileFuse>, | ||||
|     pub words_pairs_proximities_docids: Reader<FileFuse>, | ||||
|     pub word_level_position_docids: Reader<FileFuse>, | ||||
|     pub facet_field_value_docids: Reader<FileFuse>, | ||||
|     pub field_id_docid_facet_values: Reader<FileFuse>, | ||||
|     pub facet_field_numbers_docids: Reader<FileFuse>, | ||||
|     pub facet_field_strings_docids: Reader<FileFuse>, | ||||
|     pub field_id_docid_facet_numbers: Reader<FileFuse>, | ||||
|     pub field_id_docid_facet_strings: Reader<FileFuse>, | ||||
|     pub documents: Reader<FileFuse>, | ||||
| } | ||||
|  | ||||
| @@ -55,13 +56,14 @@ pub struct Store<'s, A> { | ||||
|     primary_key: String, | ||||
|     fields_ids_map: FieldsIdsMap, | ||||
|     searchable_fields: HashSet<FieldId>, | ||||
|     faceted_fields: HashMap<FieldId, FacetType>, | ||||
|     faceted_fields: HashSet<FieldId>, | ||||
|     // Caches | ||||
|     word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, | ||||
|     word_docids_limit: usize, | ||||
|     words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, | ||||
|     words_pairs_proximities_docids_limit: usize, | ||||
|     facet_field_value_docids: LinkedHashMap<(u8, FacetValue), RoaringBitmap>, | ||||
|     facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>, | ||||
|     facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, | ||||
|     facet_field_value_docids_limit: usize, | ||||
|     // MTBL parameters | ||||
|     chunk_compression_type: CompressionType, | ||||
| @@ -72,8 +74,10 @@ pub struct Store<'s, A> { | ||||
|     word_docids_sorter: Sorter<MergeFn>, | ||||
|     words_pairs_proximities_docids_sorter: Sorter<MergeFn>, | ||||
|     word_level_position_docids_sorter: Sorter<MergeFn>, | ||||
|     facet_field_value_docids_sorter: Sorter<MergeFn>, | ||||
|     field_id_docid_facet_values_sorter: Sorter<MergeFn>, | ||||
|     facet_field_numbers_docids_sorter: Sorter<MergeFn>, | ||||
|     facet_field_strings_docids_sorter: Sorter<MergeFn>, | ||||
|     field_id_docid_facet_numbers_sorter: Sorter<MergeFn>, | ||||
|     field_id_docid_facet_strings_sorter: Sorter<MergeFn>, | ||||
|     // MTBL writers | ||||
|     docid_word_positions_writer: Writer<File>, | ||||
|     documents_writer: Writer<File>, | ||||
| @@ -86,7 +90,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         primary_key: String, | ||||
|         fields_ids_map: FieldsIdsMap, | ||||
|         searchable_fields: HashSet<FieldId>, | ||||
|         faceted_fields: HashMap<FieldId, FacetType>, | ||||
|         faceted_fields: HashSet<FieldId>, | ||||
|         linked_hash_map_size: Option<usize>, | ||||
|         max_nb_chunks: Option<usize>, | ||||
|         max_memory: Option<usize>, | ||||
| @@ -132,7 +136,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let facet_field_value_docids_sorter = create_sorter( | ||||
|         let facet_field_numbers_docids_sorter = create_sorter( | ||||
|             facet_field_value_docids_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
| @@ -140,7 +144,23 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let field_id_docid_facet_values_sorter = create_sorter( | ||||
|         let facet_field_strings_docids_sorter = create_sorter( | ||||
|             facet_field_value_docids_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let field_id_docid_facet_numbers_sorter = create_sorter( | ||||
|             field_id_docid_facet_values_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             Some(1024 * 1024 * 1024), // 1MB | ||||
|         ); | ||||
|         let field_id_docid_facet_strings_sorter = create_sorter( | ||||
|             field_id_docid_facet_values_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
| @@ -173,7 +193,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             word_docids_limit: linked_hash_map_size, | ||||
|             words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             words_pairs_proximities_docids_limit: linked_hash_map_size, | ||||
|             facet_field_value_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             facet_field_number_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             facet_field_string_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             facet_field_value_docids_limit: linked_hash_map_size, | ||||
|             // MTBL parameters | ||||
|             chunk_compression_type, | ||||
| @@ -184,8 +205,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             word_docids_sorter, | ||||
|             words_pairs_proximities_docids_sorter, | ||||
|             word_level_position_docids_sorter, | ||||
|             facet_field_value_docids_sorter, | ||||
|             field_id_docid_facet_values_sorter, | ||||
|             facet_field_numbers_docids_sorter, | ||||
|             facet_field_strings_docids_sorter, | ||||
|             field_id_docid_facet_numbers_sorter, | ||||
|             field_id_docid_facet_strings_sorter, | ||||
|             // MTBL writers | ||||
|             docid_word_positions_writer, | ||||
|             documents_writer, | ||||
| @@ -215,34 +238,68 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     // Save the documents ids under the facet field id and value we have seen it. | ||||
|     fn insert_facet_values_docid( | ||||
|     fn insert_facet_number_values_docid( | ||||
|         &mut self, | ||||
|         field_id: FieldId, | ||||
|         field_value: FacetValue, | ||||
|         value: OrderedFloat<f64>, | ||||
|         id: DocumentId, | ||||
|     ) -> anyhow::Result<()> | ||||
|     { | ||||
|         Self::write_field_id_docid_facet_value(&mut self.field_id_docid_facet_values_sorter, field_id, id, &field_value)?; | ||||
|         let sorter = &mut self.field_id_docid_facet_numbers_sorter; | ||||
|         Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?; | ||||
|  | ||||
|         let key = (field_id, field_value); | ||||
|         let key = (field_id, value); | ||||
|         // if get_refresh finds the element it is assured to be at the end of the linked hash map. | ||||
|         match self.facet_field_value_docids.get_refresh(&key) { | ||||
|         match self.facet_field_number_docids.get_refresh(&key) { | ||||
|             Some(old) => { old.insert(id); }, | ||||
|             None => { | ||||
|                 // A newly inserted element is append at the end of the linked hash map. | ||||
|                 self.facet_field_value_docids.insert(key, RoaringBitmap::from_iter(Some(id))); | ||||
|                 self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id))); | ||||
|                 // If the word docids just reached it's capacity we must make sure to remove | ||||
|                 // one element, this way next time we insert we doesn't grow the capacity. | ||||
|                 if self.facet_field_value_docids.len() == self.facet_field_value_docids_limit { | ||||
|                 if self.facet_field_number_docids.len() == self.facet_field_value_docids_limit { | ||||
|                     // Removing the front element is equivalent to removing the LRU element. | ||||
|                     Self::write_facet_field_value_docids( | ||||
|                         &mut self.facet_field_value_docids_sorter, | ||||
|                         self.facet_field_value_docids.pop_front(), | ||||
|                     Self::write_facet_field_number_docids( | ||||
|                         &mut self.facet_field_numbers_docids_sorter, | ||||
|                         self.facet_field_number_docids.pop_front(), | ||||
|                     )?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     // Save the documents ids under the facet field id and value we have seen it. | ||||
|     fn insert_facet_string_values_docid( | ||||
|         &mut self, | ||||
|         field_id: FieldId, | ||||
|         value: String, | ||||
|         id: DocumentId, | ||||
|     ) -> anyhow::Result<()> | ||||
|     { | ||||
|         let sorter = &mut self.field_id_docid_facet_strings_sorter; | ||||
|         Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; | ||||
|  | ||||
|         let key = (field_id, value); | ||||
|         // if get_refresh finds the element it is assured to be at the end of the linked hash map. | ||||
|         match self.facet_field_string_docids.get_refresh(&key) { | ||||
|             Some(old) => { old.insert(id); }, | ||||
|             None => { | ||||
|                 // A newly inserted element is append at the end of the linked hash map. | ||||
|                 self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); | ||||
|                 // If the word docids just reached it's capacity we must make sure to remove | ||||
|                 // one element, this way next time we insert we doesn't grow the capacity. | ||||
|                 if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { | ||||
|                     // Removing the front element is equivalent to removing the LRU element. | ||||
|                     Self::write_facet_field_string_docids( | ||||
|                         &mut self.facet_field_strings_docids_sorter, | ||||
|                         self.facet_field_string_docids.pop_front(), | ||||
|                     )?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
| @@ -287,7 +344,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         &mut self, | ||||
|         document_id: DocumentId, | ||||
|         words_positions: &mut HashMap<String, SmallVec32<Position>>, | ||||
|         facet_values: &mut HashMap<FieldId, SmallVec8<FacetValue>>, | ||||
|         facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>, | ||||
|         facet_strings_values: &mut HashMap<FieldId, Vec<String>>, | ||||
|         record: &[u8], | ||||
|     ) -> anyhow::Result<()> | ||||
|     { | ||||
| @@ -306,10 +364,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|  | ||||
|         words_positions.clear(); | ||||
|  | ||||
|         // We store document_id associated with all the field id and values. | ||||
|         for (field, values) in facet_values.drain() { | ||||
|         // We store document_id associated with all the facet numbers fields ids and values. | ||||
|         for (field, values) in facet_numbers_values.drain() { | ||||
|             for value in values { | ||||
|                 self.insert_facet_values_docid(field, value, document_id)?; | ||||
|                 let value = OrderedFloat::from(value); | ||||
|                 self.insert_facet_number_values_docid(field, value, document_id)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // We store document_id associated with all the facet strings fields ids and values. | ||||
|         for (field, values) in facet_strings_values.drain() { | ||||
|             for value in values { | ||||
|                 self.insert_facet_string_values_docid(field, value, document_id)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
| @@ -409,20 +475,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_facet_field_value_docids<I>( | ||||
|     fn write_facet_field_string_docids<I>( | ||||
|         sorter: &mut Sorter<MergeFn>, | ||||
|         iter: I, | ||||
|     ) -> anyhow::Result<()> | ||||
|     where I: IntoIterator<Item=((FieldId, FacetValue), RoaringBitmap)> | ||||
|     where I: IntoIterator<Item=((FieldId, String), RoaringBitmap)> | ||||
|     { | ||||
|         use FacetValue::*; | ||||
|  | ||||
|         for ((field_id, value), docids) in iter { | ||||
|             let result = match value { | ||||
|                 String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), | ||||
|                 Number(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), | ||||
|             }; | ||||
|             let key = result.context("could not serialize facet key")?; | ||||
|             let key = FacetValueStringCodec::bytes_encode(&(field_id, &value)) | ||||
|                 .map(Cow::into_owned) | ||||
|                 .context("could not serialize facet key")?; | ||||
|             let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) | ||||
|                 .context("could not serialize docids")?; | ||||
|             if lmdb_key_valid_size(&key) { | ||||
| @@ -433,21 +495,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_field_id_docid_facet_value( | ||||
|     fn write_facet_field_number_docids<I>( | ||||
|         sorter: &mut Sorter<MergeFn>, | ||||
|         iter: I, | ||||
|     ) -> anyhow::Result<()> | ||||
|     where I: IntoIterator<Item=((FieldId, OrderedFloat<f64>), RoaringBitmap)> | ||||
|     { | ||||
|         for ((field_id, value), docids) in iter { | ||||
|             let key = FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *value, *value)) | ||||
|                 .map(Cow::into_owned) | ||||
|                 .context("could not serialize facet key")?; | ||||
|             let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) | ||||
|                 .context("could not serialize docids")?; | ||||
|             if lmdb_key_valid_size(&key) { | ||||
|                 sorter.insert(&key, &bytes)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_field_id_docid_facet_number_value( | ||||
|         sorter: &mut Sorter<MergeFn>, | ||||
|         field_id: FieldId, | ||||
|         document_id: DocumentId, | ||||
|         value: &FacetValue, | ||||
|         value: OrderedFloat<f64>, | ||||
|     ) -> anyhow::Result<()> | ||||
|     { | ||||
|         use FacetValue::*; | ||||
|         let key = FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, *value)) | ||||
|             .map(Cow::into_owned) | ||||
|             .context("could not serialize facet key")?; | ||||
|  | ||||
|         let result = match value { | ||||
|             String(s) => FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, s)).map(Cow::into_owned), | ||||
|             Number(f) => FieldDocIdFacetF64Codec::bytes_encode(&(field_id, document_id, **f)).map(Cow::into_owned), | ||||
|         }; | ||||
|         if lmdb_key_valid_size(&key) { | ||||
|             sorter.insert(&key, &[])?; | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_field_id_docid_facet_string_value( | ||||
|         sorter: &mut Sorter<MergeFn>, | ||||
|         field_id: FieldId, | ||||
|         document_id: DocumentId, | ||||
|         value: &str, | ||||
|     ) -> anyhow::Result<()> | ||||
|     { | ||||
|         let key = FieldDocIdFacetStringCodec::bytes_encode(&(field_id, document_id, value)) | ||||
|             .map(Cow::into_owned) | ||||
|             .context("could not serialize facet key")?; | ||||
|  | ||||
|         let key = result.context("could not serialize facet key")?; | ||||
|         if lmdb_key_valid_size(&key) { | ||||
|             sorter.insert(&key, &[])?; | ||||
|         } | ||||
| @@ -493,7 +589,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|  | ||||
|         let mut before = Instant::now(); | ||||
|         let mut words_positions = HashMap::new(); | ||||
|         let mut facet_values = HashMap::new(); | ||||
|         let mut facet_numbers_values = HashMap::new(); | ||||
|         let mut facet_strings_values = HashMap::new(); | ||||
|  | ||||
|         let mut count: usize = 0; | ||||
|         while let Some((key, value)) = documents.next()? { | ||||
| @@ -513,32 +610,12 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|                 } | ||||
|  | ||||
|                 for (attr, content) in document.iter() { | ||||
|                     if self.faceted_fields.contains_key(&attr) || self.searchable_fields.contains(&attr) { | ||||
|                     if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) { | ||||
|                         let value = serde_json::from_slice(content)?; | ||||
|  | ||||
|                         if let Some(ftype) = self.faceted_fields.get(&attr) { | ||||
|                             let mut values = match parse_facet_value(*ftype, &value) { | ||||
|                                 Ok(values) => values, | ||||
|                                 Err(e) => { | ||||
|                                     // We extract the name of the attribute and the document id | ||||
|                                     // to help users debug a facet type conversion. | ||||
|                                     let attr_name = self.fields_ids_map.name(attr).unwrap(); | ||||
|                                     let document_id: Value = self.fields_ids_map.id(&self.primary_key) | ||||
|                                         .and_then(|fid| document.get(fid)) | ||||
|                                         .map(serde_json::from_slice) | ||||
|                                         .unwrap()?; | ||||
|  | ||||
|                                     let context = format!( | ||||
|                                         "while extracting facet from the {:?} attribute in the {} document", | ||||
|                                         attr_name, document_id, | ||||
|                                     ); | ||||
|                                     warn!("{}", e.context(context)); | ||||
|  | ||||
|                                     SmallVec8::default() | ||||
|                                 }, | ||||
|                             }; | ||||
|                             facet_values.entry(attr).or_insert_with(SmallVec8::new).extend(values.drain(..)); | ||||
|                         } | ||||
|                         let (facet_numbers, facet_strings) = extract_facet_values(&value); | ||||
|                         facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers); | ||||
|                         facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings); | ||||
|  | ||||
|                         if self.searchable_fields.contains(&attr) { | ||||
|                             let content = match json_to_string(&value) { | ||||
| @@ -558,7 +635,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|                 } | ||||
|  | ||||
|                 // We write the document in the documents store. | ||||
|                 self.write_document(document_id, &mut words_positions, &mut facet_values, value)?; | ||||
|                 self.write_document( | ||||
|                     document_id, | ||||
|                     &mut words_positions, | ||||
|                     &mut facet_numbers_values, | ||||
|                     &mut facet_strings_values, | ||||
|                     value, | ||||
|                 )?; | ||||
|             } | ||||
|  | ||||
|             // Compute the document id of the next document. | ||||
| @@ -585,9 +668,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             &mut self.words_pairs_proximities_docids_sorter, | ||||
|             self.words_pairs_proximities_docids, | ||||
|         )?; | ||||
|         Self::write_facet_field_value_docids( | ||||
|             &mut self.facet_field_value_docids_sorter, | ||||
|             self.facet_field_value_docids, | ||||
|         Self::write_facet_field_number_docids( | ||||
|             &mut self.facet_field_numbers_docids_sorter, | ||||
|             self.facet_field_number_docids, | ||||
|         )?; | ||||
|  | ||||
|         Self::write_facet_field_string_docids( | ||||
|             &mut self.facet_field_strings_docids_sorter, | ||||
|             self.facet_field_string_docids, | ||||
|         )?; | ||||
|  | ||||
|         let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
| @@ -613,18 +701,26 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; | ||||
|  | ||||
|         let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; | ||||
|         let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; | ||||
|  | ||||
|         let mut field_id_docid_facet_values_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.field_id_docid_facet_values_sorter.write_into(&mut field_id_docid_facet_values_wtr)?; | ||||
|         let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?; | ||||
|  | ||||
|         let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?; | ||||
|  | ||||
|         let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?; | ||||
|  | ||||
|         let main = writer_into_reader(main_wtr, shrink_size)?; | ||||
|         let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; | ||||
|         let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; | ||||
|         let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; | ||||
|         let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; | ||||
|         let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; | ||||
|         let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; | ||||
|         let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; | ||||
|         let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; | ||||
|         let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?; | ||||
|         let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; | ||||
|         let documents = writer_into_reader(self.documents_writer, shrink_size)?; | ||||
|  | ||||
| @@ -634,8 +730,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             docid_word_positions, | ||||
|             words_pairs_proximities_docids, | ||||
|             word_level_position_docids, | ||||
|             facet_field_value_docids, | ||||
|             field_id_docid_facet_values, | ||||
|             facet_field_numbers_docids, | ||||
|             facet_field_strings_docids, | ||||
|             field_id_docid_facet_numbers, | ||||
|             field_id_docid_facet_strings, | ||||
|             documents, | ||||
|         }) | ||||
|     } | ||||
| @@ -710,71 +808,36 @@ fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator< | ||||
|     .filter(|(_, t)| t.is_word()) | ||||
| } | ||||
|  | ||||
| fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec8<FacetValue>> { | ||||
|     use FacetValue::*; | ||||
|  | ||||
|     fn inner_parse_facet_value( | ||||
|         ftype: FacetType, | ||||
| fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) { | ||||
|     fn inner_extract_facet_values( | ||||
|         value: &Value, | ||||
|         can_recurse: bool, | ||||
|         output: &mut SmallVec8<FacetValue>, | ||||
|     ) -> anyhow::Result<()> | ||||
|     { | ||||
|         output_numbers: &mut Vec<f64>, | ||||
|         output_strings: &mut Vec<String>, | ||||
|     ) { | ||||
|         match value { | ||||
|             Value::Null => Ok(()), | ||||
|             Value::Bool(b) => match ftype { | ||||
|                 FacetType::String => { | ||||
|                     output.push(String(b.to_string())); | ||||
|                     Ok(()) | ||||
|                 }, | ||||
|                 FacetType::Number => { | ||||
|                     output.push(Number(OrderedFloat(if *b { 1.0 } else { 0.0 }))); | ||||
|                     Ok(()) | ||||
|                 }, | ||||
|             }, | ||||
|             Value::Number(number) => match ftype { | ||||
|                 FacetType::String => { | ||||
|                     output.push(String(number.to_string())); | ||||
|                     Ok(()) | ||||
|                 }, | ||||
|                 FacetType::Number => match number.as_f64() { | ||||
|                     Some(float) => { | ||||
|                         output.push(Number(OrderedFloat(float))); | ||||
|                         Ok(()) | ||||
|                     }, | ||||
|                     None => bail!("invalid facet type, expecting {} found number", ftype), | ||||
|                 }, | ||||
|             Value::Null => (), | ||||
|             Value::Bool(b) => output_strings.push(b.to_string()), | ||||
|             Value::Number(number) => if let Some(float) = number.as_f64() { | ||||
|                 output_numbers.push(float); | ||||
|             }, | ||||
|             Value::String(string) => { | ||||
|                 // TODO must be normalized and not only lowercased. | ||||
|                 let string = string.trim().to_lowercase(); | ||||
|                 match ftype { | ||||
|                     FacetType::String => { | ||||
|                         output.push(String(string)); | ||||
|                         Ok(()) | ||||
|                     }, | ||||
|                     FacetType::Number => match string.parse() { | ||||
|                         Ok(float) => { | ||||
|                             output.push(Number(OrderedFloat(float))); | ||||
|                             Ok(()) | ||||
|                         }, | ||||
|                         Err(_err) => bail!("invalid facet type, expecting {} found string", ftype), | ||||
|                     }, | ||||
|                 } | ||||
|                 output_strings.push(string); | ||||
|             }, | ||||
|             Value::Array(values) => if can_recurse { | ||||
|                 values.iter().map(|v| inner_parse_facet_value(ftype, v, false, output)).collect() | ||||
|             } else { | ||||
|                 bail!( | ||||
|                     "invalid facet type, expecting {} found array (recursive arrays are not supported)", | ||||
|                     ftype, | ||||
|                 ); | ||||
|                 for value in values { | ||||
|                     inner_extract_facet_values(value, false, output_numbers, output_strings); | ||||
|                 } | ||||
|             }, | ||||
|             Value::Object(_) => bail!("invalid facet type, expecting {} found object", ftype), | ||||
|             Value::Object(_) => (), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let mut facet_values = SmallVec8::new(); | ||||
|     inner_parse_facet_value(ftype, value, true, &mut facet_values)?; | ||||
|     Ok(facet_values) | ||||
|     let mut facet_number_values = Vec::new(); | ||||
|     let mut facet_string_values = Vec::new(); | ||||
|     inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); | ||||
|  | ||||
|     (facet_number_values, facet_string_values) | ||||
| } | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| use std::collections::{BTreeSet, HashMap}; | ||||
| use std::collections::{BTreeSet, HashMap, HashSet}; | ||||
| use std::str::FromStr; | ||||
|  | ||||
| use anyhow::Context; | ||||
| @@ -11,7 +11,6 @@ use serde::{Deserialize, Deserializer, Serialize, Serializer}; | ||||
|  | ||||
| use crate::{FieldsIdsMap, Index}; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::facet::FacetType; | ||||
| use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; | ||||
| use crate::update::index_documents::{IndexDocumentsMethod, Transform}; | ||||
|  | ||||
| @@ -68,7 +67,7 @@ pub struct Settings<'a, 't, 'u, 'i> { | ||||
|  | ||||
|     searchable_fields: Setting<Vec<String>>, | ||||
|     displayed_fields: Setting<Vec<String>>, | ||||
|     faceted_fields: Setting<HashMap<String, String>>, | ||||
|     faceted_fields: Setting<HashSet<String>>, | ||||
|     criteria: Setting<Vec<String>>, | ||||
|     stop_words: Setting<BTreeSet<String>>, | ||||
|     distinct_attribute: Setting<String>, | ||||
| @@ -123,7 +122,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         self.faceted_fields = Setting::Reset; | ||||
|     } | ||||
|  | ||||
|     pub fn set_faceted_fields(&mut self, names_facet_types: HashMap<String, String>) { | ||||
|     pub fn set_faceted_fields(&mut self, names_facet_types: HashSet<String>) { | ||||
|         self.faceted_fields = Setting::Set(names_facet_types); | ||||
|     } | ||||
|  | ||||
| @@ -387,11 +386,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         match self.faceted_fields { | ||||
|             Setting::Set(ref fields) => { | ||||
|                 let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; | ||||
|                 let mut new_facets = HashMap::new(); | ||||
|                 for (name, ty) in fields { | ||||
|                 let mut new_facets = HashSet::new(); | ||||
|                 for name in fields { | ||||
|                     fields_ids_map.insert(name).context("field id limit exceeded")?; | ||||
|                     let ty = FacetType::from_str(&ty)?; | ||||
|                     new_facets.insert(name.clone(), ty); | ||||
|                     new_facets.insert(name.clone()); | ||||
|                 } | ||||
|                 self.index.put_faceted_fields(self.wtxn, &new_facets)?; | ||||
|                 self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user