mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Merge #202
202: Add field id word count docids database r=Kerollmops a=LegendreM This PR introduces a new database, `field_id_word_count_docids`, that maps the number of words in an attribute with a list of document ids. This relation is limited to attributes that contain less than 11 words. This database is used by the exactness criterion to know if a document has an attribute that contains exactly the query without any additional word. Fix #165 Fix #196 Related to [specifications:#36](https://github.com/meilisearch/specifications/pull/36) Co-authored-by: many <maxime@meilisearch.com> Co-authored-by: Many <legendre.maxime.isn@gmail.com>
This commit is contained in:
		| @@ -23,6 +23,7 @@ const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; | ||||
| const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; | ||||
| const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; | ||||
| const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; | ||||
| const FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME: &str = "field-id-word-count-docids"; | ||||
| const FACET_ID_F64_DOCIDS_DB_NAME: &str = "facet-id-f64-docids"; | ||||
| const FACET_ID_STRING_DOCIDS_DB_NAME: &str = "facet-id-string-docids"; | ||||
| const FIELD_ID_DOCID_FACET_F64S_DB_NAME: &str = "field-id-docid-facet-f64s"; | ||||
| @@ -39,6 +40,7 @@ const ALL_DATABASE_NAMES: &[&str] = &[ | ||||
|     WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, | ||||
|     WORD_LEVEL_POSITION_DOCIDS_DB_NAME, | ||||
|     WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, | ||||
|     FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME, | ||||
|     FACET_ID_F64_DOCIDS_DB_NAME, | ||||
|     FACET_ID_STRING_DOCIDS_DB_NAME, | ||||
|     FIELD_ID_DOCID_FACET_F64S_DB_NAME, | ||||
| @@ -155,6 +157,17 @@ enum Command { | ||||
|         prefixes: Vec<String>, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs a CSV with the documents ids along with | ||||
|     /// the field id and the word count where it appears. | ||||
|     FieldIdWordCountDocids { | ||||
|         /// Display the whole documents ids in details. | ||||
|         #[structopt(long)] | ||||
|         full_display: bool, | ||||
|  | ||||
|         /// The field name in the document. | ||||
|         field_name: String, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs a CSV with the documents ids, words and the positions where this word appears. | ||||
|     DocidsWordsPositions { | ||||
|         /// Display the whole positions in detail. | ||||
| @@ -271,6 +284,9 @@ fn main() -> anyhow::Result<()> { | ||||
|         WordPrefixesLevelPositionsDocids { full_display, prefixes } => { | ||||
|             word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) | ||||
|         }, | ||||
|         FieldIdWordCountDocids { full_display, field_name } => { | ||||
|             field_id_word_count_docids(&index, &rtxn, !full_display, field_name) | ||||
|         }, | ||||
|         DocidsWordsPositions { full_display, internal_documents_ids } => { | ||||
|             docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) | ||||
|         }, | ||||
| @@ -357,6 +373,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | ||||
|         word_prefix_pair_proximity_docids, | ||||
|         word_level_position_docids, | ||||
|         word_prefix_level_position_docids, | ||||
|         field_id_word_count_docids, | ||||
|         facet_id_f64_docids, | ||||
|         facet_id_string_docids, | ||||
|         field_id_docid_facet_f64s: _, | ||||
| @@ -372,6 +389,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | ||||
|     let word_pair_proximity_docids_name = "word_pair_proximity_docids"; | ||||
|     let word_level_position_docids_name = "word_level_position_docids"; | ||||
|     let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; | ||||
|     let field_id_word_count_docids_name = "field_id_word_count_docids"; | ||||
|     let facet_id_f64_docids_name = "facet_id_f64_docids"; | ||||
|     let facet_id_string_docids_name = "facet_id_string_docids"; | ||||
|     let documents_name = "documents"; | ||||
| @@ -443,6 +461,13 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         for result in field_id_word_count_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||
|             let ((field_id, word_count), docids) = result?; | ||||
|             let key = format!("{} {}", field_id, word_count); | ||||
|             heap.push(Reverse((docids.len(), key, field_id_word_count_docids_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         let faceted_fields = index.faceted_fields_ids(rtxn)?; | ||||
|         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||
|  | ||||
| @@ -676,6 +701,39 @@ fn word_prefixes_level_positions_docids( | ||||
|     Ok(wtr.flush()?) | ||||
| } | ||||
|  | ||||
| fn field_id_word_count_docids( | ||||
|     index: &Index, | ||||
|     rtxn: &heed::RoTxn, | ||||
|     debug: bool, | ||||
|     field_name: String | ||||
| ) -> anyhow::Result<()> | ||||
| { | ||||
|     let stdout = io::stdout(); | ||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||
|     wtr.write_record(&["field_name", "word_count", "docids"])?; | ||||
|  | ||||
|     let field_id = index.fields_ids_map(rtxn)? | ||||
|         .id(&field_name) | ||||
|         .with_context(|| format!("unknown field name: {}", &field_name))?; | ||||
|  | ||||
|     let left = (field_id, 0); | ||||
|     let right = (field_id, u8::max_value()); | ||||
|     let iter = index.field_id_word_count_docids | ||||
|         .range(rtxn, &(left..=right))?; | ||||
|  | ||||
|     for result in iter { | ||||
|         let ((_, word_count), docids) = result?; | ||||
|         let docids = if debug { | ||||
|             format!("{:?}", docids) | ||||
|         } else { | ||||
|             format!("{:?}", docids.iter().collect::<Vec<_>>()) | ||||
|         }; | ||||
|         wtr.write_record(&[&field_name, &format!("{}", word_count), &docids])?; | ||||
|     } | ||||
|  | ||||
|     Ok(wtr.flush()?) | ||||
| } | ||||
|  | ||||
| fn docids_words_positions( | ||||
|     index: &Index, | ||||
|     rtxn: &heed::RoTxn, | ||||
| @@ -870,6 +928,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a | ||||
|         word_prefix_pair_proximity_docids, | ||||
|         word_level_position_docids, | ||||
|         word_prefix_level_position_docids, | ||||
|         field_id_word_count_docids, | ||||
|         facet_id_f64_docids, | ||||
|         facet_id_string_docids, | ||||
|         field_id_docid_facet_f64s, | ||||
| @@ -893,6 +952,7 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a | ||||
|             WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|             WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), | ||||
|             WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), | ||||
|             FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => field_id_word_count_docids.as_polymorph(), | ||||
|             FACET_ID_F64_DOCIDS_DB_NAME => facet_id_f64_docids.as_polymorph(), | ||||
|             FACET_ID_STRING_DOCIDS_DB_NAME => facet_id_string_docids.as_polymorph(), | ||||
|             FIELD_ID_DOCID_FACET_F64S_DB_NAME => field_id_docid_facet_f64s.as_polymorph(), | ||||
| @@ -999,6 +1059,10 @@ fn database_stats(index: &Index, rtxn: &heed::RoTxn, name: &str) -> anyhow::Resu | ||||
|             let db = index.word_prefix_pair_proximity_docids.as_polymorph(); | ||||
|             compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) | ||||
|         }, | ||||
|         FIELD_ID_WORD_COUNT_DOCIDS_DB_NAME => { | ||||
|             let db = index.field_id_word_count_docids.as_polymorph(); | ||||
|             compute_stats::<CboRoaringBitmapCodec>(*db, rtxn, name) | ||||
|         }, | ||||
|         unknown => anyhow::bail!("unknown database {:?}", unknown), | ||||
|     } | ||||
| } | ||||
|   | ||||
							
								
								
									
										22
									
								
								milli/src/heed_codec/field_id_word_count_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								milli/src/heed_codec/field_id_word_count_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,22 @@ | ||||
| use std::{borrow::Cow, convert::TryInto}; | ||||
|  | ||||
| use crate::FieldId; | ||||
|  | ||||
| pub struct FieldIdWordCountCodec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec { | ||||
|     type DItem = (FieldId, u8); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let [field_id, word_count]: [u8; 2] = bytes.try_into().ok()?; | ||||
|         Some((field_id, word_count)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec { | ||||
|     type EItem = (FieldId, u8); | ||||
|  | ||||
|     fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         Some(Cow::Owned(vec![*field_id, *word_count])) | ||||
|     } | ||||
| } | ||||
| @@ -4,6 +4,7 @@ mod roaring_bitmap; | ||||
| mod roaring_bitmap_length; | ||||
| mod str_level_position_codec; | ||||
| mod str_str_u8_codec; | ||||
| mod field_id_word_count_codec; | ||||
| pub mod facet; | ||||
|  | ||||
| pub use self::beu32_str_codec::BEU32StrCodec; | ||||
| @@ -12,3 +13,4 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar | ||||
| pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; | ||||
| pub use self::str_level_position_codec::StrLevelPositionCodec; | ||||
| pub use self::str_str_u8_codec::StrStrU8Codec; | ||||
| pub use self::field_id_word_count_codec::FieldIdWordCountCodec; | ||||
|   | ||||
| @@ -13,6 +13,7 @@ use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; | ||||
| use crate::{ | ||||
|     BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, | ||||
|     ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, | ||||
|     FieldIdWordCountCodec, | ||||
| }; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, | ||||
| @@ -63,6 +64,8 @@ pub struct Index { | ||||
|  | ||||
|     /// Maps the word, level and position range with the docids that corresponds to it. | ||||
|     pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the field id and the word count with the docids that corresponds to it. | ||||
|     pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the level positions of a word prefix with all the docids where this prefix appears. | ||||
|     pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, | ||||
|  | ||||
| @@ -82,7 +85,7 @@ pub struct Index { | ||||
|  | ||||
| impl Index { | ||||
|     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> { | ||||
|         options.max_dbs(13); | ||||
|         options.max_dbs(14); | ||||
|  | ||||
|         let env = options.open(path)?; | ||||
|         let main = env.create_poly_database(Some("main"))?; | ||||
| @@ -92,6 +95,7 @@ impl Index { | ||||
|         let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; | ||||
|         let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; | ||||
|         let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; | ||||
|         let field_id_word_count_docids = env.create_database(Some("field-id-word-count-docids"))?; | ||||
|         let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; | ||||
|         let facet_id_f64_docids = env.create_database(Some("facet-id-f64-docids"))?; | ||||
|         let facet_id_string_docids = env.create_database(Some("facet-id-string-docids"))?; | ||||
| @@ -111,6 +115,7 @@ impl Index { | ||||
|             word_prefix_pair_proximity_docids, | ||||
|             word_level_position_docids, | ||||
|             word_prefix_level_position_docids, | ||||
|             field_id_word_count_docids, | ||||
|             facet_id_f64_docids, | ||||
|             facet_id_string_docids, | ||||
|             field_id_docid_facet_f64s, | ||||
|   | ||||
| @@ -23,7 +23,7 @@ use serde_json::{Map, Value}; | ||||
| pub use self::criterion::{Criterion, default_criteria}; | ||||
| pub use self::external_documents_ids::ExternalDocumentsIds; | ||||
| pub use self::fields_ids_map::FieldsIdsMap; | ||||
| pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec}; | ||||
| pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec, FieldIdWordCountCodec}; | ||||
| pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||
| pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; | ||||
| pub use self::index::Index; | ||||
|   | ||||
| @@ -1,9 +1,10 @@ | ||||
| use std::convert::TryFrom; | ||||
| use std::mem::take; | ||||
| use std::ops::BitOr; | ||||
|  | ||||
| use log::debug; | ||||
| use roaring::RoaringBitmap; | ||||
| use itertools::Itertools; | ||||
| use std::ops::BitOr; | ||||
|  | ||||
| use crate::search::query_tree::{Operation, PrimitiveQueryPart}; | ||||
| use crate::search::criteria::{ | ||||
| @@ -162,23 +163,24 @@ fn resolve_state( | ||||
|     use State::*; | ||||
|     match state { | ||||
|         ExactAttribute(mut allowed_candidates) => { | ||||
|             let query_len = query.len() as u32; | ||||
|             let mut candidates = RoaringBitmap::new(); | ||||
|             let attributes_ids = ctx.searchable_fields_ids()?; | ||||
|             for id in attributes_ids { | ||||
|                 if let Some(attribute_allowed_docids) = ctx.field_id_len_docids(id, query_len)? { | ||||
|                     let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; | ||||
|                     attribute_candidates_array.push(attribute_allowed_docids); | ||||
|                     candidates |= intersection_of(attribute_candidates_array.iter().collect()); | ||||
|             if let Ok(query_len) = u8::try_from(query.len()) { | ||||
|                 let attributes_ids = ctx.searchable_fields_ids()?; | ||||
|                 for id in attributes_ids { | ||||
|                     if let Some(attribute_allowed_docids) = ctx.field_id_word_count_docids(id, query_len)? { | ||||
|                         let mut attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; | ||||
|                         attribute_candidates_array.push(attribute_allowed_docids); | ||||
|                         candidates |= intersection_of(attribute_candidates_array.iter().collect()); | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 // only keep allowed candidates | ||||
|                 candidates &= &allowed_candidates; | ||||
|                 // remove current candidates from allowed candidates | ||||
|                 allowed_candidates -= &candidates; | ||||
|             } | ||||
|  | ||||
|             // only keep allowed candidates | ||||
|             candidates &= &allowed_candidates; | ||||
|             // remove current candidates from allowed candidates | ||||
|             allowed_candidates -= &candidates; | ||||
|             Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) | ||||
|  | ||||
|         }, | ||||
|         AttributeStartsWith(mut allowed_candidates) => { | ||||
|             let mut candidates = RoaringBitmap::new(); | ||||
|   | ||||
| @@ -78,7 +78,7 @@ pub trait Context<'c> { | ||||
|     fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>; | ||||
|     fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>; | ||||
|     fn searchable_fields_ids(&self) ->  heed::Result<Vec<FieldId>>; | ||||
|     fn field_id_len_docids(&self, field_id: FieldId, len: u32) -> heed::Result<Option<RoaringBitmap>>; | ||||
|     fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>>; | ||||
|     fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result<Option<RoaringBitmap>, heed::Error>; | ||||
| } | ||||
| pub struct CriteriaBuilder<'t> { | ||||
| @@ -181,8 +181,9 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result<Option<RoaringBitmap>> { | ||||
|         Ok(None) | ||||
|     fn field_id_word_count_docids(&self, field_id: FieldId, word_count: u8) -> heed::Result<Option<RoaringBitmap>> { | ||||
|         let key = (field_id, word_count); | ||||
|         self.index.field_id_word_count_docids.get(self.rtxn, &key) | ||||
|     } | ||||
|  | ||||
|     fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result<Option<RoaringBitmap>, heed::Error> { | ||||
| @@ -488,7 +489,7 @@ pub mod test { | ||||
|             todo!() | ||||
|         } | ||||
|  | ||||
|         fn field_id_len_docids(&self, _field_id: FieldId, _len: u32) -> heed::Result<Option<RoaringBitmap>> { | ||||
|         fn field_id_word_count_docids(&self, _field_id: FieldId, _word_count: u8) -> heed::Result<Option<RoaringBitmap>> { | ||||
|             todo!() | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|             word_pair_proximity_docids, | ||||
|             word_prefix_pair_proximity_docids, | ||||
|             word_level_position_docids, | ||||
|             field_id_word_count_docids, | ||||
|             word_prefix_level_position_docids, | ||||
|             facet_id_f64_docids, | ||||
|             facet_id_string_docids, | ||||
| @@ -62,6 +63,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|         word_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         word_prefix_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         word_level_position_docids.clear(self.wtxn)?; | ||||
|         field_id_word_count_docids.clear(self.wtxn)?; | ||||
|         word_prefix_level_position_docids.clear(self.wtxn)?; | ||||
|         facet_id_f64_docids.clear(self.wtxn)?; | ||||
|         facet_id_string_docids.clear(self.wtxn)?; | ||||
| @@ -117,6 +119,7 @@ mod tests { | ||||
|         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); | ||||
|   | ||||
| @@ -86,6 +86,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             word_prefix_docids, | ||||
|             docid_word_positions, | ||||
|             word_pair_proximity_docids, | ||||
|             field_id_word_count_docids, | ||||
|             word_prefix_pair_proximity_docids, | ||||
|             word_level_position_docids, | ||||
|             word_prefix_level_position_docids, | ||||
| @@ -316,6 +317,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|  | ||||
|         drop(iter); | ||||
|  | ||||
|         // Remove the documents ids from the field id word count database. | ||||
|         let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; | ||||
|         while let Some((key, mut docids)) = iter.next().transpose()? { | ||||
|             let previous_len = docids.len(); | ||||
|             docids.difference_with(&self.documents_ids); | ||||
|             if docids.is_empty() { | ||||
|                 iter.del_current()?; | ||||
|             } else if docids.len() != previous_len { | ||||
|                 iter.put_current(&key, &docids)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         drop(iter); | ||||
|  | ||||
|         // We delete the documents ids that are under the facet field id values. | ||||
|         remove_docids_from_facet_field_id_value_docids( | ||||
|             self.wtxn, | ||||
|   | ||||
| @@ -60,6 +60,10 @@ pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> an | ||||
|     cbo_roaring_bitmap_merge(values) | ||||
| } | ||||
|  | ||||
| pub fn field_id_word_count_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { | ||||
|     cbo_roaring_bitmap_merge(values) | ||||
| } | ||||
|  | ||||
| pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { | ||||
|     cbo_roaring_bitmap_merge(values) | ||||
| } | ||||
|   | ||||
| @@ -29,6 +29,7 @@ pub use self::merge_function::{ | ||||
|     docid_word_positions_merge, documents_merge, | ||||
|     word_level_position_docids_merge, word_prefix_level_positions_docids_merge, | ||||
|     facet_field_value_docids_merge, field_id_docid_facet_values_merge, | ||||
|     field_id_word_count_docids_merge, | ||||
| }; | ||||
| pub use self::transform::{Transform, TransformOutput}; | ||||
|  | ||||
| @@ -412,6 +413,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             Main, | ||||
|             WordDocids, | ||||
|             WordLevel0PositionDocids, | ||||
|             FieldIdWordCountDocids, | ||||
|             FacetLevel0NumbersDocids, | ||||
|         } | ||||
|  | ||||
| @@ -476,6 +478,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut field_id_word_count_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut facet_field_numbers_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut facet_field_strings_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut field_id_docid_facet_numbers_readers = Vec::with_capacity(readers.len()); | ||||
| @@ -488,6 +491,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                     docid_word_positions, | ||||
|                     words_pairs_proximities_docids, | ||||
|                     word_level_position_docids, | ||||
|                     field_id_word_count_docids, | ||||
|                     facet_field_numbers_docids, | ||||
|                     facet_field_strings_docids, | ||||
|                     field_id_docid_facet_numbers, | ||||
| @@ -499,6 +503,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                 docid_word_positions_readers.push(docid_word_positions); | ||||
|                 words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); | ||||
|                 word_level_position_docids_readers.push(word_level_position_docids); | ||||
|                 field_id_word_count_docids_readers.push(field_id_word_count_docids); | ||||
|                 facet_field_numbers_docids_readers.push(facet_field_numbers_docids); | ||||
|                 facet_field_strings_docids_readers.push(facet_field_strings_docids); | ||||
|                 field_id_docid_facet_numbers_readers.push(field_id_docid_facet_numbers); | ||||
| @@ -536,6 +541,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                         word_level_position_docids_readers, | ||||
|                         word_level_position_docids_merge, | ||||
|                     ), | ||||
|                     ( | ||||
|                         DatabaseType::FieldIdWordCountDocids, | ||||
|                         field_id_word_count_docids_readers, | ||||
|                         field_id_word_count_docids_merge, | ||||
|                     ), | ||||
|                 ] | ||||
|                 .into_par_iter() | ||||
|                 .for_each(|(dbtype, readers, merge)| { | ||||
| @@ -595,7 +605,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         self.index.put_documents_ids(self.wtxn, &documents_ids)?; | ||||
|  | ||||
|         let mut database_count = 0; | ||||
|         let total_databases = 10; | ||||
|         let total_databases = 11; | ||||
|  | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: 0, | ||||
| @@ -727,6 +737,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 }, | ||||
|                 DatabaseType::FieldIdWordCountDocids => { | ||||
|                     debug!("Writing the field id word count docids into LMDB on disk..."); | ||||
|                     let db = *self.index.field_id_word_count_docids.as_polymorph(); | ||||
|                     write_into_lmdb_database( | ||||
|                         self.wtxn, | ||||
|                         db, | ||||
|                         content, | ||||
|                         field_id_word_count_docids_merge, | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 }, | ||||
|                 DatabaseType::WordLevel0PositionDocids => { | ||||
|                     debug!("Writing the word level 0 positions docids into LMDB on disk..."); | ||||
|                     let db = *self.index.word_level_position_docids.as_polymorph(); | ||||
|   | ||||
| @@ -29,7 +29,7 @@ use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; | ||||
| use super::merge_function::{ | ||||
|     main_merge, word_docids_merge, words_pairs_proximities_docids_merge, | ||||
|     word_level_position_docids_merge, facet_field_value_docids_merge, | ||||
|     field_id_docid_facet_values_merge, | ||||
|     field_id_docid_facet_values_merge, field_id_word_count_docids_merge, | ||||
| }; | ||||
|  | ||||
| const LMDB_MAX_KEY_LENGTH: usize = 511; | ||||
| @@ -44,6 +44,7 @@ pub struct Readers { | ||||
|     pub docid_word_positions: Reader<FileFuse>, | ||||
|     pub words_pairs_proximities_docids: Reader<FileFuse>, | ||||
|     pub word_level_position_docids: Reader<FileFuse>, | ||||
|     pub field_id_word_count_docids: Reader<FileFuse>, | ||||
|     pub facet_field_numbers_docids: Reader<FileFuse>, | ||||
|     pub facet_field_strings_docids: Reader<FileFuse>, | ||||
|     pub field_id_docid_facet_numbers: Reader<FileFuse>, | ||||
| @@ -58,6 +59,7 @@ pub struct Store<'s, A> { | ||||
|     // Caches | ||||
|     word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, | ||||
|     word_docids_limit: usize, | ||||
|     field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>, | ||||
|     words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>, | ||||
|     words_pairs_proximities_docids_limit: usize, | ||||
|     facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>, | ||||
| @@ -72,6 +74,7 @@ pub struct Store<'s, A> { | ||||
|     word_docids_sorter: Sorter<MergeFn>, | ||||
|     words_pairs_proximities_docids_sorter: Sorter<MergeFn>, | ||||
|     word_level_position_docids_sorter: Sorter<MergeFn>, | ||||
|     field_id_word_count_docids_sorter: Sorter<MergeFn>, | ||||
|     facet_field_numbers_docids_sorter: Sorter<MergeFn>, | ||||
|     facet_field_strings_docids_sorter: Sorter<MergeFn>, | ||||
|     field_id_docid_facet_numbers_sorter: Sorter<MergeFn>, | ||||
| @@ -132,6 +135,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let field_id_word_count_docids_sorter = create_sorter( | ||||
|             field_id_word_count_docids_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let facet_field_numbers_docids_sorter = create_sorter( | ||||
|             facet_field_value_docids_merge, | ||||
|             chunk_compression_type, | ||||
| @@ -184,6 +195,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             faceted_fields, | ||||
|             // Caches | ||||
|             word_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             field_id_word_count_docids: HashMap::new(), | ||||
|             word_docids_limit: linked_hash_map_size, | ||||
|             words_pairs_proximities_docids: LinkedHashMap::with_capacity(linked_hash_map_size), | ||||
|             words_pairs_proximities_docids_limit: linked_hash_map_size, | ||||
| @@ -199,6 +211,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             word_docids_sorter, | ||||
|             words_pairs_proximities_docids_sorter, | ||||
|             word_level_position_docids_sorter, | ||||
|             field_id_word_count_docids_sorter, | ||||
|             facet_field_numbers_docids_sorter, | ||||
|             facet_field_strings_docids_sorter, | ||||
|             field_id_docid_facet_numbers_sorter, | ||||
| @@ -620,10 +633,17 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|                             let analyzed = self.analyzer.analyze(&content); | ||||
|                             let tokens = process_tokens(analyzed.tokens()); | ||||
|  | ||||
|                             let mut last_pos = None; | ||||
|                             for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { | ||||
|                                 last_pos = Some(pos); | ||||
|                                 let position = (attr as usize * MAX_POSITION + pos) as u32; | ||||
|                                 words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); | ||||
|                             } | ||||
|  | ||||
|                             if let Some(last_pos) = last_pos.filter(|p| *p <= 10) { | ||||
|                                 let key = (attr, last_pos as u8 + 1); | ||||
|                                 self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id); | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
| @@ -683,6 +703,13 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             word_docids_wtr.insert(word, val)?; | ||||
|         } | ||||
|  | ||||
|         let mut docids_buffer = Vec::new(); | ||||
|         for ((fid, count), docids) in self.field_id_word_count_docids { | ||||
|             docids_buffer.clear(); | ||||
|             CboRoaringBitmapCodec::serialize_into(&docids, &mut docids_buffer)?; | ||||
|             self.field_id_word_count_docids_sorter.insert([fid, count], &docids_buffer)?; | ||||
|         } | ||||
|  | ||||
|         let fst = builder.into_set(); | ||||
|         self.main_sorter.insert(WORDS_FST_KEY, fst.as_fst().as_bytes())?; | ||||
|  | ||||
| @@ -695,6 +722,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; | ||||
|  | ||||
|         let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?; | ||||
|  | ||||
|         let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?; | ||||
|  | ||||
| @@ -711,6 +741,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; | ||||
|         let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; | ||||
|         let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; | ||||
|         let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?; | ||||
|         let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?; | ||||
|         let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?; | ||||
|         let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?; | ||||
| @@ -724,6 +755,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             docid_word_positions, | ||||
|             words_pairs_proximities_docids, | ||||
|             word_level_position_docids, | ||||
|             field_id_word_count_docids, | ||||
|             facet_field_numbers_docids, | ||||
|             facet_field_strings_docids, | ||||
|             field_id_docid_facet_numbers, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user