mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	Merge pull request #122 from meilisearch/attribute-criterion
Introduce the Attribute criterion
This commit is contained in:
		
							
								
								
									
										11
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										11
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -122,6 +122,12 @@ version = "0.13.0" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "904dfeac50f3cdaba28fc6f57fdcddb75f49ed61346676a78c4ffe55877802fd" | ||||
|  | ||||
| [[package]] | ||||
| name = "big_s" | ||||
| version = "1.0.2" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "199edb7b90631283b10c2422e6a0bc8b7d987bf732995ba1de53b576c97e51a8" | ||||
|  | ||||
| [[package]] | ||||
| name = "bincode" | ||||
| version = "1.3.1" | ||||
| @@ -1251,6 +1257,7 @@ name = "milli" | ||||
| version = "0.1.1" | ||||
| dependencies = [ | ||||
|  "anyhow", | ||||
|  "big_s", | ||||
|  "bstr", | ||||
|  "byteorder", | ||||
|  "chrono", | ||||
| @@ -1957,9 +1964,9 @@ checksum = "53552c6c49e1e13f1a203ef0080ab3bbef0beb570a528993e83df057a9d9bba1" | ||||
|  | ||||
| [[package]] | ||||
| name = "roaring" | ||||
| version = "0.6.5" | ||||
| version = "0.6.6" | ||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | ||||
| checksum = "c6744a4a918e91359ad1d356a91e2e943a86d9fb9ae77f715d617032ea2af88f" | ||||
| checksum = "a4b2e7ab0bbb2d144558ae3f4761a0db06d21463b45756fc64c3393cdba3d447" | ||||
| dependencies = [ | ||||
|  "bytemuck", | ||||
|  "byteorder", | ||||
|   | ||||
| @@ -3,7 +3,7 @@ use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; | ||||
| use std::fmt::Display; | ||||
| use std::fs::{create_dir_all, File}; | ||||
| use std::net::SocketAddr; | ||||
| use std::num::NonZeroUsize; | ||||
| use std::num::{NonZeroU32, NonZeroUsize}; | ||||
| use std::path::PathBuf; | ||||
| use std::str::FromStr; | ||||
| use std::sync::Arc; | ||||
| @@ -228,7 +228,6 @@ enum UpdateMeta { | ||||
|     ClearDocuments, | ||||
|     Settings(Settings), | ||||
|     Facets(Facets), | ||||
|     WordsPrefixes(WordsPrefixes), | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Serialize, Deserialize)] | ||||
| @@ -281,6 +280,14 @@ struct WordsPrefixes { | ||||
|     max_prefix_length: Option<usize>, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Serialize, Deserialize)] | ||||
| #[serde(deny_unknown_fields)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| struct WordsLevelPositions { | ||||
|     level_group_size: Option<NonZeroU32>, | ||||
|     min_level_size: Option<NonZeroU32>, | ||||
| } | ||||
|  | ||||
| #[tokio::main] | ||||
| async fn main() -> anyhow::Result<()> { | ||||
|     let opt = Opt::from_args(); | ||||
| @@ -479,21 +486,6 @@ async fn main() -> anyhow::Result<()> { | ||||
|                         Err(e) => Err(e) | ||||
|                     } | ||||
|                 } | ||||
|                 UpdateMeta::WordsPrefixes(settings) => { | ||||
|                     // We must use the write transaction of the update here. | ||||
|                     let mut wtxn = index_cloned.write_txn()?; | ||||
|                     let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned); | ||||
|                     if let Some(value) = settings.threshold { | ||||
|                         builder.threshold(value); | ||||
|                     } | ||||
|                     if let Some(value) = settings.max_prefix_length { | ||||
|                         builder.max_prefix_length(value); | ||||
|                     } | ||||
|                     match builder.execute() { | ||||
|                         Ok(()) => wtxn.commit().map_err(Into::into), | ||||
|                         Err(e) => Err(e) | ||||
|                     } | ||||
|                 } | ||||
|             }; | ||||
|  | ||||
|             let meta = match result { | ||||
| @@ -910,19 +902,6 @@ async fn main() -> anyhow::Result<()> { | ||||
|             warp::reply() | ||||
|         }); | ||||
|  | ||||
|     let update_store_cloned = update_store.clone(); | ||||
|     let update_status_sender_cloned = update_status_sender.clone(); | ||||
|     let change_words_prefixes_route = warp::filters::method::post() | ||||
|         .and(warp::path!("words-prefixes")) | ||||
|         .and(warp::body::json()) | ||||
|         .map(move |settings: WordsPrefixes| { | ||||
|             let meta = UpdateMeta::WordsPrefixes(settings); | ||||
|             let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); | ||||
|             let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); | ||||
|             eprintln!("update {} registered", update_id); | ||||
|             warp::reply() | ||||
|         }); | ||||
|  | ||||
|     let update_store_cloned = update_store.clone(); | ||||
|     let update_status_sender_cloned = update_status_sender.clone(); | ||||
|     let abort_update_id_route = warp::filters::method::delete() | ||||
| @@ -997,7 +976,6 @@ async fn main() -> anyhow::Result<()> { | ||||
|         .or(clearing_route) | ||||
|         .or(change_settings_route) | ||||
|         .or(change_facet_levels_route) | ||||
|         .or(change_words_prefixes_route) | ||||
|         .or(update_ws_route); | ||||
|  | ||||
|     let addr = SocketAddr::from_str(&opt.http_listen_addr)?; | ||||
|   | ||||
| @@ -11,7 +11,7 @@ csv = "1.1.5" | ||||
| heed = "0.10.6" | ||||
| jemallocator = "0.3.2" | ||||
| milli = { path = "../milli" } | ||||
| roaring = "0.6.5" | ||||
| roaring = "0.6.6" | ||||
| serde_json = "1.0.62" | ||||
| stderrlog = "0.5.1" | ||||
| structopt = { version = "0.3.21", default-features = false } | ||||
|   | ||||
| @@ -5,7 +5,7 @@ use std::{str, io, fmt}; | ||||
| use anyhow::Context; | ||||
| use byte_unit::Byte; | ||||
| use heed::EnvOpenOptions; | ||||
| use milli::Index; | ||||
| use milli::{Index, TreeLevel}; | ||||
| use structopt::StructOpt; | ||||
|  | ||||
| use Command::*; | ||||
| @@ -19,9 +19,11 @@ const WORD_DOCIDS_DB_NAME: &str = "word-docids"; | ||||
| const WORD_PREFIX_DOCIDS_DB_NAME: &str = "word-prefix-docids"; | ||||
| const DOCID_WORD_POSITIONS_DB_NAME: &str = "docid-word-positions"; | ||||
| const WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-pair-proximity-docids"; | ||||
| const FACET_FIELD_ID_VALUE_DOCIDS_NAME: &str = "facet-field-id-value-docids"; | ||||
| const FIELD_ID_DOCID_FACET_VALUES_NAME: &str = "field-id-docid-facet-values"; | ||||
| const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME: &str = "word-prefix-pair-proximity-docids"; | ||||
| const WORD_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-level-position-docids"; | ||||
| const WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME: &str = "word-prefix-level-position-docids"; | ||||
| const FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME: &str = "facet-field-id-value-docids"; | ||||
| const FIELD_ID_DOCID_FACET_VALUES_DB_NAME: &str = "field-id-docid-facet-values"; | ||||
| const DOCUMENTS_DB_NAME: &str = "documents"; | ||||
|  | ||||
| const ALL_DATABASE_NAMES: &[&str] = &[ | ||||
| @@ -31,8 +33,10 @@ const ALL_DATABASE_NAMES: &[&str] = &[ | ||||
|     DOCID_WORD_POSITIONS_DB_NAME, | ||||
|     WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME, | ||||
|     WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME, | ||||
|     FACET_FIELD_ID_VALUE_DOCIDS_NAME, | ||||
|     FIELD_ID_DOCID_FACET_VALUES_NAME, | ||||
|     WORD_LEVEL_POSITION_DOCIDS_DB_NAME, | ||||
|     WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME, | ||||
|     FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME, | ||||
|     FIELD_ID_DOCID_FACET_VALUES_DB_NAME, | ||||
|     DOCUMENTS_DB_NAME, | ||||
| ]; | ||||
|  | ||||
| @@ -114,6 +118,27 @@ enum Command { | ||||
|         field_name: String, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs a CSV with the documents ids along with the word level positions where it appears. | ||||
|     WordsLevelPositionsDocids { | ||||
|         /// Display the whole documents ids in details. | ||||
|         #[structopt(long)] | ||||
|         full_display: bool, | ||||
|  | ||||
|         /// Words appearing in the documents. | ||||
|         words: Vec<String>, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs a CSV with the documents ids along with | ||||
|     /// the word prefix level positions where it appears. | ||||
|     WordPrefixesLevelPositionsDocids { | ||||
|         /// Display the whole documents ids in details. | ||||
|         #[structopt(long)] | ||||
|         full_display: bool, | ||||
|  | ||||
|         /// Prefixes of words appearing in the documents. | ||||
|         prefixes: Vec<String>, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs a CSV with the documents ids, words and the positions where this word appears. | ||||
|     DocidsWordsPositions { | ||||
|         /// Display the whole positions in detail. | ||||
| @@ -221,6 +246,12 @@ fn main() -> anyhow::Result<()> { | ||||
|         FacetValuesDocids { full_display, field_name } => { | ||||
|             facet_values_docids(&index, &rtxn, !full_display, field_name) | ||||
|         }, | ||||
|         WordsLevelPositionsDocids { full_display, words } => { | ||||
|             words_level_positions_docids(&index, &rtxn, !full_display, words) | ||||
|         }, | ||||
|         WordPrefixesLevelPositionsDocids { full_display, prefixes } => { | ||||
|             word_prefixes_level_positions_docids(&index, &rtxn, !full_display, prefixes) | ||||
|         }, | ||||
|         DocidsWordsPositions { full_display, internal_documents_ids } => { | ||||
|             docids_words_positions(&index, &rtxn, !full_display, internal_documents_ids) | ||||
|         }, | ||||
| @@ -319,9 +350,11 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | ||||
|         docid_word_positions, | ||||
|         word_pair_proximity_docids, | ||||
|         word_prefix_pair_proximity_docids, | ||||
|         word_level_position_docids, | ||||
|         word_prefix_level_position_docids, | ||||
|         facet_field_id_value_docids, | ||||
|         field_id_docid_facet_values: _, | ||||
|         documents, | ||||
|         documents | ||||
|     } = index; | ||||
|  | ||||
|     let main_name = "main"; | ||||
| @@ -330,6 +363,8 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | ||||
|     let docid_word_positions_name = "docid_word_positions"; | ||||
|     let word_prefix_pair_proximity_docids_name = "word_prefix_pair_proximity_docids"; | ||||
|     let word_pair_proximity_docids_name = "word_pair_proximity_docids"; | ||||
|     let word_level_position_docids_name = "word_level_position_docids"; | ||||
|     let word_prefix_level_position_docids_name = "word_prefix_level_position_docids"; | ||||
|     let facet_field_id_value_docids_name = "facet_field_id_value_docids"; | ||||
|     let documents_name = "documents"; | ||||
|  | ||||
| @@ -386,6 +421,20 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         for result in word_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||
|             let ((word, level, left, right), value) = result?; | ||||
|             let key = format!("{} {} {:?}", word, level, left..=right); | ||||
|             heap.push(Reverse((value.len(), key, word_level_position_docids_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         for result in word_prefix_level_position_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||
|             let ((word, level, left, right), value) = result?; | ||||
|             let key = format!("{} {} {:?}", word, level, left..=right); | ||||
|             heap.push(Reverse((value.len(), key, word_prefix_level_position_docids_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
|         } | ||||
|  | ||||
|         let faceted_fields = index.faceted_fields_ids(rtxn)?; | ||||
|         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||
|         for (field_id, field_type) in faceted_fields { | ||||
| @@ -524,6 +573,84 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam | ||||
|     Ok(wtr.flush()?) | ||||
| } | ||||
|  | ||||
| fn words_level_positions_docids( | ||||
|     index: &Index, | ||||
|     rtxn: &heed::RoTxn, | ||||
|     debug: bool, | ||||
|     words: Vec<String>, | ||||
| ) -> anyhow::Result<()> | ||||
| { | ||||
|     let stdout = io::stdout(); | ||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||
|     wtr.write_record(&["word", "level", "positions", "documents_count", "documents_ids"])?; | ||||
|  | ||||
|     for word in words.iter().map(AsRef::as_ref) { | ||||
|         let range = { | ||||
|             let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); | ||||
|             let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); | ||||
|             left..=right | ||||
|         }; | ||||
|         for result in index.word_level_position_docids.range(rtxn, &range)? { | ||||
|             let ((w, level, left, right), docids) = result?; | ||||
|  | ||||
|             let count = docids.len().to_string(); | ||||
|             let docids = if debug { | ||||
|                 format!("{:?}", docids) | ||||
|             } else { | ||||
|                 format!("{:?}", docids.iter().collect::<Vec<_>>()) | ||||
|             }; | ||||
|             let position_range = if level == TreeLevel::min_value() { | ||||
|                 format!("{:?}", left) | ||||
|             } else { | ||||
|                 format!("{:?}", left..=right) | ||||
|             }; | ||||
|             let level = level.to_string(); | ||||
|             wtr.write_record(&[w, &level, &position_range, &count, &docids])?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(wtr.flush()?) | ||||
| } | ||||
|  | ||||
| fn word_prefixes_level_positions_docids( | ||||
|     index: &Index, | ||||
|     rtxn: &heed::RoTxn, | ||||
|     debug: bool, | ||||
|     prefixes: Vec<String>, | ||||
| ) -> anyhow::Result<()> | ||||
| { | ||||
|     let stdout = io::stdout(); | ||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||
|     wtr.write_record(&["prefix", "level", "positions", "documents_count", "documents_ids"])?; | ||||
|  | ||||
|     for word in prefixes.iter().map(AsRef::as_ref) { | ||||
|         let range = { | ||||
|             let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); | ||||
|             let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); | ||||
|             left..=right | ||||
|         }; | ||||
|         for result in index.word_prefix_level_position_docids.range(rtxn, &range)? { | ||||
|             let ((w, level, left, right), docids) = result?; | ||||
|  | ||||
|             let count = docids.len().to_string(); | ||||
|             let docids = if debug { | ||||
|                 format!("{:?}", docids) | ||||
|             } else { | ||||
|                 format!("{:?}", docids.iter().collect::<Vec<_>>()) | ||||
|             }; | ||||
|             let position_range = if level == TreeLevel::min_value() { | ||||
|                 format!("{:?}", left) | ||||
|             } else { | ||||
|                 format!("{:?}", left..=right) | ||||
|             }; | ||||
|             let level = level.to_string(); | ||||
|             wtr.write_record(&[w, &level, &position_range, &count, &docids])?; | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(wtr.flush()?) | ||||
| } | ||||
|  | ||||
| fn docids_words_positions( | ||||
|     index: &Index, | ||||
|     rtxn: &heed::RoTxn, | ||||
| @@ -715,6 +842,21 @@ fn average_number_of_positions_by_word(index: &Index, rtxn: &heed::RoTxn) -> any | ||||
| fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> anyhow::Result<()> { | ||||
|     use heed::types::ByteSlice; | ||||
|  | ||||
|     let Index { | ||||
|         env: _, | ||||
|         main, | ||||
|         word_docids, | ||||
|         word_prefix_docids, | ||||
|         docid_word_positions, | ||||
|         word_pair_proximity_docids, | ||||
|         word_prefix_pair_proximity_docids, | ||||
|         word_level_position_docids, | ||||
|         word_prefix_level_position_docids, | ||||
|         facet_field_id_value_docids, | ||||
|         field_id_docid_facet_values, | ||||
|         documents, | ||||
|     } = index; | ||||
|  | ||||
|     let names = if names.is_empty() { | ||||
|         ALL_DATABASE_NAMES.iter().map(|s| s.to_string()).collect() | ||||
|     } else { | ||||
| @@ -723,30 +865,35 @@ fn size_of_databases(index: &Index, rtxn: &heed::RoTxn, names: Vec<String>) -> a | ||||
|  | ||||
|     for name in names { | ||||
|         let database = match name.as_str() { | ||||
|             MAIN_DB_NAME => &index.main, | ||||
|             WORD_PREFIX_DOCIDS_DB_NAME => index.word_prefix_docids.as_polymorph(), | ||||
|             WORD_DOCIDS_DB_NAME => index.word_docids.as_polymorph(), | ||||
|             DOCID_WORD_POSITIONS_DB_NAME => index.docid_word_positions.as_polymorph(), | ||||
|             WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_pair_proximity_docids.as_polymorph(), | ||||
|             WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => index.word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|             FACET_FIELD_ID_VALUE_DOCIDS_NAME => index.facet_field_id_value_docids.as_polymorph(), | ||||
|             FIELD_ID_DOCID_FACET_VALUES_NAME => index.field_id_docid_facet_values.as_polymorph(), | ||||
|             DOCUMENTS_DB_NAME => index.documents.as_polymorph(), | ||||
|             MAIN_DB_NAME => &main, | ||||
|             WORD_PREFIX_DOCIDS_DB_NAME => word_prefix_docids.as_polymorph(), | ||||
|             WORD_DOCIDS_DB_NAME => word_docids.as_polymorph(), | ||||
|             DOCID_WORD_POSITIONS_DB_NAME => docid_word_positions.as_polymorph(), | ||||
|             WORD_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_pair_proximity_docids.as_polymorph(), | ||||
|             WORD_PREFIX_PAIR_PROXIMITY_DOCIDS_DB_NAME => word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|             WORD_LEVEL_POSITION_DOCIDS_DB_NAME => word_level_position_docids.as_polymorph(), | ||||
|             WORD_PREFIX_LEVEL_POSITION_DOCIDS_DB_NAME => word_prefix_level_position_docids.as_polymorph(), | ||||
|             FACET_FIELD_ID_VALUE_DOCIDS_DB_NAME => facet_field_id_value_docids.as_polymorph(), | ||||
|             FIELD_ID_DOCID_FACET_VALUES_DB_NAME => field_id_docid_facet_values.as_polymorph(), | ||||
|             DOCUMENTS_DB_NAME => documents.as_polymorph(), | ||||
|             unknown => anyhow::bail!("unknown database {:?}", unknown), | ||||
|         }; | ||||
|  | ||||
|         let mut key_size: u64 = 0; | ||||
|         let mut val_size: u64 = 0; | ||||
|         let mut number_entries: u64 = 0; | ||||
|         for result in database.iter::<_, ByteSlice, ByteSlice>(rtxn)? { | ||||
|             let (k, v) = result?; | ||||
|             key_size += k.len() as u64; | ||||
|             val_size += v.len() as u64; | ||||
|             number_entries += 1; | ||||
|         } | ||||
|  | ||||
|         println!("The {} database weigh:", name); | ||||
|         println!("\ttotal key size: {}", Byte::from(key_size).get_appropriate_unit(true)); | ||||
|         println!("\ttotal val size: {}", Byte::from(val_size).get_appropriate_unit(true)); | ||||
|         println!("\ttotal size: {}", Byte::from(key_size + val_size).get_appropriate_unit(true)); | ||||
|         println!("\tnumber of entries: {}", number_entries); | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
|   | ||||
| @@ -27,7 +27,7 @@ once_cell = "1.5.2" | ||||
| ordered-float = "2.1.1" | ||||
| rayon = "1.5.0" | ||||
| regex = "1.4.3" | ||||
| roaring = "0.6.5" | ||||
| roaring = "0.6.6" | ||||
| serde = { version = "1.0.123", features = ["derive"] } | ||||
| serde_json = { version = "1.0.62", features = ["preserve_order"] } | ||||
| slice-group-by = "0.2.6" | ||||
| @@ -52,13 +52,11 @@ logging_timer = "1.0.0" | ||||
| tinytemplate = "=1.1.0" | ||||
|  | ||||
| [dev-dependencies] | ||||
| big_s = "1.0.2" | ||||
| criterion = "0.3.4" | ||||
| maplit = "1.0.2" | ||||
| rand = "0.8.3" | ||||
|  | ||||
| [build-dependencies] | ||||
| fst = "0.4.5" | ||||
|  | ||||
| [features] | ||||
| default = [] | ||||
|  | ||||
|   | ||||
| @@ -2,6 +2,7 @@ mod beu32_str_codec; | ||||
| mod obkv_codec; | ||||
| mod roaring_bitmap; | ||||
| mod roaring_bitmap_length; | ||||
| mod str_level_position_codec; | ||||
| mod str_str_u8_codec; | ||||
| pub mod facet; | ||||
|  | ||||
| @@ -9,4 +10,5 @@ pub use self::beu32_str_codec::BEU32StrCodec; | ||||
| pub use self::obkv_codec::ObkvCodec; | ||||
| pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; | ||||
| pub use self::roaring_bitmap_length::{BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec}; | ||||
| pub use self::str_level_position_codec::StrLevelPositionCodec; | ||||
| pub use self::str_str_u8_codec::StrStrU8Codec; | ||||
|   | ||||
							
								
								
									
										45
									
								
								milli/src/heed_codec/str_level_position_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										45
									
								
								milli/src/heed_codec/str_level_position_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,45 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::convert::{TryFrom, TryInto}; | ||||
| use std::mem::size_of; | ||||
| use std::str; | ||||
|  | ||||
| use crate::TreeLevel; | ||||
|  | ||||
| pub struct StrLevelPositionCodec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec { | ||||
|     type DItem = (&'a str, TreeLevel, u32, u32); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let footer_len = size_of::<u8>() + size_of::<u32>() * 2; | ||||
|  | ||||
|         if bytes.len() < footer_len { return None } | ||||
|  | ||||
|         let (word, bytes) = bytes.split_at(bytes.len() - footer_len); | ||||
|         let word = str::from_utf8(word).ok()?; | ||||
|  | ||||
|         let (level, bytes) = bytes.split_first()?; | ||||
|         let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?; | ||||
|         let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?; | ||||
|         let level = TreeLevel::try_from(*level).ok()?; | ||||
|  | ||||
|         Some((word, level, left, right)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec { | ||||
|     type EItem = (&'a str, TreeLevel, u32, u32); | ||||
|  | ||||
|     fn bytes_encode((word, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let left = left.to_be_bytes(); | ||||
|         let right = right.to_be_bytes(); | ||||
|  | ||||
|         let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len()); | ||||
|         bytes.extend_from_slice(word.as_bytes()); | ||||
|         bytes.push((*level).into()); | ||||
|         bytes.extend_from_slice(&left[..]); | ||||
|         bytes.extend_from_slice(&right[..]); | ||||
|  | ||||
|         Some(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
| @@ -12,7 +12,7 @@ use crate::{Criterion, default_criteria, FacetDistribution, FieldsDistribution, | ||||
| use crate::{BEU32, DocumentId, ExternalDocumentsIds, FieldId}; | ||||
| use crate::{ | ||||
|     BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, | ||||
|     ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrStrU8Codec, | ||||
|     ObkvCodec, RoaringBitmapCodec, RoaringBitmapLenCodec, StrLevelPositionCodec, StrStrU8Codec, | ||||
| }; | ||||
| use crate::facet::FacetType; | ||||
| use crate::fields_ids_map::FieldsIdsMap; | ||||
| @@ -52,6 +52,10 @@ pub struct Index { | ||||
|     pub word_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. | ||||
|     pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the word, level and position range with the docids that corresponds to it. | ||||
|     pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the level positions of a word prefix with all the docids where this prefix appears. | ||||
|     pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the facet field id and the globally ordered value with the docids that corresponds to it. | ||||
|     pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>, | ||||
|     /// Maps the document id, the facet field id and the globally ordered value. | ||||
| @@ -62,7 +66,7 @@ pub struct Index { | ||||
|  | ||||
| impl Index { | ||||
|     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> { | ||||
|         options.max_dbs(9); | ||||
|         options.max_dbs(11); | ||||
|  | ||||
|         let env = options.open(path)?; | ||||
|         let main = env.create_poly_database(Some("main"))?; | ||||
| @@ -71,6 +75,8 @@ impl Index { | ||||
|         let docid_word_positions = env.create_database(Some("docid-word-positions"))?; | ||||
|         let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?; | ||||
|         let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?; | ||||
|         let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?; | ||||
|         let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?; | ||||
|         let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?; | ||||
|         let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?; | ||||
|         let documents = env.create_database(Some("documents"))?; | ||||
| @@ -94,6 +100,8 @@ impl Index { | ||||
|             docid_word_positions, | ||||
|             word_pair_proximity_docids, | ||||
|             word_prefix_pair_proximity_docids, | ||||
|             word_level_position_docids, | ||||
|             word_prefix_level_position_docids, | ||||
|             facet_field_id_value_docids, | ||||
|             field_id_docid_facet_values, | ||||
|             documents, | ||||
|   | ||||
| @@ -9,6 +9,7 @@ pub mod facet; | ||||
| pub mod heed_codec; | ||||
| pub mod index; | ||||
| pub mod proximity; | ||||
| pub mod tree_level; | ||||
| pub mod update; | ||||
|  | ||||
| use std::borrow::Cow; | ||||
| @@ -22,11 +23,12 @@ use serde_json::{Map, Value}; | ||||
| pub use self::criterion::{Criterion, default_criteria}; | ||||
| pub use self::external_documents_ids::ExternalDocumentsIds; | ||||
| pub use self::fields_ids_map::FieldsIdsMap; | ||||
| pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; | ||||
| pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, StrLevelPositionCodec, ObkvCodec}; | ||||
| pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||
| pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; | ||||
| pub use self::index::Index; | ||||
| pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; | ||||
| pub use self::tree_level::TreeLevel; | ||||
| pub use self::update_store::UpdateStore; | ||||
|  | ||||
| pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | ||||
|   | ||||
| @@ -31,32 +31,10 @@ pub struct AscDesc<'t> { | ||||
|     candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>, | ||||
|     bucket_candidates: RoaringBitmap, | ||||
|     faceted_candidates: RoaringBitmap, | ||||
|     parent: Option<Box<dyn Criterion + 't>>, | ||||
|     parent: Box<dyn Criterion + 't>, | ||||
| } | ||||
|  | ||||
| impl<'t> AscDesc<'t> { | ||||
|     pub fn initial_asc( | ||||
|         index: &'t Index, | ||||
|         rtxn: &'t heed::RoTxn, | ||||
|         query_tree: Option<Operation>, | ||||
|         candidates: Option<RoaringBitmap>, | ||||
|         field_name: String, | ||||
|     ) -> anyhow::Result<Self> | ||||
|     { | ||||
|         Self::initial(index, rtxn, query_tree, candidates, field_name, true) | ||||
|     } | ||||
|  | ||||
|     pub fn initial_desc( | ||||
|         index: &'t Index, | ||||
|         rtxn: &'t heed::RoTxn, | ||||
|         query_tree: Option<Operation>, | ||||
|         candidates: Option<RoaringBitmap>, | ||||
|         field_name: String, | ||||
|     ) -> anyhow::Result<Self> | ||||
|     { | ||||
|         Self::initial(index, rtxn, query_tree, candidates, field_name, false) | ||||
|     } | ||||
|  | ||||
|     pub fn asc( | ||||
|         index: &'t Index, | ||||
|         rtxn: &'t heed::RoTxn, | ||||
| @@ -77,47 +55,6 @@ impl<'t> AscDesc<'t> { | ||||
|         Self::new(index, rtxn, parent, field_name, false) | ||||
|     } | ||||
|  | ||||
|     fn initial( | ||||
|         index: &'t Index, | ||||
|         rtxn: &'t heed::RoTxn, | ||||
|         query_tree: Option<Operation>, | ||||
|         candidates: Option<RoaringBitmap>, | ||||
|         field_name: String, | ||||
|         ascending: bool, | ||||
|     ) -> anyhow::Result<Self> | ||||
|     { | ||||
|         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||
|         let faceted_fields = index.faceted_fields(rtxn)?; | ||||
|         let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; | ||||
|  | ||||
|         let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; | ||||
|         let candidates = match &query_tree { | ||||
|             Some(qt) => { | ||||
|                 let context = CriteriaBuilder::new(rtxn, index)?; | ||||
|                 let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), &mut WordDerivationsCache::new())?; | ||||
|                 if let Some(candidates) = candidates { | ||||
|                     qt_candidates.intersect_with(&candidates); | ||||
|                 } | ||||
|                 qt_candidates | ||||
|             }, | ||||
|             None => candidates.unwrap_or(faceted_candidates.clone()), | ||||
|         }; | ||||
|  | ||||
|         Ok(AscDesc { | ||||
|             index, | ||||
|             rtxn, | ||||
|             field_name, | ||||
|             field_id, | ||||
|             facet_type, | ||||
|             ascending, | ||||
|             query_tree, | ||||
|             candidates: facet_ordered(index, rtxn, field_id, facet_type, ascending, candidates)?, | ||||
|             faceted_candidates, | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent: None, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     fn new( | ||||
|         index: &'t Index, | ||||
|         rtxn: &'t heed::RoTxn, | ||||
| @@ -141,7 +78,7 @@ impl<'t> AscDesc<'t> { | ||||
|             candidates: Box::new(std::iter::empty()), | ||||
|             faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent: Some(parent), | ||||
|             parent, | ||||
|         }) | ||||
|     } | ||||
| } | ||||
| @@ -156,64 +93,56 @@ impl<'t> Criterion for AscDesc<'t> { | ||||
|  | ||||
|             match self.candidates.next().transpose()? { | ||||
|                 None => { | ||||
|                     let query_tree = self.query_tree.take(); | ||||
|                     let bucket_candidates = take(&mut self.bucket_candidates); | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(wdcache)? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     self.query_tree = query_tree; | ||||
|                                     let candidates = match (&self.query_tree, candidates) { | ||||
|                                         (_, Some(mut candidates)) => { | ||||
|                                             candidates.intersect_with(&self.faceted_candidates); | ||||
|                                             candidates | ||||
|                                         }, | ||||
|                                         (Some(qt), None) => { | ||||
|                                             let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; | ||||
|                                             let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; | ||||
|                                             candidates.intersect_with(&self.faceted_candidates); | ||||
|                                             candidates | ||||
|                                         }, | ||||
|                                         (None, None) => take(&mut self.faceted_candidates), | ||||
|                                     }; | ||||
|                                     if bucket_candidates.is_empty() { | ||||
|                                         self.bucket_candidates.union_with(&candidates); | ||||
|                                     } else { | ||||
|                                         self.bucket_candidates.union_with(&bucket_candidates); | ||||
|                                     } | ||||
|                                     self.candidates = facet_ordered( | ||||
|                                         self.index, | ||||
|                                         self.rtxn, | ||||
|                                         self.field_id, | ||||
|                                         self.facet_type, | ||||
|                                         self.ascending, | ||||
|                                         candidates, | ||||
|                                     )?; | ||||
|                     match self.parent.next(wdcache)? { | ||||
|                         Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                             let candidates_is_some = candidates.is_some(); | ||||
|                             self.query_tree = query_tree; | ||||
|                             let candidates = match (&self.query_tree, candidates) { | ||||
|                                 (_, Some(mut candidates)) => { | ||||
|                                     candidates.intersect_with(&self.faceted_candidates); | ||||
|                                     candidates | ||||
|                                 }, | ||||
|                                 None => return Ok(None), | ||||
|                             } | ||||
|                         }, | ||||
|                         None => if query_tree.is_none() && bucket_candidates.is_empty() { | ||||
|                             return Ok(None) | ||||
|                         }, | ||||
|                     } | ||||
|                                 (Some(qt), None) => { | ||||
|                                     let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; | ||||
|                                     let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; | ||||
|                                     candidates.intersect_with(&self.faceted_candidates); | ||||
|                                     candidates | ||||
|                                 }, | ||||
|                                 (None, None) => take(&mut self.faceted_candidates), | ||||
|                             }; | ||||
|  | ||||
|                     return Ok(Some(CriterionResult { | ||||
|                         query_tree, | ||||
|                         candidates: Some(RoaringBitmap::new()), | ||||
|                         bucket_candidates, | ||||
|                     })); | ||||
|                             // If our parent returns candidates it means that the bucket | ||||
|                             // candidates were already computed before and we can use them. | ||||
|                             // | ||||
|                             // If not, we must use the just computed candidates as our bucket | ||||
|                             // candidates. | ||||
|                             if candidates_is_some { | ||||
|                                 self.bucket_candidates.union_with(&bucket_candidates); | ||||
|                             } else { | ||||
|                                 self.bucket_candidates.union_with(&candidates); | ||||
|                             } | ||||
|  | ||||
|                             if candidates.is_empty() { | ||||
|                                 continue; | ||||
|                             } | ||||
|  | ||||
|                             self.candidates = facet_ordered( | ||||
|                                 self.index, | ||||
|                                 self.rtxn, | ||||
|                                 self.field_id, | ||||
|                                 self.facet_type, | ||||
|                                 self.ascending, | ||||
|                                 candidates, | ||||
|                             )?; | ||||
|                         }, | ||||
|                         None => return Ok(None), | ||||
|                     } | ||||
|                 }, | ||||
|                 Some(candidates) => { | ||||
|                     let bucket_candidates = match self.parent { | ||||
|                         Some(_) => take(&mut self.bucket_candidates), | ||||
|                         None => candidates.clone(), | ||||
|                     }; | ||||
|  | ||||
|                     return Ok(Some(CriterionResult { | ||||
|                         query_tree: self.query_tree.clone(), | ||||
|                         candidates: Some(candidates), | ||||
|                         bucket_candidates, | ||||
|                         bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                     })); | ||||
|                 }, | ||||
|             } | ||||
|   | ||||
							
								
								
									
										737
									
								
								milli/src/search/criteria/attribute.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										737
									
								
								milli/src/search/criteria/attribute.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,737 @@ | ||||
| use std::{borrow::Cow, cmp::{self, Ordering}, collections::BinaryHeap}; | ||||
| use std::collections::{BTreeMap, HashMap, btree_map}; | ||||
| use std::collections::binary_heap::PeekMut; | ||||
| use std::mem::take; | ||||
|  | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::{TreeLevel, search::build_dfa}; | ||||
| use crate::search::criteria::Query; | ||||
| use crate::search::query_tree::{Operation, QueryKind}; | ||||
| use crate::search::{word_derivations, WordDerivationsCache}; | ||||
| use super::{Criterion, CriterionResult, Context, resolve_query_tree}; | ||||
|  | ||||
| /// To be able to divide integers by the number of words in the query | ||||
| /// we want to find a multiplier that allow us to divide by any number between 1 and 10. | ||||
| /// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). | ||||
| const LCM_10_FIRST_NUMBERS: u32 = 2520; | ||||
|  | ||||
| /// To compute the interval size of a level, | ||||
| /// we use 4 as the exponentiation base and the level as the exponent. | ||||
| const LEVEL_EXPONENTIATION_BASE: u32 = 4; | ||||
|  | ||||
| pub struct Attribute<'t> { | ||||
|     ctx: &'t dyn Context<'t>, | ||||
|     query_tree: Option<Operation>, | ||||
|     candidates: Option<RoaringBitmap>, | ||||
|     bucket_candidates: RoaringBitmap, | ||||
|     parent: Box<dyn Criterion + 't>, | ||||
|     flattened_query_tree: Option<Vec<Vec<Vec<Query>>>>, | ||||
|     current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>, | ||||
| } | ||||
|  | ||||
| impl<'t> Attribute<'t> { | ||||
|     pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self { | ||||
|         Attribute { | ||||
|             ctx, | ||||
|             query_tree: None, | ||||
|             candidates: None, | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent, | ||||
|             flattened_query_tree: None, | ||||
|             current_buckets: None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t> Criterion for Attribute<'t> { | ||||
|     #[logging_timer::time("Attribute::{}")] | ||||
|     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> { | ||||
|         loop { | ||||
|             match (&self.query_tree, &mut self.candidates) { | ||||
|                 (_, Some(candidates)) if candidates.is_empty() => { | ||||
|                     return Ok(Some(CriterionResult { | ||||
|                         query_tree: self.query_tree.take(), | ||||
|                         candidates: self.candidates.take(), | ||||
|                         bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                     })); | ||||
|                 }, | ||||
|                 (Some(qt), Some(candidates)) => { | ||||
|                     let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| { | ||||
|                         flatten_query_tree(&qt) | ||||
|                     }); | ||||
|  | ||||
|                     let found_candidates = if candidates.len() < 1000 { | ||||
|                         let current_buckets = match self.current_buckets.as_mut() { | ||||
|                             Some(current_buckets) => current_buckets, | ||||
|                             None => { | ||||
|                                 let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; | ||||
|                                 self.current_buckets.get_or_insert(new_buckets.into_iter()) | ||||
|                             }, | ||||
|                         }; | ||||
|  | ||||
|                         match current_buckets.next() { | ||||
|                             Some((_score, candidates)) => candidates, | ||||
|                             None => { | ||||
|                                 return Ok(Some(CriterionResult { | ||||
|                                     query_tree: self.query_tree.take(), | ||||
|                                     candidates: self.candidates.take(), | ||||
|                                     bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                                 })); | ||||
|                             }, | ||||
|                         } | ||||
|                     } else { | ||||
|                         match set_compute_candidates(self.ctx, flattened_query_tree, candidates, wdcache)? { | ||||
|                             Some(candidates) => candidates, | ||||
|                             None => { | ||||
|                                 return Ok(Some(CriterionResult { | ||||
|                                     query_tree: self.query_tree.take(), | ||||
|                                     candidates: self.candidates.take(), | ||||
|                                     bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                                 })); | ||||
|                             }, | ||||
|                         } | ||||
|                     }; | ||||
|  | ||||
|                     candidates.difference_with(&found_candidates); | ||||
|  | ||||
|                     return Ok(Some(CriterionResult { | ||||
|                         query_tree: self.query_tree.clone(), | ||||
|                         candidates: Some(found_candidates), | ||||
|                         bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                     })); | ||||
|                 }, | ||||
|                 (Some(qt), None) => { | ||||
|                     let query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; | ||||
|                     self.bucket_candidates |= &query_tree_candidates; | ||||
|                     self.candidates = Some(query_tree_candidates); | ||||
|                 }, | ||||
|                 (None, Some(_)) => { | ||||
|                     return Ok(Some(CriterionResult { | ||||
|                         query_tree: self.query_tree.take(), | ||||
|                         candidates: self.candidates.take(), | ||||
|                         bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                     })); | ||||
|                 }, | ||||
|                 (None, None) => { | ||||
|                     match self.parent.next(wdcache)? { | ||||
|                         Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { | ||||
|                             return Ok(Some(CriterionResult { | ||||
|                                 query_tree: None, | ||||
|                                 candidates: None, | ||||
|                                 bucket_candidates, | ||||
|                             })); | ||||
|                         }, | ||||
|                         Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                             self.query_tree = query_tree; | ||||
|                             self.candidates = candidates; | ||||
|                             self.bucket_candidates |= bucket_candidates; | ||||
|                             self.flattened_query_tree = None; | ||||
|                             self.current_buckets = None; | ||||
|                         }, | ||||
|                         None => return Ok(None), | ||||
|                     } | ||||
|                 }, | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// WordLevelIterator is an pseudo-Iterator over intervals of word-position for one word, | ||||
| /// it will begin at the first non-empty interval and will return every interval without | ||||
| /// jumping over empty intervals. | ||||
| struct WordLevelIterator<'t, 'q> { | ||||
|     inner: Box<dyn Iterator<Item =heed::Result<((&'t str, TreeLevel, u32, u32), RoaringBitmap)>> + 't>, | ||||
|     level: TreeLevel, | ||||
|     interval_size: u32, | ||||
|     word: Cow<'q, str>, | ||||
|     in_prefix_cache: bool, | ||||
|     inner_next: Option<(u32, u32, RoaringBitmap)>, | ||||
|     current_interval: Option<(u32, u32)>, | ||||
| } | ||||
|  | ||||
| impl<'t, 'q> WordLevelIterator<'t, 'q> { | ||||
|     fn new(ctx: &'t dyn Context<'t>, word: Cow<'q, str>, in_prefix_cache: bool) -> heed::Result<Option<Self>> { | ||||
|         match ctx.word_position_last_level(&word, in_prefix_cache)? { | ||||
|             Some(level) =>  { | ||||
|                 let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32); | ||||
|                 let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, None, None)?; | ||||
|                 Ok(Some(Self { inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None })) | ||||
|             }, | ||||
|             None => Ok(None), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn dig(&self, ctx: &'t dyn Context<'t>, level: &TreeLevel, left_interval: Option<u32>) -> heed::Result<Self> { | ||||
|         let level = level.min(&self.level).clone(); | ||||
|         let interval_size = LEVEL_EXPONENTIATION_BASE.pow(Into::<u8>::into(level.clone()) as u32); | ||||
|         let word = self.word.clone(); | ||||
|         let in_prefix_cache = self.in_prefix_cache; | ||||
|         let inner = ctx.word_position_iterator(&word, level, in_prefix_cache, left_interval, None)?; | ||||
|  | ||||
|         Ok(Self {inner, level, interval_size, word, in_prefix_cache, inner_next: None, current_interval: None}) | ||||
|     } | ||||
|  | ||||
|     fn next(&mut self) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { | ||||
|         fn is_next_interval(last_right: u32, next_left: u32) -> bool { last_right + 1 == next_left } | ||||
|  | ||||
|         let inner_next = match self.inner_next.take() { | ||||
|             Some(inner_next) => Some(inner_next), | ||||
|             None => self.inner.next().transpose()?.map(|((_, _, left, right), docids)| (left, right, docids)), | ||||
|         }; | ||||
|  | ||||
|         match inner_next { | ||||
|             Some((left, right, docids)) => { | ||||
|                 match self.current_interval { | ||||
|                     Some((last_left, last_right)) if !is_next_interval(last_right, left) => { | ||||
|                         let blank_left = last_left + self.interval_size; | ||||
|                         let blank_right = last_right + self.interval_size; | ||||
|                         self.current_interval = Some((blank_left, blank_right)); | ||||
|                         self.inner_next = Some((left, right, docids)); | ||||
|                         Ok(Some((blank_left, blank_right, RoaringBitmap::new()))) | ||||
|                     }, | ||||
|                     _ => { | ||||
|                         self.current_interval = Some((left, right)); | ||||
|                         Ok(Some((left, right, docids))) | ||||
|                     } | ||||
|                 } | ||||
|             }, | ||||
|             None => Ok(None), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// QueryLevelIterator is an pseudo-Iterator for a Query, | ||||
| /// It contains WordLevelIterators and is chainned with other QueryLevelIterator. | ||||
| struct QueryLevelIterator<'t, 'q> { | ||||
|     parent: Option<Box<QueryLevelIterator<'t, 'q>>>, | ||||
|     inner: Vec<WordLevelIterator<'t, 'q>>, | ||||
|     level: TreeLevel, | ||||
|     accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>, | ||||
|     parent_accumulator: Vec<Option<(u32, u32, RoaringBitmap)>>, | ||||
|     interval_to_skip: usize, | ||||
| } | ||||
|  | ||||
| impl<'t, 'q> QueryLevelIterator<'t, 'q> { | ||||
|     fn new(ctx: &'t dyn Context<'t>, queries: &'q Vec<Query>, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<Self>> { | ||||
|         let mut inner = Vec::with_capacity(queries.len()); | ||||
|         for query in queries { | ||||
|             match &query.kind { | ||||
|                 QueryKind::Exact { word, .. } => { | ||||
|                     if !query.prefix || ctx.in_prefix_cache(&word) { | ||||
|                         let word = Cow::Borrowed(query.kind.word()); | ||||
|                         if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, query.prefix)? { | ||||
|                             inner.push(word_level_iterator); | ||||
|                         } | ||||
|                     } else { | ||||
|                         for (word, _) in word_derivations(&word, true, 0, ctx.words_fst(), wdcache)? { | ||||
|                             let word = Cow::Owned(word.to_owned()); | ||||
|                             if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { | ||||
|                                 inner.push(word_level_iterator); | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 }, | ||||
|                 QueryKind::Tolerant { typo, word } => { | ||||
|                     for (word, _) in word_derivations(&word, query.prefix, *typo, ctx.words_fst(), wdcache)? { | ||||
|                         let word = Cow::Owned(word.to_owned()); | ||||
|                         if let Some(word_level_iterator) = WordLevelIterator::new(ctx, word, false)? { | ||||
|                             inner.push(word_level_iterator); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let highest = inner.iter().max_by_key(|wli| wli.level).map(|wli| wli.level.clone()); | ||||
|         match highest { | ||||
|             Some(level) => Ok(Some(Self { | ||||
|                 parent: None, | ||||
|                 inner, | ||||
|                 level, | ||||
|                 accumulator: vec![], | ||||
|                 parent_accumulator: vec![], | ||||
|                 interval_to_skip: 0, | ||||
|             })), | ||||
|             None => Ok(None), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn parent(&mut self, parent: QueryLevelIterator<'t, 'q>) -> &Self { | ||||
|         self.parent = Some(Box::new(parent)); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     /// create a new QueryLevelIterator with a lower level than the current one. | ||||
|     fn dig(&self, ctx: &'t dyn Context<'t>) -> heed::Result<Self> { | ||||
|         let (level, parent) = match &self.parent { | ||||
|             Some(parent) => { | ||||
|                 let parent = parent.dig(ctx)?; | ||||
|                 (parent.level.min(self.level), Some(Box::new(parent))) | ||||
|             }, | ||||
|             None => (self.level.saturating_sub(1), None), | ||||
|         }; | ||||
|  | ||||
|         let left_interval = self.accumulator.get(self.interval_to_skip).map(|opt| opt.as_ref().map(|(left, _, _)| *left)).flatten(); | ||||
|         let mut inner = Vec::with_capacity(self.inner.len()); | ||||
|         for word_level_iterator in self.inner.iter() { | ||||
|             inner.push(word_level_iterator.dig(ctx, &level, left_interval)?); | ||||
|         } | ||||
|  | ||||
|         Ok(Self {parent, inner, level, accumulator: vec![], parent_accumulator: vec![], interval_to_skip: 0}) | ||||
|     } | ||||
|  | ||||
|     fn inner_next(&mut self, level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { | ||||
|         let mut accumulated: Option<(u32, u32, RoaringBitmap)> = None; | ||||
|         let u8_level = Into::<u8>::into(level); | ||||
|         let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); | ||||
|         for wli in self.inner.iter_mut() { | ||||
|             let wli_u8_level = Into::<u8>::into(wli.level.clone()); | ||||
|             let accumulated_count = LEVEL_EXPONENTIATION_BASE.pow((u8_level - wli_u8_level) as u32); | ||||
|             for _ in 0..accumulated_count { | ||||
|                 if let Some((next_left, _, next_docids)) =  wli.next()? { | ||||
|                     accumulated = match accumulated.take(){ | ||||
|                         Some((acc_left, acc_right, mut acc_docids)) => { | ||||
|                             acc_docids |= next_docids; | ||||
|                             Some((acc_left, acc_right, acc_docids)) | ||||
|                         }, | ||||
|                         None => Some((next_left, next_left + interval_size, next_docids)), | ||||
|                     }; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(accumulated) | ||||
|     } | ||||
|  | ||||
|     /// return the next meta-interval created from inner WordLevelIterators, | ||||
|     /// and from eventual chainned QueryLevelIterator. | ||||
|     fn next(&mut self, allowed_candidates: &RoaringBitmap, tree_level: TreeLevel) -> heed::Result<Option<(u32, u32, RoaringBitmap)>> { | ||||
|         let parent_result = match self.parent.as_mut() { | ||||
|             Some(parent) => Some(parent.next(allowed_candidates, tree_level)?), | ||||
|             None => None, | ||||
|         }; | ||||
|  | ||||
|         match parent_result { | ||||
|             Some(parent_next) => { | ||||
|                 let inner_next = self.inner_next(tree_level)?; | ||||
|                 self.interval_to_skip += interval_to_skip( | ||||
|                     &self.parent_accumulator, | ||||
|                     &self.accumulator, | ||||
|                     self.interval_to_skip, | ||||
|                     allowed_candidates | ||||
|                 ); | ||||
|                 self.accumulator.push(inner_next); | ||||
|                 self.parent_accumulator.push(parent_next); | ||||
|                 let mut merged_interval: Option<(u32, u32, RoaringBitmap)> = None; | ||||
|  | ||||
|                 for current in self.accumulator.iter().rev().zip(self.parent_accumulator.iter()).skip(self.interval_to_skip) { | ||||
|                     if let (Some((left_a, right_a, a)), Some((left_b, right_b, b))) = current { | ||||
|                         match merged_interval.as_mut() { | ||||
|                             Some((_, _, merged_docids)) => *merged_docids |= a & b, | ||||
|                             None => merged_interval = Some((left_a + left_b, right_a + right_b, a & b)), | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 Ok(merged_interval) | ||||
|             }, | ||||
|             None => { | ||||
|                 let level = self.level; | ||||
|                 match self.inner_next(level)? { | ||||
|                     Some((left, right, mut candidates)) => { | ||||
|                         self.accumulator = vec![Some((left, right, RoaringBitmap::new()))]; | ||||
|                         candidates &= allowed_candidates; | ||||
|                         Ok(Some((left, right, candidates))) | ||||
|  | ||||
|                     }, | ||||
|                     None => { | ||||
|                         self.accumulator = vec![None]; | ||||
|                         Ok(None) | ||||
|                     }, | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Count the number of interval that can be skiped when we make the cross-intersections | ||||
| /// in order to compute the next meta-interval. | ||||
| /// A pair of intervals is skiped when both intervals doesn't contain any allowed docids. | ||||
| fn interval_to_skip( | ||||
|     parent_accumulator: &[Option<(u32, u32, RoaringBitmap)>], | ||||
|     current_accumulator: &[Option<(u32, u32, RoaringBitmap)>], | ||||
|     already_skiped: usize, | ||||
|     allowed_candidates: &RoaringBitmap, | ||||
| ) -> usize { | ||||
|     parent_accumulator.into_iter() | ||||
|         .zip(current_accumulator.into_iter()) | ||||
|         .skip(already_skiped) | ||||
|         .take_while(|(parent, current)| { | ||||
|             let skip_parent = parent.as_ref().map_or(true, |(_, _, docids)| docids.is_empty()); | ||||
|             let skip_current = current.as_ref().map_or(true, |(_, _, docids)| docids.is_disjoint(allowed_candidates)); | ||||
|             skip_parent && skip_current | ||||
|         }) | ||||
|         .count() | ||||
|  | ||||
| } | ||||
|  | ||||
| /// A Branch is represent a possible alternative of the original query and is build with the Query Tree, | ||||
| /// This branch allows us to iterate over meta-interval of position and to dig in it if it contains interesting candidates. | ||||
| struct Branch<'t, 'q> { | ||||
|     query_level_iterator: QueryLevelIterator<'t, 'q>, | ||||
|     last_result: (u32, u32, RoaringBitmap), | ||||
|     tree_level: TreeLevel, | ||||
|     branch_size: u32, | ||||
| } | ||||
|  | ||||
| impl<'t, 'q> Branch<'t, 'q> { | ||||
|     /// return the next meta-interval of the branch, | ||||
|     /// and update inner interval in order to be ranked by the BinaryHeap. | ||||
|     fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result<bool> { | ||||
|         let tree_level = self.query_level_iterator.level; | ||||
|         match self.query_level_iterator.next(allowed_candidates, tree_level)? { | ||||
|             Some(last_result) => { | ||||
|                 self.last_result = last_result; | ||||
|                 self.tree_level = tree_level; | ||||
|                 Ok(true) | ||||
|             }, | ||||
|             None => Ok(false), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// make the current Branch iterate over smaller intervals. | ||||
|     fn dig(&mut self, ctx: &'t dyn Context<'t>) -> heed::Result<()> { | ||||
|         self.query_level_iterator = self.query_level_iterator.dig(ctx)?; | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     /// because next() method could be time consuming, | ||||
|     /// update inner interval in order to be ranked by the binary_heap without computing it, | ||||
|     /// the next() method should be called when the real interval is needed. | ||||
|     fn lazy_next(&mut self) { | ||||
|         let u8_level = Into::<u8>::into(self.tree_level.clone()); | ||||
|         let interval_size = LEVEL_EXPONENTIATION_BASE.pow(u8_level as u32); | ||||
|         let (left, right, _) = self.last_result; | ||||
|  | ||||
|         self.last_result = (left + interval_size,  right + interval_size, RoaringBitmap::new()); | ||||
|     } | ||||
|  | ||||
|     /// return the score of the current inner interval. | ||||
|     fn compute_rank(&self) -> u32 { | ||||
|         // we compute a rank from the left interval. | ||||
|         let (left, _, _) = self.last_result; | ||||
|         left.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size | ||||
|     } | ||||
|  | ||||
|     fn cmp(&self, other: &Self) -> Ordering { | ||||
|         let self_rank = self.compute_rank(); | ||||
|         let other_rank = other.compute_rank(); | ||||
|         let left_cmp = self_rank.cmp(&other_rank).reverse(); | ||||
|         // on level: lower is better, | ||||
|         // we want to dig faster into levels on interesting branches. | ||||
|         let level_cmp = self.tree_level.cmp(&other.tree_level).reverse(); | ||||
|  | ||||
|         left_cmp.then(level_cmp).then(self.last_result.2.len().cmp(&other.last_result.2.len())) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t, 'q> Ord for Branch<'t, 'q> { | ||||
|     fn cmp(&self, other: &Self) -> Ordering { | ||||
|         self.cmp(other) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t, 'q> PartialOrd for Branch<'t, 'q> { | ||||
|     fn partial_cmp(&self, other: &Self) -> Option<Ordering> { | ||||
|         Some(self.cmp(other)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t, 'q> PartialEq for Branch<'t, 'q> { | ||||
|     fn eq(&self, other: &Self) -> bool { | ||||
|         self.cmp(other) == Ordering::Equal | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t, 'q> Eq for Branch<'t, 'q> {} | ||||
|  | ||||
| fn initialize_query_level_iterators<'t, 'q>( | ||||
|     ctx: &'t dyn Context<'t>, | ||||
|     branches: &'q Vec<Vec<Vec<Query>>>, | ||||
|     allowed_candidates: &RoaringBitmap, | ||||
|     wdcache: &mut WordDerivationsCache, | ||||
| ) -> anyhow::Result<BinaryHeap<Branch<'t, 'q>>> { | ||||
|  | ||||
|     let mut positions = BinaryHeap::with_capacity(branches.len()); | ||||
|     for branch in branches { | ||||
|         let mut branch_positions = Vec::with_capacity(branch.len()); | ||||
|         for queries in  branch { | ||||
|             match QueryLevelIterator::new(ctx, queries, wdcache)? { | ||||
|                 Some(qli) => branch_positions.push(qli), | ||||
|                 None => { | ||||
|                     // the branch seems to be invalid, so we skip it. | ||||
|                     branch_positions.clear(); | ||||
|                     break; | ||||
|                 }, | ||||
|             } | ||||
|         } | ||||
|         // QueryLevelIterator need to be sorted by level and folded in descending order. | ||||
|         branch_positions.sort_unstable_by_key(|qli| qli.level); | ||||
|         let folded_query_level_iterators = branch_positions | ||||
|             .into_iter() | ||||
|             .fold(None, |fold: Option<QueryLevelIterator>, mut qli| match fold { | ||||
|                 Some(fold) => { | ||||
|                     qli.parent(fold); | ||||
|                     Some(qli) | ||||
|                 }, | ||||
|                 None => Some(qli), | ||||
|         }); | ||||
|  | ||||
|         if let Some(mut folded_query_level_iterators) = folded_query_level_iterators { | ||||
|             let tree_level = folded_query_level_iterators.level; | ||||
|             let last_result = folded_query_level_iterators.next(allowed_candidates, tree_level)?; | ||||
|             if let Some(last_result) = last_result { | ||||
|                 let branch = Branch { | ||||
|                     last_result, | ||||
|                     tree_level, | ||||
|                     query_level_iterator: folded_query_level_iterators, | ||||
|                     branch_size: branch.len() as u32, | ||||
|                 }; | ||||
|                 positions.push(branch); | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     Ok(positions) | ||||
| } | ||||
|  | ||||
| fn set_compute_candidates<'t>( | ||||
|     ctx: &'t dyn Context<'t>, | ||||
|     branches: &Vec<Vec<Vec<Query>>>, | ||||
|     allowed_candidates: &RoaringBitmap, | ||||
|     wdcache: &mut WordDerivationsCache, | ||||
| ) -> anyhow::Result<Option<RoaringBitmap>> | ||||
| { | ||||
|     let mut branches_heap = initialize_query_level_iterators(ctx, branches, allowed_candidates, wdcache)?; | ||||
|     let lowest_level = TreeLevel::min_value(); | ||||
|     let mut final_candidates: Option<(u32, RoaringBitmap)> = None; | ||||
|     let mut allowed_candidates = allowed_candidates.clone(); | ||||
|  | ||||
|     while let Some(mut branch) = branches_heap.peek_mut() { | ||||
|         let is_lowest_level = branch.tree_level == lowest_level; | ||||
|         let branch_rank = branch.compute_rank(); | ||||
|         // if current is worst than best we break to return | ||||
|         // candidates that correspond to the best rank | ||||
|         if let Some((best_rank, _)) = final_candidates { | ||||
|             if branch_rank > best_rank { break } | ||||
|         } | ||||
|         let _left = branch.last_result.0; | ||||
|         let candidates = take(&mut branch.last_result.2); | ||||
|         if candidates.is_empty() { | ||||
|             // we don't have candidates, get next interval. | ||||
|             if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } | ||||
|         } | ||||
|         else if is_lowest_level { | ||||
|             // we have candidates, but we can't dig deeper. | ||||
|             allowed_candidates -= &candidates; | ||||
|             final_candidates = match final_candidates.take() { | ||||
|                 // we add current candidates to best candidates | ||||
|                 Some((best_rank, mut best_candidates)) => { | ||||
|                     best_candidates |= candidates; | ||||
|                     branch.lazy_next(); | ||||
|                     Some((best_rank, best_candidates)) | ||||
|                 }, | ||||
|                 // we take current candidates as best candidates | ||||
|                 None => { | ||||
|                     branch.lazy_next(); | ||||
|                     Some((branch_rank, candidates)) | ||||
|                 }, | ||||
|             }; | ||||
|         } else { | ||||
|             // we have candidates, lets dig deeper in levels. | ||||
|             branch.dig(ctx)?; | ||||
|             if !branch.next(&allowed_candidates)? { PeekMut::pop(branch); } | ||||
|         } | ||||
|  | ||||
|     } | ||||
|  | ||||
|     Ok(final_candidates.map(|(_rank, candidates)| candidates)) | ||||
| } | ||||
|  | ||||
| fn linear_compute_candidates( | ||||
|     ctx: &dyn Context, | ||||
|     branches: &Vec<Vec<Vec<Query>>>, | ||||
|     allowed_candidates: &RoaringBitmap, | ||||
| ) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>> | ||||
| { | ||||
|     fn compute_candidate_rank(branches: &Vec<Vec<Vec<Query>>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 { | ||||
|         let mut min_rank = u64::max_value(); | ||||
|         for branch in branches { | ||||
|  | ||||
|             let branch_len = branch.len(); | ||||
|             let mut branch_rank = Vec::with_capacity(branch_len); | ||||
|             for derivates in branch { | ||||
|                 let mut position = None; | ||||
|                 for Query { prefix, kind } in derivates { | ||||
|                     // find the best position of the current word in the document. | ||||
|                     let current_position = match kind { | ||||
|                         QueryKind::Exact { word, .. } => { | ||||
|                             if *prefix { | ||||
|                                 word_derivations(word, true, 0, &words_positions) | ||||
|                                     .flat_map(|positions| positions.iter().next()).min() | ||||
|                             } else { | ||||
|                                 words_positions.get(word) | ||||
|                                     .map(|positions| positions.iter().next()) | ||||
|                                     .flatten() | ||||
|                             } | ||||
|                         }, | ||||
|                         QueryKind::Tolerant { typo, word } => { | ||||
|                             word_derivations(word, *prefix, *typo, &words_positions) | ||||
|                                 .flat_map(|positions| positions.iter().next()).min() | ||||
|                         }, | ||||
|                     }; | ||||
|  | ||||
|                     match (position, current_position) { | ||||
|                         (Some(p), Some(cp)) => position = Some(cmp::min(p, cp)), | ||||
|                         (None, Some(cp)) => position = Some(cp), | ||||
|                         _ => (), | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 // if a position is found, we add it to the branch score, | ||||
|                 // otherwise the branch is considered as unfindable in this document and we break. | ||||
|                 if let Some(position) = position { | ||||
|                     branch_rank.push(position as u64); | ||||
|                 } else { | ||||
|                     branch_rank.clear(); | ||||
|                     break; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             if !branch_rank.is_empty() { | ||||
|                 branch_rank.sort_unstable(); | ||||
|                 // because several words in same query can't match all a the position 0, | ||||
|                 // we substract the word index to the position. | ||||
|                 let branch_rank: u64 = branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); | ||||
|                 // here we do the means of the words of the branch | ||||
|                 min_rank = min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         min_rank | ||||
|     } | ||||
|  | ||||
|     fn word_derivations<'a>( | ||||
|         word: &str, | ||||
|         is_prefix: bool, | ||||
|         max_typo: u8, | ||||
|         words_positions: &'a HashMap<String, RoaringBitmap>, | ||||
|     ) -> impl Iterator<Item = &'a RoaringBitmap> | ||||
|     { | ||||
|         let dfa = build_dfa(word, max_typo, is_prefix); | ||||
|         words_positions.iter().filter_map(move |(document_word, positions)| { | ||||
|             use levenshtein_automata::Distance; | ||||
|             match dfa.eval(document_word) { | ||||
|                 Distance::Exact(_) => Some(positions), | ||||
|                 Distance::AtLeast(_) => None, | ||||
|             } | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     let mut candidates = BTreeMap::new(); | ||||
|     for docid in allowed_candidates { | ||||
|         let words_positions = ctx.docid_words_positions(docid)?; | ||||
|         let rank = compute_candidate_rank(branches, words_positions); | ||||
|         candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid); | ||||
|     } | ||||
|  | ||||
|     Ok(candidates) | ||||
| } | ||||
|  | ||||
| // TODO can we keep refs of Query | ||||
| fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Vec<Query>>> { | ||||
|     use crate::search::criteria::Operation::{And, Or, Consecutive}; | ||||
|  | ||||
|     fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Vec<Query>>> { | ||||
|         match tail.split_first() { | ||||
|             Some((thead, tail)) => { | ||||
|                 let tail = and_recurse(thead, tail); | ||||
|                 let mut out = Vec::new(); | ||||
|                 for array in recurse(head) { | ||||
|                     for tail_array in &tail { | ||||
|                         let mut array = array.clone(); | ||||
|                         array.extend(tail_array.iter().cloned()); | ||||
|                         out.push(array); | ||||
|                     } | ||||
|                 } | ||||
|                 out | ||||
|             }, | ||||
|             None => recurse(head), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn recurse(op: &Operation) -> Vec<Vec<Vec<Query>>> { | ||||
|         match op { | ||||
|             And(ops) | Consecutive(ops) => { | ||||
|                 ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) | ||||
|             }, | ||||
|             Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) { | ||||
|                 vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] | ||||
|             } else { | ||||
|                 ops.into_iter().map(recurse).flatten().collect() | ||||
|             }, | ||||
|             Operation::Query(query) => vec![vec![vec![query.clone()]]], | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     recurse(query_tree) | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use big_s::S; | ||||
|  | ||||
|     use crate::search::criteria::QueryKind; | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn simple_flatten_query_tree() { | ||||
|         let query_tree = Operation::Or(false, vec![ | ||||
|             Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), | ||||
|             Operation::And(vec![ | ||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), | ||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), | ||||
|             ]), | ||||
|             Operation::And(vec![ | ||||
|                 Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), | ||||
|                 Operation::Or(false, vec![ | ||||
|                     Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("thefish")) }), | ||||
|                     Operation::And(vec![ | ||||
|                         Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("the")) }), | ||||
|                         Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), | ||||
|                     ]), | ||||
|                 ]), | ||||
|             ]), | ||||
|         ]); | ||||
|  | ||||
|         let expected = vec![ | ||||
|             vec![vec![Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }]], | ||||
|             vec![ | ||||
|                 vec![Query { prefix: false, kind: QueryKind::exact(S("manythe")) }], | ||||
|                 vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], | ||||
|             ], | ||||
|             vec![ | ||||
|                 vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], | ||||
|                 vec![Query { prefix: false, kind: QueryKind::exact(S("thefish")) }], | ||||
|             ], | ||||
|             vec![ | ||||
|                 vec![Query { prefix: false, kind: QueryKind::exact(S("many")) }], | ||||
|                 vec![Query { prefix: false, kind: QueryKind::exact(S("the")) }], | ||||
|                 vec![Query { prefix: false, kind: QueryKind::exact(S("fish")) }], | ||||
|             ], | ||||
|         ]; | ||||
|  | ||||
|         let result = flatten_query_tree(&query_tree); | ||||
|         assert_eq!(expected, result); | ||||
|     } | ||||
| } | ||||
| @@ -1,135 +0,0 @@ | ||||
| use std::collections::HashMap; | ||||
| use std::mem::take; | ||||
|  | ||||
| use log::debug; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::search::query_tree::Operation; | ||||
| use crate::search::WordDerivationsCache; | ||||
| use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; | ||||
|  | ||||
| /// The result of a call to the fetcher. | ||||
| #[derive(Debug, Clone, PartialEq)] | ||||
| pub struct FetcherResult { | ||||
|     /// The query tree corresponding to the current bucket of the last criterion. | ||||
|     pub query_tree: Option<Operation>, | ||||
|     /// The candidates of the current bucket of the last criterion. | ||||
|     pub candidates: RoaringBitmap, | ||||
|     /// Candidates that comes from the current bucket of the initial criterion. | ||||
|     pub bucket_candidates: RoaringBitmap, | ||||
| } | ||||
|  | ||||
| pub struct Fetcher<'t> { | ||||
|     ctx: &'t dyn Context, | ||||
|     query_tree: Option<Operation>, | ||||
|     candidates: Candidates, | ||||
|     parent: Option<Box<dyn Criterion + 't>>, | ||||
|     should_get_documents_ids: bool, | ||||
|     wdcache: WordDerivationsCache, | ||||
| } | ||||
|  | ||||
| impl<'t> Fetcher<'t> { | ||||
|     pub fn initial( | ||||
|         ctx: &'t dyn Context, | ||||
|         query_tree: Option<Operation>, | ||||
|         candidates: Option<RoaringBitmap>, | ||||
|     ) -> Self | ||||
|     { | ||||
|         Fetcher { | ||||
|             ctx, | ||||
|             query_tree, | ||||
|             candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), | ||||
|             parent: None, | ||||
|             should_get_documents_ids: true, | ||||
|             wdcache: WordDerivationsCache::new(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn new( | ||||
|         ctx: &'t dyn Context, | ||||
|         parent: Box<dyn Criterion + 't>, | ||||
|     ) -> Self | ||||
|     { | ||||
|         Fetcher { | ||||
|             ctx, | ||||
|             query_tree: None, | ||||
|             candidates: Candidates::default(), | ||||
|             parent: Some(parent), | ||||
|             should_get_documents_ids: true, | ||||
|             wdcache: WordDerivationsCache::new(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("Fetcher::{}")] | ||||
|     pub fn next(&mut self) -> anyhow::Result<Option<FetcherResult>> { | ||||
|         use Candidates::{Allowed, Forbidden}; | ||||
|         loop { | ||||
|             debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", | ||||
|                 self.should_get_documents_ids, self.candidates, | ||||
|             ); | ||||
|  | ||||
|             let should_get_documents_ids = take(&mut self.should_get_documents_ids); | ||||
|             match &mut self.candidates { | ||||
|                 Allowed(_) => { | ||||
|                     let candidates = take(&mut self.candidates).into_inner(); | ||||
|                     let candidates = match &self.query_tree { | ||||
|                         Some(qt) if should_get_documents_ids => { | ||||
|                             let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?; | ||||
|                             docids.intersect_with(&candidates); | ||||
|                             docids | ||||
|                         }, | ||||
|                         _ => candidates, | ||||
|                     }; | ||||
|  | ||||
|                     return Ok(Some(FetcherResult { | ||||
|                         query_tree: self.query_tree.take(), | ||||
|                         candidates: candidates.clone(), | ||||
|                         bucket_candidates: candidates, | ||||
|                     })); | ||||
|                 }, | ||||
|                 Forbidden(_) => { | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(&mut self.wdcache)? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     let candidates = match (&query_tree, candidates) { | ||||
|                                         (_, Some(candidates)) => candidates, | ||||
|                                         (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, | ||||
|                                         (None, None) => RoaringBitmap::new(), | ||||
|                                     }; | ||||
|  | ||||
|                                     return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates })) | ||||
|                                 }, | ||||
|                                 None => if should_get_documents_ids { | ||||
|                                     let candidates = match &self.query_tree { | ||||
|                                         Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, | ||||
|                                         None => self.ctx.documents_ids()?, | ||||
|                                     }; | ||||
|  | ||||
|                                     return Ok(Some(FetcherResult { | ||||
|                                         query_tree: self.query_tree.clone(), | ||||
|                                         candidates: candidates.clone(), | ||||
|                                         bucket_candidates: candidates, | ||||
|                                     })); | ||||
|                                 }, | ||||
|                             } | ||||
|                         }, | ||||
|                         None => if should_get_documents_ids { | ||||
|                             let candidates = match &self.query_tree { | ||||
|                                 Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, | ||||
|                                 None => self.ctx.documents_ids()?, | ||||
|                             }; | ||||
|  | ||||
|                             return Ok(Some(FetcherResult { | ||||
|                                 query_tree: self.query_tree.clone(), | ||||
|                                 candidates: candidates.clone(), | ||||
|                                 bucket_candidates: candidates, | ||||
|                             })); | ||||
|                         }, | ||||
|                     } | ||||
|                     return Ok(None); | ||||
|                 }, | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
							
								
								
									
										53
									
								
								milli/src/search/criteria/final.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										53
									
								
								milli/src/search/criteria/final.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,53 @@ | ||||
| use std::collections::HashMap; | ||||
|  | ||||
| use log::debug; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::search::query_tree::Operation; | ||||
| use crate::search::WordDerivationsCache; | ||||
| use super::{resolve_query_tree, Criterion, CriterionResult, Context}; | ||||
|  | ||||
| /// The result of a call to the fetcher. | ||||
| #[derive(Debug, Clone, PartialEq)] | ||||
| pub struct FinalResult { | ||||
|     /// The query tree corresponding to the current bucket of the last criterion. | ||||
|     pub query_tree: Option<Operation>, | ||||
|     /// The candidates of the current bucket of the last criterion. | ||||
|     pub candidates: RoaringBitmap, | ||||
|     /// Candidates that comes from the current bucket of the initial criterion. | ||||
|     pub bucket_candidates: RoaringBitmap, | ||||
| } | ||||
|  | ||||
| pub struct Final<'t> { | ||||
|     ctx: &'t dyn Context<'t>, | ||||
|     parent: Box<dyn Criterion + 't>, | ||||
|     wdcache: WordDerivationsCache, | ||||
| } | ||||
|  | ||||
| impl<'t> Final<'t> { | ||||
|     pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Final<'t> { | ||||
|         Final { ctx, parent, wdcache: WordDerivationsCache::new() } | ||||
|     } | ||||
|  | ||||
|     #[logging_timer::time("Final::{}")] | ||||
|     pub fn next(&mut self) -> anyhow::Result<Option<FinalResult>> { | ||||
|         loop { | ||||
|             debug!("Final iteration"); | ||||
|  | ||||
|             match self.parent.next(&mut self.wdcache)? { | ||||
|                 Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => { | ||||
|                     let candidates = match (&query_tree, candidates) { | ||||
|                         (_, Some(candidates)) => candidates, | ||||
|                         (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, | ||||
|                         (None, None) => self.ctx.documents_ids()?, | ||||
|                     }; | ||||
|  | ||||
|                     bucket_candidates.union_with(&candidates); | ||||
|  | ||||
|                     return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); | ||||
|                 }, | ||||
|                 None => return Ok(None), | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
							
								
								
									
										28
									
								
								milli/src/search/criteria/initial.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								milli/src/search/criteria/initial.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,28 @@ | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::search::query_tree::Operation; | ||||
| use crate::search::WordDerivationsCache; | ||||
|  | ||||
| use super::{Criterion, CriterionResult}; | ||||
|  | ||||
| pub struct Initial { | ||||
|     answer: Option<CriterionResult> | ||||
| } | ||||
|  | ||||
| impl Initial { | ||||
|     pub fn new(query_tree: Option<Operation>, mut candidates: Option<RoaringBitmap>) -> Initial { | ||||
|         let answer = CriterionResult { | ||||
|             query_tree, | ||||
|             candidates: candidates.clone(), | ||||
|             bucket_candidates: candidates.take().unwrap_or_default(), | ||||
|         }; | ||||
|         Initial { answer: Some(answer) } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Criterion for Initial { | ||||
|     #[logging_timer::time("Initial::{}")] | ||||
|     fn next(&mut self, _: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> { | ||||
|         Ok(self.answer.take()) | ||||
|     } | ||||
| } | ||||
| @@ -4,21 +4,25 @@ use std::borrow::Cow; | ||||
| use anyhow::bail; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::search::{word_derivations, WordDerivationsCache}; | ||||
| use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}}; | ||||
| use crate::{Index, DocumentId}; | ||||
|  | ||||
| use super::query_tree::{Operation, Query, QueryKind}; | ||||
| use self::asc_desc::AscDesc; | ||||
| use self::attribute::Attribute; | ||||
| use self::r#final::Final; | ||||
| use self::initial::Initial; | ||||
| use self::proximity::Proximity; | ||||
| use self::typo::Typo; | ||||
| use self::words::Words; | ||||
| use self::asc_desc::AscDesc; | ||||
| use self::proximity::Proximity; | ||||
| use self::fetcher::Fetcher; | ||||
|  | ||||
| mod asc_desc; | ||||
| mod attribute; | ||||
| mod initial; | ||||
| mod proximity; | ||||
| mod typo; | ||||
| mod words; | ||||
| mod asc_desc; | ||||
| mod proximity; | ||||
| pub mod fetcher; | ||||
| pub mod r#final; | ||||
|  | ||||
| pub trait Criterion { | ||||
|     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>>; | ||||
| @@ -59,7 +63,8 @@ impl Default for Candidates { | ||||
|         Self::Forbidden(RoaringBitmap::new()) | ||||
|     } | ||||
| } | ||||
| pub trait Context { | ||||
|  | ||||
| pub trait Context<'c> { | ||||
|     fn documents_ids(&self) -> heed::Result<RoaringBitmap>; | ||||
|     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; | ||||
|     fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; | ||||
| @@ -68,6 +73,8 @@ pub trait Context { | ||||
|     fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>; | ||||
|     fn in_prefix_cache(&self, word: &str) -> bool; | ||||
|     fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>; | ||||
|     fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option<u32>, right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>>; | ||||
|     fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>>; | ||||
| } | ||||
| pub struct CriteriaBuilder<'t> { | ||||
|     rtxn: &'t heed::RoTxn<'t>, | ||||
| @@ -76,7 +83,7 @@ pub struct CriteriaBuilder<'t> { | ||||
|     words_prefixes_fst: fst::Set<Cow<'t, [u8]>>, | ||||
| } | ||||
|  | ||||
| impl<'a> Context for CriteriaBuilder<'a> { | ||||
| impl<'c> Context<'c> for CriteriaBuilder<'c> { | ||||
|     fn documents_ids(&self) -> heed::Result<RoaringBitmap> { | ||||
|         self.index.documents_ids(self.rtxn) | ||||
|     } | ||||
| @@ -115,6 +122,48 @@ impl<'a> Context for CriteriaBuilder<'a> { | ||||
|         } | ||||
|         Ok(words_positions) | ||||
|     } | ||||
|  | ||||
|     fn word_position_iterator( | ||||
|         &self, | ||||
|         word: &str, | ||||
|         level: TreeLevel, | ||||
|         in_prefix_cache: bool, | ||||
|         left: Option<u32>, | ||||
|         right: Option<u32> | ||||
|     ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> | ||||
|     { | ||||
|         let range = { | ||||
|             let left = left.unwrap_or(u32::min_value()); | ||||
|             let right = right.unwrap_or(u32::max_value()); | ||||
|             let left = (word, level, left, left); | ||||
|             let right = (word, level, right, right); | ||||
|             left..=right | ||||
|         }; | ||||
|         let db = match in_prefix_cache { | ||||
|             true => self.index.word_prefix_level_position_docids, | ||||
|             false => self.index.word_level_position_docids, | ||||
|         }; | ||||
|  | ||||
|         Ok(Box::new(db.range(self.rtxn, &range)?)) | ||||
|     } | ||||
|  | ||||
|     fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> { | ||||
|         let range = { | ||||
|             let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); | ||||
|             let right = (word, TreeLevel::max_value(), u32::max_value(), u32::max_value()); | ||||
|             left..=right | ||||
|         }; | ||||
|         let db = match in_prefix_cache { | ||||
|             true => self.index.word_prefix_level_position_docids, | ||||
|             false => self.index.word_level_position_docids, | ||||
|         }; | ||||
|         let last_level = db | ||||
|             .remap_data_type::<heed::types::DecodeIgnore>() | ||||
|             .range(self.rtxn, &range)?.last().transpose()? | ||||
|             .map(|((_, level, _, _), _)| level); | ||||
|  | ||||
|         Ok(last_level) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'t> CriteriaBuilder<'t> { | ||||
| @@ -126,42 +175,26 @@ impl<'t> CriteriaBuilder<'t> { | ||||
|  | ||||
|     pub fn build( | ||||
|         &'t self, | ||||
|         mut query_tree: Option<Operation>, | ||||
|         mut facet_candidates: Option<RoaringBitmap>, | ||||
|     ) -> anyhow::Result<Fetcher<'t>> | ||||
|         query_tree: Option<Operation>, | ||||
|         facet_candidates: Option<RoaringBitmap>, | ||||
|     ) -> anyhow::Result<Final<'t>> | ||||
|     { | ||||
|         use crate::criterion::Criterion as Name; | ||||
|  | ||||
|         let mut criterion = None as Option<Box<dyn Criterion>>; | ||||
|         let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box<dyn Criterion>; | ||||
|         for name in self.index.criteria(&self.rtxn)? { | ||||
|             criterion = Some(match criterion.take() { | ||||
|                 Some(father) => match name { | ||||
|                     Name::Typo => Box::new(Typo::new(self, father)), | ||||
|                     Name::Words => Box::new(Words::new(self, father)), | ||||
|                     Name::Proximity => Box::new(Proximity::new(self, father)), | ||||
|                     Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, father, field)?), | ||||
|                     Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, father, field)?), | ||||
|                     _otherwise => father, | ||||
|                 }, | ||||
|                 None => match name { | ||||
|                     Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())), | ||||
|                     Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), | ||||
|                     Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), | ||||
|                     Name::Asc(field) => { | ||||
|                         Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) | ||||
|                     }, | ||||
|                     Name::Desc(field) => { | ||||
|                         Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), field)?) | ||||
|                     }, | ||||
|                     _otherwise => continue, | ||||
|                 }, | ||||
|             }); | ||||
|             criterion = match name { | ||||
|                 Name::Typo => Box::new(Typo::new(self, criterion)), | ||||
|                 Name::Words => Box::new(Words::new(self, criterion)), | ||||
|                 Name::Proximity => Box::new(Proximity::new(self, criterion)), | ||||
|                 Name::Attribute => Box::new(Attribute::new(self, criterion)), | ||||
|                 Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), | ||||
|                 Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), | ||||
|                 _otherwise => criterion, | ||||
|             }; | ||||
|         } | ||||
|  | ||||
|         match criterion { | ||||
|             Some(criterion) => Ok(Fetcher::new(self, criterion)), | ||||
|             None => Ok(Fetcher::initial(self, query_tree, facet_candidates)), | ||||
|         } | ||||
|         Ok(Final::new(self, criterion)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| @@ -362,9 +395,10 @@ pub mod test { | ||||
|         word_prefix_docids: HashMap<String, RoaringBitmap>, | ||||
|         word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, | ||||
|         word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, | ||||
|         docid_words: HashMap<u32, Vec<String>>, | ||||
|     } | ||||
|  | ||||
|     impl<'a> Context for TestContext<'a> { | ||||
|     impl<'c> Context<'c> for TestContext<'c> { | ||||
|         fn documents_ids(&self) -> heed::Result<RoaringBitmap> { | ||||
|             Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) | ||||
|         } | ||||
| @@ -395,7 +429,24 @@ pub mod test { | ||||
|             self.word_prefix_docids.contains_key(&word.to_string()) | ||||
|         } | ||||
|  | ||||
|         fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> { | ||||
|         fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> { | ||||
|             if let Some(docid_words) = self.docid_words.get(&docid) { | ||||
|                 Ok(docid_words | ||||
|                     .iter() | ||||
|                     .enumerate() | ||||
|                     .map(|(i,w)| (w.clone(), RoaringBitmap::from_sorted_iter(std::iter::once(i as u32)))) | ||||
|                     .collect() | ||||
|                 ) | ||||
|             } else { | ||||
|                 Ok(HashMap::new()) | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         fn word_position_iterator(&self, _word: &str, _level: TreeLevel, _in_prefix_cache: bool, _left: Option<u32>, _right: Option<u32>) -> heed::Result<Box<dyn Iterator<Item =heed::Result<((&'c str, TreeLevel, u32, u32), RoaringBitmap)>> + 'c>> { | ||||
|             todo!() | ||||
|         } | ||||
|  | ||||
|         fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result<Option<TreeLevel>> { | ||||
|             todo!() | ||||
|         } | ||||
|     } | ||||
| @@ -431,50 +482,58 @@ pub mod test { | ||||
|                 s("morning")    => random_postings(rng,    125), | ||||
|             }; | ||||
|  | ||||
|             let mut docid_words = HashMap::new(); | ||||
|             for (word, docids) in word_docids.iter() { | ||||
|                 for docid in docids { | ||||
|                     let words = docid_words.entry(docid).or_insert(vec![]); | ||||
|                     words.push(word.clone()); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             let word_prefix_docids = hashmap!{ | ||||
|                 s("h")   => &word_docids[&s("hello")] | &word_docids[&s("hi")], | ||||
|                 s("wor") => &word_docids[&s("word")]  | &word_docids[&s("world")], | ||||
|                 s("20")  => &word_docids[&s("2020")]  | &word_docids[&s("2021")], | ||||
|             }; | ||||
|  | ||||
|             let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")]; | ||||
|             let hello_world_split = (hello_world.len() / 2) as usize; | ||||
|             let hello_world_1 = hello_world.iter().take(hello_world_split).collect(); | ||||
|             let hello_world_2 = hello_world.iter().skip(hello_world_split).collect(); | ||||
|  | ||||
|             let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")]; | ||||
|             let hello_word_split = (hello_word.len() / 2) as usize; | ||||
|             let hello_word_4 = hello_word.iter().take(hello_word_split).collect(); | ||||
|             let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect(); | ||||
|             let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect(); | ||||
|             let word_pair_proximity_docids = hashmap!{ | ||||
|                 (s("good"), s("morning"), 1)   => &word_docids[&s("good")] & &word_docids[&s("morning")], | ||||
|                 (s("hello"), s("world"), 1)   => hello_world_1, | ||||
|                 (s("hello"), s("world"), 4)   => hello_world_2, | ||||
|                 (s("this"), s("is"), 1)   => &word_docids[&s("this")] & &word_docids[&s("is")], | ||||
|                 (s("is"), s("2021"), 1)   => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], | ||||
|                 (s("is"), s("2020"), 1)   => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), | ||||
|                 (s("this"), s("2021"), 2)   => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], | ||||
|                 (s("this"), s("2020"), 2)   => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), | ||||
|                 (s("word"), s("split"), 1)   => &word_docids[&s("word")] & &word_docids[&s("split")], | ||||
|                 (s("world"), s("split"), 1)   => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")], | ||||
|                 (s("hello"), s("word"), 4) => hello_word_4, | ||||
|                 (s("hello"), s("word"), 6) => hello_word_6, | ||||
|                 (s("hello"), s("word"), 7) => hello_word_7, | ||||
|                 (s("split"), s("ngrams"), 3)   => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")], | ||||
|                 (s("split"), s("ngrams"), 5)   => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], | ||||
|                 (s("this"), s("ngrams"), 1)   => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")], | ||||
|                 (s("this"), s("ngrams"), 2)   => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], | ||||
|             }; | ||||
|  | ||||
|             let word_prefix_pair_proximity_docids = hashmap!{ | ||||
|                 (s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(), | ||||
|                 (s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(), | ||||
|                 (s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(), | ||||
|                 (s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(), | ||||
|                 (s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(), | ||||
|                 (s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(), | ||||
|             }; | ||||
|             let mut word_pair_proximity_docids = HashMap::new(); | ||||
|             let mut word_prefix_pair_proximity_docids = HashMap::new(); | ||||
|             for (lword, lcandidates) in &word_docids { | ||||
|                 for (rword, rcandidates) in &word_docids { | ||||
|                     if lword == rword { continue } | ||||
|                     let candidates = lcandidates & rcandidates; | ||||
|                     for candidate in candidates { | ||||
|                         if let Some(docid_words) = docid_words.get(&candidate) { | ||||
|                             let lposition = docid_words.iter().position(|w| w == lword).unwrap(); | ||||
|                             let rposition = docid_words.iter().position(|w| w == rword).unwrap(); | ||||
|                             let key = if lposition < rposition { | ||||
|                                 (s(lword), s(rword), (rposition - lposition) as i32) | ||||
|                             } else { | ||||
|                                 (s(lword), s(rword), (lposition - rposition + 1) as i32) | ||||
|                             }; | ||||
|                             let docids = word_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); | ||||
|                             docids.push(candidate); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 for (pword, pcandidates) in &word_prefix_docids { | ||||
|                     if lword.starts_with(pword) { continue } | ||||
|                     let candidates = lcandidates & pcandidates; | ||||
|                     for candidate in candidates { | ||||
|                         if let Some(docid_words) = docid_words.get(&candidate) { | ||||
|                             let lposition = docid_words.iter().position(|w| w == lword).unwrap(); | ||||
|                             let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); | ||||
|                             let key = if lposition < rposition { | ||||
|                                 (s(lword), s(pword), (rposition - lposition) as i32) | ||||
|                             } else { | ||||
|                                 (s(lword), s(pword), (lposition - rposition + 1) as i32) | ||||
|                             }; | ||||
|                             let docids = word_prefix_pair_proximity_docids.entry(key).or_insert(RoaringBitmap::new()); | ||||
|                             docids.push(candidate); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             let mut keys = word_docids.keys().collect::<Vec<_>>(); | ||||
|             keys.sort_unstable(); | ||||
| @@ -486,6 +545,7 @@ pub mod test { | ||||
|                 word_prefix_docids, | ||||
|                 word_pair_proximity_docids, | ||||
|                 word_prefix_pair_proximity_docids, | ||||
|                 docid_words, | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|   | ||||
| @@ -8,48 +8,29 @@ use log::debug; | ||||
| use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; | ||||
| use crate::search::query_tree::{maximum_proximity, Operation, Query}; | ||||
| use crate::search::{build_dfa, WordDerivationsCache}; | ||||
| use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; | ||||
| use super::{Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; | ||||
|  | ||||
| type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; | ||||
|  | ||||
| pub struct Proximity<'t> { | ||||
|     ctx: &'t dyn Context, | ||||
|     query_tree: Option<(usize, Operation)>, | ||||
|     ctx: &'t dyn Context<'t>, | ||||
|     /// ((max_proximity, query_tree), allowed_candidates) | ||||
|     state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, | ||||
|     proximity: u8, | ||||
|     candidates: Candidates, | ||||
|     bucket_candidates: RoaringBitmap, | ||||
|     parent: Option<Box<dyn Criterion + 't>>, | ||||
|     parent: Box<dyn Criterion + 't>, | ||||
|     candidates_cache: Cache, | ||||
|     plane_sweep_cache: Option<btree_map::IntoIter<u8, RoaringBitmap>>, | ||||
| } | ||||
|  | ||||
| impl<'t> Proximity<'t> { | ||||
|     pub fn initial( | ||||
|         ctx: &'t dyn Context, | ||||
|         query_tree: Option<Operation>, | ||||
|         candidates: Option<RoaringBitmap>, | ||||
|     ) -> Self | ||||
|     { | ||||
|     pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self { | ||||
|         Proximity { | ||||
|             ctx, | ||||
|             query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), | ||||
|             state: None, | ||||
|             proximity: 0, | ||||
|             candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent: None, | ||||
|             candidates_cache: Cache::new(), | ||||
|             plane_sweep_cache: None, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self { | ||||
|         Proximity { | ||||
|             ctx, | ||||
|             query_tree: None, | ||||
|             proximity: 0, | ||||
|             candidates: Candidates::default(), | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent: Some(parent), | ||||
|             parent, | ||||
|             candidates_cache: Cache::new(), | ||||
|             plane_sweep_cache: None, | ||||
|         } | ||||
| @@ -59,27 +40,20 @@ impl<'t> Proximity<'t> { | ||||
| impl<'t> Criterion for Proximity<'t> { | ||||
|     #[logging_timer::time("Proximity::{}")] | ||||
|     fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result<Option<CriterionResult>> { | ||||
|         use Candidates::{Allowed, Forbidden}; | ||||
|         loop { | ||||
|             debug!("Proximity at iteration {} (max {:?}) ({:?})", | ||||
|             debug!("Proximity at iteration {} (max prox {:?}) ({:?})", | ||||
|                 self.proximity, | ||||
|                 self.query_tree.as_ref().map(|(mp, _)| mp), | ||||
|                 self.candidates, | ||||
|                 self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)), | ||||
|                 self.state.as_ref().map(|(_, cd)| cd), | ||||
|             ); | ||||
|  | ||||
|             match (&mut self.query_tree, &mut self.candidates) { | ||||
|                 (_, Allowed(candidates)) if candidates.is_empty() => { | ||||
|                     return Ok(Some(CriterionResult { | ||||
|                         query_tree: self.query_tree.take().map(|(_, qt)| qt), | ||||
|                         candidates: Some(take(&mut self.candidates).into_inner()), | ||||
|                         bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                     })); | ||||
|             match &mut self.state { | ||||
|                 Some((_, candidates)) if candidates.is_empty() => { | ||||
|                     self.state = None; // reset state | ||||
|                 }, | ||||
|                 (Some((max_prox, query_tree)), Allowed(candidates)) => { | ||||
|                 Some((Some((max_prox, query_tree)), candidates)) => { | ||||
|                     if self.proximity as usize > *max_prox { | ||||
|                         // reset state to (None, Forbidden(_)) | ||||
|                         self.query_tree = None; | ||||
|                         self.candidates = Candidates::default(); | ||||
|                         self.state = None; // reset state | ||||
|                     } else { | ||||
|                         let mut new_candidates = if candidates.len() <= 1000 { | ||||
|                             if let Some(cache) = self.plane_sweep_cache.as_mut() { | ||||
| @@ -89,9 +63,7 @@ impl<'t> Criterion for Proximity<'t> { | ||||
|                                         candidates | ||||
|                                     }, | ||||
|                                     None => { | ||||
|                                         // reset state to (None, Forbidden(_)) | ||||
|                                         self.query_tree = None; | ||||
|                                         self.candidates = Candidates::default(); | ||||
|                                         self.state = None; // reset state | ||||
|                                         continue | ||||
|                                     }, | ||||
|                                 } | ||||
| @@ -120,79 +92,54 @@ impl<'t> Criterion for Proximity<'t> { | ||||
|                         candidates.difference_with(&new_candidates); | ||||
|                         self.proximity += 1; | ||||
|  | ||||
|                         let bucket_candidates = match self.parent { | ||||
|                             Some(_) => take(&mut self.bucket_candidates), | ||||
|                             None => new_candidates.clone(), | ||||
|                         }; | ||||
|  | ||||
|                         return Ok(Some(CriterionResult { | ||||
|                             query_tree: Some(query_tree.clone()), | ||||
|                             candidates: Some(new_candidates), | ||||
|                             bucket_candidates, | ||||
|                             bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                         })); | ||||
|                     } | ||||
|                 }, | ||||
|                 (Some((max_prox, query_tree)), Forbidden(candidates)) => { | ||||
|                     if self.proximity as usize > *max_prox { | ||||
|                         self.query_tree = None; | ||||
|                         self.candidates = Candidates::default(); | ||||
|                     } else { | ||||
|                         let mut new_candidates = resolve_candidates( | ||||
|                             self.ctx, | ||||
|                             &query_tree, | ||||
|                             self.proximity, | ||||
|                             &mut self.candidates_cache, | ||||
|                             wdcache, | ||||
|                         )?; | ||||
|  | ||||
|                         new_candidates.difference_with(&candidates); | ||||
|                         candidates.union_with(&new_candidates); | ||||
|                         self.proximity += 1; | ||||
|  | ||||
|                         let bucket_candidates = match self.parent { | ||||
|                             Some(_) => take(&mut self.bucket_candidates), | ||||
|                             None => new_candidates.clone(), | ||||
|                         }; | ||||
|  | ||||
|                         return Ok(Some(CriterionResult { | ||||
|                             query_tree: Some(query_tree.clone()), | ||||
|                             candidates: Some(new_candidates), | ||||
|                             bucket_candidates, | ||||
|                         })); | ||||
|                     } | ||||
|                 }, | ||||
|                 (None, Allowed(_)) => { | ||||
|                     let candidates = take(&mut self.candidates).into_inner(); | ||||
|                 Some((None, candidates)) => { | ||||
|                     let candidates = take(candidates); | ||||
|                     self.state = None; // reset state | ||||
|                     return Ok(Some(CriterionResult { | ||||
|                         query_tree: None, | ||||
|                         candidates: Some(candidates.clone()), | ||||
|                         bucket_candidates: candidates, | ||||
|                     })); | ||||
|                 }, | ||||
|                 (None, Forbidden(_)) => { | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(wdcache)? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     let candidates = match (&query_tree, candidates) { | ||||
|                                         (_, Some(candidates)) => candidates, | ||||
|                                         (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, | ||||
|                                         (None, None) => RoaringBitmap::new(), | ||||
|                                     }; | ||||
|                 None => { | ||||
|                     match self.parent.next(wdcache)? { | ||||
|                         Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { | ||||
|                             return Ok(Some(CriterionResult { | ||||
|                                 query_tree: None, | ||||
|                                 candidates: None, | ||||
|                                 bucket_candidates, | ||||
|                             })); | ||||
|                         }, | ||||
|                         Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                             let candidates_is_some = candidates.is_some(); | ||||
|                             let candidates = match (&query_tree, candidates) { | ||||
|                                 (_, Some(candidates)) => candidates, | ||||
|                                 (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, | ||||
|                                 (None, None) => RoaringBitmap::new(), | ||||
|                             }; | ||||
|  | ||||
|                                     if bucket_candidates.is_empty() { | ||||
|                                         self.bucket_candidates.union_with(&candidates); | ||||
|                                     } else { | ||||
|                                         self.bucket_candidates.union_with(&bucket_candidates); | ||||
|                                     } | ||||
|  | ||||
|                                     self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); | ||||
|                                     self.proximity = 0; | ||||
|                                     self.candidates = Candidates::Allowed(candidates); | ||||
|                                     self.plane_sweep_cache = None; | ||||
|                                 }, | ||||
|                                 None => return Ok(None), | ||||
|                             // If our parent returns candidates it means that the bucket | ||||
|                             // candidates were already computed before and we can use them. | ||||
|                             // | ||||
|                             // If not, we must use the just computed candidates as our bucket | ||||
|                             // candidates. | ||||
|                             if candidates_is_some { | ||||
|                                 self.bucket_candidates.union_with(&bucket_candidates); | ||||
|                             } else { | ||||
|                                 self.bucket_candidates.union_with(&candidates); | ||||
|                             } | ||||
|  | ||||
|                             let query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); | ||||
|                             self.state = Some((query_tree, candidates)); | ||||
|                             self.proximity = 0; | ||||
|                             self.plane_sweep_cache = None; | ||||
|                         }, | ||||
|                         None => return Ok(None), | ||||
|                     } | ||||
|   | ||||
| @@ -9,41 +9,24 @@ use crate::search::{word_derivations, WordDerivationsCache}; | ||||
| use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; | ||||
|  | ||||
| pub struct Typo<'t> { | ||||
|     ctx: &'t dyn Context, | ||||
|     ctx: &'t dyn Context<'t>, | ||||
|     query_tree: Option<(usize, Operation)>, | ||||
|     number_typos: u8, | ||||
|     candidates: Candidates, | ||||
|     bucket_candidates: RoaringBitmap, | ||||
|     parent: Option<Box<dyn Criterion + 't>>, | ||||
|     parent: Box<dyn Criterion + 't>, | ||||
|     candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, | ||||
| } | ||||
|  | ||||
| impl<'t> Typo<'t> { | ||||
|     pub fn initial( | ||||
|         ctx: &'t dyn Context, | ||||
|         query_tree: Option<Operation>, | ||||
|         candidates: Option<RoaringBitmap>, | ||||
|     ) -> Self | ||||
|     { | ||||
|         Typo { | ||||
|             ctx, | ||||
|             query_tree: query_tree.map(|op| (maximum_typo(&op), op)), | ||||
|             number_typos: 0, | ||||
|             candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent: None, | ||||
|             candidates_cache: HashMap::new(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self { | ||||
|     pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self { | ||||
|         Typo { | ||||
|             ctx, | ||||
|             query_tree: None, | ||||
|             number_typos: 0, | ||||
|             candidates: Candidates::default(), | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent: Some(parent), | ||||
|             parent, | ||||
|             candidates_cache: HashMap::new(), | ||||
|         } | ||||
|     } | ||||
| @@ -90,15 +73,10 @@ impl<'t> Criterion for Typo<'t> { | ||||
|                         candidates.difference_with(&new_candidates); | ||||
|                         self.number_typos += 1; | ||||
|  | ||||
|                         let bucket_candidates = match self.parent { | ||||
|                             Some(_) => take(&mut self.bucket_candidates), | ||||
|                             None => new_candidates.clone(), | ||||
|                         }; | ||||
|  | ||||
|                         return Ok(Some(CriterionResult { | ||||
|                             query_tree: Some(new_query_tree), | ||||
|                             candidates: Some(new_candidates), | ||||
|                             bucket_candidates, | ||||
|                             bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                         })); | ||||
|                     } | ||||
|                 }, | ||||
| @@ -145,17 +123,19 @@ impl<'t> Criterion for Typo<'t> { | ||||
|                     })); | ||||
|                 }, | ||||
|                 (None, Forbidden(_)) => { | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(wdcache)? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); | ||||
|                                     self.number_typos = 0; | ||||
|                                     self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); | ||||
|                                     self.bucket_candidates.union_with(&bucket_candidates); | ||||
|                                 }, | ||||
|                                 None => return Ok(None), | ||||
|                             } | ||||
|                     match self.parent.next(wdcache)? { | ||||
|                         Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { | ||||
|                             return Ok(Some(CriterionResult { | ||||
|                                 query_tree: None, | ||||
|                                 candidates: None, | ||||
|                                 bucket_candidates, | ||||
|                             })); | ||||
|                         }, | ||||
|                         Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                             self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); | ||||
|                             self.number_typos = 0; | ||||
|                             self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); | ||||
|                             self.bucket_candidates.union_with(&bucket_candidates); | ||||
|                         }, | ||||
|                         None => return Ok(None), | ||||
|                     } | ||||
| @@ -334,8 +314,8 @@ fn resolve_candidates<'t>( | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod test { | ||||
|  | ||||
|     use super::*; | ||||
|     use super::super::initial::Initial; | ||||
|     use super::super::test::TestContext; | ||||
|  | ||||
|     #[test] | ||||
| @@ -345,8 +325,10 @@ mod test { | ||||
|         let facet_candidates = None; | ||||
|  | ||||
|         let mut wdcache = WordDerivationsCache::new(); | ||||
|         let mut criteria = Typo::initial(&context, query_tree, facet_candidates); | ||||
|         let parent = Initial::new(query_tree, facet_candidates); | ||||
|         let mut criteria = Typo::new(&context, Box::new(parent)); | ||||
|  | ||||
|         assert!(criteria.next(&mut wdcache).unwrap().unwrap().candidates.is_none()); | ||||
|         assert!(criteria.next(&mut wdcache).unwrap().is_none()); | ||||
|     } | ||||
|  | ||||
| @@ -364,7 +346,8 @@ mod test { | ||||
|         let facet_candidates = None; | ||||
|  | ||||
|         let mut wdcache = WordDerivationsCache::new(); | ||||
|         let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates); | ||||
|         let parent = Initial::new(Some(query_tree), facet_candidates); | ||||
|         let mut criteria = Typo::new(&context, Box::new(parent)); | ||||
|  | ||||
|         let candidates_1 = context.word_docids("split").unwrap().unwrap() | ||||
|             & context.word_docids("this").unwrap().unwrap() | ||||
| @@ -413,7 +396,8 @@ mod test { | ||||
|         let facet_candidates = context.word_docids("earth").unwrap().unwrap(); | ||||
|  | ||||
|         let mut wdcache = WordDerivationsCache::new(); | ||||
|         let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())); | ||||
|         let parent = Initial::new(query_tree, Some(facet_candidates.clone())); | ||||
|         let mut criteria = Typo::new(&context, Box::new(parent)); | ||||
|  | ||||
|         let expected = CriterionResult { | ||||
|             query_tree: None, | ||||
| @@ -442,7 +426,8 @@ mod test { | ||||
|         let facet_candidates = context.word_docids("earth").unwrap().unwrap(); | ||||
|  | ||||
|         let mut wdcache = WordDerivationsCache::new(); | ||||
|         let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())); | ||||
|         let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone())); | ||||
|         let mut criteria = Typo::new(&context, Box::new(parent)); | ||||
|  | ||||
|         let candidates_1 = context.word_docids("split").unwrap().unwrap() | ||||
|             & context.word_docids("this").unwrap().unwrap() | ||||
| @@ -456,7 +441,7 @@ mod test { | ||||
|                 ]), | ||||
|             ])), | ||||
|             candidates: Some(&candidates_1 & &facet_candidates), | ||||
|             bucket_candidates: candidates_1 & &facet_candidates, | ||||
|             bucket_candidates: facet_candidates.clone(), | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_1)); | ||||
| @@ -478,7 +463,7 @@ mod test { | ||||
|                 ]), | ||||
|             ])), | ||||
|             candidates: Some(&candidates_2 & &facet_candidates), | ||||
|             bucket_candidates: candidates_2 & &facet_candidates, | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|         }; | ||||
|  | ||||
|         assert_eq!(criteria.next(&mut wdcache).unwrap(), Some(expected_2)); | ||||
|   | ||||
| @@ -8,38 +8,22 @@ use crate::search::query_tree::Operation; | ||||
| use super::{resolve_query_tree, Criterion, CriterionResult, Context, WordDerivationsCache}; | ||||
|  | ||||
| pub struct Words<'t> { | ||||
|     ctx: &'t dyn Context, | ||||
|     ctx: &'t dyn Context<'t>, | ||||
|     query_trees: Vec<Operation>, | ||||
|     candidates: Option<RoaringBitmap>, | ||||
|     bucket_candidates: RoaringBitmap, | ||||
|     parent: Option<Box<dyn Criterion + 't>>, | ||||
|     parent: Box<dyn Criterion + 't>, | ||||
|     candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, | ||||
| } | ||||
|  | ||||
| impl<'t> Words<'t> { | ||||
|     pub fn initial( | ||||
|         ctx: &'t dyn Context, | ||||
|         query_tree: Option<Operation>, | ||||
|         candidates: Option<RoaringBitmap>, | ||||
|     ) -> Self | ||||
|     { | ||||
|         Words { | ||||
|             ctx, | ||||
|             query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), | ||||
|             candidates, | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent: None, | ||||
|             candidates_cache: HashMap::default(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self { | ||||
|     pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self { | ||||
|         Words { | ||||
|             ctx, | ||||
|             query_trees: Vec::default(), | ||||
|             candidates: None, | ||||
|             bucket_candidates: RoaringBitmap::new(), | ||||
|             parent: Some(parent), | ||||
|             parent, | ||||
|             candidates_cache: HashMap::default(), | ||||
|         } | ||||
|     } | ||||
| @@ -65,27 +49,17 @@ impl<'t> Criterion for Words<'t> { | ||||
|                     found_candidates.intersect_with(&candidates); | ||||
|                     candidates.difference_with(&found_candidates); | ||||
|  | ||||
|                     let bucket_candidates = match self.parent { | ||||
|                         Some(_) => take(&mut self.bucket_candidates), | ||||
|                         None => found_candidates.clone(), | ||||
|                     }; | ||||
|  | ||||
|                     return Ok(Some(CriterionResult { | ||||
|                         query_tree: Some(qt), | ||||
|                         candidates: Some(found_candidates), | ||||
|                         bucket_candidates, | ||||
|                         bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                     })); | ||||
|                 }, | ||||
|                 (Some(qt), None) => { | ||||
|                     let bucket_candidates = match self.parent { | ||||
|                         Some(_) => take(&mut self.bucket_candidates), | ||||
|                         None => RoaringBitmap::new(), | ||||
|                     }; | ||||
|  | ||||
|                     return Ok(Some(CriterionResult { | ||||
|                         query_tree: Some(qt), | ||||
|                         candidates: None, | ||||
|                         bucket_candidates, | ||||
|                         bucket_candidates: take(&mut self.bucket_candidates), | ||||
|                     })); | ||||
|                 }, | ||||
|                 (None, Some(_)) => { | ||||
| @@ -97,16 +71,18 @@ impl<'t> Criterion for Words<'t> { | ||||
|                     })); | ||||
|                 }, | ||||
|                 (None, None) => { | ||||
|                     match self.parent.as_mut() { | ||||
|                         Some(parent) => { | ||||
|                             match parent.next(wdcache)? { | ||||
|                                 Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                                     self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); | ||||
|                                     self.candidates = candidates; | ||||
|                                     self.bucket_candidates.union_with(&bucket_candidates); | ||||
|                                 }, | ||||
|                                 None => return Ok(None), | ||||
|                             } | ||||
|                     match self.parent.next(wdcache)? { | ||||
|                         Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { | ||||
|                             return Ok(Some(CriterionResult { | ||||
|                                 query_tree: None, | ||||
|                                 candidates: None, | ||||
|                                 bucket_candidates, | ||||
|                             })); | ||||
|                         }, | ||||
|                         Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { | ||||
|                             self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); | ||||
|                             self.candidates = candidates; | ||||
|                             self.bucket_candidates.union_with(&bucket_candidates); | ||||
|                         }, | ||||
|                         None => return Ok(None), | ||||
|                     } | ||||
|   | ||||
| @@ -13,9 +13,8 @@ use once_cell::sync::Lazy; | ||||
| use roaring::bitmap::RoaringBitmap; | ||||
|  | ||||
| use distinct::{Distinct, DocIter, FacetDistinct, MapDistinct, NoopDistinct}; | ||||
|  | ||||
| use crate::search::criteria::fetcher::{Fetcher, FetcherResult}; | ||||
| use crate::{DocumentId, Index}; | ||||
| use crate::search::criteria::r#final::{Final, FinalResult}; | ||||
| use crate::{Index, DocumentId}; | ||||
|  | ||||
| pub use self::facet::{ | ||||
|     FacetCondition, FacetDistribution, FacetIter, FacetNumberOperator, FacetStringOperator, | ||||
| @@ -162,14 +161,14 @@ impl<'a> Search<'a> { | ||||
|         &self, | ||||
|         mut distinct: impl for<'c> Distinct<'c>, | ||||
|         matching_words: MatchingWords, | ||||
|         mut criteria: Fetcher, | ||||
|         mut criteria: Final, | ||||
|     ) -> anyhow::Result<SearchResult> { | ||||
|         let mut offset = self.offset; | ||||
|         let mut initial_candidates = RoaringBitmap::new(); | ||||
|         let mut excluded_documents = RoaringBitmap::new(); | ||||
|         let mut documents_ids = Vec::with_capacity(self.limit); | ||||
|  | ||||
|         while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { | ||||
|         while let Some(FinalResult { candidates, bucket_candidates, .. }) = criteria.next()? { | ||||
|             debug!("Number of candidates found {}", candidates.len()); | ||||
|  | ||||
|             let excluded = take(&mut excluded_documents); | ||||
|   | ||||
							
								
								
									
										51
									
								
								milli/src/tree_level.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								milli/src/tree_level.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,51 @@ | ||||
| use std::convert::TryFrom; | ||||
| use std::fmt; | ||||
|  | ||||
| /// This is just before the lowest printable character (space, sp, 32) | ||||
| const MAX_VALUE: u8 = 31; | ||||
|  | ||||
| #[derive(Debug, Copy, Clone)] | ||||
| pub enum Error { | ||||
|     LevelTooHigh(u8), | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||
| #[repr(transparent)] | ||||
| pub struct TreeLevel(u8); | ||||
|  | ||||
| impl TreeLevel { | ||||
|     pub const fn max_value() -> TreeLevel { | ||||
|         TreeLevel(MAX_VALUE) | ||||
|     } | ||||
|  | ||||
|     pub const fn min_value() -> TreeLevel { | ||||
|         TreeLevel(0) | ||||
|     } | ||||
|  | ||||
|     pub fn saturating_sub(&self, lhs: u8) -> TreeLevel { | ||||
|         TreeLevel(self.0.saturating_sub(lhs)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Into<u8> for TreeLevel { | ||||
|     fn into(self) -> u8 { | ||||
|         self.0 | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl TryFrom<u8> for TreeLevel { | ||||
|     type Error = Error; | ||||
|  | ||||
|     fn try_from(value: u8) -> Result<TreeLevel, Error> { | ||||
|         match value { | ||||
|             0..=MAX_VALUE => Ok(TreeLevel(value)), | ||||
|             _ => Err(Error::LevelTooHigh(value)), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl fmt::Display for TreeLevel { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         write!(f, "{}", self.0) | ||||
|     } | ||||
| } | ||||
| @@ -28,6 +28,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|             docid_word_positions, | ||||
|             word_pair_proximity_docids, | ||||
|             word_prefix_pair_proximity_docids, | ||||
|             word_level_position_docids, | ||||
|             word_prefix_level_position_docids, | ||||
|             facet_field_id_value_docids, | ||||
|             field_id_docid_facet_values, | ||||
|             documents, | ||||
| @@ -55,6 +57,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|         docid_word_positions.clear(self.wtxn)?; | ||||
|         word_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         word_prefix_pair_proximity_docids.clear(self.wtxn)?; | ||||
|         word_level_position_docids.clear(self.wtxn)?; | ||||
|         word_prefix_level_position_docids.clear(self.wtxn)?; | ||||
|         facet_field_id_value_docids.clear(self.wtxn)?; | ||||
|         field_id_docid_facet_values.clear(self.wtxn)?; | ||||
|         documents.clear(self.wtxn)?; | ||||
|   | ||||
| @@ -88,6 +88,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             docid_word_positions, | ||||
|             word_pair_proximity_docids, | ||||
|             word_prefix_pair_proximity_docids, | ||||
|             word_level_position_docids, | ||||
|             word_prefix_level_position_docids, | ||||
|             facet_field_id_value_docids, | ||||
|             field_id_docid_facet_values, | ||||
|             documents, | ||||
| @@ -329,6 +331,36 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|  | ||||
|         drop(iter); | ||||
|  | ||||
|         // We delete the documents ids that are under the word level position docids. | ||||
|         let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); | ||||
|         while let Some(result) = iter.next() { | ||||
|             let (bytes, mut docids) = result?; | ||||
|             let previous_len = docids.len(); | ||||
|             docids.difference_with(&self.documents_ids); | ||||
|             if docids.is_empty() { | ||||
|                 iter.del_current()?; | ||||
|             } else if docids.len() != previous_len { | ||||
|                 iter.put_current(bytes, &docids)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         drop(iter); | ||||
|  | ||||
|         // We delete the documents ids that are under the word prefix level position docids. | ||||
|         let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>(); | ||||
|         while let Some(result) = iter.next() { | ||||
|             let (bytes, mut docids) = result?; | ||||
|             let previous_len = docids.len(); | ||||
|             docids.difference_with(&self.documents_ids); | ||||
|             if docids.is_empty() { | ||||
|                 iter.del_current()?; | ||||
|             } else if docids.len() != previous_len { | ||||
|                 iter.put_current(bytes, &docids)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         drop(iter); | ||||
|  | ||||
|         Ok(self.documents_ids.len()) | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -52,6 +52,14 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) - | ||||
|     cbo_roaring_bitmap_merge(values) | ||||
| } | ||||
|  | ||||
| pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { | ||||
|     cbo_roaring_bitmap_merge(values) | ||||
| } | ||||
|  | ||||
| pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { | ||||
|     cbo_roaring_bitmap_merge(values) | ||||
| } | ||||
|  | ||||
| pub fn facet_field_value_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> { | ||||
|     cbo_roaring_bitmap_merge(values) | ||||
| } | ||||
|   | ||||
| @@ -2,7 +2,8 @@ use std::borrow::Cow; | ||||
| use std::collections::HashSet; | ||||
| use std::fs::File; | ||||
| use std::io::{self, Seek, SeekFrom}; | ||||
| use std::num::NonZeroUsize; | ||||
| use std::num::{NonZeroU32, NonZeroUsize}; | ||||
| use std::str; | ||||
| use std::sync::mpsc::sync_channel; | ||||
| use std::time::Instant; | ||||
|  | ||||
| @@ -13,17 +14,21 @@ use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionTy | ||||
| use heed::types::ByteSlice; | ||||
| use log::{debug, info, error}; | ||||
| use memmap::Mmap; | ||||
| use rayon::ThreadPool; | ||||
| use rayon::prelude::*; | ||||
| use rayon::ThreadPool; | ||||
| use serde::{Serialize, Deserialize}; | ||||
|  | ||||
| use crate::index::Index; | ||||
| use crate::update::{Facets, WordsPrefixes, UpdateIndexingStep}; | ||||
| use crate::update::{ | ||||
|     Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep, | ||||
|     WordPrefixPairProximityDocids, | ||||
| }; | ||||
| use self::store::{Store, Readers}; | ||||
| pub use self::merge_function::{ | ||||
|     main_merge, word_docids_merge, words_pairs_proximities_docids_merge, | ||||
|     docid_word_positions_merge, documents_merge, facet_field_value_docids_merge, | ||||
|     field_id_docid_facet_values_merge, | ||||
|     docid_word_positions_merge, documents_merge, | ||||
|     word_level_position_docids_merge, word_prefix_level_positions_docids_merge, | ||||
|     facet_field_value_docids_merge, field_id_docid_facet_values_merge, | ||||
| }; | ||||
| pub use self::transform::{Transform, TransformOutput}; | ||||
|  | ||||
| @@ -262,6 +267,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|     facet_min_level_size: Option<NonZeroUsize>, | ||||
|     words_prefix_threshold: Option<f64>, | ||||
|     max_prefix_length: Option<usize>, | ||||
|     words_positions_level_group_size: Option<NonZeroU32>, | ||||
|     words_positions_min_level_size: Option<NonZeroU32>, | ||||
|     update_method: IndexDocumentsMethod, | ||||
|     update_format: UpdateFormat, | ||||
|     autogenerate_docids: bool, | ||||
| @@ -289,6 +296,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             facet_min_level_size: None, | ||||
|             words_prefix_threshold: None, | ||||
|             max_prefix_length: None, | ||||
|             words_positions_level_group_size: None, | ||||
|             words_positions_min_level_size: None, | ||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||
|             update_format: UpdateFormat::Json, | ||||
|             autogenerate_docids: true, | ||||
| @@ -402,6 +411,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         enum DatabaseType { | ||||
|             Main, | ||||
|             WordDocids, | ||||
|             WordLevel0PositionDocids, | ||||
|             FacetLevel0ValuesDocids, | ||||
|         } | ||||
|  | ||||
| @@ -467,6 +477,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|             let mut word_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut docid_word_positions_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut words_pairs_proximities_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut word_level_position_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut facet_field_value_docids_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut field_id_docid_facet_values_readers = Vec::with_capacity(readers.len()); | ||||
|             let mut documents_readers = Vec::with_capacity(readers.len()); | ||||
| @@ -476,6 +487,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                     word_docids, | ||||
|                     docid_word_positions, | ||||
|                     words_pairs_proximities_docids, | ||||
|                     word_level_position_docids, | ||||
|                     facet_field_value_docids, | ||||
|                     field_id_docid_facet_values, | ||||
|                     documents | ||||
| @@ -484,6 +496,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                 word_docids_readers.push(word_docids); | ||||
|                 docid_word_positions_readers.push(docid_word_positions); | ||||
|                 words_pairs_proximities_docids_readers.push(words_pairs_proximities_docids); | ||||
|                 word_level_position_docids_readers.push(word_level_position_docids); | ||||
|                 facet_field_value_docids_readers.push(facet_field_value_docids); | ||||
|                 field_id_docid_facet_values_readers.push(field_id_docid_facet_values); | ||||
|                 documents_readers.push(documents); | ||||
| @@ -514,6 +527,11 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                         facet_field_value_docids_readers, | ||||
|                         facet_field_value_docids_merge, | ||||
|                     ), | ||||
|                     ( | ||||
|                         DatabaseType::WordLevel0PositionDocids, | ||||
|                         word_level_position_docids_readers, | ||||
|                         word_level_position_docids_merge, | ||||
|                     ), | ||||
|                 ] | ||||
|                 .into_par_iter() | ||||
|                 .for_each(|(dbtype, readers, merge)| { | ||||
| @@ -569,7 +587,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         self.index.put_documents_ids(self.wtxn, &documents_ids)?; | ||||
|  | ||||
|         let mut database_count = 0; | ||||
|         let total_databases = 7; | ||||
|         let total_databases = 8; | ||||
|  | ||||
|         progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase { | ||||
|             databases_seen: 0, | ||||
| @@ -661,7 +679,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                     )?; | ||||
|                 }, | ||||
|                 DatabaseType::FacetLevel0ValuesDocids => { | ||||
|                     debug!("Writing the facet values docids into LMDB on disk..."); | ||||
|                     debug!("Writing the facet level 0 values docids into LMDB on disk..."); | ||||
|                     let db = *self.index.facet_field_id_value_docids.as_polymorph(); | ||||
|                     write_into_lmdb_database( | ||||
|                         self.wtxn, | ||||
| @@ -671,6 +689,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 }, | ||||
|                 DatabaseType::WordLevel0PositionDocids => { | ||||
|                     debug!("Writing the word level 0 positions docids into LMDB on disk..."); | ||||
|                     let db = *self.index.word_level_position_docids.as_polymorph(); | ||||
|                     write_into_lmdb_database( | ||||
|                         self.wtxn, | ||||
|                         db, | ||||
|                         content, | ||||
|                         word_level_position_docids_merge, | ||||
|                         write_method, | ||||
|                     )?; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             database_count += 1; | ||||
| @@ -694,10 +723,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         builder.execute()?; | ||||
|  | ||||
|         // Run the words prefixes update operation. | ||||
|         let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id); | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id); | ||||
|         if let Some(value) = self.words_prefix_threshold { | ||||
|             builder.threshold(value); | ||||
|         } | ||||
| @@ -706,6 +732,37 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         } | ||||
|         builder.execute()?; | ||||
|  | ||||
|         // Run the word prefix docids update operation. | ||||
|         let mut builder = WordPrefixDocids::new(self.wtxn, self.index); | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         builder.max_nb_chunks = self.max_nb_chunks; | ||||
|         builder.max_memory = self.max_memory; | ||||
|         builder.execute()?; | ||||
|  | ||||
|         // Run the word prefix pair proximity docids update operation. | ||||
|         let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         builder.max_nb_chunks = self.max_nb_chunks; | ||||
|         builder.max_memory = self.max_memory; | ||||
|         builder.execute()?; | ||||
|  | ||||
|         // Run the words level positions update operation. | ||||
|         let mut builder = WordsLevelPositions::new(self.wtxn, self.index); | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|         if let Some(value) = self.words_positions_level_group_size { | ||||
|             builder.level_group_size(value); | ||||
|         } | ||||
|         if let Some(value) = self.words_positions_min_level_size { | ||||
|             builder.min_level_size(value); | ||||
|         } | ||||
|         builder.execute()?; | ||||
|  | ||||
|         debug_assert_eq!(database_count, total_databases); | ||||
|  | ||||
|         info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); | ||||
|   | ||||
| @@ -29,7 +29,8 @@ use crate::{json_to_string, SmallVec8, SmallVec32, Position, DocumentId, FieldId | ||||
| use super::{MergeFn, create_writer, create_sorter, writer_into_reader}; | ||||
| use super::merge_function::{ | ||||
|     main_merge, word_docids_merge, words_pairs_proximities_docids_merge, | ||||
|     facet_field_value_docids_merge, field_id_docid_facet_values_merge, | ||||
|     word_level_position_docids_merge, facet_field_value_docids_merge, | ||||
|     field_id_docid_facet_values_merge, | ||||
| }; | ||||
|  | ||||
| const LMDB_MAX_KEY_LENGTH: usize = 511; | ||||
| @@ -43,6 +44,7 @@ pub struct Readers { | ||||
|     pub word_docids: Reader<FileFuse>, | ||||
|     pub docid_word_positions: Reader<FileFuse>, | ||||
|     pub words_pairs_proximities_docids: Reader<FileFuse>, | ||||
|     pub word_level_position_docids: Reader<FileFuse>, | ||||
|     pub facet_field_value_docids: Reader<FileFuse>, | ||||
|     pub field_id_docid_facet_values: Reader<FileFuse>, | ||||
|     pub documents: Reader<FileFuse>, | ||||
| @@ -69,6 +71,7 @@ pub struct Store<'s, A> { | ||||
|     main_sorter: Sorter<MergeFn>, | ||||
|     word_docids_sorter: Sorter<MergeFn>, | ||||
|     words_pairs_proximities_docids_sorter: Sorter<MergeFn>, | ||||
|     word_level_position_docids_sorter: Sorter<MergeFn>, | ||||
|     facet_field_value_docids_sorter: Sorter<MergeFn>, | ||||
|     field_id_docid_facet_values_sorter: Sorter<MergeFn>, | ||||
|     // MTBL writers | ||||
| @@ -94,7 +97,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|     ) -> anyhow::Result<Self> | ||||
|     { | ||||
|         // We divide the max memory by the number of sorter the Store have. | ||||
|         let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 4)); | ||||
|         let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5)); | ||||
|         let linked_hash_map_size = linked_hash_map_size.unwrap_or(500); | ||||
|  | ||||
|         let main_sorter = create_sorter( | ||||
| @@ -121,6 +124,14 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let word_level_position_docids_sorter = create_sorter( | ||||
|             word_level_position_docids_merge, | ||||
|             chunk_compression_type, | ||||
|             chunk_compression_level, | ||||
|             chunk_fusing_shrink_size, | ||||
|             max_nb_chunks, | ||||
|             max_memory, | ||||
|         ); | ||||
|         let facet_field_value_docids_sorter = create_sorter( | ||||
|             facet_field_value_docids_merge, | ||||
|             chunk_compression_type, | ||||
| @@ -172,6 +183,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             main_sorter, | ||||
|             word_docids_sorter, | ||||
|             words_pairs_proximities_docids_sorter, | ||||
|             word_level_position_docids_sorter, | ||||
|             facet_field_value_docids_sorter, | ||||
|             field_id_docid_facet_values_sorter, | ||||
|             // MTBL writers | ||||
| @@ -290,6 +302,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|  | ||||
|         self.documents_writer.insert(document_id.to_be_bytes(), record)?; | ||||
|         Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; | ||||
|         Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?; | ||||
|  | ||||
|         words_positions.clear(); | ||||
|  | ||||
| @@ -360,6 +373,42 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_word_position_docids( | ||||
|         writer: &mut Sorter<MergeFn>, | ||||
|         document_id: DocumentId, | ||||
|         words_positions: &HashMap<String, SmallVec32<Position>>, | ||||
|     ) -> anyhow::Result<()> | ||||
|     { | ||||
|         let mut key_buffer = Vec::new(); | ||||
|         let mut data_buffer = Vec::new(); | ||||
|  | ||||
|         for (word, positions) in words_positions { | ||||
|             key_buffer.clear(); | ||||
|             key_buffer.extend_from_slice(word.as_bytes()); | ||||
|             key_buffer.push(0); // level 0 | ||||
|  | ||||
|             for position in positions { | ||||
|                 key_buffer.truncate(word.len() + 1); | ||||
|                 let position_bytes = position.to_be_bytes(); | ||||
|                 key_buffer.extend_from_slice(position_bytes.as_bytes()); | ||||
|                 key_buffer.extend_from_slice(position_bytes.as_bytes()); | ||||
|  | ||||
|                 data_buffer.clear(); | ||||
|                 let positions = RoaringBitmap::from_iter(Some(document_id)); | ||||
|                 // We serialize the positions into a buffer. | ||||
|                 CboRoaringBitmapCodec::serialize_into(&positions, &mut data_buffer) | ||||
|                     .with_context(|| "could not serialize positions")?; | ||||
|  | ||||
|                 // that we write under the generated key into MTBL | ||||
|                 if lmdb_key_valid_size(&key_buffer) { | ||||
|                     writer.insert(&key_buffer, &data_buffer)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_facet_field_value_docids<I>( | ||||
|         sorter: &mut Sorter<MergeFn>, | ||||
|         iter: I, | ||||
| @@ -561,6 +610,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?; | ||||
|  | ||||
|         let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?; | ||||
|  | ||||
|         let mut facet_field_value_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?; | ||||
|         self.facet_field_value_docids_sorter.write_into(&mut facet_field_value_docids_wtr)?; | ||||
|  | ||||
| @@ -570,6 +622,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|         let main = writer_into_reader(main_wtr, shrink_size)?; | ||||
|         let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?; | ||||
|         let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?; | ||||
|         let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?; | ||||
|         let facet_field_value_docids = writer_into_reader(facet_field_value_docids_wtr, shrink_size)?; | ||||
|         let field_id_docid_facet_values = writer_into_reader(field_id_docid_facet_values_wtr, shrink_size)?; | ||||
|         let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?; | ||||
| @@ -580,6 +633,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { | ||||
|             word_docids, | ||||
|             docid_word_positions, | ||||
|             words_pairs_proximities_docids, | ||||
|             word_level_position_docids, | ||||
|             facet_field_value_docids, | ||||
|             field_id_docid_facet_values, | ||||
|             documents, | ||||
|   | ||||
| @@ -6,7 +6,10 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc | ||||
| pub use self::settings::{Setting, Settings}; | ||||
| pub use self::update_builder::UpdateBuilder; | ||||
| pub use self::update_step::UpdateIndexingStep; | ||||
| pub use self::words_prefixes::WordsPrefixes; | ||||
| pub use self::word_prefix_docids::WordPrefixDocids; | ||||
| pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; | ||||
| pub use self::words_level_positions::WordsLevelPositions; | ||||
| pub use self::words_prefixes_fst::WordsPrefixesFst; | ||||
|  | ||||
| mod available_documents_ids; | ||||
| mod clear_documents; | ||||
| @@ -16,5 +19,7 @@ mod index_documents; | ||||
| mod settings; | ||||
| mod update_builder; | ||||
| mod update_step; | ||||
| mod words_prefixes; | ||||
|  | ||||
| mod word_prefix_docids; | ||||
| mod word_prefix_pair_proximity_docids; | ||||
| mod words_level_positions; | ||||
| mod words_prefixes_fst; | ||||
|   | ||||
| @@ -2,7 +2,7 @@ use grenad::CompressionType; | ||||
| use rayon::ThreadPool; | ||||
|  | ||||
| use crate::Index; | ||||
| use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets, WordsPrefixes}; | ||||
| use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; | ||||
|  | ||||
| pub struct UpdateBuilder<'a> { | ||||
|     pub(crate) log_every_n: Option<usize>, | ||||
| @@ -135,19 +135,4 @@ impl<'a> UpdateBuilder<'a> { | ||||
|  | ||||
|         builder | ||||
|     } | ||||
|  | ||||
|     pub fn words_prefixes<'t, 'u, 'i>( | ||||
|         self, | ||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|         index: &'i Index, | ||||
|     ) -> WordsPrefixes<'t, 'u, 'i> | ||||
|     { | ||||
|         let mut builder = WordsPrefixes::new(wtxn, index, self.update_id); | ||||
|  | ||||
|         builder.chunk_compression_type = self.chunk_compression_type; | ||||
|         builder.chunk_compression_level = self.chunk_compression_level; | ||||
|         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||
|  | ||||
|         builder | ||||
|     } | ||||
| } | ||||
|   | ||||
							
								
								
									
										75
									
								
								milli/src/update/word_prefix_docids.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										75
									
								
								milli/src/update/word_prefix_docids.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,75 @@ | ||||
| use std::str; | ||||
|  | ||||
| use crate::Index; | ||||
| use fst::Streamer; | ||||
| use grenad::CompressionType; | ||||
| use heed::types::ByteSlice; | ||||
|  | ||||
| use crate::update::index_documents::WriteMethod; | ||||
| use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database}; | ||||
|  | ||||
| pub struct WordPrefixDocids<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
| } | ||||
|  | ||||
| impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { | ||||
|     pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> { | ||||
|         WordPrefixDocids { | ||||
|             wtxn, | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn execute(self) -> anyhow::Result<()> { | ||||
|         // Clear the word prefix docids database. | ||||
|         self.index.word_prefix_docids.clear(self.wtxn)?; | ||||
|  | ||||
|         let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; | ||||
|  | ||||
|         // It is forbidden to keep a mutable reference into the database | ||||
|         // and write into it at the same time, therefore we write into another file. | ||||
|         let mut prefix_docids_sorter = create_sorter( | ||||
|             word_docids_merge, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
|  | ||||
|         // We iterate over all the prefixes and retrieve the corresponding docids. | ||||
|         let mut prefix_stream = prefix_fst.stream(); | ||||
|         while let Some(bytes) = prefix_stream.next() { | ||||
|             let prefix = str::from_utf8(bytes)?; | ||||
|             let db = self.index.word_docids.remap_data_type::<ByteSlice>(); | ||||
|             for result in db.prefix_iter(self.wtxn, prefix)? { | ||||
|                 let (_word, data) = result?; | ||||
|                 prefix_docids_sorter.insert(prefix, data)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         drop(prefix_fst); | ||||
|  | ||||
|         // We finally write the word prefix docids into the LMDB database. | ||||
|         sorter_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_docids.as_polymorph(), | ||||
|             prefix_docids_sorter, | ||||
|             word_docids_merge, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										89
									
								
								milli/src/update/word_prefix_pair_proximity_docids.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										89
									
								
								milli/src/update/word_prefix_pair_proximity_docids.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,89 @@ | ||||
| use std::str; | ||||
|  | ||||
| use fst::automaton::{Automaton, Str}; | ||||
| use fst::{Streamer, IntoStreamer}; | ||||
| use grenad::CompressionType; | ||||
| use heed::BytesEncode; | ||||
| use heed::types::ByteSlice; | ||||
| use log::debug; | ||||
|  | ||||
| use crate::Index; | ||||
| use crate::heed_codec::StrStrU8Codec; | ||||
| use crate::update::index_documents::{ | ||||
|     WriteMethod, create_sorter, sorter_into_lmdb_database, | ||||
|     words_pairs_proximities_docids_merge, | ||||
| }; | ||||
|  | ||||
| pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
| } | ||||
|  | ||||
| impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { | ||||
|     pub fn new( | ||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|         index: &'i Index, | ||||
|     ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> | ||||
|     { | ||||
|         WordPrefixPairProximityDocids { | ||||
|             wtxn, | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn execute(self) -> anyhow::Result<()> { | ||||
|         debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); | ||||
|  | ||||
|         self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; | ||||
|  | ||||
|         let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; | ||||
|  | ||||
|         // Here we create a sorter akin to the previous one. | ||||
|         let mut word_prefix_pair_proximity_docids_sorter = create_sorter( | ||||
|             words_pairs_proximities_docids_merge, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
|  | ||||
|         // We insert all the word pairs corresponding to the word-prefix pairs | ||||
|         // where the prefixes appears in the prefix FST previously constructed. | ||||
|         let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(); | ||||
|         for result in db.iter(self.wtxn)? { | ||||
|             let ((word1, word2, prox), data) = result?; | ||||
|             let automaton = Str::new(word2).starts_with(); | ||||
|             let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); | ||||
|             while let Some(prefix) = matching_prefixes.next() { | ||||
|                 let prefix = str::from_utf8(prefix)?; | ||||
|                 let pair = (word1, prefix, prox); | ||||
|                 let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); | ||||
|                 word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         drop(prefix_fst); | ||||
|  | ||||
|         // We finally write the word prefix pair proximity docids into the LMDB database. | ||||
|         sorter_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|             word_prefix_pair_proximity_docids_sorter, | ||||
|             words_pairs_proximities_docids_merge, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										261
									
								
								milli/src/update/words_level_positions.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										261
									
								
								milli/src/update/words_level_positions.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,261 @@ | ||||
| use std::{cmp, str}; | ||||
| use std::convert::TryFrom; | ||||
| use std::fs::File; | ||||
| use std::num::NonZeroU32; | ||||
|  | ||||
| use fst::automaton::{self, Automaton}; | ||||
| use fst::{Streamer, IntoStreamer}; | ||||
| use grenad::{CompressionType, Reader, Writer, FileFuse}; | ||||
| use heed::types::{ByteSlice, DecodeIgnore, Str}; | ||||
| use heed::{BytesEncode, Error}; | ||||
| use log::debug; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec}; | ||||
| use crate::update::index_documents::WriteMethod; | ||||
| use crate::update::index_documents::{ | ||||
|     create_writer, create_sorter, writer_into_reader, write_into_lmdb_database, | ||||
|     word_prefix_level_positions_docids_merge, sorter_into_lmdb_database | ||||
| }; | ||||
| use crate::{Index, TreeLevel}; | ||||
|  | ||||
| pub struct WordsLevelPositions<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
|     level_group_size: NonZeroU32, | ||||
|     min_level_size: NonZeroU32, | ||||
| } | ||||
|  | ||||
| impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|     pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> { | ||||
|         WordsLevelPositions { | ||||
|             wtxn, | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|             level_group_size: NonZeroU32::new(4).unwrap(), | ||||
|             min_level_size: NonZeroU32::new(5).unwrap(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn level_group_size(&mut self, value: NonZeroU32) -> &mut Self { | ||||
|         self.level_group_size = NonZeroU32::new(cmp::max(value.get(), 2)).unwrap(); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn min_level_size(&mut self, value: NonZeroU32) -> &mut Self { | ||||
|         self.min_level_size = value; | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn execute(self) -> anyhow::Result<()> { | ||||
|         debug!("Computing and writing the word levels positions docids into LMDB on disk..."); | ||||
|  | ||||
|         let entries = compute_positions_levels( | ||||
|             self.wtxn, | ||||
|             self.index.word_docids.remap_data_type::<DecodeIgnore>(), | ||||
|             self.index.word_level_position_docids, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.level_group_size, | ||||
|             self.min_level_size, | ||||
|         )?; | ||||
|  | ||||
|         // The previously computed entries also defines the level 0 entries | ||||
|         // so we can clear the database and append all of these entries. | ||||
|         self.index.word_level_position_docids.clear(self.wtxn)?; | ||||
|  | ||||
|         write_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_level_position_docids.as_polymorph(), | ||||
|             entries, | ||||
|             |_, _| anyhow::bail!("invalid word level position merging"), | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
|         // We compute the word prefix level positions database. | ||||
|         self.index.word_prefix_level_position_docids.clear(self.wtxn)?; | ||||
|  | ||||
|         let mut word_prefix_level_positions_docids_sorter = create_sorter( | ||||
|             word_prefix_level_positions_docids_merge, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
|  | ||||
|         // We insert the word prefix level positions where the level is equal to 0 and | ||||
|         // corresponds to the word-prefix level positions where the prefixes appears | ||||
|         // in the prefix FST previously constructed. | ||||
|         let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; | ||||
|         let db = self.index.word_level_position_docids.remap_data_type::<ByteSlice>(); | ||||
|         for result in db.iter(self.wtxn)? { | ||||
|             let ((word, level, left, right), data) = result?; | ||||
|             if level == TreeLevel::min_value() { | ||||
|                 let automaton = automaton::Str::new(word).starts_with(); | ||||
|                 let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); | ||||
|                 while let Some(prefix) = matching_prefixes.next() { | ||||
|                     let prefix = str::from_utf8(prefix)?; | ||||
|                     let key = (prefix, level, left, right); | ||||
|                     let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap(); | ||||
|                     word_prefix_level_positions_docids_sorter.insert(bytes, data)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // We finally write all the word prefix level positions docids with | ||||
|         // a level equal to 0 into the LMDB database. | ||||
|         sorter_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_level_position_docids.as_polymorph(), | ||||
|             word_prefix_level_positions_docids_sorter, | ||||
|             word_prefix_level_positions_docids_merge, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
|         let entries = compute_positions_levels( | ||||
|             self.wtxn, | ||||
|             self.index.word_prefix_docids.remap_data_type::<DecodeIgnore>(), | ||||
|             self.index.word_prefix_level_position_docids, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.level_group_size, | ||||
|             self.min_level_size, | ||||
|         )?; | ||||
|  | ||||
|         // The previously computed entries also defines the level 0 entries | ||||
|         // so we can clear the database and append all of these entries. | ||||
|         self.index.word_prefix_level_position_docids.clear(self.wtxn)?; | ||||
|  | ||||
|         write_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_level_position_docids.as_polymorph(), | ||||
|             entries, | ||||
|             |_, _| anyhow::bail!("invalid word prefix level position merging"), | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Returns the next number after or equal to `x` that is divisible by `d`. | ||||
| fn next_divisible(x: u32, d: u32) -> u32 { | ||||
|     (x.saturating_sub(1) | (d - 1)) + 1 | ||||
| } | ||||
|  | ||||
| /// Returns the previous number after or equal to `x` that is divisible by `d`, | ||||
| /// saturates on zero. | ||||
| fn previous_divisible(x: u32, d: u32) -> u32 { | ||||
|     match x.checked_sub(d - 1) { | ||||
|         Some(0) | None => 0, | ||||
|         Some(x) => next_divisible(x, d), | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Generates all the words positions levels based on the levels zero (including the level zero). | ||||
| fn compute_positions_levels( | ||||
|     rtxn: &heed::RoTxn, | ||||
|     words_db: heed::Database<Str, DecodeIgnore>, | ||||
|     words_positions_db: heed::Database<StrLevelPositionCodec, CboRoaringBitmapCodec>, | ||||
|     compression_type: CompressionType, | ||||
|     compression_level: Option<u32>, | ||||
|     shrink_size: Option<u64>, | ||||
|     level_group_size: NonZeroU32, | ||||
|     min_level_size: NonZeroU32, | ||||
| ) -> anyhow::Result<Reader<FileFuse>> | ||||
| { | ||||
|     // It is forbidden to keep a cursor and write in a database at the same time with LMDB | ||||
|     // therefore we write the facet levels entries into a grenad file before transfering them. | ||||
|     let mut writer = tempfile::tempfile().and_then(|file| { | ||||
|         create_writer(compression_type, compression_level, file) | ||||
|     })?; | ||||
|  | ||||
|     for result in words_db.iter(rtxn)? { | ||||
|         let (word, ()) = result?; | ||||
|  | ||||
|         let level_0_range = { | ||||
|             let left = (word, TreeLevel::min_value(), u32::min_value(), u32::min_value()); | ||||
|             let right = (word, TreeLevel::min_value(), u32::max_value(), u32::max_value()); | ||||
|             left..=right | ||||
|         }; | ||||
|  | ||||
|         let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>() | ||||
|             .range(rtxn, &level_0_range)? | ||||
|             .fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?; | ||||
|  | ||||
|         // Groups sizes are always a power of the original level_group_size and therefore a group | ||||
|         // always maps groups of the previous level and never splits previous levels groups in half. | ||||
|         let group_size_iter = (1u8..) | ||||
|             .map(|l| (TreeLevel::try_from(l).unwrap(), level_group_size.get().pow(l as u32))) | ||||
|             .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); | ||||
|  | ||||
|         // As specified in the documentation, we also write the level 0 entries. | ||||
|         for result in words_positions_db.range(rtxn, &level_0_range)? { | ||||
|             let ((word, level, left, right), docids) = result?; | ||||
|             write_level_entry(&mut writer, word, level, left, right, &docids)?; | ||||
|         } | ||||
|  | ||||
|         for (level, group_size) in group_size_iter { | ||||
|             let mut left = 0; | ||||
|             let mut right = 0; | ||||
|             let mut group_docids = RoaringBitmap::new(); | ||||
|  | ||||
|             for (i, result) in words_positions_db.range(rtxn, &level_0_range)?.enumerate() { | ||||
|                 let ((_word, _level, value, _right), docids) = result?; | ||||
|  | ||||
|                 if i == 0 { | ||||
|                     left = previous_divisible(value, group_size); | ||||
|                     right = left + (group_size - 1); | ||||
|                 } | ||||
|  | ||||
|                 if value > right { | ||||
|                     // we found the first bound of the next group, we must store the left | ||||
|                     // and right bounds associated with the docids. | ||||
|                     write_level_entry(&mut writer, word, level, left, right, &group_docids)?; | ||||
|  | ||||
|                     // We save the left bound for the new group and also reset the docids. | ||||
|                     group_docids = RoaringBitmap::new(); | ||||
|                     left = previous_divisible(value, group_size); | ||||
|                     right = left + (group_size - 1); | ||||
|                 } | ||||
|  | ||||
|                 // The right bound is always the bound we run through. | ||||
|                 group_docids.union_with(&docids); | ||||
|             } | ||||
|  | ||||
|             if !group_docids.is_empty() { | ||||
|                 write_level_entry(&mut writer, word, level, left, right, &group_docids)?; | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     writer_into_reader(writer, shrink_size) | ||||
| } | ||||
|  | ||||
| fn write_level_entry( | ||||
|     writer: &mut Writer<File>, | ||||
|     word: &str, | ||||
|     level: TreeLevel, | ||||
|     left: u32, | ||||
|     right: u32, | ||||
|     ids: &RoaringBitmap, | ||||
| ) -> anyhow::Result<()> | ||||
| { | ||||
|     let key = (word, level, left, right); | ||||
|     let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?; | ||||
|     let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; | ||||
|     writer.insert(&key, &data)?; | ||||
|     Ok(()) | ||||
| } | ||||
| @@ -1,196 +0,0 @@ | ||||
| use std::iter::FromIterator; | ||||
| use std::str; | ||||
|  | ||||
| use chrono::Utc; | ||||
| use fst::automaton::Str; | ||||
| use fst::{Automaton, Streamer, IntoStreamer}; | ||||
| use grenad::CompressionType; | ||||
| use heed::BytesEncode; | ||||
| use heed::types::ByteSlice; | ||||
|  | ||||
| use crate::heed_codec::StrStrU8Codec; | ||||
| use crate::update::index_documents::WriteMethod; | ||||
| use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database}; | ||||
| use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge}; | ||||
| use crate::{Index, SmallString32}; | ||||
|  | ||||
| pub struct WordsPrefixes<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     pub(crate) chunk_compression_type: CompressionType, | ||||
|     pub(crate) chunk_compression_level: Option<u32>, | ||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||
|     pub(crate) max_nb_chunks: Option<usize>, | ||||
|     pub(crate) max_memory: Option<usize>, | ||||
|     threshold: f64, | ||||
|     max_prefix_length: usize, | ||||
|     _update_id: u64, | ||||
| } | ||||
|  | ||||
| impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { | ||||
|     pub fn new( | ||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|         index: &'i Index, | ||||
|         update_id: u64, | ||||
|     ) -> WordsPrefixes<'t, 'u, 'i> | ||||
|     { | ||||
|         WordsPrefixes { | ||||
|             wtxn, | ||||
|             index, | ||||
|             chunk_compression_type: CompressionType::None, | ||||
|             chunk_compression_level: None, | ||||
|             chunk_fusing_shrink_size: None, | ||||
|             max_nb_chunks: None, | ||||
|             max_memory: None, | ||||
|             threshold: 0.1 / 100.0, // .01% | ||||
|             max_prefix_length: 4, | ||||
|             _update_id: update_id, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Set the ratio of concerned words required to make a prefix be part of the words prefixes | ||||
|     /// database. If a word prefix is supposed to match more than this number of words in the | ||||
|     /// dictionnary, therefore this prefix is added to the words prefixes datastructures. | ||||
|     /// | ||||
|     /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped | ||||
|     /// to these bounds otherwise. | ||||
|     pub fn threshold(&mut self, value: f64) -> &mut Self { | ||||
|         self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     /// Set the maximum length of prefixes in bytes. | ||||
|     /// | ||||
|     /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped | ||||
|     /// to these bounds, otherwise. | ||||
|     pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { | ||||
|         self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn execute(self) -> anyhow::Result<()> { | ||||
|         self.index.set_updated_at(self.wtxn, &Utc::now())?; | ||||
|         // Clear the words prefixes datastructures. | ||||
|         self.index.word_prefix_docids.clear(self.wtxn)?; | ||||
|         self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?; | ||||
|  | ||||
|         let words_fst = self.index.words_fst(&self.wtxn)?; | ||||
|         let number_of_words = words_fst.len(); | ||||
|         let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; | ||||
|  | ||||
|         // It is forbidden to keep a mutable reference into the database | ||||
|         // and write into it at the same time, therefore we write into another file. | ||||
|         let mut prefix_docids_sorter = create_sorter( | ||||
|             word_docids_merge, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
|  | ||||
|         let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); | ||||
|         for n in 1..=self.max_prefix_length { | ||||
|  | ||||
|             let mut current_prefix = SmallString32::new(); | ||||
|             let mut current_prefix_count = 0; | ||||
|             let mut builder = fst::SetBuilder::memory(); | ||||
|  | ||||
|             let mut stream = words_fst.stream(); | ||||
|             while let Some(bytes) = stream.next() { | ||||
|                 // We try to get the first n bytes out of this string but we only want | ||||
|                 // to split at valid characters bounds. If we try to split in the middle of | ||||
|                 // a character we ignore this word and go to the next one. | ||||
|                 let word = str::from_utf8(bytes)?; | ||||
|                 let prefix = match word.get(..n) { | ||||
|                     Some(prefix) => prefix, | ||||
|                     None => continue, | ||||
|                 }; | ||||
|  | ||||
|                 // This is the first iteration of the loop, | ||||
|                 // or the current word doesn't starts with the current prefix. | ||||
|                 if current_prefix_count == 0 || prefix != current_prefix.as_str() { | ||||
|                     current_prefix = SmallString32::from(prefix); | ||||
|                     current_prefix_count = 0; | ||||
|                 } | ||||
|  | ||||
|                 current_prefix_count += 1; | ||||
|  | ||||
|                 // There is enough words corresponding to this prefix to add it to the cache. | ||||
|                 if current_prefix_count == min_number_of_words { | ||||
|                     builder.insert(prefix)?; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             // We construct the final set for prefixes of size n. | ||||
|             prefix_fsts.push(builder.into_set()); | ||||
|         } | ||||
|  | ||||
|         // We merge all of the previously computed prefixes into on final set. | ||||
|         let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); | ||||
|         let mut builder = fst::SetBuilder::memory(); | ||||
|         builder.extend_stream(op.r#union())?; | ||||
|         let prefix_fst = builder.into_set(); | ||||
|  | ||||
|         // We iterate over all the prefixes and retrieve the corresponding docids. | ||||
|         let mut prefix_stream = prefix_fst.stream(); | ||||
|         while let Some(bytes) = prefix_stream.next() { | ||||
|             let prefix = str::from_utf8(bytes)?; | ||||
|             let db = self.index.word_docids.remap_data_type::<ByteSlice>(); | ||||
|             for result in db.prefix_iter(self.wtxn, prefix)? { | ||||
|                 let (_word, data) = result?; | ||||
|                 prefix_docids_sorter.insert(prefix, data)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // Set the words prefixes FST in the dtabase. | ||||
|         self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; | ||||
|  | ||||
|         // We finally write the word prefix docids into the LMDB database. | ||||
|         sorter_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_docids.as_polymorph(), | ||||
|             prefix_docids_sorter, | ||||
|             word_docids_merge, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
|         // We compute the word prefix pair proximity database. | ||||
|  | ||||
|         // Here we create a sorter akin to the previous one. | ||||
|         let mut word_prefix_pair_proximity_docids_sorter = create_sorter( | ||||
|             words_pairs_proximities_docids_merge, | ||||
|             self.chunk_compression_type, | ||||
|             self.chunk_compression_level, | ||||
|             self.chunk_fusing_shrink_size, | ||||
|             self.max_nb_chunks, | ||||
|             self.max_memory, | ||||
|         ); | ||||
|  | ||||
|         // We insert all the word pairs corresponding to the word-prefix pairs | ||||
|         // where the prefixes appears in the prefix FST previously constructed. | ||||
|         let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(); | ||||
|         for result in db.iter(self.wtxn)? { | ||||
|             let ((word1, word2, prox), data) = result?; | ||||
|             let automaton = Str::new(word2).starts_with(); | ||||
|             let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); | ||||
|             while let Some(prefix) = matching_prefixes.next() { | ||||
|                 let prefix = str::from_utf8(prefix)?; | ||||
|                 let pair = (word1, prefix, prox); | ||||
|                 let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap(); | ||||
|                 word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // We finally write the word prefix pair proximity docids into the LMDB database. | ||||
|         sorter_into_lmdb_database( | ||||
|             self.wtxn, | ||||
|             *self.index.word_prefix_pair_proximity_docids.as_polymorph(), | ||||
|             word_prefix_pair_proximity_docids_sorter, | ||||
|             words_pairs_proximities_docids_merge, | ||||
|             WriteMethod::Append, | ||||
|         )?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										104
									
								
								milli/src/update/words_prefixes_fst.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								milli/src/update/words_prefixes_fst.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,104 @@ | ||||
| use std::iter::FromIterator; | ||||
| use std::str; | ||||
|  | ||||
| use fst::Streamer; | ||||
| use crate::{Index, SmallString32}; | ||||
|  | ||||
| pub struct WordsPrefixesFst<'t, 'u, 'i> { | ||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|     index: &'i Index, | ||||
|     threshold: f64, | ||||
|     max_prefix_length: usize, | ||||
|     _update_id: u64, | ||||
| } | ||||
|  | ||||
| impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> { | ||||
|     pub fn new( | ||||
|         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||
|         index: &'i Index, | ||||
|         update_id: u64, | ||||
|     ) -> WordsPrefixesFst<'t, 'u, 'i> | ||||
|     { | ||||
|         WordsPrefixesFst { | ||||
|             wtxn, | ||||
|             index, | ||||
|             threshold: 0.1 / 100.0, // .01% | ||||
|             max_prefix_length: 4, | ||||
|             _update_id: update_id, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Set the ratio of concerned words required to make a prefix be part of the words prefixes | ||||
|     /// database. If a word prefix is supposed to match more than this number of words in the | ||||
|     /// dictionnary, therefore this prefix is added to the words prefixes datastructures. | ||||
|     /// | ||||
|     /// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped | ||||
|     /// to these bounds otherwise. | ||||
|     pub fn threshold(&mut self, value: f64) -> &mut Self { | ||||
|         self.threshold = value.min(1.0).max(0.0); // clamp [0, 1] | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     /// Set the maximum length of prefixes in bytes. | ||||
|     /// | ||||
|     /// Default value is `4` bytes. This value must be between 1 and 25 will be clamped | ||||
|     /// to these bounds, otherwise. | ||||
|     pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { | ||||
|         self.max_prefix_length = value.min(25).max(1); // clamp [1, 25] | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn execute(self) -> anyhow::Result<()> { | ||||
|         let words_fst = self.index.words_fst(&self.wtxn)?; | ||||
|         let number_of_words = words_fst.len(); | ||||
|         let min_number_of_words = (number_of_words as f64 * self.threshold) as usize; | ||||
|  | ||||
|         let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length); | ||||
|         for n in 1..=self.max_prefix_length { | ||||
|  | ||||
|             let mut current_prefix = SmallString32::new(); | ||||
|             let mut current_prefix_count = 0; | ||||
|             let mut builder = fst::SetBuilder::memory(); | ||||
|  | ||||
|             let mut stream = words_fst.stream(); | ||||
|             while let Some(bytes) = stream.next() { | ||||
|                 // We try to get the first n bytes out of this string but we only want | ||||
|                 // to split at valid characters bounds. If we try to split in the middle of | ||||
|                 // a character we ignore this word and go to the next one. | ||||
|                 let word = str::from_utf8(bytes)?; | ||||
|                 let prefix = match word.get(..n) { | ||||
|                     Some(prefix) => prefix, | ||||
|                     None => continue, | ||||
|                 }; | ||||
|  | ||||
|                 // This is the first iteration of the loop, | ||||
|                 // or the current word doesn't starts with the current prefix. | ||||
|                 if current_prefix_count == 0 || prefix != current_prefix.as_str() { | ||||
|                     current_prefix = SmallString32::from(prefix); | ||||
|                     current_prefix_count = 0; | ||||
|                 } | ||||
|  | ||||
|                 current_prefix_count += 1; | ||||
|  | ||||
|                 // There is enough words corresponding to this prefix to add it to the cache. | ||||
|                 if current_prefix_count == min_number_of_words { | ||||
|                     builder.insert(prefix)?; | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             // We construct the final set for prefixes of size n. | ||||
|             prefix_fsts.push(builder.into_set()); | ||||
|         } | ||||
|  | ||||
|         // We merge all of the previously computed prefixes into on final set. | ||||
|         let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter()); | ||||
|         let mut builder = fst::SetBuilder::memory(); | ||||
|         builder.extend_stream(op.r#union())?; | ||||
|         let prefix_fst = builder.into_set(); | ||||
|  | ||||
|         // Set the words prefixes FST in the dtabase. | ||||
|         self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user