mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Add exactness ranking rules
This commit is contained in:
		
							
								
								
									
										175
									
								
								milli/src/search/new/exact_attribute.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										175
									
								
								milli/src/search/new/exact_attribute.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,175 @@ | |||||||
|  | use heed::BytesDecode; | ||||||
|  | use roaring::MultiOps; | ||||||
|  |  | ||||||
|  | use super::query_graph::QueryGraph; | ||||||
|  | use super::ranking_rules::{RankingRule, RankingRuleOutput}; | ||||||
|  | use crate::search::new::query_graph::QueryNodeData; | ||||||
|  | use crate::search::new::query_term::ExactTerm; | ||||||
|  | use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; | ||||||
|  |  | ||||||
|  | /// FIXME: | ||||||
|  | /// | ||||||
|  | /// - A lot of work done in next_bucket that start_iteration could do. | ||||||
|  | /// - Consider calling the graph based rule directly from this one. | ||||||
|  | /// - currently we did exact term, don't forget about prefix | ||||||
|  | /// - some tests | ||||||
|  | pub struct ExactAttribute { | ||||||
|  |     query_graph: Option<QueryGraph>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl ExactAttribute { | ||||||
|  |     pub fn new() -> Self { | ||||||
|  |         Self { query_graph: None } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { | ||||||
|  |     fn id(&self) -> String { | ||||||
|  |         "exact_attribute".to_owned() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn start_iteration( | ||||||
|  |         &mut self, | ||||||
|  |         _ctx: &mut SearchContext<'ctx>, | ||||||
|  |         _logger: &mut dyn SearchLogger<QueryGraph>, | ||||||
|  |         _universe: &roaring::RoaringBitmap, | ||||||
|  |         query: &QueryGraph, | ||||||
|  |     ) -> Result<()> { | ||||||
|  |         self.query_graph = Some(query.clone()); | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn next_bucket( | ||||||
|  |         &mut self, | ||||||
|  |         ctx: &mut SearchContext<'ctx>, | ||||||
|  |         _logger: &mut dyn SearchLogger<QueryGraph>, | ||||||
|  |         universe: &roaring::RoaringBitmap, | ||||||
|  |     ) -> Result<Option<RankingRuleOutput<QueryGraph>>> { | ||||||
|  |         // iterate on the nodes of the graph, retain LocatedQueryTermSubset | ||||||
|  |         let query_graph = self.query_graph.as_ref().unwrap(); | ||||||
|  |         let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = | ||||||
|  |             Vec::with_capacity(query_graph.nodes.len() as usize); | ||||||
|  |         for (_, node) in query_graph.nodes.iter() { | ||||||
|  |             match &node.data { | ||||||
|  |                 QueryNodeData::Term(term) => { | ||||||
|  |                     let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { | ||||||
|  |                         exact_term | ||||||
|  |                     } else { | ||||||
|  |                         // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules | ||||||
|  |                         return Ok(Some(RankingRuleOutput { | ||||||
|  |                             query: query_graph.clone(), | ||||||
|  |                             candidates: universe.clone(), | ||||||
|  |                         })); | ||||||
|  |                     }; | ||||||
|  |                     exact_term_position_ids.push(( | ||||||
|  |                         exact_term, | ||||||
|  |                         *term.positions.start(), | ||||||
|  |                         *term.term_ids.start(), | ||||||
|  |                     )) | ||||||
|  |                 } | ||||||
|  |                 QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         exact_term_position_ids.sort_by_key(|(_, _, id)| *id); | ||||||
|  |         // bail if there is a "hole" (missing word) in remaining query graph | ||||||
|  |         let mut previous_id = 0; | ||||||
|  |         for (_, _, id) in exact_term_position_ids.iter().copied() { | ||||||
|  |             if id < previous_id || id - previous_id > 1 { | ||||||
|  |                 // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules | ||||||
|  |                 return Ok(Some(RankingRuleOutput { | ||||||
|  |                     query: query_graph.clone(), | ||||||
|  |                     candidates: universe.clone(), | ||||||
|  |                 })); | ||||||
|  |             } else { | ||||||
|  |                 previous_id = id; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // sample query: "sunflower are pretty" | ||||||
|  |         // sunflower at pos 0 in attr A | ||||||
|  |         // are at pos 1 in attr B | ||||||
|  |         // pretty at pos 2 in attr C | ||||||
|  |         // We want to eliminate such document | ||||||
|  |  | ||||||
|  |         // first check that for each term, there exists some attribute that has this term at the correct position | ||||||
|  |         //"word-position-docids"; | ||||||
|  |         let mut candidates = universe.clone(); | ||||||
|  |         let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids | ||||||
|  |             .iter() | ||||||
|  |             .copied() | ||||||
|  |             .map(|(term, position, _)| (term.interned_words(ctx).collect(), position)) | ||||||
|  |             .collect(); | ||||||
|  |         for (words, position) in &words_positions { | ||||||
|  |             if candidates.is_empty() { | ||||||
|  |                 // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules | ||||||
|  |                 return Ok(Some(RankingRuleOutput { | ||||||
|  |                     query: query_graph.clone(), | ||||||
|  |                     candidates: universe.clone(), | ||||||
|  |                 })); | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             'words: for (offset, word) in words.iter().enumerate() { | ||||||
|  |                 let offset = offset as u16; | ||||||
|  |                 let word = if let Some(word) = word { | ||||||
|  |                     word | ||||||
|  |                 } else { | ||||||
|  |                     continue 'words; | ||||||
|  |                 }; | ||||||
|  |                 let word_position_docids = CboRoaringBitmapCodec::bytes_decode( | ||||||
|  |                     ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), | ||||||
|  |                 ) | ||||||
|  |                 .unwrap_or_default(); | ||||||
|  |                 candidates &= word_position_docids; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let candidates = candidates; | ||||||
|  |  | ||||||
|  |         if candidates.is_empty() { | ||||||
|  |             // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules | ||||||
|  |             return Ok(Some(RankingRuleOutput { | ||||||
|  |                 query: query_graph.clone(), | ||||||
|  |                 candidates: universe.clone(), | ||||||
|  |             })); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); | ||||||
|  |  | ||||||
|  |         let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); | ||||||
|  |  | ||||||
|  |         // then check that there exists at least one attribute that has all of the terms | ||||||
|  |         for fid in searchable_fields_ids { | ||||||
|  |             let mut intersection = MultiOps::intersection( | ||||||
|  |                 words_positions | ||||||
|  |                     .iter() | ||||||
|  |                     .flat_map(|(words, ..)| words.iter()) | ||||||
|  |                     // ignore stop words words in phrases | ||||||
|  |                     .flatten() | ||||||
|  |                     .map(|word| -> Result<_> { | ||||||
|  |                         Ok(ctx | ||||||
|  |                             .get_db_word_fid_docids(*word, fid)? | ||||||
|  |                             .map(CboRoaringBitmapCodec::bytes_decode) | ||||||
|  |                             .unwrap_or_default() | ||||||
|  |                             .unwrap_or_default()) | ||||||
|  |                     }), | ||||||
|  |             )?; | ||||||
|  |             intersection &= &candidates; | ||||||
|  |             if !intersection.is_empty() { | ||||||
|  |                 candidates_per_attributes.push(intersection); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         // note we could have "false positives" where there both exist different attributes that collectively | ||||||
|  |         // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. | ||||||
|  |  | ||||||
|  |         let candidates = MultiOps::union(candidates_per_attributes.into_iter()); | ||||||
|  |         Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn end_iteration( | ||||||
|  |         &mut self, | ||||||
|  |         _ctx: &mut SearchContext<'ctx>, | ||||||
|  |         _logger: &mut dyn SearchLogger<QueryGraph>, | ||||||
|  |     ) { | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner}; | |||||||
| use super::logger::SearchLogger; | use super::logger::SearchLogger; | ||||||
| use super::query_graph::QueryNode; | use super::query_graph::QueryNode; | ||||||
| use super::ranking_rule_graph::{ | use super::ranking_rule_graph::{ | ||||||
|     ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, |     ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph, | ||||||
|     TypoGraph, |     RankingRuleGraphTrait, TypoGraph, | ||||||
| }; | }; | ||||||
| use super::small_bitmap::SmallBitmap; | use super::small_bitmap::SmallBitmap; | ||||||
| use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; | use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; | ||||||
| @@ -65,6 +65,12 @@ impl GraphBasedRankingRule<TypoGraph> { | |||||||
|         Self::new_with_id("typo".to_owned(), terms_matching_strategy) |         Self::new_with_id("typo".to_owned(), terms_matching_strategy) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | pub type Exactness = GraphBasedRankingRule<ExactnessGraph>; | ||||||
|  | impl GraphBasedRankingRule<ExactnessGraph> { | ||||||
|  |     pub fn new() -> Self { | ||||||
|  |         Self::new_with_id("exactness".to_owned(), None) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| /// A generic graph-based ranking rule | /// A generic graph-based ranking rule | ||||||
| pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> { | pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> { | ||||||
|   | |||||||
| @@ -9,8 +9,9 @@ mod query_term; | |||||||
| mod ranking_rule_graph; | mod ranking_rule_graph; | ||||||
| mod ranking_rules; | mod ranking_rules; | ||||||
| mod resolve_query_graph; | mod resolve_query_graph; | ||||||
| // TODO: documentation + comments |  | ||||||
| mod small_bitmap; | mod small_bitmap; | ||||||
|  |  | ||||||
|  | mod exact_attribute; | ||||||
| // TODO: documentation + comments | // TODO: documentation + comments | ||||||
| // implementation is currently an adaptation of the previous implementation to fit with the new model | // implementation is currently an adaptation of the previous implementation to fit with the new model | ||||||
| mod sort; | mod sort; | ||||||
| @@ -33,6 +34,8 @@ use resolve_query_graph::PhraseDocIdsCache; | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use words::Words; | use words::Words; | ||||||
|  |  | ||||||
|  | use self::exact_attribute::ExactAttribute; | ||||||
|  | use self::graph_based_ranking_rule::Exactness; | ||||||
| use self::interner::Interner; | use self::interner::Interner; | ||||||
| use self::ranking_rules::{BoxRankingRule, RankingRule}; | use self::ranking_rules::{BoxRankingRule, RankingRule}; | ||||||
| use self::resolve_query_graph::compute_query_graph_docids; | use self::resolve_query_graph::compute_query_graph_docids; | ||||||
| @@ -150,7 +153,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( | |||||||
|     let mut proximity = false; |     let mut proximity = false; | ||||||
|     let mut sort = false; |     let mut sort = false; | ||||||
|     let attribute = false; |     let attribute = false; | ||||||
|     let exactness = false; |     let mut exactness = false; | ||||||
|     let mut asc = HashSet::new(); |     let mut asc = HashSet::new(); | ||||||
|     let mut desc = HashSet::new(); |     let mut desc = HashSet::new(); | ||||||
|  |  | ||||||
| @@ -211,8 +214,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( | |||||||
|                 if exactness { |                 if exactness { | ||||||
|                     continue; |                     continue; | ||||||
|                 } |                 } | ||||||
|                 // todo!(); |                 ranking_rules.push(Box::new(ExactAttribute::new())); | ||||||
|                 // exactness = false; |                 ranking_rules.push(Box::new(Exactness::new())); | ||||||
|  |                 exactness = true; | ||||||
|             } |             } | ||||||
|             crate::Criterion::Asc(field_name) => { |             crate::Criterion::Asc(field_name) => { | ||||||
|                 if asc.contains(&field_name) { |                 if asc.contains(&field_name) { | ||||||
|   | |||||||
							
								
								
									
										107
									
								
								milli/src/search/new/ranking_rule_graph/exactness/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										107
									
								
								milli/src/search/new/ranking_rule_graph/exactness/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,107 @@ | |||||||
|  | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  | use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; | ||||||
|  | use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; | ||||||
|  | use crate::search::new::query_graph::{QueryGraph, QueryNode}; | ||||||
|  | use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; | ||||||
|  | use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; | ||||||
|  |  | ||||||
|  | /// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other | ||||||
|  | /// word than a doc that matches 9 words non exactly but none exactly | ||||||
|  | /// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider | ||||||
|  | /// | ||||||
|  | /// "naive vision" | ||||||
|  | /// condition from one node to another: | ||||||
|  | /// - word exactly present: cost 0 | ||||||
|  | /// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside | ||||||
|  | /// | ||||||
|  | /// Three strategies: | ||||||
|  | /// 1. ExactAttribute: word position / word_fid_docid | ||||||
|  | /// 2. AttributeStart: | ||||||
|  | /// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` | ||||||
|  |  | ||||||
|  | #[derive(Clone, PartialEq, Eq, Hash)] | ||||||
|  | pub enum ExactnessCondition { | ||||||
|  |     ExactInAttribute(LocatedQueryTermSubset), | ||||||
|  |     Skip(LocatedQueryTermSubset), | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub enum ExactnessGraph {} | ||||||
|  |  | ||||||
|  | fn compute_docids( | ||||||
|  |     ctx: &mut SearchContext, | ||||||
|  |     dest_node: &LocatedQueryTermSubset, | ||||||
|  |     universe: &RoaringBitmap, | ||||||
|  | ) -> Result<RoaringBitmap> { | ||||||
|  |     let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) { | ||||||
|  |         exact_term | ||||||
|  |     } else { | ||||||
|  |         return Ok(Default::default()); | ||||||
|  |     }; | ||||||
|  |     let mut candidates = match exact_term { | ||||||
|  |         ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), | ||||||
|  |         ExactTerm::Word(word) => { | ||||||
|  |             if let Some(word_candidates) = ctx.get_db_word_docids(word)? { | ||||||
|  |                 CboRoaringBitmapCodec::deserialize_from(word_candidates)? | ||||||
|  |             } else { | ||||||
|  |                 return Ok(Default::default()); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     }; | ||||||
|  |     // TODO: synonyms? | ||||||
|  |     candidates &= universe; | ||||||
|  |     Ok(candidates) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl RankingRuleGraphTrait for ExactnessGraph { | ||||||
|  |     type Condition = ExactnessCondition; | ||||||
|  |  | ||||||
|  |     fn resolve_condition( | ||||||
|  |         ctx: &mut SearchContext, | ||||||
|  |         condition: &Self::Condition, | ||||||
|  |         universe: &RoaringBitmap, | ||||||
|  |     ) -> Result<ComputedCondition> { | ||||||
|  |         let (docids, dest_node) = match condition { | ||||||
|  |             ExactnessCondition::ExactInAttribute(dest_node) => { | ||||||
|  |                 (compute_docids(ctx, dest_node, universe)?, dest_node) | ||||||
|  |             } | ||||||
|  |             ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node), | ||||||
|  |         }; | ||||||
|  |         Ok(ComputedCondition { | ||||||
|  |             docids, | ||||||
|  |             universe_len: universe.len(), | ||||||
|  |             start_term_subset: None, | ||||||
|  |             end_term_subset: dest_node.clone(), | ||||||
|  |         }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn build_edges( | ||||||
|  |         _ctx: &mut SearchContext, | ||||||
|  |         conditions_interner: &mut DedupInterner<Self::Condition>, | ||||||
|  |         _source_node: Option<&LocatedQueryTermSubset>, | ||||||
|  |         dest_node: &LocatedQueryTermSubset, | ||||||
|  |     ) -> Result<Vec<(u32, Interned<Self::Condition>)>> { | ||||||
|  |         let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone()); | ||||||
|  |         let exact_condition = conditions_interner.insert(exact_condition); | ||||||
|  |  | ||||||
|  |         let skip_condition = ExactnessCondition::Skip(dest_node.clone()); | ||||||
|  |         let skip_condition = conditions_interner.insert(skip_condition); | ||||||
|  |         Ok(vec![(0, exact_condition), (1, skip_condition)]) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn log_state( | ||||||
|  |         graph: &RankingRuleGraph<Self>, | ||||||
|  |         paths: &[Vec<Interned<Self::Condition>>], | ||||||
|  |         dead_ends_cache: &DeadEndsCache<Self::Condition>, | ||||||
|  |         universe: &RoaringBitmap, | ||||||
|  |         costs: &MappedInterner<QueryNode, Vec<u64>>, | ||||||
|  |         cost: u64, | ||||||
|  |         logger: &mut dyn SearchLogger<QueryGraph>, | ||||||
|  |     ) { | ||||||
|  |         todo!() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> { | ||||||
|  |         todo!() | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -10,6 +10,8 @@ mod cheapest_paths; | |||||||
| mod condition_docids_cache; | mod condition_docids_cache; | ||||||
| mod dead_ends_cache; | mod dead_ends_cache; | ||||||
|  |  | ||||||
|  | /// Implementation of the `exactness` ranking rule | ||||||
|  | mod exactness; | ||||||
| /// Implementation of the `proximity` ranking rule | /// Implementation of the `proximity` ranking rule | ||||||
| mod proximity; | mod proximity; | ||||||
| /// Implementation of the `typo` ranking rule | /// Implementation of the `typo` ranking rule | ||||||
| @@ -20,6 +22,7 @@ use std::hash::Hash; | |||||||
| pub use cheapest_paths::PathVisitor; | pub use cheapest_paths::PathVisitor; | ||||||
| pub use condition_docids_cache::ConditionDocIdsCache; | pub use condition_docids_cache::ConditionDocIdsCache; | ||||||
| pub use dead_ends_cache::DeadEndsCache; | pub use dead_ends_cache::DeadEndsCache; | ||||||
|  | pub use exactness::{ExactnessCondition, ExactnessGraph}; | ||||||
| pub use proximity::{ProximityCondition, ProximityGraph}; | pub use proximity::{ProximityCondition, ProximityGraph}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| pub use typo::{TypoCondition, TypoGraph}; | pub use typo::{TypoCondition, TypoGraph}; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user