mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 21:46:27 +00:00 
			
		
		
		
	Merge #3768
3768: Fix bugs in graph-based ranking rules + make `words` a graph-based ranking rule r=dureuill a=loiclec This PR contains three changes: ## 1. Don't call the `words` ranking rule if the term matching strategy is `All` This is because the purpose of `words` is only to remove nodes from the query graph. It would never do any useful work when the matching strategy was `All`. Remember that the universe was already computed before by computing all the docids corresponding to the "maximally reduced" query graph, which, in the case of `All`, is equal to the original graph. ## 2. The `words` ranking rule is replaced by a graph-based ranking rule. This is for three reasons: 1. **performance**: graph-based ranking rules benefit from a lot of optimisations by default, which ensures that they are never too slow. The previous implementation of `words` could call `compute_query_graph_docids` many times if some words had to be removed from the query, which would be quite expensive. I was especially worried about its performance in cases where it is placed right after the `sort` ranking rule. Furthermore, `compute_query_graph_docids` would clone a lot of bitmaps many times unnecessarily. 2. **consistency**: every other ranking rule (except `sort`) is graph-based. It makes sense to implement `words` like that as well. It will automatically benefit from all the features, optimisations, and bug fixes that all the other ranking rules get. 3. **surfacing bugs**: as the first ranking rule to be called (most of the time), I'd like `words` to behave the same as the other ranking rules so that we can quickly detect bugs in our graph algorithms. This actually already happened, which is why this PR also contains a bug fix. ## 3. Fix the `update_all_costs_before_nodes` function It is a bit difficult to explain what was wrong, but I'll try. The bug happened when we had graphs like: <img width="730" alt="Screenshot 2023-05-16 at 10 58 57" src="https://github.com/meilisearch/meilisearch/assets/6040237/40db1a68-d852-4e89-99d5-0d65757242a7"> and we gave the node `is` as argument. Then, we'd walk backwards from the node breadth-first. We'd update the costs of: 1. `sun` 2. `thesun` 3. `start` 4. `the` which is an incorrect order. The correct order is: 1. `sun` 2. `thesun` 3. `the` 4. `start` That is, we can only update the cost of a node when all of its successors have either already been visited or were not affected by the update to the node passed as argument. To solve this bug, I factored out the graph-traversal logic into a `traverse_breadth_first_backward` function. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
		| @@ -46,7 +46,7 @@ use super::logger::SearchLogger; | |||||||
| use super::query_graph::QueryNode; | use super::query_graph::QueryNode; | ||||||
| use super::ranking_rule_graph::{ | use super::ranking_rule_graph::{ | ||||||
|     ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph, |     ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph, | ||||||
|     RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, |     RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, WordsGraph, | ||||||
| }; | }; | ||||||
| use super::small_bitmap::SmallBitmap; | use super::small_bitmap::SmallBitmap; | ||||||
| use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; | use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; | ||||||
| @@ -54,6 +54,12 @@ use crate::search::new::query_term::LocatedQueryTermSubset; | |||||||
| use crate::search::new::ranking_rule_graph::PathVisitor; | use crate::search::new::ranking_rule_graph::PathVisitor; | ||||||
| use crate::{Result, TermsMatchingStrategy}; | use crate::{Result, TermsMatchingStrategy}; | ||||||
|  |  | ||||||
|  | pub type Words = GraphBasedRankingRule<WordsGraph>; | ||||||
|  | impl GraphBasedRankingRule<WordsGraph> { | ||||||
|  |     pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self { | ||||||
|  |         Self::new_with_id("words".to_owned(), Some(terms_matching_strategy)) | ||||||
|  |     } | ||||||
|  | } | ||||||
| pub type Proximity = GraphBasedRankingRule<ProximityGraph>; | pub type Proximity = GraphBasedRankingRule<ProximityGraph>; | ||||||
| impl GraphBasedRankingRule<ProximityGraph> { | impl GraphBasedRankingRule<ProximityGraph> { | ||||||
|     pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self { |     pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self { | ||||||
|   | |||||||
| @@ -4,7 +4,6 @@ use std::io::{BufWriter, Write}; | |||||||
| use std::path::{Path, PathBuf}; | use std::path::{Path, PathBuf}; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
|  |  | ||||||
| // use rand::random; |  | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::search::new::interner::Interned; | use crate::search::new::interner::Interned; | ||||||
| @@ -13,6 +12,7 @@ use crate::search::new::query_term::LocatedQueryTermSubset; | |||||||
| use crate::search::new::ranking_rule_graph::{ | use crate::search::new::ranking_rule_graph::{ | ||||||
|     Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition, |     Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition, | ||||||
|     ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, |     ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, | ||||||
|  |     WordsCondition, WordsGraph, | ||||||
| }; | }; | ||||||
| use crate::search::new::ranking_rules::BoxRankingRule; | use crate::search::new::ranking_rules::BoxRankingRule; | ||||||
| use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; | use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger}; | ||||||
| @@ -24,11 +24,12 @@ pub enum SearchEvents { | |||||||
|     RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, |     RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, | ||||||
|     RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 }, |     RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 }, | ||||||
|     ExtendResults { new: Vec<u32> }, |     ExtendResults { new: Vec<u32> }, | ||||||
|     WordsGraph { query_graph: QueryGraph }, |  | ||||||
|     ProximityGraph { graph: RankingRuleGraph<ProximityGraph> }, |     ProximityGraph { graph: RankingRuleGraph<ProximityGraph> }, | ||||||
|     ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> }, |     ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> }, | ||||||
|     TypoGraph { graph: RankingRuleGraph<TypoGraph> }, |     TypoGraph { graph: RankingRuleGraph<TypoGraph> }, | ||||||
|     TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> }, |     TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> }, | ||||||
|  |     WordsGraph { graph: RankingRuleGraph<WordsGraph> }, | ||||||
|  |     WordsPaths { paths: Vec<Vec<Interned<WordsCondition>>> }, | ||||||
|     FidGraph { graph: RankingRuleGraph<FidGraph> }, |     FidGraph { graph: RankingRuleGraph<FidGraph> }, | ||||||
|     FidPaths { paths: Vec<Vec<Interned<FidCondition>>> }, |     FidPaths { paths: Vec<Vec<Interned<FidCondition>>> }, | ||||||
|     PositionGraph { graph: RankingRuleGraph<PositionGraph> }, |     PositionGraph { graph: RankingRuleGraph<PositionGraph> }, | ||||||
| @@ -139,8 +140,11 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger { | |||||||
|         let Some(location) = self.location.last() else { return }; |         let Some(location) = self.location.last() else { return }; | ||||||
|         match location { |         match location { | ||||||
|             Location::Words => { |             Location::Words => { | ||||||
|                 if let Some(query_graph) = state.downcast_ref::<QueryGraph>() { |                 if let Some(graph) = state.downcast_ref::<RankingRuleGraph<WordsGraph>>() { | ||||||
|                     self.events.push(SearchEvents::WordsGraph { query_graph: query_graph.clone() }); |                     self.events.push(SearchEvents::WordsGraph { graph: graph.clone() }); | ||||||
|  |                 } | ||||||
|  |                 if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<WordsCondition>>>>() { | ||||||
|  |                     self.events.push(SearchEvents::WordsPaths { paths: paths.clone() }); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             Location::Typo => { |             Location::Typo => { | ||||||
| @@ -329,7 +333,6 @@ impl<'ctx> DetailedLoggerFinish<'ctx> { | |||||||
|             SearchEvents::ExtendResults { new } => { |             SearchEvents::ExtendResults { new } => { | ||||||
|                 self.write_extend_results(new)?; |                 self.write_extend_results(new)?; | ||||||
|             } |             } | ||||||
|             SearchEvents::WordsGraph { query_graph } => self.write_words_graph(query_graph)?, |  | ||||||
|             SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?, |             SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?, | ||||||
|             SearchEvents::ProximityPaths { paths } => { |             SearchEvents::ProximityPaths { paths } => { | ||||||
|                 self.write_rr_graph_paths::<ProximityGraph>(paths)?; |                 self.write_rr_graph_paths::<ProximityGraph>(paths)?; | ||||||
| @@ -338,6 +341,10 @@ impl<'ctx> DetailedLoggerFinish<'ctx> { | |||||||
|             SearchEvents::TypoPaths { paths } => { |             SearchEvents::TypoPaths { paths } => { | ||||||
|                 self.write_rr_graph_paths::<TypoGraph>(paths)?; |                 self.write_rr_graph_paths::<TypoGraph>(paths)?; | ||||||
|             } |             } | ||||||
|  |             SearchEvents::WordsGraph { graph } => self.write_rr_graph(&graph)?, | ||||||
|  |             SearchEvents::WordsPaths { paths } => { | ||||||
|  |                 self.write_rr_graph_paths::<WordsGraph>(paths)?; | ||||||
|  |             } | ||||||
|             SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?, |             SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?, | ||||||
|             SearchEvents::FidPaths { paths } => { |             SearchEvents::FidPaths { paths } => { | ||||||
|                 self.write_rr_graph_paths::<FidGraph>(paths)?; |                 self.write_rr_graph_paths::<FidGraph>(paths)?; | ||||||
| @@ -455,7 +462,7 @@ fill: \"#B6E2D3\" | |||||||
|                 shape: class |                 shape: class | ||||||
|                 max_nbr_typo: {}", |                 max_nbr_typo: {}", | ||||||
|                     term_subset.description(ctx), |                     term_subset.description(ctx), | ||||||
|                     term_subset.max_nbr_typos(ctx) |                     term_subset.max_typo_cost(ctx) | ||||||
|                 )?; |                 )?; | ||||||
|  |  | ||||||
|                 for w in term_subset.all_single_words_except_prefix_db(ctx)? { |                 for w in term_subset.all_single_words_except_prefix_db(ctx)? { | ||||||
| @@ -482,13 +489,6 @@ fill: \"#B6E2D3\" | |||||||
|         } |         } | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|     fn write_words_graph(&mut self, qg: QueryGraph) -> Result<()> { |  | ||||||
|         self.make_new_file_for_internal_state_if_needed()?; |  | ||||||
|  |  | ||||||
|         self.write_query_graph(&qg)?; |  | ||||||
|  |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
|     fn write_rr_graph<R: RankingRuleGraphTrait>( |     fn write_rr_graph<R: RankingRuleGraphTrait>( | ||||||
|         &mut self, |         &mut self, | ||||||
|         graph: &RankingRuleGraph<R>, |         graph: &RankingRuleGraph<R>, | ||||||
|   | |||||||
| @@ -15,11 +15,7 @@ mod resolve_query_graph; | |||||||
| mod small_bitmap; | mod small_bitmap; | ||||||
|  |  | ||||||
| mod exact_attribute; | mod exact_attribute; | ||||||
| // TODO: documentation + comments |  | ||||||
| // implementation is currently an adaptation of the previous implementation to fit with the new model |  | ||||||
| mod sort; | mod sort; | ||||||
| // TODO: documentation + comments |  | ||||||
| mod words; |  | ||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod tests; | mod tests; | ||||||
| @@ -43,10 +39,10 @@ use ranking_rules::{ | |||||||
| use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache}; | use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use sort::Sort; | use sort::Sort; | ||||||
| use words::Words; |  | ||||||
|  |  | ||||||
| use self::geo_sort::GeoSort; | use self::geo_sort::GeoSort; | ||||||
| pub use self::geo_sort::Strategy as GeoSortStrategy; | pub use self::geo_sort::Strategy as GeoSortStrategy; | ||||||
|  | use self::graph_based_ranking_rule::Words; | ||||||
| use self::interner::Interned; | use self::interner::Interned; | ||||||
| use crate::search::new::distinct::apply_distinct_rule; | use crate::search::new::distinct::apply_distinct_rule; | ||||||
| use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; | use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; | ||||||
| @@ -202,6 +198,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( | |||||||
|     let mut sorted_fields = HashSet::new(); |     let mut sorted_fields = HashSet::new(); | ||||||
|     let mut geo_sorted = false; |     let mut geo_sorted = false; | ||||||
|  |  | ||||||
|  |     // Don't add the `words` ranking rule if the term matching strategy is `All` | ||||||
|  |     if matches!(terms_matching_strategy, TermsMatchingStrategy::All) { | ||||||
|  |         words = true; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![]; |     let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![]; | ||||||
|     let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; |     let settings_ranking_rules = ctx.index.criteria(ctx.txn)?; | ||||||
|     for rr in settings_ranking_rules { |     for rr in settings_ranking_rules { | ||||||
|   | |||||||
| @@ -28,14 +28,14 @@ pub enum ZeroOrOneTypo { | |||||||
| impl Interned<QueryTerm> { | impl Interned<QueryTerm> { | ||||||
|     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { |     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { | ||||||
|         let s = ctx.term_interner.get_mut(self); |         let s = ctx.term_interner.get_mut(self); | ||||||
|         if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() { |         if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() { | ||||||
|             assert!(s.two_typo.is_uninit()); |             assert!(s.two_typo.is_uninit()); | ||||||
|             // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words |             // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words | ||||||
|             self.initialize_one_typo_subterm(ctx)?; |             self.initialize_one_typo_subterm(ctx)?; | ||||||
|             let s = ctx.term_interner.get_mut(self); |             let s = ctx.term_interner.get_mut(self); | ||||||
|             assert!(s.one_typo.is_init()); |             assert!(s.one_typo.is_init()); | ||||||
|             s.two_typo = Lazy::Init(TwoTypoTerm::default()); |             s.two_typo = Lazy::Init(TwoTypoTerm::default()); | ||||||
|         } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { |         } else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() { | ||||||
|             assert!(s.two_typo.is_uninit()); |             assert!(s.two_typo.is_uninit()); | ||||||
|             self.initialize_one_and_two_typo_subterm(ctx)?; |             self.initialize_one_and_two_typo_subterm(ctx)?; | ||||||
|             let s = ctx.term_interner.get_mut(self); |             let s = ctx.term_interner.get_mut(self); | ||||||
| @@ -185,7 +185,7 @@ pub fn partially_initialized_term_from_word( | |||||||
|                 original: ctx.word_interner.insert(word.to_owned()), |                 original: ctx.word_interner.insert(word.to_owned()), | ||||||
|                 ngram_words: None, |                 ngram_words: None, | ||||||
|                 is_prefix: false, |                 is_prefix: false, | ||||||
|                 max_nbr_typos: 0, |                 max_levenshtein_distance: 0, | ||||||
|                 zero_typo: <_>::default(), |                 zero_typo: <_>::default(), | ||||||
|                 one_typo: Lazy::Init(<_>::default()), |                 one_typo: Lazy::Init(<_>::default()), | ||||||
|                 two_typo: Lazy::Init(<_>::default()), |                 two_typo: Lazy::Init(<_>::default()), | ||||||
| @@ -256,7 +256,7 @@ pub fn partially_initialized_term_from_word( | |||||||
|     Ok(QueryTerm { |     Ok(QueryTerm { | ||||||
|         original: word_interned, |         original: word_interned, | ||||||
|         ngram_words: None, |         ngram_words: None, | ||||||
|         max_nbr_typos: max_typo, |         max_levenshtein_distance: max_typo, | ||||||
|         is_prefix, |         is_prefix, | ||||||
|         zero_typo, |         zero_typo, | ||||||
|         one_typo: Lazy::Uninit, |         one_typo: Lazy::Uninit, | ||||||
| @@ -275,7 +275,16 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern | |||||||
| impl Interned<QueryTerm> { | impl Interned<QueryTerm> { | ||||||
|     fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { |     fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { | ||||||
|         let self_mut = ctx.term_interner.get_mut(self); |         let self_mut = ctx.term_interner.get_mut(self); | ||||||
|         let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut; |  | ||||||
|  |         let allows_split_words = self_mut.allows_split_words(); | ||||||
|  |         let QueryTerm { | ||||||
|  |             original, | ||||||
|  |             is_prefix, | ||||||
|  |             one_typo, | ||||||
|  |             max_levenshtein_distance: max_nbr_typos, | ||||||
|  |             .. | ||||||
|  |         } = self_mut; | ||||||
|  |  | ||||||
|         let original = *original; |         let original = *original; | ||||||
|         let is_prefix = *is_prefix; |         let is_prefix = *is_prefix; | ||||||
|         // let original_str = ctx.word_interner.get(*original).to_owned(); |         // let original_str = ctx.word_interner.get(*original).to_owned(); | ||||||
| @@ -300,13 +309,17 @@ impl Interned<QueryTerm> { | |||||||
|             })?; |             })?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let original_str = ctx.word_interner.get(original).to_owned(); |         let split_words = if allows_split_words { | ||||||
|         let split_words = find_split_words(ctx, original_str.as_str())?; |             let original_str = ctx.word_interner.get(original).to_owned(); | ||||||
|  |             find_split_words(ctx, original_str.as_str())? | ||||||
|  |         } else { | ||||||
|  |             None | ||||||
|  |         }; | ||||||
|  |  | ||||||
|         let self_mut = ctx.term_interner.get_mut(self); |         let self_mut = ctx.term_interner.get_mut(self); | ||||||
|  |  | ||||||
|         // Only add the split words to the derivations if: |         // Only add the split words to the derivations if: | ||||||
|         // 1. the term is not an ngram; OR |         // 1. the term is neither an ngram nor a phrase; OR | ||||||
|         // 2. the term is an ngram, but the split words are different from the ngram's component words |         // 2. the term is an ngram, but the split words are different from the ngram's component words | ||||||
|         let split_words = if let Some((ngram_words, split_words)) = |         let split_words = if let Some((ngram_words, split_words)) = | ||||||
|             self_mut.ngram_words.as_ref().zip(split_words.as_ref()) |             self_mut.ngram_words.as_ref().zip(split_words.as_ref()) | ||||||
| @@ -328,7 +341,13 @@ impl Interned<QueryTerm> { | |||||||
|     } |     } | ||||||
|     fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { |     fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { | ||||||
|         let self_mut = ctx.term_interner.get_mut(self); |         let self_mut = ctx.term_interner.get_mut(self); | ||||||
|         let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut; |         let QueryTerm { | ||||||
|  |             original, | ||||||
|  |             is_prefix, | ||||||
|  |             two_typo, | ||||||
|  |             max_levenshtein_distance: max_nbr_typos, | ||||||
|  |             .. | ||||||
|  |         } = self_mut; | ||||||
|         let original_str = ctx.word_interner.get(*original).to_owned(); |         let original_str = ctx.word_interner.get(*original).to_owned(); | ||||||
|         if two_typo.is_init() { |         if two_typo.is_init() { | ||||||
|             return Ok(()); |             return Ok(()); | ||||||
|   | |||||||
| @@ -43,7 +43,7 @@ pub struct QueryTermSubset { | |||||||
| pub struct QueryTerm { | pub struct QueryTerm { | ||||||
|     original: Interned<String>, |     original: Interned<String>, | ||||||
|     ngram_words: Option<Vec<Interned<String>>>, |     ngram_words: Option<Vec<Interned<String>>>, | ||||||
|     max_nbr_typos: u8, |     max_levenshtein_distance: u8, | ||||||
|     is_prefix: bool, |     is_prefix: bool, | ||||||
|     zero_typo: ZeroTypoTerm, |     zero_typo: ZeroTypoTerm, | ||||||
|     // May not be computed yet |     // May not be computed yet | ||||||
| @@ -342,10 +342,16 @@ impl QueryTermSubset { | |||||||
|         } |         } | ||||||
|         None |         None | ||||||
|     } |     } | ||||||
|     pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 { |     pub fn max_typo_cost(&self, ctx: &SearchContext) -> u8 { | ||||||
|         let t = ctx.term_interner.get(self.original); |         let t = ctx.term_interner.get(self.original); | ||||||
|         match t.max_nbr_typos { |         match t.max_levenshtein_distance { | ||||||
|             0 => 0, |             0 => { | ||||||
|  |                 if t.allows_split_words() { | ||||||
|  |                     1 | ||||||
|  |                 } else { | ||||||
|  |                     0 | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|             1 => { |             1 => { | ||||||
|                 if self.one_typo_subset.is_empty() { |                 if self.one_typo_subset.is_empty() { | ||||||
|                     0 |                     0 | ||||||
| @@ -438,6 +444,9 @@ impl QueryTerm { | |||||||
|  |  | ||||||
|         self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() |         self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() | ||||||
|     } |     } | ||||||
|  |     fn allows_split_words(&self) -> bool { | ||||||
|  |         self.zero_typo.phrase.is_none() | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Interned<QueryTerm> { | impl Interned<QueryTerm> { | ||||||
|   | |||||||
| @@ -217,7 +217,7 @@ pub fn make_ngram( | |||||||
|         original: ngram_str_interned, |         original: ngram_str_interned, | ||||||
|         ngram_words: Some(words_interned), |         ngram_words: Some(words_interned), | ||||||
|         is_prefix, |         is_prefix, | ||||||
|         max_nbr_typos, |         max_levenshtein_distance: max_nbr_typos, | ||||||
|         zero_typo: term.zero_typo, |         zero_typo: term.zero_typo, | ||||||
|         one_typo: Lazy::Uninit, |         one_typo: Lazy::Uninit, | ||||||
|         two_typo: Lazy::Uninit, |         two_typo: Lazy::Uninit, | ||||||
| @@ -271,7 +271,7 @@ impl PhraseBuilder { | |||||||
|                 QueryTerm { |                 QueryTerm { | ||||||
|                     original: ctx.word_interner.insert(phrase_desc), |                     original: ctx.word_interner.insert(phrase_desc), | ||||||
|                     ngram_words: None, |                     ngram_words: None, | ||||||
|                     max_nbr_typos: 0, |                     max_levenshtein_distance: 0, | ||||||
|                     is_prefix: false, |                     is_prefix: false, | ||||||
|                     zero_typo: ZeroTypoTerm { |                     zero_typo: ZeroTypoTerm { | ||||||
|                         phrase: Some(phrase), |                         phrase: Some(phrase), | ||||||
|   | |||||||
| @@ -205,18 +205,12 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> { | |||||||
| impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> { | impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> { | ||||||
|     pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> { |     pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> { | ||||||
|         let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]); |         let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]); | ||||||
|         let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); |  | ||||||
|  |  | ||||||
|         let mut node_stack = VecDeque::new(); |         self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| { | ||||||
|  |             if cur_node == self.query_graph.end_node { | ||||||
|         *costs_to_end.get_mut(self.query_graph.end_node) = vec![0]; |                 *costs_to_end.get_mut(self.query_graph.end_node) = vec![0]; | ||||||
|  |                 return; | ||||||
|         for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() { |             } | ||||||
|             node_stack.push_back(prev_node); |  | ||||||
|             enqueued.insert(prev_node); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         while let Some(cur_node) = node_stack.pop_front() { |  | ||||||
|             let mut self_costs = Vec::<u64>::new(); |             let mut self_costs = Vec::<u64>::new(); | ||||||
|  |  | ||||||
|             let cur_node_edges = &self.edges_of_node.get(cur_node); |             let cur_node_edges = &self.edges_of_node.get(cur_node); | ||||||
| @@ -232,13 +226,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> { | |||||||
|             self_costs.dedup(); |             self_costs.dedup(); | ||||||
|  |  | ||||||
|             *costs_to_end.get_mut(cur_node) = self_costs; |             *costs_to_end.get_mut(cur_node) = self_costs; | ||||||
|             for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { |         }); | ||||||
|                 if !enqueued.contains(prev_node) { |  | ||||||
|                     node_stack.push_back(prev_node); |  | ||||||
|                     enqueued.insert(prev_node); |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|         costs_to_end |         costs_to_end | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -247,17 +235,12 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> { | |||||||
|         node_with_removed_outgoing_conditions: Interned<QueryNode>, |         node_with_removed_outgoing_conditions: Interned<QueryNode>, | ||||||
|         costs: &mut MappedInterner<QueryNode, Vec<u64>>, |         costs: &mut MappedInterner<QueryNode, Vec<u64>>, | ||||||
|     ) { |     ) { | ||||||
|         let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len()); |         // Traverse the graph backward from the target node, recomputing the cost for each of its predecessors. | ||||||
|         let mut node_stack = VecDeque::new(); |         // We first check that no other node is contributing the same total cost to a predecessor before removing | ||||||
|  |         // the cost from the predecessor. | ||||||
|         enqueued.insert(node_with_removed_outgoing_conditions); |         self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| { | ||||||
|         node_stack.push_back(node_with_removed_outgoing_conditions); |  | ||||||
|  |  | ||||||
|         'main_loop: while let Some(cur_node) = node_stack.pop_front() { |  | ||||||
|             let mut costs_to_remove = FxHashSet::default(); |             let mut costs_to_remove = FxHashSet::default(); | ||||||
|             for c in costs.get(cur_node) { |             costs_to_remove.extend(costs.get(cur_node).iter().copied()); | ||||||
|                 costs_to_remove.insert(*c); |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             let cur_node_edges = &self.edges_of_node.get(cur_node); |             let cur_node_edges = &self.edges_of_node.get(cur_node); | ||||||
|             for edge_idx in cur_node_edges.iter() { |             for edge_idx in cur_node_edges.iter() { | ||||||
| @@ -265,22 +248,75 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> { | |||||||
|                 for cost in costs.get(edge.dest_node).iter() { |                 for cost in costs.get(edge.dest_node).iter() { | ||||||
|                     costs_to_remove.remove(&(*cost + edge.cost as u64)); |                     costs_to_remove.remove(&(*cost + edge.cost as u64)); | ||||||
|                     if costs_to_remove.is_empty() { |                     if costs_to_remove.is_empty() { | ||||||
|                         continue 'main_loop; |                         return; | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|             if costs_to_remove.is_empty() { |             if costs_to_remove.is_empty() { | ||||||
|                 continue 'main_loop; |                 return; | ||||||
|             } |             } | ||||||
|             let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied()); |             let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied()); | ||||||
|             for c in costs_to_remove { |             for c in costs_to_remove { | ||||||
|                 new_costs.remove(&c); |                 new_costs.remove(&c); | ||||||
|             } |             } | ||||||
|             *costs.get_mut(cur_node) = new_costs.into_iter().collect(); |             *costs.get_mut(cur_node) = new_costs.into_iter().collect(); | ||||||
|  |         }); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// Traverse the graph backwards from the given node such that every time | ||||||
|  |     /// a node is visited, we are guaranteed that all its successors either: | ||||||
|  |     /// 1. have already been visited; OR | ||||||
|  |     /// 2. were not reachable from the given node | ||||||
|  |     pub fn traverse_breadth_first_backward( | ||||||
|  |         &self, | ||||||
|  |         from: Interned<QueryNode>, | ||||||
|  |         mut visit: impl FnMut(Interned<QueryNode>), | ||||||
|  |     ) { | ||||||
|  |         let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes); | ||||||
|  |         { | ||||||
|  |             // go backward to get the set of all reachable nodes from the given node | ||||||
|  |             // the nodes that are not reachable will be set as `visited` | ||||||
|  |             let mut stack = VecDeque::new(); | ||||||
|  |             let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes); | ||||||
|  |             enqueued.insert(from); | ||||||
|  |             stack.push_back(from); | ||||||
|  |             while let Some(n) = stack.pop_front() { | ||||||
|  |                 if reachable.contains(n) { | ||||||
|  |                     continue; | ||||||
|  |                 } | ||||||
|  |                 reachable.insert(n); | ||||||
|  |                 for prev_node in self.query_graph.nodes.get(n).predecessors.iter() { | ||||||
|  |                     if !enqueued.contains(prev_node) && !reachable.contains(prev_node) { | ||||||
|  |                         stack.push_back(prev_node); | ||||||
|  |                         enqueued.insert(prev_node); | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |         let mut unreachable_or_visited = | ||||||
|  |             SmallBitmap::for_interned_values_in(&self.query_graph.nodes); | ||||||
|  |         for (n, _) in self.query_graph.nodes.iter() { | ||||||
|  |             if !reachable.contains(n) { | ||||||
|  |                 unreachable_or_visited.insert(n); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes); | ||||||
|  |         let mut stack = VecDeque::new(); | ||||||
|  |  | ||||||
|  |         enqueued.insert(from); | ||||||
|  |         stack.push_back(from); | ||||||
|  |  | ||||||
|  |         while let Some(cur_node) = stack.pop_front() { | ||||||
|  |             if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) { | ||||||
|  |                 stack.push_back(cur_node); | ||||||
|  |                 continue; | ||||||
|  |             } | ||||||
|  |             unreachable_or_visited.insert(cur_node); | ||||||
|  |             visit(cur_node); | ||||||
|             for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { |             for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() { | ||||||
|                 if !enqueued.contains(prev_node) { |                 if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) { | ||||||
|                     node_stack.push_back(prev_node); |                     stack.push_back(prev_node); | ||||||
|                     enqueued.insert(prev_node); |                     enqueued.insert(prev_node); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|   | |||||||
| @@ -20,6 +20,8 @@ mod position; | |||||||
| mod proximity; | mod proximity; | ||||||
| /// Implementation of the `typo` ranking rule | /// Implementation of the `typo` ranking rule | ||||||
| mod typo; | mod typo; | ||||||
|  | /// Implementation of the `words` ranking rule | ||||||
|  | mod words; | ||||||
|  |  | ||||||
| use std::collections::BTreeSet; | use std::collections::BTreeSet; | ||||||
| use std::hash::Hash; | use std::hash::Hash; | ||||||
| @@ -33,6 +35,7 @@ pub use position::{PositionCondition, PositionGraph}; | |||||||
| pub use proximity::{ProximityCondition, ProximityGraph}; | pub use proximity::{ProximityCondition, ProximityGraph}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| pub use typo::{TypoCondition, TypoGraph}; | pub use typo::{TypoCondition, TypoGraph}; | ||||||
|  | pub use words::{WordsCondition, WordsGraph}; | ||||||
|  |  | ||||||
| use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; | use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner}; | ||||||
| use super::query_term::LocatedQueryTermSubset; | use super::query_term::LocatedQueryTermSubset; | ||||||
|   | |||||||
| @@ -50,7 +50,7 @@ impl RankingRuleGraphTrait for TypoGraph { | |||||||
|         // 3-gram -> equivalent to 2 typos |         // 3-gram -> equivalent to 2 typos | ||||||
|         let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; |         let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; | ||||||
|  |  | ||||||
|         for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { |         for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) { | ||||||
|             let mut term = term.clone(); |             let mut term = term.clone(); | ||||||
|             match nbr_typos { |             match nbr_typos { | ||||||
|                 0 => { |                 0 => { | ||||||
|   | |||||||
							
								
								
									
										49
									
								
								milli/src/search/new/ranking_rule_graph/words/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										49
									
								
								milli/src/search/new/ranking_rule_graph/words/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,49 @@ | |||||||
|  | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  | use super::{ComputedCondition, RankingRuleGraphTrait}; | ||||||
|  | use crate::search::new::interner::{DedupInterner, Interned}; | ||||||
|  | use crate::search::new::query_term::LocatedQueryTermSubset; | ||||||
|  | use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; | ||||||
|  | use crate::search::new::SearchContext; | ||||||
|  | use crate::Result; | ||||||
|  |  | ||||||
|  | #[derive(Clone, PartialEq, Eq, Hash)] | ||||||
|  | pub struct WordsCondition { | ||||||
|  |     term: LocatedQueryTermSubset, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | pub enum WordsGraph {} | ||||||
|  |  | ||||||
|  | impl RankingRuleGraphTrait for WordsGraph { | ||||||
|  |     type Condition = WordsCondition; | ||||||
|  |  | ||||||
|  |     fn resolve_condition( | ||||||
|  |         ctx: &mut SearchContext, | ||||||
|  |         condition: &Self::Condition, | ||||||
|  |         universe: &RoaringBitmap, | ||||||
|  |     ) -> Result<ComputedCondition> { | ||||||
|  |         let WordsCondition { term, .. } = condition; | ||||||
|  |         // maybe compute_query_term_subset_docids should accept a universe as argument | ||||||
|  |         let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; | ||||||
|  |         docids &= universe; | ||||||
|  |  | ||||||
|  |         Ok(ComputedCondition { | ||||||
|  |             docids, | ||||||
|  |             universe_len: universe.len(), | ||||||
|  |             start_term_subset: None, | ||||||
|  |             end_term_subset: term.clone(), | ||||||
|  |         }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn build_edges( | ||||||
|  |         _ctx: &mut SearchContext, | ||||||
|  |         conditions_interner: &mut DedupInterner<Self::Condition>, | ||||||
|  |         _from: Option<&LocatedQueryTermSubset>, | ||||||
|  |         to_term: &LocatedQueryTermSubset, | ||||||
|  |     ) -> Result<Vec<(u32, Interned<Self::Condition>)>> { | ||||||
|  |         Ok(vec![( | ||||||
|  |             to_term.term_ids.len() as u32, | ||||||
|  |             conditions_interner.insert(WordsCondition { term: to_term.clone() }), | ||||||
|  |         )]) | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -1,87 +0,0 @@ | |||||||
| use roaring::RoaringBitmap; |  | ||||||
|  |  | ||||||
| use super::logger::SearchLogger; |  | ||||||
| use super::query_graph::QueryNode; |  | ||||||
| use super::resolve_query_graph::compute_query_graph_docids; |  | ||||||
| use super::small_bitmap::SmallBitmap; |  | ||||||
| use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; |  | ||||||
| use crate::{Result, TermsMatchingStrategy}; |  | ||||||
|  |  | ||||||
| pub struct Words { |  | ||||||
|     exhausted: bool, // TODO: remove |  | ||||||
|     query_graph: Option<QueryGraph>, |  | ||||||
|     nodes_to_remove: Vec<SmallBitmap<QueryNode>>, |  | ||||||
|     terms_matching_strategy: TermsMatchingStrategy, |  | ||||||
| } |  | ||||||
| impl Words { |  | ||||||
|     pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self { |  | ||||||
|         Self { |  | ||||||
|             exhausted: true, |  | ||||||
|             query_graph: None, |  | ||||||
|             nodes_to_remove: vec![], |  | ||||||
|             terms_matching_strategy, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { |  | ||||||
|     fn id(&self) -> String { |  | ||||||
|         "words".to_owned() |  | ||||||
|     } |  | ||||||
|     fn start_iteration( |  | ||||||
|         &mut self, |  | ||||||
|         ctx: &mut SearchContext<'ctx>, |  | ||||||
|         _logger: &mut dyn SearchLogger<QueryGraph>, |  | ||||||
|         _universe: &RoaringBitmap, |  | ||||||
|         parent_query_graph: &QueryGraph, |  | ||||||
|     ) -> Result<()> { |  | ||||||
|         self.exhausted = false; |  | ||||||
|         self.query_graph = Some(parent_query_graph.clone()); |  | ||||||
|         self.nodes_to_remove = match self.terms_matching_strategy { |  | ||||||
|             TermsMatchingStrategy::Last => { |  | ||||||
|                 let mut ns = parent_query_graph.removal_order_for_terms_matching_strategy_last(ctx); |  | ||||||
|                 ns.reverse(); |  | ||||||
|                 ns |  | ||||||
|             } |  | ||||||
|             TermsMatchingStrategy::All => { |  | ||||||
|                 vec![] |  | ||||||
|             } |  | ||||||
|         }; |  | ||||||
|         Ok(()) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn next_bucket( |  | ||||||
|         &mut self, |  | ||||||
|         ctx: &mut SearchContext<'ctx>, |  | ||||||
|         logger: &mut dyn SearchLogger<QueryGraph>, |  | ||||||
|         universe: &RoaringBitmap, |  | ||||||
|     ) -> Result<Option<RankingRuleOutput<QueryGraph>>> { |  | ||||||
|         if self.exhausted { |  | ||||||
|             return Ok(None); |  | ||||||
|         } |  | ||||||
|         let Some(query_graph) = &mut self.query_graph else { panic!() }; |  | ||||||
|         logger.log_internal_state(query_graph); |  | ||||||
|  |  | ||||||
|         let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?; |  | ||||||
|  |  | ||||||
|         let child_query_graph = query_graph.clone(); |  | ||||||
|  |  | ||||||
|         if self.nodes_to_remove.is_empty() { |  | ||||||
|             self.exhausted = true; |  | ||||||
|         } else { |  | ||||||
|             let nodes_to_remove = self.nodes_to_remove.pop().unwrap(); |  | ||||||
|             query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::<Vec<_>>()); |  | ||||||
|         } |  | ||||||
|         Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn end_iteration( |  | ||||||
|         &mut self, |  | ||||||
|         _ctx: &mut SearchContext<'ctx>, |  | ||||||
|         _logger: &mut dyn SearchLogger<QueryGraph>, |  | ||||||
|     ) { |  | ||||||
|         self.exhausted = true; |  | ||||||
|         self.nodes_to_remove = vec![]; |  | ||||||
|         self.query_graph = None; |  | ||||||
|     } |  | ||||||
| } |  | ||||||
		Reference in New Issue
	
	Block a user