mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 09:56:28 +00:00 
			
		
		
		
	Merge #3768
3768: Fix bugs in graph-based ranking rules + make `words` a graph-based ranking rule r=dureuill a=loiclec This PR contains three changes: ## 1. Don't call the `words` ranking rule if the term matching strategy is `All` This is because the purpose of `words` is only to remove nodes from the query graph. It would never do any useful work when the matching strategy was `All`. Remember that the universe was already computed before by computing all the docids corresponding to the "maximally reduced" query graph, which, in the case of `All`, is equal to the original graph. ## 2. The `words` ranking rule is replaced by a graph-based ranking rule. This is for three reasons: 1. **performance**: graph-based ranking rules benefit from a lot of optimisations by default, which ensures that they are never too slow. The previous implementation of `words` could call `compute_query_graph_docids` many times if some words had to be removed from the query, which would be quite expensive. I was especially worried about its performance in cases where it is placed right after the `sort` ranking rule. Furthermore, `compute_query_graph_docids` would clone a lot of bitmaps many times unnecessarily. 2. **consistency**: every other ranking rule (except `sort`) is graph-based. It makes sense to implement `words` like that as well. It will automatically benefit from all the features, optimisations, and bug fixes that all the other ranking rules get. 3. **surfacing bugs**: as the first ranking rule to be called (most of the time), I'd like `words` to behave the same as the other ranking rules so that we can quickly detect bugs in our graph algorithms. This actually already happened, which is why this PR also contains a bug fix. ## 3. Fix the `update_all_costs_before_nodes` function It is a bit difficult to explain what was wrong, but I'll try. The bug happened when we had graphs like: <img width="730" alt="Screenshot 2023-05-16 at 10 58 57" src="https://github.com/meilisearch/meilisearch/assets/6040237/40db1a68-d852-4e89-99d5-0d65757242a7"> and we gave the node `is` as argument. Then, we'd walk backwards from the node breadth-first. We'd update the costs of: 1. `sun` 2. `thesun` 3. `start` 4. `the` which is an incorrect order. The correct order is: 1. `sun` 2. `thesun` 3. `the` 4. `start` That is, we can only update the cost of a node when all of its successors have either already been visited or were not affected by the update to the node passed as argument. To solve this bug, I factored out the graph-traversal logic into a `traverse_breadth_first_backward` function. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
		@@ -46,7 +46,7 @@ use super::logger::SearchLogger;
 | 
			
		||||
use super::query_graph::QueryNode;
 | 
			
		||||
use super::ranking_rule_graph::{
 | 
			
		||||
    ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph,
 | 
			
		||||
    RankingRuleGraph, RankingRuleGraphTrait, TypoGraph,
 | 
			
		||||
    RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, WordsGraph,
 | 
			
		||||
};
 | 
			
		||||
use super::small_bitmap::SmallBitmap;
 | 
			
		||||
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
 | 
			
		||||
@@ -54,6 +54,12 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
 | 
			
		||||
use crate::search::new::ranking_rule_graph::PathVisitor;
 | 
			
		||||
use crate::{Result, TermsMatchingStrategy};
 | 
			
		||||
 | 
			
		||||
pub type Words = GraphBasedRankingRule<WordsGraph>;
 | 
			
		||||
impl GraphBasedRankingRule<WordsGraph> {
 | 
			
		||||
    pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
 | 
			
		||||
        Self::new_with_id("words".to_owned(), Some(terms_matching_strategy))
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
 | 
			
		||||
impl GraphBasedRankingRule<ProximityGraph> {
 | 
			
		||||
    pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
 | 
			
		||||
 
 | 
			
		||||
@@ -4,7 +4,6 @@ use std::io::{BufWriter, Write};
 | 
			
		||||
use std::path::{Path, PathBuf};
 | 
			
		||||
use std::time::Instant;
 | 
			
		||||
 | 
			
		||||
// use rand::random;
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
 | 
			
		||||
use crate::search::new::interner::Interned;
 | 
			
		||||
@@ -13,6 +12,7 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
 | 
			
		||||
use crate::search::new::ranking_rule_graph::{
 | 
			
		||||
    Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition,
 | 
			
		||||
    ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph,
 | 
			
		||||
    WordsCondition, WordsGraph,
 | 
			
		||||
};
 | 
			
		||||
use crate::search::new::ranking_rules::BoxRankingRule;
 | 
			
		||||
use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger};
 | 
			
		||||
@@ -24,11 +24,12 @@ pub enum SearchEvents {
 | 
			
		||||
    RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
 | 
			
		||||
    RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
 | 
			
		||||
    ExtendResults { new: Vec<u32> },
 | 
			
		||||
    WordsGraph { query_graph: QueryGraph },
 | 
			
		||||
    ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
 | 
			
		||||
    ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
 | 
			
		||||
    TypoGraph { graph: RankingRuleGraph<TypoGraph> },
 | 
			
		||||
    TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> },
 | 
			
		||||
    WordsGraph { graph: RankingRuleGraph<WordsGraph> },
 | 
			
		||||
    WordsPaths { paths: Vec<Vec<Interned<WordsCondition>>> },
 | 
			
		||||
    FidGraph { graph: RankingRuleGraph<FidGraph> },
 | 
			
		||||
    FidPaths { paths: Vec<Vec<Interned<FidCondition>>> },
 | 
			
		||||
    PositionGraph { graph: RankingRuleGraph<PositionGraph> },
 | 
			
		||||
@@ -139,8 +140,11 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
 | 
			
		||||
        let Some(location) = self.location.last() else { return };
 | 
			
		||||
        match location {
 | 
			
		||||
            Location::Words => {
 | 
			
		||||
                if let Some(query_graph) = state.downcast_ref::<QueryGraph>() {
 | 
			
		||||
                    self.events.push(SearchEvents::WordsGraph { query_graph: query_graph.clone() });
 | 
			
		||||
                if let Some(graph) = state.downcast_ref::<RankingRuleGraph<WordsGraph>>() {
 | 
			
		||||
                    self.events.push(SearchEvents::WordsGraph { graph: graph.clone() });
 | 
			
		||||
                }
 | 
			
		||||
                if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<WordsCondition>>>>() {
 | 
			
		||||
                    self.events.push(SearchEvents::WordsPaths { paths: paths.clone() });
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            Location::Typo => {
 | 
			
		||||
@@ -329,7 +333,6 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
 | 
			
		||||
            SearchEvents::ExtendResults { new } => {
 | 
			
		||||
                self.write_extend_results(new)?;
 | 
			
		||||
            }
 | 
			
		||||
            SearchEvents::WordsGraph { query_graph } => self.write_words_graph(query_graph)?,
 | 
			
		||||
            SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?,
 | 
			
		||||
            SearchEvents::ProximityPaths { paths } => {
 | 
			
		||||
                self.write_rr_graph_paths::<ProximityGraph>(paths)?;
 | 
			
		||||
@@ -338,6 +341,10 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
 | 
			
		||||
            SearchEvents::TypoPaths { paths } => {
 | 
			
		||||
                self.write_rr_graph_paths::<TypoGraph>(paths)?;
 | 
			
		||||
            }
 | 
			
		||||
            SearchEvents::WordsGraph { graph } => self.write_rr_graph(&graph)?,
 | 
			
		||||
            SearchEvents::WordsPaths { paths } => {
 | 
			
		||||
                self.write_rr_graph_paths::<WordsGraph>(paths)?;
 | 
			
		||||
            }
 | 
			
		||||
            SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?,
 | 
			
		||||
            SearchEvents::FidPaths { paths } => {
 | 
			
		||||
                self.write_rr_graph_paths::<FidGraph>(paths)?;
 | 
			
		||||
@@ -455,7 +462,7 @@ fill: \"#B6E2D3\"
 | 
			
		||||
                shape: class
 | 
			
		||||
                max_nbr_typo: {}",
 | 
			
		||||
                    term_subset.description(ctx),
 | 
			
		||||
                    term_subset.max_nbr_typos(ctx)
 | 
			
		||||
                    term_subset.max_typo_cost(ctx)
 | 
			
		||||
                )?;
 | 
			
		||||
 | 
			
		||||
                for w in term_subset.all_single_words_except_prefix_db(ctx)? {
 | 
			
		||||
@@ -482,13 +489,6 @@ fill: \"#B6E2D3\"
 | 
			
		||||
        }
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
    fn write_words_graph(&mut self, qg: QueryGraph) -> Result<()> {
 | 
			
		||||
        self.make_new_file_for_internal_state_if_needed()?;
 | 
			
		||||
 | 
			
		||||
        self.write_query_graph(&qg)?;
 | 
			
		||||
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
    fn write_rr_graph<R: RankingRuleGraphTrait>(
 | 
			
		||||
        &mut self,
 | 
			
		||||
        graph: &RankingRuleGraph<R>,
 | 
			
		||||
 
 | 
			
		||||
@@ -15,11 +15,7 @@ mod resolve_query_graph;
 | 
			
		||||
mod small_bitmap;
 | 
			
		||||
 | 
			
		||||
mod exact_attribute;
 | 
			
		||||
// TODO: documentation + comments
 | 
			
		||||
// implementation is currently an adaptation of the previous implementation to fit with the new model
 | 
			
		||||
mod sort;
 | 
			
		||||
// TODO: documentation + comments
 | 
			
		||||
mod words;
 | 
			
		||||
 | 
			
		||||
#[cfg(test)]
 | 
			
		||||
mod tests;
 | 
			
		||||
@@ -43,10 +39,10 @@ use ranking_rules::{
 | 
			
		||||
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
use sort::Sort;
 | 
			
		||||
use words::Words;
 | 
			
		||||
 | 
			
		||||
use self::geo_sort::GeoSort;
 | 
			
		||||
pub use self::geo_sort::Strategy as GeoSortStrategy;
 | 
			
		||||
use self::graph_based_ranking_rule::Words;
 | 
			
		||||
use self::interner::Interned;
 | 
			
		||||
use crate::search::new::distinct::apply_distinct_rule;
 | 
			
		||||
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
 | 
			
		||||
@@ -202,6 +198,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
 | 
			
		||||
    let mut sorted_fields = HashSet::new();
 | 
			
		||||
    let mut geo_sorted = false;
 | 
			
		||||
 | 
			
		||||
    // Don't add the `words` ranking rule if the term matching strategy is `All`
 | 
			
		||||
    if matches!(terms_matching_strategy, TermsMatchingStrategy::All) {
 | 
			
		||||
        words = true;
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![];
 | 
			
		||||
    let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
 | 
			
		||||
    for rr in settings_ranking_rules {
 | 
			
		||||
 
 | 
			
		||||
@@ -28,14 +28,14 @@ pub enum ZeroOrOneTypo {
 | 
			
		||||
impl Interned<QueryTerm> {
 | 
			
		||||
    pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
 | 
			
		||||
        let s = ctx.term_interner.get_mut(self);
 | 
			
		||||
        if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() {
 | 
			
		||||
        if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() {
 | 
			
		||||
            assert!(s.two_typo.is_uninit());
 | 
			
		||||
            // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
 | 
			
		||||
            self.initialize_one_typo_subterm(ctx)?;
 | 
			
		||||
            let s = ctx.term_interner.get_mut(self);
 | 
			
		||||
            assert!(s.one_typo.is_init());
 | 
			
		||||
            s.two_typo = Lazy::Init(TwoTypoTerm::default());
 | 
			
		||||
        } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() {
 | 
			
		||||
        } else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() {
 | 
			
		||||
            assert!(s.two_typo.is_uninit());
 | 
			
		||||
            self.initialize_one_and_two_typo_subterm(ctx)?;
 | 
			
		||||
            let s = ctx.term_interner.get_mut(self);
 | 
			
		||||
@@ -185,7 +185,7 @@ pub fn partially_initialized_term_from_word(
 | 
			
		||||
                original: ctx.word_interner.insert(word.to_owned()),
 | 
			
		||||
                ngram_words: None,
 | 
			
		||||
                is_prefix: false,
 | 
			
		||||
                max_nbr_typos: 0,
 | 
			
		||||
                max_levenshtein_distance: 0,
 | 
			
		||||
                zero_typo: <_>::default(),
 | 
			
		||||
                one_typo: Lazy::Init(<_>::default()),
 | 
			
		||||
                two_typo: Lazy::Init(<_>::default()),
 | 
			
		||||
@@ -256,7 +256,7 @@ pub fn partially_initialized_term_from_word(
 | 
			
		||||
    Ok(QueryTerm {
 | 
			
		||||
        original: word_interned,
 | 
			
		||||
        ngram_words: None,
 | 
			
		||||
        max_nbr_typos: max_typo,
 | 
			
		||||
        max_levenshtein_distance: max_typo,
 | 
			
		||||
        is_prefix,
 | 
			
		||||
        zero_typo,
 | 
			
		||||
        one_typo: Lazy::Uninit,
 | 
			
		||||
@@ -275,7 +275,16 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
 | 
			
		||||
impl Interned<QueryTerm> {
 | 
			
		||||
    fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
 | 
			
		||||
        let self_mut = ctx.term_interner.get_mut(self);
 | 
			
		||||
        let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
 | 
			
		||||
 | 
			
		||||
        let allows_split_words = self_mut.allows_split_words();
 | 
			
		||||
        let QueryTerm {
 | 
			
		||||
            original,
 | 
			
		||||
            is_prefix,
 | 
			
		||||
            one_typo,
 | 
			
		||||
            max_levenshtein_distance: max_nbr_typos,
 | 
			
		||||
            ..
 | 
			
		||||
        } = self_mut;
 | 
			
		||||
 | 
			
		||||
        let original = *original;
 | 
			
		||||
        let is_prefix = *is_prefix;
 | 
			
		||||
        // let original_str = ctx.word_interner.get(*original).to_owned();
 | 
			
		||||
@@ -300,13 +309,17 @@ impl Interned<QueryTerm> {
 | 
			
		||||
            })?;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        let original_str = ctx.word_interner.get(original).to_owned();
 | 
			
		||||
        let split_words = find_split_words(ctx, original_str.as_str())?;
 | 
			
		||||
        let split_words = if allows_split_words {
 | 
			
		||||
            let original_str = ctx.word_interner.get(original).to_owned();
 | 
			
		||||
            find_split_words(ctx, original_str.as_str())?
 | 
			
		||||
        } else {
 | 
			
		||||
            None
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        let self_mut = ctx.term_interner.get_mut(self);
 | 
			
		||||
 | 
			
		||||
        // Only add the split words to the derivations if:
 | 
			
		||||
        // 1. the term is not an ngram; OR
 | 
			
		||||
        // 1. the term is neither an ngram nor a phrase; OR
 | 
			
		||||
        // 2. the term is an ngram, but the split words are different from the ngram's component words
 | 
			
		||||
        let split_words = if let Some((ngram_words, split_words)) =
 | 
			
		||||
            self_mut.ngram_words.as_ref().zip(split_words.as_ref())
 | 
			
		||||
@@ -328,7 +341,13 @@ impl Interned<QueryTerm> {
 | 
			
		||||
    }
 | 
			
		||||
    fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
 | 
			
		||||
        let self_mut = ctx.term_interner.get_mut(self);
 | 
			
		||||
        let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut;
 | 
			
		||||
        let QueryTerm {
 | 
			
		||||
            original,
 | 
			
		||||
            is_prefix,
 | 
			
		||||
            two_typo,
 | 
			
		||||
            max_levenshtein_distance: max_nbr_typos,
 | 
			
		||||
            ..
 | 
			
		||||
        } = self_mut;
 | 
			
		||||
        let original_str = ctx.word_interner.get(*original).to_owned();
 | 
			
		||||
        if two_typo.is_init() {
 | 
			
		||||
            return Ok(());
 | 
			
		||||
 
 | 
			
		||||
@@ -43,7 +43,7 @@ pub struct QueryTermSubset {
 | 
			
		||||
pub struct QueryTerm {
 | 
			
		||||
    original: Interned<String>,
 | 
			
		||||
    ngram_words: Option<Vec<Interned<String>>>,
 | 
			
		||||
    max_nbr_typos: u8,
 | 
			
		||||
    max_levenshtein_distance: u8,
 | 
			
		||||
    is_prefix: bool,
 | 
			
		||||
    zero_typo: ZeroTypoTerm,
 | 
			
		||||
    // May not be computed yet
 | 
			
		||||
@@ -342,10 +342,16 @@ impl QueryTermSubset {
 | 
			
		||||
        }
 | 
			
		||||
        None
 | 
			
		||||
    }
 | 
			
		||||
    pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 {
 | 
			
		||||
    pub fn max_typo_cost(&self, ctx: &SearchContext) -> u8 {
 | 
			
		||||
        let t = ctx.term_interner.get(self.original);
 | 
			
		||||
        match t.max_nbr_typos {
 | 
			
		||||
            0 => 0,
 | 
			
		||||
        match t.max_levenshtein_distance {
 | 
			
		||||
            0 => {
 | 
			
		||||
                if t.allows_split_words() {
 | 
			
		||||
                    1
 | 
			
		||||
                } else {
 | 
			
		||||
                    0
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            1 => {
 | 
			
		||||
                if self.one_typo_subset.is_empty() {
 | 
			
		||||
                    0
 | 
			
		||||
@@ -438,6 +444,9 @@ impl QueryTerm {
 | 
			
		||||
 | 
			
		||||
        self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
 | 
			
		||||
    }
 | 
			
		||||
    fn allows_split_words(&self) -> bool {
 | 
			
		||||
        self.zero_typo.phrase.is_none()
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl Interned<QueryTerm> {
 | 
			
		||||
 
 | 
			
		||||
@@ -217,7 +217,7 @@ pub fn make_ngram(
 | 
			
		||||
        original: ngram_str_interned,
 | 
			
		||||
        ngram_words: Some(words_interned),
 | 
			
		||||
        is_prefix,
 | 
			
		||||
        max_nbr_typos,
 | 
			
		||||
        max_levenshtein_distance: max_nbr_typos,
 | 
			
		||||
        zero_typo: term.zero_typo,
 | 
			
		||||
        one_typo: Lazy::Uninit,
 | 
			
		||||
        two_typo: Lazy::Uninit,
 | 
			
		||||
@@ -271,7 +271,7 @@ impl PhraseBuilder {
 | 
			
		||||
                QueryTerm {
 | 
			
		||||
                    original: ctx.word_interner.insert(phrase_desc),
 | 
			
		||||
                    ngram_words: None,
 | 
			
		||||
                    max_nbr_typos: 0,
 | 
			
		||||
                    max_levenshtein_distance: 0,
 | 
			
		||||
                    is_prefix: false,
 | 
			
		||||
                    zero_typo: ZeroTypoTerm {
 | 
			
		||||
                        phrase: Some(phrase),
 | 
			
		||||
 
 | 
			
		||||
@@ -205,18 +205,12 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
 | 
			
		||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
 | 
			
		||||
    pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> {
 | 
			
		||||
        let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]);
 | 
			
		||||
        let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len());
 | 
			
		||||
 | 
			
		||||
        let mut node_stack = VecDeque::new();
 | 
			
		||||
 | 
			
		||||
        *costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
 | 
			
		||||
 | 
			
		||||
        for prev_node in self.query_graph.nodes.get(self.query_graph.end_node).predecessors.iter() {
 | 
			
		||||
            node_stack.push_back(prev_node);
 | 
			
		||||
            enqueued.insert(prev_node);
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        while let Some(cur_node) = node_stack.pop_front() {
 | 
			
		||||
        self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| {
 | 
			
		||||
            if cur_node == self.query_graph.end_node {
 | 
			
		||||
                *costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
 | 
			
		||||
                return;
 | 
			
		||||
            }
 | 
			
		||||
            let mut self_costs = Vec::<u64>::new();
 | 
			
		||||
 | 
			
		||||
            let cur_node_edges = &self.edges_of_node.get(cur_node);
 | 
			
		||||
@@ -232,13 +226,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
 | 
			
		||||
            self_costs.dedup();
 | 
			
		||||
 | 
			
		||||
            *costs_to_end.get_mut(cur_node) = self_costs;
 | 
			
		||||
            for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
 | 
			
		||||
                if !enqueued.contains(prev_node) {
 | 
			
		||||
                    node_stack.push_back(prev_node);
 | 
			
		||||
                    enqueued.insert(prev_node);
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        });
 | 
			
		||||
        costs_to_end
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -247,17 +235,12 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
 | 
			
		||||
        node_with_removed_outgoing_conditions: Interned<QueryNode>,
 | 
			
		||||
        costs: &mut MappedInterner<QueryNode, Vec<u64>>,
 | 
			
		||||
    ) {
 | 
			
		||||
        let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len());
 | 
			
		||||
        let mut node_stack = VecDeque::new();
 | 
			
		||||
 | 
			
		||||
        enqueued.insert(node_with_removed_outgoing_conditions);
 | 
			
		||||
        node_stack.push_back(node_with_removed_outgoing_conditions);
 | 
			
		||||
 | 
			
		||||
        'main_loop: while let Some(cur_node) = node_stack.pop_front() {
 | 
			
		||||
        // Traverse the graph backward from the target node, recomputing the cost for each of its predecessors.
 | 
			
		||||
        // We first check that no other node is contributing the same total cost to a predecessor before removing
 | 
			
		||||
        // the cost from the predecessor.
 | 
			
		||||
        self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
 | 
			
		||||
            let mut costs_to_remove = FxHashSet::default();
 | 
			
		||||
            for c in costs.get(cur_node) {
 | 
			
		||||
                costs_to_remove.insert(*c);
 | 
			
		||||
            }
 | 
			
		||||
            costs_to_remove.extend(costs.get(cur_node).iter().copied());
 | 
			
		||||
 | 
			
		||||
            let cur_node_edges = &self.edges_of_node.get(cur_node);
 | 
			
		||||
            for edge_idx in cur_node_edges.iter() {
 | 
			
		||||
@@ -265,22 +248,75 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
 | 
			
		||||
                for cost in costs.get(edge.dest_node).iter() {
 | 
			
		||||
                    costs_to_remove.remove(&(*cost + edge.cost as u64));
 | 
			
		||||
                    if costs_to_remove.is_empty() {
 | 
			
		||||
                        continue 'main_loop;
 | 
			
		||||
                        return;
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            if costs_to_remove.is_empty() {
 | 
			
		||||
                continue 'main_loop;
 | 
			
		||||
                return;
 | 
			
		||||
            }
 | 
			
		||||
            let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied());
 | 
			
		||||
            for c in costs_to_remove {
 | 
			
		||||
                new_costs.remove(&c);
 | 
			
		||||
            }
 | 
			
		||||
            *costs.get_mut(cur_node) = new_costs.into_iter().collect();
 | 
			
		||||
        });
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Traverse the graph backwards from the given node such that every time
 | 
			
		||||
    /// a node is visited, we are guaranteed that all its successors either:
 | 
			
		||||
    /// 1. have already been visited; OR
 | 
			
		||||
    /// 2. were not reachable from the given node
 | 
			
		||||
    pub fn traverse_breadth_first_backward(
 | 
			
		||||
        &self,
 | 
			
		||||
        from: Interned<QueryNode>,
 | 
			
		||||
        mut visit: impl FnMut(Interned<QueryNode>),
 | 
			
		||||
    ) {
 | 
			
		||||
        let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
 | 
			
		||||
        {
 | 
			
		||||
            // go backward to get the set of all reachable nodes from the given node
 | 
			
		||||
            // the nodes that are not reachable will be set as `visited`
 | 
			
		||||
            let mut stack = VecDeque::new();
 | 
			
		||||
            let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
 | 
			
		||||
            enqueued.insert(from);
 | 
			
		||||
            stack.push_back(from);
 | 
			
		||||
            while let Some(n) = stack.pop_front() {
 | 
			
		||||
                if reachable.contains(n) {
 | 
			
		||||
                    continue;
 | 
			
		||||
                }
 | 
			
		||||
                reachable.insert(n);
 | 
			
		||||
                for prev_node in self.query_graph.nodes.get(n).predecessors.iter() {
 | 
			
		||||
                    if !enqueued.contains(prev_node) && !reachable.contains(prev_node) {
 | 
			
		||||
                        stack.push_back(prev_node);
 | 
			
		||||
                        enqueued.insert(prev_node);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        };
 | 
			
		||||
        let mut unreachable_or_visited =
 | 
			
		||||
            SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
 | 
			
		||||
        for (n, _) in self.query_graph.nodes.iter() {
 | 
			
		||||
            if !reachable.contains(n) {
 | 
			
		||||
                unreachable_or_visited.insert(n);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
 | 
			
		||||
        let mut stack = VecDeque::new();
 | 
			
		||||
 | 
			
		||||
        enqueued.insert(from);
 | 
			
		||||
        stack.push_back(from);
 | 
			
		||||
 | 
			
		||||
        while let Some(cur_node) = stack.pop_front() {
 | 
			
		||||
            if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) {
 | 
			
		||||
                stack.push_back(cur_node);
 | 
			
		||||
                continue;
 | 
			
		||||
            }
 | 
			
		||||
            unreachable_or_visited.insert(cur_node);
 | 
			
		||||
            visit(cur_node);
 | 
			
		||||
            for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
 | 
			
		||||
                if !enqueued.contains(prev_node) {
 | 
			
		||||
                    node_stack.push_back(prev_node);
 | 
			
		||||
                if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) {
 | 
			
		||||
                    stack.push_back(prev_node);
 | 
			
		||||
                    enqueued.insert(prev_node);
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
 
 | 
			
		||||
@@ -20,6 +20,8 @@ mod position;
 | 
			
		||||
mod proximity;
 | 
			
		||||
/// Implementation of the `typo` ranking rule
 | 
			
		||||
mod typo;
 | 
			
		||||
/// Implementation of the `words` ranking rule
 | 
			
		||||
mod words;
 | 
			
		||||
 | 
			
		||||
use std::collections::BTreeSet;
 | 
			
		||||
use std::hash::Hash;
 | 
			
		||||
@@ -33,6 +35,7 @@ pub use position::{PositionCondition, PositionGraph};
 | 
			
		||||
pub use proximity::{ProximityCondition, ProximityGraph};
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
pub use typo::{TypoCondition, TypoGraph};
 | 
			
		||||
pub use words::{WordsCondition, WordsGraph};
 | 
			
		||||
 | 
			
		||||
use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner};
 | 
			
		||||
use super::query_term::LocatedQueryTermSubset;
 | 
			
		||||
 
 | 
			
		||||
@@ -50,7 +50,7 @@ impl RankingRuleGraphTrait for TypoGraph {
 | 
			
		||||
        // 3-gram -> equivalent to 2 typos
 | 
			
		||||
        let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
 | 
			
		||||
 | 
			
		||||
        for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) {
 | 
			
		||||
        for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) {
 | 
			
		||||
            let mut term = term.clone();
 | 
			
		||||
            match nbr_typos {
 | 
			
		||||
                0 => {
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										49
									
								
								milli/src/search/new/ranking_rule_graph/words/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										49
									
								
								milli/src/search/new/ranking_rule_graph/words/mod.rs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,49 @@
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
 | 
			
		||||
use super::{ComputedCondition, RankingRuleGraphTrait};
 | 
			
		||||
use crate::search::new::interner::{DedupInterner, Interned};
 | 
			
		||||
use crate::search::new::query_term::LocatedQueryTermSubset;
 | 
			
		||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
 | 
			
		||||
use crate::search::new::SearchContext;
 | 
			
		||||
use crate::Result;
 | 
			
		||||
 | 
			
		||||
#[derive(Clone, PartialEq, Eq, Hash)]
 | 
			
		||||
pub struct WordsCondition {
 | 
			
		||||
    term: LocatedQueryTermSubset,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
pub enum WordsGraph {}
 | 
			
		||||
 | 
			
		||||
impl RankingRuleGraphTrait for WordsGraph {
 | 
			
		||||
    type Condition = WordsCondition;
 | 
			
		||||
 | 
			
		||||
    fn resolve_condition(
 | 
			
		||||
        ctx: &mut SearchContext,
 | 
			
		||||
        condition: &Self::Condition,
 | 
			
		||||
        universe: &RoaringBitmap,
 | 
			
		||||
    ) -> Result<ComputedCondition> {
 | 
			
		||||
        let WordsCondition { term, .. } = condition;
 | 
			
		||||
        // maybe compute_query_term_subset_docids should accept a universe as argument
 | 
			
		||||
        let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?;
 | 
			
		||||
        docids &= universe;
 | 
			
		||||
 | 
			
		||||
        Ok(ComputedCondition {
 | 
			
		||||
            docids,
 | 
			
		||||
            universe_len: universe.len(),
 | 
			
		||||
            start_term_subset: None,
 | 
			
		||||
            end_term_subset: term.clone(),
 | 
			
		||||
        })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn build_edges(
 | 
			
		||||
        _ctx: &mut SearchContext,
 | 
			
		||||
        conditions_interner: &mut DedupInterner<Self::Condition>,
 | 
			
		||||
        _from: Option<&LocatedQueryTermSubset>,
 | 
			
		||||
        to_term: &LocatedQueryTermSubset,
 | 
			
		||||
    ) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
 | 
			
		||||
        Ok(vec![(
 | 
			
		||||
            to_term.term_ids.len() as u32,
 | 
			
		||||
            conditions_interner.insert(WordsCondition { term: to_term.clone() }),
 | 
			
		||||
        )])
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -1,87 +0,0 @@
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
 | 
			
		||||
use super::logger::SearchLogger;
 | 
			
		||||
use super::query_graph::QueryNode;
 | 
			
		||||
use super::resolve_query_graph::compute_query_graph_docids;
 | 
			
		||||
use super::small_bitmap::SmallBitmap;
 | 
			
		||||
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
 | 
			
		||||
use crate::{Result, TermsMatchingStrategy};
 | 
			
		||||
 | 
			
		||||
pub struct Words {
 | 
			
		||||
    exhausted: bool, // TODO: remove
 | 
			
		||||
    query_graph: Option<QueryGraph>,
 | 
			
		||||
    nodes_to_remove: Vec<SmallBitmap<QueryNode>>,
 | 
			
		||||
    terms_matching_strategy: TermsMatchingStrategy,
 | 
			
		||||
}
 | 
			
		||||
impl Words {
 | 
			
		||||
    pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
 | 
			
		||||
        Self {
 | 
			
		||||
            exhausted: true,
 | 
			
		||||
            query_graph: None,
 | 
			
		||||
            nodes_to_remove: vec![],
 | 
			
		||||
            terms_matching_strategy,
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl<'ctx> RankingRule<'ctx, QueryGraph> for Words {
 | 
			
		||||
    fn id(&self) -> String {
 | 
			
		||||
        "words".to_owned()
 | 
			
		||||
    }
 | 
			
		||||
    fn start_iteration(
 | 
			
		||||
        &mut self,
 | 
			
		||||
        ctx: &mut SearchContext<'ctx>,
 | 
			
		||||
        _logger: &mut dyn SearchLogger<QueryGraph>,
 | 
			
		||||
        _universe: &RoaringBitmap,
 | 
			
		||||
        parent_query_graph: &QueryGraph,
 | 
			
		||||
    ) -> Result<()> {
 | 
			
		||||
        self.exhausted = false;
 | 
			
		||||
        self.query_graph = Some(parent_query_graph.clone());
 | 
			
		||||
        self.nodes_to_remove = match self.terms_matching_strategy {
 | 
			
		||||
            TermsMatchingStrategy::Last => {
 | 
			
		||||
                let mut ns = parent_query_graph.removal_order_for_terms_matching_strategy_last(ctx);
 | 
			
		||||
                ns.reverse();
 | 
			
		||||
                ns
 | 
			
		||||
            }
 | 
			
		||||
            TermsMatchingStrategy::All => {
 | 
			
		||||
                vec![]
 | 
			
		||||
            }
 | 
			
		||||
        };
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn next_bucket(
 | 
			
		||||
        &mut self,
 | 
			
		||||
        ctx: &mut SearchContext<'ctx>,
 | 
			
		||||
        logger: &mut dyn SearchLogger<QueryGraph>,
 | 
			
		||||
        universe: &RoaringBitmap,
 | 
			
		||||
    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
 | 
			
		||||
        if self.exhausted {
 | 
			
		||||
            return Ok(None);
 | 
			
		||||
        }
 | 
			
		||||
        let Some(query_graph) = &mut self.query_graph else { panic!() };
 | 
			
		||||
        logger.log_internal_state(query_graph);
 | 
			
		||||
 | 
			
		||||
        let this_bucket = compute_query_graph_docids(ctx, query_graph, universe)?;
 | 
			
		||||
 | 
			
		||||
        let child_query_graph = query_graph.clone();
 | 
			
		||||
 | 
			
		||||
        if self.nodes_to_remove.is_empty() {
 | 
			
		||||
            self.exhausted = true;
 | 
			
		||||
        } else {
 | 
			
		||||
            let nodes_to_remove = self.nodes_to_remove.pop().unwrap();
 | 
			
		||||
            query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::<Vec<_>>());
 | 
			
		||||
        }
 | 
			
		||||
        Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket }))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn end_iteration(
 | 
			
		||||
        &mut self,
 | 
			
		||||
        _ctx: &mut SearchContext<'ctx>,
 | 
			
		||||
        _logger: &mut dyn SearchLogger<QueryGraph>,
 | 
			
		||||
    ) {
 | 
			
		||||
        self.exhausted = true;
 | 
			
		||||
        self.nodes_to_remove = vec![];
 | 
			
		||||
        self.query_graph = None;
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user