mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 01:46:28 +00:00 
			
		
		
		
	Add exactness ranking rules
This commit is contained in:
		
							
								
								
									
										175
									
								
								milli/src/search/new/exact_attribute.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										175
									
								
								milli/src/search/new/exact_attribute.rs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,175 @@
 | 
			
		||||
use heed::BytesDecode;
 | 
			
		||||
use roaring::MultiOps;
 | 
			
		||||
 | 
			
		||||
use super::query_graph::QueryGraph;
 | 
			
		||||
use super::ranking_rules::{RankingRule, RankingRuleOutput};
 | 
			
		||||
use crate::search::new::query_graph::QueryNodeData;
 | 
			
		||||
use crate::search::new::query_term::ExactTerm;
 | 
			
		||||
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
 | 
			
		||||
 | 
			
		||||
/// FIXME:
 | 
			
		||||
///
 | 
			
		||||
/// - A lot of work done in next_bucket that start_iteration could do.
 | 
			
		||||
/// - Consider calling the graph based rule directly from this one.
 | 
			
		||||
/// - currently we did exact term, don't forget about prefix
 | 
			
		||||
/// - some tests
 | 
			
		||||
pub struct ExactAttribute {
 | 
			
		||||
    query_graph: Option<QueryGraph>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl ExactAttribute {
 | 
			
		||||
    pub fn new() -> Self {
 | 
			
		||||
        Self { query_graph: None }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
 | 
			
		||||
    fn id(&self) -> String {
 | 
			
		||||
        "exact_attribute".to_owned()
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn start_iteration(
 | 
			
		||||
        &mut self,
 | 
			
		||||
        _ctx: &mut SearchContext<'ctx>,
 | 
			
		||||
        _logger: &mut dyn SearchLogger<QueryGraph>,
 | 
			
		||||
        _universe: &roaring::RoaringBitmap,
 | 
			
		||||
        query: &QueryGraph,
 | 
			
		||||
    ) -> Result<()> {
 | 
			
		||||
        self.query_graph = Some(query.clone());
 | 
			
		||||
        Ok(())
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn next_bucket(
 | 
			
		||||
        &mut self,
 | 
			
		||||
        ctx: &mut SearchContext<'ctx>,
 | 
			
		||||
        _logger: &mut dyn SearchLogger<QueryGraph>,
 | 
			
		||||
        universe: &roaring::RoaringBitmap,
 | 
			
		||||
    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
 | 
			
		||||
        // iterate on the nodes of the graph, retain LocatedQueryTermSubset
 | 
			
		||||
        let query_graph = self.query_graph.as_ref().unwrap();
 | 
			
		||||
        let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
 | 
			
		||||
            Vec::with_capacity(query_graph.nodes.len() as usize);
 | 
			
		||||
        for (_, node) in query_graph.nodes.iter() {
 | 
			
		||||
            match &node.data {
 | 
			
		||||
                QueryNodeData::Term(term) => {
 | 
			
		||||
                    let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
 | 
			
		||||
                        exact_term
 | 
			
		||||
                    } else {
 | 
			
		||||
                        // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
 | 
			
		||||
                        return Ok(Some(RankingRuleOutput {
 | 
			
		||||
                            query: query_graph.clone(),
 | 
			
		||||
                            candidates: universe.clone(),
 | 
			
		||||
                        }));
 | 
			
		||||
                    };
 | 
			
		||||
                    exact_term_position_ids.push((
 | 
			
		||||
                        exact_term,
 | 
			
		||||
                        *term.positions.start(),
 | 
			
		||||
                        *term.term_ids.start(),
 | 
			
		||||
                    ))
 | 
			
		||||
                }
 | 
			
		||||
                QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        exact_term_position_ids.sort_by_key(|(_, _, id)| *id);
 | 
			
		||||
        // bail if there is a "hole" (missing word) in remaining query graph
 | 
			
		||||
        let mut previous_id = 0;
 | 
			
		||||
        for (_, _, id) in exact_term_position_ids.iter().copied() {
 | 
			
		||||
            if id < previous_id || id - previous_id > 1 {
 | 
			
		||||
                // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
 | 
			
		||||
                return Ok(Some(RankingRuleOutput {
 | 
			
		||||
                    query: query_graph.clone(),
 | 
			
		||||
                    candidates: universe.clone(),
 | 
			
		||||
                }));
 | 
			
		||||
            } else {
 | 
			
		||||
                previous_id = id;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        // sample query: "sunflower are pretty"
 | 
			
		||||
        // sunflower at pos 0 in attr A
 | 
			
		||||
        // are at pos 1 in attr B
 | 
			
		||||
        // pretty at pos 2 in attr C
 | 
			
		||||
        // We want to eliminate such document
 | 
			
		||||
 | 
			
		||||
        // first check that for each term, there exists some attribute that has this term at the correct position
 | 
			
		||||
        //"word-position-docids";
 | 
			
		||||
        let mut candidates = universe.clone();
 | 
			
		||||
        let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids
 | 
			
		||||
            .iter()
 | 
			
		||||
            .copied()
 | 
			
		||||
            .map(|(term, position, _)| (term.interned_words(ctx).collect(), position))
 | 
			
		||||
            .collect();
 | 
			
		||||
        for (words, position) in &words_positions {
 | 
			
		||||
            if candidates.is_empty() {
 | 
			
		||||
                // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
 | 
			
		||||
                return Ok(Some(RankingRuleOutput {
 | 
			
		||||
                    query: query_graph.clone(),
 | 
			
		||||
                    candidates: universe.clone(),
 | 
			
		||||
                }));
 | 
			
		||||
            }
 | 
			
		||||
 | 
			
		||||
            'words: for (offset, word) in words.iter().enumerate() {
 | 
			
		||||
                let offset = offset as u16;
 | 
			
		||||
                let word = if let Some(word) = word {
 | 
			
		||||
                    word
 | 
			
		||||
                } else {
 | 
			
		||||
                    continue 'words;
 | 
			
		||||
                };
 | 
			
		||||
                let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
 | 
			
		||||
                    ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(),
 | 
			
		||||
                )
 | 
			
		||||
                .unwrap_or_default();
 | 
			
		||||
                candidates &= word_position_docids;
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        let candidates = candidates;
 | 
			
		||||
 | 
			
		||||
        if candidates.is_empty() {
 | 
			
		||||
            // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
 | 
			
		||||
            return Ok(Some(RankingRuleOutput {
 | 
			
		||||
                query: query_graph.clone(),
 | 
			
		||||
                candidates: universe.clone(),
 | 
			
		||||
            }));
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default();
 | 
			
		||||
 | 
			
		||||
        let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len());
 | 
			
		||||
 | 
			
		||||
        // then check that there exists at least one attribute that has all of the terms
 | 
			
		||||
        for fid in searchable_fields_ids {
 | 
			
		||||
            let mut intersection = MultiOps::intersection(
 | 
			
		||||
                words_positions
 | 
			
		||||
                    .iter()
 | 
			
		||||
                    .flat_map(|(words, ..)| words.iter())
 | 
			
		||||
                    // ignore stop words words in phrases
 | 
			
		||||
                    .flatten()
 | 
			
		||||
                    .map(|word| -> Result<_> {
 | 
			
		||||
                        Ok(ctx
 | 
			
		||||
                            .get_db_word_fid_docids(*word, fid)?
 | 
			
		||||
                            .map(CboRoaringBitmapCodec::bytes_decode)
 | 
			
		||||
                            .unwrap_or_default()
 | 
			
		||||
                            .unwrap_or_default())
 | 
			
		||||
                    }),
 | 
			
		||||
            )?;
 | 
			
		||||
            intersection &= &candidates;
 | 
			
		||||
            if !intersection.is_empty() {
 | 
			
		||||
                candidates_per_attributes.push(intersection);
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
        // note we could have "false positives" where there both exist different attributes that collectively
 | 
			
		||||
        // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
 | 
			
		||||
 | 
			
		||||
        let candidates = MultiOps::union(candidates_per_attributes.into_iter());
 | 
			
		||||
        Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates }))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn end_iteration(
 | 
			
		||||
        &mut self,
 | 
			
		||||
        _ctx: &mut SearchContext<'ctx>,
 | 
			
		||||
        _logger: &mut dyn SearchLogger<QueryGraph>,
 | 
			
		||||
    ) {
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner};
 | 
			
		||||
use super::logger::SearchLogger;
 | 
			
		||||
use super::query_graph::QueryNode;
 | 
			
		||||
use super::ranking_rule_graph::{
 | 
			
		||||
    ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait,
 | 
			
		||||
    TypoGraph,
 | 
			
		||||
    ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph,
 | 
			
		||||
    RankingRuleGraphTrait, TypoGraph,
 | 
			
		||||
};
 | 
			
		||||
use super::small_bitmap::SmallBitmap;
 | 
			
		||||
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
 | 
			
		||||
@@ -65,6 +65,12 @@ impl GraphBasedRankingRule<TypoGraph> {
 | 
			
		||||
        Self::new_with_id("typo".to_owned(), terms_matching_strategy)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
pub type Exactness = GraphBasedRankingRule<ExactnessGraph>;
 | 
			
		||||
impl GraphBasedRankingRule<ExactnessGraph> {
 | 
			
		||||
    pub fn new() -> Self {
 | 
			
		||||
        Self::new_with_id("exactness".to_owned(), None)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// A generic graph-based ranking rule
 | 
			
		||||
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
 | 
			
		||||
 
 | 
			
		||||
@@ -9,8 +9,9 @@ mod query_term;
 | 
			
		||||
mod ranking_rule_graph;
 | 
			
		||||
mod ranking_rules;
 | 
			
		||||
mod resolve_query_graph;
 | 
			
		||||
// TODO: documentation + comments
 | 
			
		||||
mod small_bitmap;
 | 
			
		||||
 | 
			
		||||
mod exact_attribute;
 | 
			
		||||
// TODO: documentation + comments
 | 
			
		||||
// implementation is currently an adaptation of the previous implementation to fit with the new model
 | 
			
		||||
mod sort;
 | 
			
		||||
@@ -33,6 +34,8 @@ use resolve_query_graph::PhraseDocIdsCache;
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
use words::Words;
 | 
			
		||||
 | 
			
		||||
use self::exact_attribute::ExactAttribute;
 | 
			
		||||
use self::graph_based_ranking_rule::Exactness;
 | 
			
		||||
use self::interner::Interner;
 | 
			
		||||
use self::ranking_rules::{BoxRankingRule, RankingRule};
 | 
			
		||||
use self::resolve_query_graph::compute_query_graph_docids;
 | 
			
		||||
@@ -150,7 +153,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
 | 
			
		||||
    let mut proximity = false;
 | 
			
		||||
    let mut sort = false;
 | 
			
		||||
    let attribute = false;
 | 
			
		||||
    let exactness = false;
 | 
			
		||||
    let mut exactness = false;
 | 
			
		||||
    let mut asc = HashSet::new();
 | 
			
		||||
    let mut desc = HashSet::new();
 | 
			
		||||
 | 
			
		||||
@@ -211,8 +214,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
 | 
			
		||||
                if exactness {
 | 
			
		||||
                    continue;
 | 
			
		||||
                }
 | 
			
		||||
                // todo!();
 | 
			
		||||
                // exactness = false;
 | 
			
		||||
                ranking_rules.push(Box::new(ExactAttribute::new()));
 | 
			
		||||
                ranking_rules.push(Box::new(Exactness::new()));
 | 
			
		||||
                exactness = true;
 | 
			
		||||
            }
 | 
			
		||||
            crate::Criterion::Asc(field_name) => {
 | 
			
		||||
                if asc.contains(&field_name) {
 | 
			
		||||
 
 | 
			
		||||
							
								
								
									
										107
									
								
								milli/src/search/new/ranking_rule_graph/exactness/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										107
									
								
								milli/src/search/new/ranking_rule_graph/exactness/mod.rs
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,107 @@
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
 | 
			
		||||
use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
 | 
			
		||||
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
 | 
			
		||||
use crate::search::new::query_graph::{QueryGraph, QueryNode};
 | 
			
		||||
use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
 | 
			
		||||
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
 | 
			
		||||
 | 
			
		||||
/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other
 | 
			
		||||
/// word than a doc that matches 9 words non exactly but none exactly
 | 
			
		||||
/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider
 | 
			
		||||
///
 | 
			
		||||
/// "naive vision"
 | 
			
		||||
/// condition from one node to another:
 | 
			
		||||
/// - word exactly present: cost 0
 | 
			
		||||
/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside
 | 
			
		||||
///
 | 
			
		||||
/// Three strategies:
 | 
			
		||||
/// 1. ExactAttribute: word position / word_fid_docid
 | 
			
		||||
/// 2. AttributeStart:
 | 
			
		||||
/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait`
 | 
			
		||||
 | 
			
		||||
#[derive(Clone, PartialEq, Eq, Hash)]
 | 
			
		||||
pub enum ExactnessCondition {
 | 
			
		||||
    ExactInAttribute(LocatedQueryTermSubset),
 | 
			
		||||
    Skip(LocatedQueryTermSubset),
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
pub enum ExactnessGraph {}
 | 
			
		||||
 | 
			
		||||
fn compute_docids(
 | 
			
		||||
    ctx: &mut SearchContext,
 | 
			
		||||
    dest_node: &LocatedQueryTermSubset,
 | 
			
		||||
    universe: &RoaringBitmap,
 | 
			
		||||
) -> Result<RoaringBitmap> {
 | 
			
		||||
    let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) {
 | 
			
		||||
        exact_term
 | 
			
		||||
    } else {
 | 
			
		||||
        return Ok(Default::default());
 | 
			
		||||
    };
 | 
			
		||||
    let mut candidates = match exact_term {
 | 
			
		||||
        ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
 | 
			
		||||
        ExactTerm::Word(word) => {
 | 
			
		||||
            if let Some(word_candidates) = ctx.get_db_word_docids(word)? {
 | 
			
		||||
                CboRoaringBitmapCodec::deserialize_from(word_candidates)?
 | 
			
		||||
            } else {
 | 
			
		||||
                return Ok(Default::default());
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    };
 | 
			
		||||
    // TODO: synonyms?
 | 
			
		||||
    candidates &= universe;
 | 
			
		||||
    Ok(candidates)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl RankingRuleGraphTrait for ExactnessGraph {
 | 
			
		||||
    type Condition = ExactnessCondition;
 | 
			
		||||
 | 
			
		||||
    fn resolve_condition(
 | 
			
		||||
        ctx: &mut SearchContext,
 | 
			
		||||
        condition: &Self::Condition,
 | 
			
		||||
        universe: &RoaringBitmap,
 | 
			
		||||
    ) -> Result<ComputedCondition> {
 | 
			
		||||
        let (docids, dest_node) = match condition {
 | 
			
		||||
            ExactnessCondition::ExactInAttribute(dest_node) => {
 | 
			
		||||
                (compute_docids(ctx, dest_node, universe)?, dest_node)
 | 
			
		||||
            }
 | 
			
		||||
            ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node),
 | 
			
		||||
        };
 | 
			
		||||
        Ok(ComputedCondition {
 | 
			
		||||
            docids,
 | 
			
		||||
            universe_len: universe.len(),
 | 
			
		||||
            start_term_subset: None,
 | 
			
		||||
            end_term_subset: dest_node.clone(),
 | 
			
		||||
        })
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn build_edges(
 | 
			
		||||
        _ctx: &mut SearchContext,
 | 
			
		||||
        conditions_interner: &mut DedupInterner<Self::Condition>,
 | 
			
		||||
        _source_node: Option<&LocatedQueryTermSubset>,
 | 
			
		||||
        dest_node: &LocatedQueryTermSubset,
 | 
			
		||||
    ) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
 | 
			
		||||
        let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone());
 | 
			
		||||
        let exact_condition = conditions_interner.insert(exact_condition);
 | 
			
		||||
 | 
			
		||||
        let skip_condition = ExactnessCondition::Skip(dest_node.clone());
 | 
			
		||||
        let skip_condition = conditions_interner.insert(skip_condition);
 | 
			
		||||
        Ok(vec![(0, exact_condition), (1, skip_condition)])
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn log_state(
 | 
			
		||||
        graph: &RankingRuleGraph<Self>,
 | 
			
		||||
        paths: &[Vec<Interned<Self::Condition>>],
 | 
			
		||||
        dead_ends_cache: &DeadEndsCache<Self::Condition>,
 | 
			
		||||
        universe: &RoaringBitmap,
 | 
			
		||||
        costs: &MappedInterner<QueryNode, Vec<u64>>,
 | 
			
		||||
        cost: u64,
 | 
			
		||||
        logger: &mut dyn SearchLogger<QueryGraph>,
 | 
			
		||||
    ) {
 | 
			
		||||
        todo!()
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
 | 
			
		||||
        todo!()
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
@@ -10,6 +10,8 @@ mod cheapest_paths;
 | 
			
		||||
mod condition_docids_cache;
 | 
			
		||||
mod dead_ends_cache;
 | 
			
		||||
 | 
			
		||||
/// Implementation of the `exactness` ranking rule
 | 
			
		||||
mod exactness;
 | 
			
		||||
/// Implementation of the `proximity` ranking rule
 | 
			
		||||
mod proximity;
 | 
			
		||||
/// Implementation of the `typo` ranking rule
 | 
			
		||||
@@ -20,6 +22,7 @@ use std::hash::Hash;
 | 
			
		||||
pub use cheapest_paths::PathVisitor;
 | 
			
		||||
pub use condition_docids_cache::ConditionDocIdsCache;
 | 
			
		||||
pub use dead_ends_cache::DeadEndsCache;
 | 
			
		||||
pub use exactness::{ExactnessCondition, ExactnessGraph};
 | 
			
		||||
pub use proximity::{ProximityCondition, ProximityGraph};
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
pub use typo::{TypoCondition, TypoGraph};
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user