mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Introduce a special word_derivations function for Proximity
This commit is contained in:
		| @@ -67,7 +67,7 @@ pub trait Context { | |||||||
|     fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>; |     fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>; | ||||||
|     fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>; |     fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>; | ||||||
|     fn in_prefix_cache(&self, word: &str) -> bool; |     fn in_prefix_cache(&self, word: &str) -> bool; | ||||||
|     fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>>; |     fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>>; | ||||||
| } | } | ||||||
| pub struct CriteriaBuilder<'t> { | pub struct CriteriaBuilder<'t> { | ||||||
|     rtxn: &'t heed::RoTxn<'t>, |     rtxn: &'t heed::RoTxn<'t>, | ||||||
| @@ -107,9 +107,13 @@ impl<'a> Context for CriteriaBuilder<'a> { | |||||||
|         self.words_prefixes_fst.contains(word) |         self.words_prefixes_fst.contains(word) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result<Option<RoaringBitmap>> { |     fn docid_words_positions(&self, docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> { | ||||||
|         let key = (docid, word); |         let mut words_positions = HashMap::new(); | ||||||
|         self.index.docid_word_positions.get(self.rtxn, &key) |         for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { | ||||||
|  |             let ((_, word), positions) = result?; | ||||||
|  |             words_positions.insert(word.to_string(), positions); | ||||||
|  |         } | ||||||
|  |         Ok(words_positions) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -391,7 +395,7 @@ pub mod test { | |||||||
|             self.word_prefix_docids.contains_key(&word.to_string()) |             self.word_prefix_docids.contains_key(&word.to_string()) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result<Option<RoaringBitmap>> { |         fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result<HashMap<String, RoaringBitmap>> { | ||||||
|             todo!() |             todo!() | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -1,14 +1,13 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::collections::btree_map::{self, BTreeMap}; | use std::collections::btree_map::{self, BTreeMap}; | ||||||
| use std::collections::hash_map::{HashMap, Entry}; | use std::collections::hash_map::HashMap; | ||||||
| use std::mem::take; | use std::mem::take; | ||||||
|  |  | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use log::debug; | use log::debug; | ||||||
|  |  | ||||||
| use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; | use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; | ||||||
| use crate::search::query_tree::{maximum_proximity, Operation, Query}; | use crate::search::query_tree::{maximum_proximity, Operation, Query}; | ||||||
| use crate::search::WordDerivationsCache; | use crate::search::{build_dfa, WordDerivationsCache}; | ||||||
| use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; | use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; | ||||||
|  |  | ||||||
| pub struct Proximity<'t> { | pub struct Proximity<'t> { | ||||||
| @@ -358,7 +357,7 @@ fn resolve_plane_sweep_candidates( | |||||||
|         docid: DocumentId, |         docid: DocumentId, | ||||||
|         consecutive: bool, |         consecutive: bool, | ||||||
|         rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, |         rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, | ||||||
|         dwpcache: &mut HashMap<String, Option<RoaringBitmap>>, |         words_positions: &HashMap<String, RoaringBitmap>, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> anyhow::Result<Vec<(Position, u8, Position)>> |     ) -> anyhow::Result<Vec<(Position, u8, Position)>> | ||||||
|     { |     { | ||||||
| @@ -400,7 +399,7 @@ fn resolve_plane_sweep_candidates( | |||||||
|         let mut groups_positions = Vec::with_capacity(groups_len); |         let mut groups_positions = Vec::with_capacity(groups_len); | ||||||
|  |  | ||||||
|         for operation in operations { |         for operation in operations { | ||||||
|             let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?; |             let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; | ||||||
|             groups_positions.push(positions.into_iter()); |             groups_positions.push(positions.into_iter()); | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -476,7 +475,7 @@ fn resolve_plane_sweep_candidates( | |||||||
|         query_tree: &'a Operation, |         query_tree: &'a Operation, | ||||||
|         docid: DocumentId, |         docid: DocumentId, | ||||||
|         rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, |         rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, | ||||||
|         dwpcache: &mut HashMap<String, Option<RoaringBitmap>>, |         words_positions: &HashMap<String, RoaringBitmap>, | ||||||
|         wdcache: &mut WordDerivationsCache, |         wdcache: &mut WordDerivationsCache, | ||||||
|     ) -> anyhow::Result<Vec<(Position, u8, Position)>> |     ) -> anyhow::Result<Vec<(Position, u8, Position)>> | ||||||
|     { |     { | ||||||
| @@ -487,44 +486,34 @@ fn resolve_plane_sweep_candidates( | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         let result = match query_tree { |         let result = match query_tree { | ||||||
|             And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?, |             And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?, | ||||||
|             Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?, |             Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?, | ||||||
|             Or(_, ops) => { |             Or(_, ops) => { | ||||||
|                 let mut result = Vec::new(); |                 let mut result = Vec::new(); | ||||||
|                 for op in ops { |                 for op in ops { | ||||||
|                     result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?) |                     result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?) | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 result.sort_unstable(); |                 result.sort_unstable(); | ||||||
|                 result |                 result | ||||||
|             }, |             }, | ||||||
|             Operation::Query(Query { prefix, kind }) => { |             Operation::Query(Query { prefix, kind }) => { | ||||||
|                 let fst = ctx.words_fst(); |                 let mut result = Vec::new(); | ||||||
|                 let words = match kind { |                 match kind { | ||||||
|                     QueryKind::Exact { word, .. } => { |                     QueryKind::Exact { word, .. } => { | ||||||
|                         if *prefix { |                         if *prefix { | ||||||
|                             Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?) |                             let iter = word_derivations(word, true, 0, &words_positions) | ||||||
|  |                                 .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); | ||||||
|  |                             result.extend(iter); | ||||||
|                         } else { |                         } else { | ||||||
|                             Cow::Owned(vec![(word.to_string(), 0)]) |                             if let Some(positions) = words_positions.get(word) { | ||||||
|  |                                 result.extend(positions.iter().map(|p| (p, 0, p))); | ||||||
|  |                             } | ||||||
|                         } |                         } | ||||||
|                     }, |                     }, | ||||||
|                     QueryKind::Tolerant { typo, word } => { |                     QueryKind::Tolerant { typo, word } => { | ||||||
|                         Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?) |                         let iter = word_derivations(word, *prefix, *typo, &words_positions) | ||||||
|                     } |                             .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); | ||||||
|                 }; |  | ||||||
|  |  | ||||||
|                 let mut result = Vec::new(); |  | ||||||
|                 for (word, _) in words.as_ref() { |  | ||||||
|                     let positions = match dwpcache.entry(word.to_string()) { |  | ||||||
|                         Entry::Occupied(entry) => entry.into_mut(), |  | ||||||
|                         Entry::Vacant(entry) => { |  | ||||||
|                             let positions = ctx.docid_word_positions(docid, word)?; |  | ||||||
|                             entry.insert(positions) |  | ||||||
|                         } |  | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     if let Some(positions) = positions { |  | ||||||
|                         let iter = positions.iter().map(|p| (p, 0, p)); |  | ||||||
|                         result.extend(iter); |                         result.extend(iter); | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
| @@ -538,18 +527,34 @@ fn resolve_plane_sweep_candidates( | |||||||
|         Ok(result) |         Ok(result) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     let mut word_positions_cache = HashMap::new(); |     fn word_derivations<'a>( | ||||||
|  |         word: &str, | ||||||
|  |         is_prefix: bool, | ||||||
|  |         max_typo: u8, | ||||||
|  |         words_positions: &'a HashMap<String, RoaringBitmap>, | ||||||
|  |     ) -> impl Iterator<Item = &'a RoaringBitmap> | ||||||
|  |     { | ||||||
|  |         let dfa = build_dfa(word, max_typo, is_prefix); | ||||||
|  |         words_positions.iter().filter_map(move |(document_word, positions)| { | ||||||
|  |             use levenshtein_automata::Distance; | ||||||
|  |             match dfa.eval(document_word) { | ||||||
|  |                 Distance::Exact(_) => Some(positions), | ||||||
|  |                 Distance::AtLeast(_) => None, | ||||||
|  |             } | ||||||
|  |         }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     let mut resolve_operation_cache = HashMap::new(); |     let mut resolve_operation_cache = HashMap::new(); | ||||||
|     let mut candidates = BTreeMap::new(); |     let mut candidates = BTreeMap::new(); | ||||||
|     for docid in allowed_candidates { |     for docid in allowed_candidates { | ||||||
|         word_positions_cache.clear(); |         let words_positions = ctx.docid_words_positions(docid)?; | ||||||
|         resolve_operation_cache.clear(); |         resolve_operation_cache.clear(); | ||||||
|         let positions =  resolve_operation( |         let positions =  resolve_operation( | ||||||
|             ctx, |             ctx, | ||||||
|             query_tree, |             query_tree, | ||||||
|             docid, |             docid, | ||||||
|             &mut resolve_operation_cache, |             &mut resolve_operation_cache, | ||||||
|             &mut word_positions_cache, |             &words_positions, | ||||||
|             wdcache, |             wdcache, | ||||||
|         )?; |         )?; | ||||||
|         let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); |         let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user