mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	custom fst automatons
This commit is contained in:
		| @@ -18,6 +18,3 @@ opt-level = 3 | |||||||
| opt-level = 3 | opt-level = 3 | ||||||
| [profile.test.build-override] | [profile.test.build-override] | ||||||
| opt-level = 3 | opt-level = 3 | ||||||
|  |  | ||||||
| [patch.crates-io] |  | ||||||
| fst = { git = "https://github.com/MarinPostma/fst.git", rev = "e6c606b7507e8cb5e502d1609f9b909b8690bac5" } |  | ||||||
|   | |||||||
							
								
								
									
										187
									
								
								milli/src/search/fst_utils.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										187
									
								
								milli/src/search/fst_utils.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,187 @@ | |||||||
|  | /// This mod is necessary until https://github.com/BurntSushi/fst/pull/137 gets merged. | ||||||
|  | /// All credits for this code go to BurntSushi. | ||||||
|  | use fst::Automaton; | ||||||
|  |  | ||||||
|  | pub struct StartsWith<A>(pub A); | ||||||
|  |  | ||||||
|  | /// The `Automaton` state for `StartsWith<A>`. | ||||||
|  | pub struct StartsWithState<A: Automaton>(pub StartsWithStateKind<A>); | ||||||
|  |  | ||||||
|  | impl<A: Automaton> Clone for StartsWithState<A> | ||||||
|  | where | ||||||
|  |     A::State: Clone, | ||||||
|  | { | ||||||
|  |     fn clone(&self) -> Self { | ||||||
|  |         Self(self.0.clone()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | /// The inner state of a `StartsWithState<A>`. | ||||||
|  | pub enum StartsWithStateKind<A: Automaton> { | ||||||
|  |     /// Sink state that is reached when the automaton has matched the prefix. | ||||||
|  |     Done, | ||||||
|  |     /// State in which the automaton is while it hasn't matched the prefix. | ||||||
|  |     Running(A::State), | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<A: Automaton> Clone for StartsWithStateKind<A> | ||||||
|  | where | ||||||
|  |     A::State: Clone, | ||||||
|  | { | ||||||
|  |     fn clone(&self) -> Self { | ||||||
|  |         match self { | ||||||
|  |             StartsWithStateKind::Done => StartsWithStateKind::Done, | ||||||
|  |             StartsWithStateKind::Running(inner) => StartsWithStateKind::Running(inner.clone()), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<A: Automaton> Automaton for StartsWith<A> { | ||||||
|  |     type State = StartsWithState<A>; | ||||||
|  |  | ||||||
|  |     fn start(&self) -> StartsWithState<A> { | ||||||
|  |         StartsWithState({ | ||||||
|  |             let inner = self.0.start(); | ||||||
|  |             if self.0.is_match(&inner) { | ||||||
|  |                 StartsWithStateKind::Done | ||||||
|  |             } else { | ||||||
|  |                 StartsWithStateKind::Running(inner) | ||||||
|  |             } | ||||||
|  |         }) | ||||||
|  |     } | ||||||
|  |     fn is_match(&self, state: &StartsWithState<A>) -> bool { | ||||||
|  |         match state.0 { | ||||||
|  |             StartsWithStateKind::Done => true, | ||||||
|  |             StartsWithStateKind::Running(_) => false, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     fn can_match(&self, state: &StartsWithState<A>) -> bool { | ||||||
|  |         match state.0 { | ||||||
|  |             StartsWithStateKind::Done => true, | ||||||
|  |             StartsWithStateKind::Running(ref inner) => self.0.can_match(inner), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     fn will_always_match(&self, state: &StartsWithState<A>) -> bool { | ||||||
|  |         match state.0 { | ||||||
|  |             StartsWithStateKind::Done => true, | ||||||
|  |             StartsWithStateKind::Running(_) => false, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |     fn accept(&self, state: &StartsWithState<A>, byte: u8) -> StartsWithState<A> { | ||||||
|  |         StartsWithState(match state.0 { | ||||||
|  |             StartsWithStateKind::Done => StartsWithStateKind::Done, | ||||||
|  |             StartsWithStateKind::Running(ref inner) => { | ||||||
|  |                 let next_inner = self.0.accept(inner, byte); | ||||||
|  |                 if self.0.is_match(&next_inner) { | ||||||
|  |                     StartsWithStateKind::Done | ||||||
|  |                 } else { | ||||||
|  |                     StartsWithStateKind::Running(next_inner) | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         }) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | /// An automaton that matches when one of its component automata match. | ||||||
|  | #[derive(Clone, Debug)] | ||||||
|  | pub struct Union<A, B>(pub A, pub B); | ||||||
|  |  | ||||||
|  | /// The `Automaton` state for `Union<A, B>`. | ||||||
|  | pub struct UnionState<A: Automaton, B: Automaton>(pub A::State, pub B::State); | ||||||
|  |  | ||||||
|  | impl<A: Automaton, B: Automaton> Clone for UnionState<A, B> | ||||||
|  | where | ||||||
|  |     A::State: Clone, | ||||||
|  |     B::State: Clone, | ||||||
|  | { | ||||||
|  |     fn clone(&self) -> Self { | ||||||
|  |         Self(self.0.clone(), self.1.clone()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<A: Automaton, B: Automaton> Automaton for Union<A, B> { | ||||||
|  |     type State = UnionState<A, B>; | ||||||
|  |     fn start(&self) -> UnionState<A, B> { | ||||||
|  |         UnionState(self.0.start(), self.1.start()) | ||||||
|  |     } | ||||||
|  |     fn is_match(&self, state: &UnionState<A, B>) -> bool { | ||||||
|  |         self.0.is_match(&state.0) || self.1.is_match(&state.1) | ||||||
|  |     } | ||||||
|  |     fn can_match(&self, state: &UnionState<A, B>) -> bool { | ||||||
|  |         self.0.can_match(&state.0) || self.1.can_match(&state.1) | ||||||
|  |     } | ||||||
|  |     fn will_always_match(&self, state: &UnionState<A, B>) -> bool { | ||||||
|  |         self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1) | ||||||
|  |     } | ||||||
|  |     fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> { | ||||||
|  |         UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | /// An automaton that matches when both of its component automata match. | ||||||
|  | #[derive(Clone, Debug)] | ||||||
|  | pub struct Intersection<A, B>(pub A, pub B); | ||||||
|  |  | ||||||
|  | /// The `Automaton` state for `Intersection<A, B>`. | ||||||
|  | pub struct IntersectionState<A: Automaton, B: Automaton>(pub A::State, pub B::State); | ||||||
|  |  | ||||||
|  | impl<A: Automaton, B: Automaton> Clone for IntersectionState<A, B> | ||||||
|  | where | ||||||
|  |     A::State: Clone, | ||||||
|  |     B::State: Clone, | ||||||
|  | { | ||||||
|  |     fn clone(&self) -> Self { | ||||||
|  |         Self(self.0.clone(), self.1.clone()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> { | ||||||
|  |     type State = IntersectionState<A, B>; | ||||||
|  |     fn start(&self) -> IntersectionState<A, B> { | ||||||
|  |         IntersectionState(self.0.start(), self.1.start()) | ||||||
|  |     } | ||||||
|  |     fn is_match(&self, state: &IntersectionState<A, B>) -> bool { | ||||||
|  |         self.0.is_match(&state.0) && self.1.is_match(&state.1) | ||||||
|  |     } | ||||||
|  |     fn can_match(&self, state: &IntersectionState<A, B>) -> bool { | ||||||
|  |         self.0.can_match(&state.0) && self.1.can_match(&state.1) | ||||||
|  |     } | ||||||
|  |     fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool { | ||||||
|  |         self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1) | ||||||
|  |     } | ||||||
|  |     fn accept(&self, state: &IntersectionState<A, B>, byte: u8) -> IntersectionState<A, B> { | ||||||
|  |         IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | /// An automaton that matches exactly when the automaton it wraps does not. | ||||||
|  | #[derive(Clone, Debug)] | ||||||
|  | pub struct Complement<A>(pub A); | ||||||
|  |  | ||||||
|  | /// The `Automaton` state for `Complement<A>`. | ||||||
|  | pub struct ComplementState<A: Automaton>(pub A::State); | ||||||
|  |  | ||||||
|  | impl<A: Automaton> Clone for ComplementState<A> | ||||||
|  | where | ||||||
|  |     A::State: Clone, | ||||||
|  | { | ||||||
|  |     fn clone(&self) -> Self { | ||||||
|  |         Self(self.0.clone()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<A: Automaton> Automaton for Complement<A> { | ||||||
|  |     type State = ComplementState<A>; | ||||||
|  |     fn start(&self) -> ComplementState<A> { | ||||||
|  |         ComplementState(self.0.start()) | ||||||
|  |     } | ||||||
|  |     fn is_match(&self, state: &ComplementState<A>) -> bool { | ||||||
|  |         !self.0.is_match(&state.0) | ||||||
|  |     } | ||||||
|  |     fn can_match(&self, state: &ComplementState<A>) -> bool { | ||||||
|  |         !self.0.will_always_match(&state.0) | ||||||
|  |     } | ||||||
|  |     fn will_always_match(&self, state: &ComplementState<A>) -> bool { | ||||||
|  |         !self.0.can_match(&state.0) | ||||||
|  |     } | ||||||
|  |     fn accept(&self, state: &ComplementState<A>, byte: u8) -> ComplementState<A> { | ||||||
|  |         ComplementState(self.0.accept(&state.0, byte)) | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -16,6 +16,7 @@ use once_cell::sync::Lazy; | |||||||
| use roaring::bitmap::RoaringBitmap; | use roaring::bitmap::RoaringBitmap; | ||||||
|  |  | ||||||
| pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; | pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; | ||||||
|  | use self::fst_utils::{Complement, Intersection, StartsWith, Union}; | ||||||
| pub use self::matching_words::MatchingWords; | pub use self::matching_words::MatchingWords; | ||||||
| use self::query_tree::QueryTreeBuilder; | use self::query_tree::QueryTreeBuilder; | ||||||
| use crate::error::UserError; | use crate::error::UserError; | ||||||
| @@ -30,6 +31,7 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true)); | |||||||
| mod criteria; | mod criteria; | ||||||
| mod distinct; | mod distinct; | ||||||
| mod facet; | mod facet; | ||||||
|  | mod fst_utils; | ||||||
| mod matching_words; | mod matching_words; | ||||||
| mod query_tree; | mod query_tree; | ||||||
|  |  | ||||||
| @@ -70,7 +72,6 @@ impl<'a> Search<'a> { | |||||||
|  |  | ||||||
|     pub fn offset(&mut self, offset: usize) -> &mut Search<'a> { |     pub fn offset(&mut self, offset: usize) -> &mut Search<'a> { | ||||||
|         self.offset = offset; |         self.offset = offset; | ||||||
|  |  | ||||||
|         self |         self | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -301,8 +302,9 @@ pub fn word_derivations<'c>( | |||||||
|             } else { |             } else { | ||||||
|                 if max_typo == 1 { |                 if max_typo == 1 { | ||||||
|                     let dfa = build_dfa(word, 1, is_prefix); |                     let dfa = build_dfa(word, 1, is_prefix); | ||||||
|                     let starts = Str::new(get_first(word)).starts_with(); |                     let starts = StartsWith(Str::new(get_first(word))); | ||||||
|                     let mut stream = fst.search_with_state(starts.intersection(&dfa)).into_stream(); |                     let mut stream = | ||||||
|  |                         fst.search_with_state(Intersection(starts, &dfa)).into_stream(); | ||||||
|  |  | ||||||
|                     while let Some((word, state)) = stream.next() { |                     while let Some((word, state)) = stream.next() { | ||||||
|                         let word = std::str::from_utf8(word)?; |                         let word = std::str::from_utf8(word)?; | ||||||
| @@ -310,11 +312,11 @@ pub fn word_derivations<'c>( | |||||||
|                         derived_words.push((word.to_string(), d.to_u8())); |                         derived_words.push((word.to_string(), d.to_u8())); | ||||||
|                     } |                     } | ||||||
|                 } else { |                 } else { | ||||||
|                     let starts = Str::new(get_first(word)).starts_with(); |                     let starts = StartsWith(Str::new(get_first(word))); | ||||||
|                     let first = build_dfa(word, 1, is_prefix).intersection((&starts).complement()); |                     let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); | ||||||
|                     let second_dfa = build_dfa(word, 2, is_prefix); |                     let second_dfa = build_dfa(word, 2, is_prefix); | ||||||
|                     let second = (&second_dfa).intersection(&starts); |                     let second = Intersection(&second_dfa, &starts); | ||||||
|                     let automaton = first.union(&second); |                     let automaton = Union(first, &second); | ||||||
|  |  | ||||||
|                     let mut stream = fst.search_with_state(automaton).into_stream(); |                     let mut stream = fst.search_with_state(automaton).into_stream(); | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user