mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	chore: Make the project a workspace
This commit is contained in:
		| @@ -1,35 +0,0 @@ | ||||
| use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA}; | ||||
|  | ||||
| pub struct LevBuilder { | ||||
|     automatons: [LevenshteinAutomatonBuilder; 3], | ||||
| } | ||||
|  | ||||
| impl LevBuilder { | ||||
|     pub fn new() -> Self { | ||||
|         Self { | ||||
|             automatons: [ | ||||
|                 LevenshteinAutomatonBuilder::new(0, false), | ||||
|                 LevenshteinAutomatonBuilder::new(1, false), | ||||
|                 LevenshteinAutomatonBuilder::new(2, false), | ||||
|             ], | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn get_automaton(&self, query: &str) -> Levenshtein { | ||||
|         let dfa = if query.len() <= 4 { | ||||
|             self.automatons[0].build_prefix_dfa(query) | ||||
|         } else if query.len() <= 8 { | ||||
|             self.automatons[1].build_prefix_dfa(query) | ||||
|         } else { | ||||
|             self.automatons[2].build_prefix_dfa(query) | ||||
|         }; | ||||
|  | ||||
|         Levenshtein { dfa, query_len: query.len() } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Clone)] | ||||
| pub struct Levenshtein { | ||||
|     pub dfa: DFA, | ||||
|     pub query_len: usize, | ||||
| } | ||||
							
								
								
									
										120
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										120
									
								
								src/lib.rs
									
									
									
									
									
								
							| @@ -1,120 +0,0 @@ | ||||
| #[macro_use] extern crate serde_derive; | ||||
| extern crate bincode; | ||||
| extern crate fst; | ||||
| extern crate group_by; | ||||
| extern crate levenshtein_automata; | ||||
| extern crate serde; | ||||
|  | ||||
| pub mod map; | ||||
| pub mod rank; | ||||
| mod levenshtein; | ||||
|  | ||||
| use std::path::Path; | ||||
| use std::fs; | ||||
|  | ||||
| pub use self::map::{Map, MapBuilder, Values}; | ||||
| pub use self::map::{ | ||||
|     OpBuilder, IndexedValues, | ||||
|     OpWithStateBuilder, IndexedValuesWithState, | ||||
| }; | ||||
| pub use self::rank::{RankedStream}; | ||||
| pub use self::levenshtein::LevBuilder; | ||||
|  | ||||
| pub type DocIndexMap = Map<DocIndex>; | ||||
| pub type DocIndexMapBuilder = MapBuilder<DocIndex>; | ||||
|  | ||||
| pub type DocumentId = u64; | ||||
|  | ||||
| /// This structure represent the position of a word | ||||
| /// in a document and its attributes. | ||||
| /// | ||||
| /// This is stored in the map, generated at index time, | ||||
| /// extracted and interpreted at search time. | ||||
| #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash, Serialize, Deserialize)] | ||||
| pub struct DocIndex { | ||||
|  | ||||
|     /// The document identifier where the word was found. | ||||
|     pub document: DocumentId, | ||||
|  | ||||
|     /// The attribute identifier in the document | ||||
|     /// where the word was found. | ||||
|     /// | ||||
|     /// This is an `u8` therefore a document | ||||
|     /// can not have more than `2^8` attributes. | ||||
|     pub attribute: u8, | ||||
|  | ||||
|     /// The index where the word was found in the attribute. | ||||
|     /// | ||||
|     /// Only the first 1000 words are indexed. | ||||
|     pub attribute_index: u32, | ||||
| } | ||||
|  | ||||
| /// This structure represent a matching word with informations | ||||
| /// on the location of the word in the document. | ||||
| /// | ||||
| /// The order of the field is important because it defines | ||||
| /// the way these structures are ordered between themselves. | ||||
| /// | ||||
| /// The word in itself is not important. | ||||
| // TODO do data oriented programming ? very arrays ? | ||||
| #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] | ||||
| pub struct Match { | ||||
|  | ||||
|     /// The word index in the query sentence. | ||||
|     /// Same as the `attribute_index` but for the query words. | ||||
|     /// | ||||
|     /// Used to retrieve the automaton that match this word. | ||||
|     pub query_index: u32, | ||||
|  | ||||
|     /// The distance the word has with the query word | ||||
|     /// (i.e. the Levenshtein distance). | ||||
|     pub distance: u8, | ||||
|  | ||||
|     /// The attribute in which the word is located | ||||
|     /// (i.e. Title is 0, Description is 1). | ||||
|     /// | ||||
|     /// This is an `u8` therefore a document | ||||
|     /// can not have more than `2^8` attributes. | ||||
|     pub attribute: u8, | ||||
|  | ||||
|     /// Where does this word is located in the attribute string | ||||
|     /// (i.e. at the start or the end of the attribute). | ||||
|     /// | ||||
|     /// The index in the attribute is limited to a maximum of `2^32` | ||||
|     /// this is because we index only the first 1000 words in an attribute. | ||||
|     pub attribute_index: u32, | ||||
|  | ||||
|     /// Whether the word that match is an exact match or a prefix. | ||||
|     pub is_exact: bool, | ||||
| } | ||||
|  | ||||
| impl Match { | ||||
|     pub fn zero() -> Self { | ||||
|         Match { | ||||
|             query_index: 0, | ||||
|             distance: 0, | ||||
|             attribute: 0, | ||||
|             attribute_index: 0, | ||||
|             is_exact: false, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn max() -> Self { | ||||
|         Match { | ||||
|             query_index: u32::max_value(), | ||||
|             distance: u8::max_value(), | ||||
|             attribute: u8::max_value(), | ||||
|             attribute_index: u32::max_value(), | ||||
|             is_exact: true, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| pub fn load_map<P, Q>(map: P, values: Q) -> fst::Result<DocIndexMap> | ||||
| where P: AsRef<Path>, Q: AsRef<Path>, | ||||
| { | ||||
|     let fst = fs::read(map)?; | ||||
|     let values = fs::read(values)?; | ||||
|     DocIndexMap::from_bytes(fst, &values) | ||||
| } | ||||
							
								
								
									
										404
									
								
								src/map.rs
									
									
									
									
									
								
							
							
						
						
									
										404
									
								
								src/map.rs
									
									
									
									
									
								
							| @@ -1,404 +0,0 @@ | ||||
| use bincode; | ||||
| use fst::{self, Automaton}; | ||||
| use serde::de::DeserializeOwned; | ||||
| use serde::ser::Serialize; | ||||
| use std::collections::BTreeMap; | ||||
| use std::collections::btree_map::Entry; | ||||
| use std::fs::File; | ||||
| use std::io::{Write, BufReader}; | ||||
| use std::ops::Range; | ||||
| use std::path::Path; | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct Map<T> { | ||||
|     inner: fst::Map, | ||||
|     values: Values<T>, | ||||
| } | ||||
|  | ||||
| impl<T> Map<T> { | ||||
|     pub unsafe fn from_paths<P, Q>(map: P, values: Q) -> fst::Result<Self> | ||||
|     where | ||||
|         T: DeserializeOwned, | ||||
|         P: AsRef<Path>, | ||||
|         Q: AsRef<Path> | ||||
|     { | ||||
|         let inner = fst::Map::from_path(map)?; | ||||
|  | ||||
|         // TODO handle errors !!! | ||||
|         let values = File::open(values).unwrap(); | ||||
|         let values = BufReader::new(values); | ||||
|         let values = bincode::deserialize_from(values).unwrap(); | ||||
|  | ||||
|         Ok(Self { inner, values }) | ||||
|     } | ||||
|  | ||||
|     pub fn from_bytes(map: Vec<u8>, values: &[u8]) -> fst::Result<Self> | ||||
|     where | ||||
|         T: DeserializeOwned | ||||
|     { | ||||
|         let inner = fst::Map::from_bytes(map)?; | ||||
|         let values = bincode::deserialize(values).unwrap(); | ||||
|  | ||||
|         Ok(Self { inner, values }) | ||||
|     } | ||||
|  | ||||
|     pub fn stream(&self) -> Stream<T> { | ||||
|         Stream { | ||||
|             inner: self.inner.stream(), | ||||
|             values: &self.values, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool { | ||||
|         self.inner.contains_key(key) | ||||
|     } | ||||
|  | ||||
|     pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[T]> { | ||||
|         self.inner.get(key).map(|i| unsafe { self.values.get_unchecked(i as usize) }) | ||||
|     } | ||||
|  | ||||
|     pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<T, A> { | ||||
|         StreamBuilder { | ||||
|             inner: self.inner.search(aut), | ||||
|             values: &self.values, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn as_map(&self) -> &fst::Map { | ||||
|         &self.inner | ||||
|     } | ||||
|  | ||||
|     pub fn values(&self) -> &Values<T> { | ||||
|         &self.values | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Serialize, Deserialize)] | ||||
| pub struct Values<T> { | ||||
|     ranges: Box<[Range<u64>]>, | ||||
|     values: Box<[T]>, | ||||
| } | ||||
|  | ||||
| impl<T> Values<T> { | ||||
|     fn new(raw: Vec<Vec<T>>) -> Self { | ||||
|         let cap = raw.len(); | ||||
|         let mut ranges = Vec::with_capacity(cap); | ||||
|         let cap = raw.iter().map(Vec::len).sum(); | ||||
|         let mut values = Vec::with_capacity(cap); | ||||
|  | ||||
|         for mut v in &raw { | ||||
|             let len = v.len() as u64; | ||||
|             let start = ranges.last().map(|&Range { end, .. }| end).unwrap_or(0); | ||||
|  | ||||
|             let range = Range { start, end: start + len }; | ||||
|             ranges.push(range); | ||||
|         } | ||||
|  | ||||
|         values.extend(raw.into_iter().flat_map(IntoIterator::into_iter)); | ||||
|  | ||||
|         let ranges = ranges.into_boxed_slice(); | ||||
|         let values = values.into_boxed_slice(); | ||||
|  | ||||
|         Self { ranges, values } | ||||
|     } | ||||
|  | ||||
|     pub unsafe fn get_unchecked(&self, index: usize) -> &[T] { | ||||
|         let range = self.ranges.get_unchecked(index); | ||||
|         let range = Range { start: range.start as usize, end: range.end as usize }; | ||||
|         self.values.get_unchecked(range) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug)] | ||||
| pub struct MapBuilder<T> { | ||||
|     map: BTreeMap<String, u64>, | ||||
|     // This makes many memory indirections but it is only used | ||||
|     // at index time, not kept for query time. | ||||
|     values: Vec<Vec<T>>, | ||||
| } | ||||
|  | ||||
| impl<T> MapBuilder<T> { | ||||
|     pub fn new() -> Self { | ||||
|         Self { | ||||
|             map: BTreeMap::new(), | ||||
|             values: Vec::new(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn insert<S: Into<String>>(&mut self, key: S, value: T) { | ||||
|         let key = key.into(); | ||||
|         match self.map.entry(key) { | ||||
|             Entry::Vacant(e) => { | ||||
|                 self.values.push(vec![value]); | ||||
|                 let index = (self.values.len() - 1) as u64; | ||||
|                 e.insert(index); | ||||
|             }, | ||||
|             Entry::Occupied(e) => { | ||||
|                 let index = *e.get(); | ||||
|                 let values = &mut self.values[index as usize]; | ||||
|                 values.push(value); | ||||
|             }, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn build_in_memory(self) -> fst::Result<Map<T>> { | ||||
|         Ok(Map { | ||||
|             inner: fst::Map::from_iter(self.map)?, | ||||
|             values: Values::new(self.values), | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     pub fn build<W, X>(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)> | ||||
|     where | ||||
|         T: Serialize, | ||||
|         W: Write, | ||||
|         X: Write | ||||
|     { | ||||
|         let mut builder = fst::MapBuilder::new(map_wrt)?; | ||||
|         builder.extend_iter(self.map)?; | ||||
|         let map = builder.into_inner()?; | ||||
|         let values = Values::new(self.values); | ||||
|  | ||||
|         // TODO handle that error !!! | ||||
|         bincode::serialize_into(&mut values_wrt, &values).unwrap(); | ||||
|  | ||||
|         Ok((map, values_wrt)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct OpBuilder<'m, 'v, T: 'v> { | ||||
|     inner: fst::map::OpBuilder<'m>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, T: 'v> OpBuilder<'m, 'v, T> { | ||||
|     pub fn new(values: &'v Values<T>) -> Self { | ||||
|         OpBuilder { | ||||
|             inner: fst::map::OpBuilder::new(), | ||||
|             values: values, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn add<I, S>(mut self, streamable: I) -> Self | ||||
|     where | ||||
|         I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, | ||||
|         S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, | ||||
|     { | ||||
|         self.push(streamable); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn push<I, S>(&mut self, streamable: I) | ||||
|     where | ||||
|         I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, | ||||
|         S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, | ||||
|     { | ||||
|         self.inner.push(streamable); | ||||
|     } | ||||
|  | ||||
|     pub fn union(self) -> Union<'m, 'v, T> { | ||||
|         Union { | ||||
|             inner: self.inner.union(), | ||||
|             outs: Vec::new(), | ||||
|             values: self.values, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Union<'m, 'v, T: 'v> { | ||||
|     inner: fst::map::Union<'m>, | ||||
|     outs: Vec<IndexedValues<'v, T>>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'm, 'v, T: 'v + 'a> fst::Streamer<'a> for Union<'m, 'v, T> { | ||||
|     type Item = (&'a [u8], &'a [IndexedValues<'a, T>]); | ||||
|  | ||||
|     fn next(&'a mut self) -> Option<Self::Item> { | ||||
|         match self.inner.next() { | ||||
|             Some((s, ivalues)) => { | ||||
|                 self.outs.clear(); | ||||
|                 for ivalue in ivalues { | ||||
|                     let index = ivalue.index; | ||||
|                     let values = unsafe { self.values.get_unchecked(ivalue.value as usize) }; | ||||
|                     self.outs.push(IndexedValues { index, values }) | ||||
|                 } | ||||
|                 Some((s, &self.outs)) | ||||
|             }, | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] | ||||
| pub struct IndexedValues<'a, T: 'a> { | ||||
|     pub index: usize, | ||||
|     pub values: &'a [T], | ||||
| } | ||||
|  | ||||
| pub struct OpWithStateBuilder<'m, 'v, T: 'v, U> { | ||||
|     inner: fst::map::OpWithStateBuilder<'m, U>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, T: 'v, U: 'static> OpWithStateBuilder<'m, 'v, T, U> { | ||||
|     pub fn new(values: &'v Values<T>) -> Self { | ||||
|         Self { | ||||
|             inner: fst::map::OpWithStateBuilder::new(), | ||||
|             values: values, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn add<I, S>(mut self, streamable: I) -> Self | ||||
|     where | ||||
|         I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>, | ||||
|         S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>, | ||||
|     { | ||||
|         self.push(streamable); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn push<I, S>(&mut self, streamable: I) | ||||
|     where | ||||
|         I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>, | ||||
|         S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>, | ||||
|     { | ||||
|         self.inner.push(streamable); | ||||
|     } | ||||
|  | ||||
|     pub fn union(self) -> UnionWithState<'m, 'v, T, U> { | ||||
|         UnionWithState { | ||||
|             inner: self.inner.union(), | ||||
|             outs: Vec::new(), | ||||
|             values: self.values, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct UnionWithState<'m, 'v, T: 'v, U> { | ||||
|     inner: fst::map::UnionWithState<'m, U>, | ||||
|     outs: Vec<IndexedValuesWithState<'v, T, U>>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'm, 'v, T: 'v + 'a, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, T, U> | ||||
| where | ||||
|     U: Clone, | ||||
| { | ||||
|     // TODO prefer returning (&[u8], index, value T, state) one by one | ||||
|     type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, T, U>]); | ||||
|  | ||||
|     fn next(&'a mut self) -> Option<Self::Item> { | ||||
|         match self.inner.next() { | ||||
|             Some((s, ivalues)) => { | ||||
|                 self.outs.clear(); | ||||
|                 self.outs.reserve(ivalues.len()); | ||||
|                 for ivalue in ivalues { | ||||
|                     let index = ivalue.index; | ||||
|                     let values = unsafe { self.values.get_unchecked(ivalue.value as usize) }; | ||||
|                     let state = ivalue.state.clone(); | ||||
|                     self.outs.push(IndexedValuesWithState { index, values, state }) | ||||
|                 } | ||||
|                 Some((s, &self.outs)) | ||||
|             }, | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Copy, Clone, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] | ||||
| pub struct IndexedValuesWithState<'a, T: 'a, U> { | ||||
|     pub index: usize, | ||||
|     pub values: &'a [T], | ||||
|     pub state: U, | ||||
| } | ||||
|  | ||||
| pub struct StreamBuilder<'m, 'v, T: 'v, A> { | ||||
|     inner: fst::map::StreamBuilder<'m, A>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, T: 'v, A> StreamBuilder<'m, 'v, T, A> { | ||||
|     pub fn with_state(self) -> StreamWithStateBuilder<'m, 'v, T, A> { | ||||
|         StreamWithStateBuilder { | ||||
|             inner: self.inner.with_state(), | ||||
|             values: self.values, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, 'a, T: 'v + 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, T, A> { | ||||
|     type Item = <Self::Into as fst::Streamer<'a>>::Item; | ||||
|     type Into = Stream<'m, 'v, T, A>; | ||||
|  | ||||
|     fn into_stream(self) -> Self::Into { | ||||
|         Stream { | ||||
|             inner: self.inner.into_stream(), | ||||
|             values: self.values, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Stream<'m, 'v, T: 'v, A: Automaton = fst::automaton::AlwaysMatch> { | ||||
|     inner: fst::map::Stream<'m, A>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, 'a, T: 'v + 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, 'v, T, A> { | ||||
|     type Item = (&'a [u8], &'a [T]); | ||||
|  | ||||
|     fn next(&'a mut self) -> Option<Self::Item> { | ||||
|         // Here we can't just `map` because of some borrow rules | ||||
|         match self.inner.next() { | ||||
|             Some((key, i)) => { | ||||
|                 let values = unsafe { self.values.get_unchecked(i as usize) }; | ||||
|                 Some((key, values)) | ||||
|             }, | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct StreamWithStateBuilder<'m, 'v, T: 'v, A> { | ||||
|     inner: fst::map::StreamWithStateBuilder<'m, A>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, 'a, T: 'v + 'a, A: 'a> fst::IntoStreamer<'a> for StreamWithStateBuilder<'m, 'v, T, A> | ||||
| where | ||||
|     A: Automaton, | ||||
|     A::State: Clone, | ||||
| { | ||||
|     type Item = <Self::Into as fst::Streamer<'a>>::Item; | ||||
|     type Into = StreamWithState<'m, 'v, T, A>; | ||||
|  | ||||
|     fn into_stream(self) -> Self::Into { | ||||
|         StreamWithState { | ||||
|             inner: self.inner.into_stream(), | ||||
|             values: self.values, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct StreamWithState<'m, 'v, T: 'v, A: Automaton = fst::automaton::AlwaysMatch> { | ||||
|     inner: fst::map::StreamWithState<'m, A>, | ||||
|     values: &'v Values<T>, | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, 'a, T: 'v + 'a, A: 'a> fst::Streamer<'a> for StreamWithState<'m, 'v, T, A> | ||||
| where | ||||
|     A: Automaton, | ||||
|     A::State: Clone, | ||||
| { | ||||
|     type Item = (&'a [u8], &'a [T], A::State); | ||||
|  | ||||
|     fn next(&'a mut self) -> Option<Self::Item> { | ||||
|         match self.inner.next() { | ||||
|             Some((key, i, state)) => { | ||||
|                 let values = unsafe { self.values.get_unchecked(i as usize) }; | ||||
|                 Some((key, values, state)) | ||||
|             }, | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
							
								
								
									
										275
									
								
								src/rank.rs
									
									
									
									
									
								
							
							
						
						
									
										275
									
								
								src/rank.rs
									
									
									
									
									
								
							| @@ -1,275 +0,0 @@ | ||||
| use std::cmp::{self, Ordering}; | ||||
| use std::collections::HashMap; | ||||
| use std::{mem, vec, iter}; | ||||
| use DocIndexMap; | ||||
| use fst; | ||||
| use levenshtein::Levenshtein; | ||||
| use map::{ | ||||
|     OpWithStateBuilder, UnionWithState, | ||||
|     StreamWithStateBuilder, | ||||
|     Values, | ||||
| }; | ||||
| use {Match, DocIndex, DocumentId}; | ||||
| use group_by::{GroupBy, GroupByMut}; | ||||
|  | ||||
| const MAX_DISTANCE: u32 = 8; | ||||
|  | ||||
| #[inline] | ||||
| fn match_query_index(a: &Match, b: &Match) -> bool { | ||||
|     a.query_index == b.query_index | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone)] | ||||
| pub struct Document { | ||||
|     document_id: DocumentId, | ||||
|     matches: Vec<Match>, | ||||
| } | ||||
|  | ||||
| impl Document { | ||||
|     pub fn new(doc: DocumentId, match_: Match) -> Self { | ||||
|         Self::from_sorted_matches(doc, vec![match_]) | ||||
|     } | ||||
|  | ||||
|     pub fn from_sorted_matches(doc: DocumentId, matches: Vec<Match>) -> Self { | ||||
|         Self { | ||||
|             document_id: doc, | ||||
|             matches: matches, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn sum_of_typos(lhs: &Document, rhs: &Document) -> Ordering { | ||||
|     let key = |doc: &Document| -> u8 { | ||||
|         GroupBy::new(&doc.matches, match_query_index).map(|m| m[0].distance).sum() | ||||
|     }; | ||||
|  | ||||
|     key(lhs).cmp(&key(rhs)) | ||||
| } | ||||
|  | ||||
| fn number_of_words(lhs: &Document, rhs: &Document) -> Ordering { | ||||
|     let key = |doc: &Document| -> usize { | ||||
|         GroupBy::new(&doc.matches, match_query_index).count() | ||||
|     }; | ||||
|  | ||||
|     key(lhs).cmp(&key(rhs)).reverse() | ||||
| } | ||||
|  | ||||
| fn index_proximity(lhs: u32, rhs: u32) -> u32 { | ||||
|     if lhs < rhs { | ||||
|         cmp::min(rhs - lhs, MAX_DISTANCE) | ||||
|     } else { | ||||
|         cmp::min(lhs - rhs, MAX_DISTANCE) + 1 | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn attribute_proximity(lhs: &Match, rhs: &Match) -> u32 { | ||||
|     if lhs.attribute != rhs.attribute { return MAX_DISTANCE } | ||||
|     index_proximity(lhs.attribute_index, rhs.attribute_index) | ||||
| } | ||||
|  | ||||
| fn words_proximity(lhs: &Document, rhs: &Document) -> Ordering { | ||||
|     let key = |doc: &Document| -> u32 { | ||||
|         let mut proximity = 0; | ||||
|         let mut next_group_index = 0; | ||||
|         for group in GroupBy::new(&doc.matches, match_query_index) { | ||||
|             next_group_index += group.len(); | ||||
|             // FIXME distance is wrong if 2 different attributes matches | ||||
|             // FIXME do that in a manner to avoid memory cache misses | ||||
|             if let Some(first_next_group) = doc.matches.get(next_group_index) { | ||||
|                 proximity += attribute_proximity(first_next_group, &group[0]); | ||||
|             } | ||||
|         } | ||||
|         proximity | ||||
|     }; | ||||
|  | ||||
|     key(lhs).cmp(&key(rhs)) | ||||
| } | ||||
|  | ||||
| fn sum_of_words_attribute(lhs: &Document, rhs: &Document) -> Ordering { | ||||
|     let key = |doc: &Document| -> u8 { | ||||
|         GroupBy::new(&doc.matches, match_query_index).map(|m| m[0].attribute).sum() | ||||
|     }; | ||||
|  | ||||
|     key(lhs).cmp(&key(rhs)) | ||||
| } | ||||
|  | ||||
| fn sum_of_words_position(lhs: &Document, rhs: &Document) -> Ordering { | ||||
|     let key = |doc: &Document| -> u32 { | ||||
|         GroupBy::new(&doc.matches, match_query_index).map(|m| m[0].attribute_index).sum() | ||||
|     }; | ||||
|  | ||||
|     key(lhs).cmp(&key(rhs)) | ||||
| } | ||||
|  | ||||
| fn exact(lhs: &Document, rhs: &Document) -> Ordering { | ||||
|     let contains_exact = |matches: &[Match]| matches.iter().any(|m| m.is_exact); | ||||
|     let key = |doc: &Document| -> usize { | ||||
|         GroupBy::new(&doc.matches, match_query_index).map(contains_exact).filter(|x| *x).count() | ||||
|     }; | ||||
|  | ||||
|     key(lhs).cmp(&key(rhs)) | ||||
| } | ||||
|  | ||||
| pub struct Pool { | ||||
|     documents: Vec<Document>, | ||||
|     limit: usize, | ||||
| } | ||||
|  | ||||
| impl Pool { | ||||
|     pub fn new(query_size: usize, limit: usize) -> Self { | ||||
|         Self { | ||||
|             documents: Vec::new(), | ||||
|             limit: limit, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // TODO remove the matches HashMap, not proud of it | ||||
|     pub fn extend(&mut self, matches: &mut HashMap<DocumentId, Vec<Match>>) { | ||||
|         for doc in self.documents.iter_mut() { | ||||
|             if let Some(matches) = matches.remove(&doc.document_id) { | ||||
|                 doc.matches.extend(matches); | ||||
|                 doc.matches.sort_unstable(); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         for (id, mut matches) in matches.drain() { | ||||
|             // note that matches are already sorted we do that by security | ||||
|             // TODO remove this useless sort | ||||
|             matches.sort_unstable(); | ||||
|  | ||||
|             let document = Document::from_sorted_matches(id, matches); | ||||
|             self.documents.push(document); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn invert_sorts<F>(a: &Document, b: &Document, sorts: &[F]) -> bool | ||||
| where F: Fn(&Document, &Document) -> Ordering, | ||||
| { | ||||
|     sorts.iter().rev().all(|sort| sort(a, b) == Ordering::Equal) | ||||
| } | ||||
|  | ||||
| impl IntoIterator for Pool { | ||||
|     type Item = Document; | ||||
|     type IntoIter = vec::IntoIter<Self::Item>; | ||||
|  | ||||
|     fn into_iter(mut self) -> Self::IntoIter { | ||||
|         let sorts = &[ | ||||
|             sum_of_typos, | ||||
|             number_of_words, | ||||
|             words_proximity, | ||||
|             sum_of_words_attribute, | ||||
|             sum_of_words_position, | ||||
|             exact, | ||||
|         ]; | ||||
|  | ||||
|         for (i, sort) in sorts.iter().enumerate() { | ||||
|             let mut computed = 0; | ||||
|             for group in GroupByMut::new(&mut self.documents, |a, b| invert_sorts(a, b, &sorts[..i])) { | ||||
|                 // TODO prefer using `sort_unstable_by_key` to allow reusing the key computation | ||||
|                 //      `number of words` needs to be reversed, we can use the `cmp::Reverse` struct to do that | ||||
|                 group.sort_unstable_by(sort); | ||||
|                 computed += group.len(); | ||||
|                 if computed >= self.limit { break } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         self.documents.truncate(self.limit); | ||||
|         self.documents.into_iter() | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub enum RankedStream<'m, 'v> { | ||||
|     Fed { | ||||
|         inner: UnionWithState<'m, 'v, DocIndex, u32>, | ||||
|         automatons: Vec<Levenshtein>, | ||||
|         pool: Pool, | ||||
|     }, | ||||
|     Pours { | ||||
|         inner: vec::IntoIter<Document>, | ||||
|     }, | ||||
| } | ||||
|  | ||||
| impl<'m, 'v> RankedStream<'m, 'v> { | ||||
|     pub fn new(map: &'m DocIndexMap, values: &'v Values<DocIndex>, automatons: Vec<Levenshtein>, limit: usize) -> Self { | ||||
|         let mut op = OpWithStateBuilder::new(values); | ||||
|  | ||||
|         for automaton in automatons.iter().map(|l| l.dfa.clone()) { | ||||
|             let stream = map.as_map().search(automaton).with_state(); | ||||
|             op.push(stream); | ||||
|         } | ||||
|  | ||||
|         let pool = Pool::new(automatons.len(), limit); | ||||
|  | ||||
|         RankedStream::Fed { | ||||
|             inner: op.union(), | ||||
|             automatons: automatons, | ||||
|             pool: pool, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> { | ||||
|     type Item = DocumentId; | ||||
|  | ||||
|     fn next(&'a mut self) -> Option<Self::Item> { | ||||
|         let mut matches = HashMap::new(); | ||||
|  | ||||
|         loop { | ||||
|             // TODO remove that when NLL are here ! | ||||
|             let mut transfert_pool = None; | ||||
|  | ||||
|             match self { | ||||
|                 RankedStream::Fed { inner, automatons, pool } => { | ||||
|                     match inner.next() { | ||||
|                         Some((string, indexed_values)) => { | ||||
|                             for iv in indexed_values { | ||||
|  | ||||
|                                 // TODO extend documents matches by batch of query_index | ||||
|                                 //      that way it will be possible to discard matches that | ||||
|                                 //      have an invalid distance *before* adding them | ||||
|                                 //      to the matches of the documents and, that way, avoid a sort | ||||
|  | ||||
|                                 let automaton = &automatons[iv.index]; | ||||
|                                 let distance = automaton.dfa.distance(iv.state).to_u8(); | ||||
|  | ||||
|                                 // TODO remove the Pool system ! | ||||
|                                 //      this is an internal Pool rule but | ||||
|                                 //      it is more efficient to test that here | ||||
|                                 // if pool.limitation.is_reached() && distance != 0 { continue } | ||||
|  | ||||
|                                 for di in iv.values { | ||||
|                                     let match_ = Match { | ||||
|                                         query_index: iv.index as u32, | ||||
|                                         distance: distance, | ||||
|                                         attribute: di.attribute, | ||||
|                                         attribute_index: di.attribute_index, | ||||
|                                         is_exact: string.len() == automaton.query_len, | ||||
|                                     }; | ||||
|                                     matches.entry(di.document) | ||||
|                                             .and_modify(|ms: &mut Vec<_>| ms.push(match_)) | ||||
|                                             .or_insert_with(|| vec![match_]); | ||||
|                                 } | ||||
|                                 pool.extend(&mut matches); | ||||
|                             } | ||||
|                         }, | ||||
|                         None => { | ||||
|                             // TODO remove this when NLL are here ! | ||||
|                             transfert_pool = Some(mem::replace(pool, Pool::new(1, 1))); | ||||
|                         }, | ||||
|                     } | ||||
|                 }, | ||||
|                 RankedStream::Pours { inner } => { | ||||
|                     return inner.next().map(|d| d.document_id) | ||||
|                 }, | ||||
|             } | ||||
|  | ||||
|             // transform the `RankedStream` into a `Pours` | ||||
|             if let Some(pool) = transfert_pool { | ||||
|                 *self = RankedStream::Pours { | ||||
|                     inner: pool.into_iter(), | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user