mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	chore: Move index related things to the meilidb-core workspace member
This commit is contained in:
		
							
								
								
									
										91
									
								
								meilidb-core/src/automaton.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										91
									
								
								meilidb-core/src/automaton.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,91 @@ | ||||
| use fst::Automaton; | ||||
| use lazy_static::lazy_static; | ||||
| use levenshtein_automata::{ | ||||
|     LevenshteinAutomatonBuilder as LevBuilder, | ||||
|     DFA, Distance, | ||||
| }; | ||||
|  | ||||
| lazy_static! { | ||||
|     static ref LEVDIST0: LevBuilder = LevBuilder::new(0, false); | ||||
|     static ref LEVDIST1: LevBuilder = LevBuilder::new(1, false); | ||||
|     static ref LEVDIST2: LevBuilder = LevBuilder::new(2, false); | ||||
| } | ||||
|  | ||||
| pub struct DfaExt { | ||||
|     query_len: usize, | ||||
|     automaton: DFA, | ||||
| } | ||||
|  | ||||
| impl Automaton for DfaExt { | ||||
|     type State = <DFA as Automaton>::State; | ||||
|  | ||||
|     fn start(&self) -> Self::State { | ||||
|         self.automaton.start() | ||||
|     } | ||||
|  | ||||
|     fn is_match(&self, state: &Self::State) -> bool { | ||||
|         self.automaton.is_match(state) | ||||
|     } | ||||
|  | ||||
|     fn can_match(&self, state: &Self::State) -> bool { | ||||
|         self.automaton.can_match(state) | ||||
|     } | ||||
|  | ||||
|     fn will_always_match(&self, state: &Self::State) -> bool { | ||||
|         self.automaton.will_always_match(state) | ||||
|     } | ||||
|  | ||||
|     fn accept(&self, state: &Self::State, byte: u8) -> Self::State { | ||||
|         self.automaton.accept(state, byte) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl AutomatonExt for DfaExt { | ||||
|     fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance { | ||||
|         self.automaton.eval(s) | ||||
|     } | ||||
|  | ||||
|     fn query_len(&self) -> usize { | ||||
|         self.query_len | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Copy, Clone)] | ||||
| enum PrefixSetting { | ||||
|     Prefix, | ||||
|     NoPrefix, | ||||
| } | ||||
|  | ||||
| fn build_dfa_with_setting(query: &str, setting: PrefixSetting) -> DfaExt { | ||||
|     use self::PrefixSetting::{Prefix, NoPrefix}; | ||||
|  | ||||
|     let dfa = match query.len() { | ||||
|         0 ..= 4 => match setting { | ||||
|             Prefix   => LEVDIST0.build_prefix_dfa(query), | ||||
|             NoPrefix => LEVDIST0.build_dfa(query), | ||||
|         }, | ||||
|         5 ..= 8 => match setting { | ||||
|             Prefix   => LEVDIST1.build_prefix_dfa(query), | ||||
|             NoPrefix => LEVDIST1.build_dfa(query), | ||||
|         }, | ||||
|         _ => match setting { | ||||
|             Prefix   => LEVDIST2.build_prefix_dfa(query), | ||||
|             NoPrefix => LEVDIST2.build_dfa(query), | ||||
|         }, | ||||
|     }; | ||||
|  | ||||
|     DfaExt { query_len: query.len(), automaton: dfa } | ||||
| } | ||||
|  | ||||
| pub fn build_prefix_dfa(query: &str) -> DfaExt { | ||||
|     build_dfa_with_setting(query, PrefixSetting::Prefix) | ||||
| } | ||||
|  | ||||
| pub fn build_dfa(query: &str) -> DfaExt { | ||||
|     build_dfa_with_setting(query, PrefixSetting::NoPrefix) | ||||
| } | ||||
|  | ||||
| pub trait AutomatonExt: Automaton { | ||||
|     fn eval<B: AsRef<[u8]>>(&self, s: B) -> Distance; | ||||
|     fn query_len(&self) -> usize; | ||||
| } | ||||
							
								
								
									
										12
									
								
								meilidb-core/src/criterion/document_id.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								meilidb-core/src/criterion/document_id.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,12 @@ | ||||
| use std::cmp::Ordering; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct DocumentId; | ||||
|  | ||||
| impl Criterion for DocumentId { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         lhs.id.cmp(&rhs.id) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										39
									
								
								meilidb-core/src/criterion/exact.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								meilidb-core/src/criterion/exact.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,39 @@ | ||||
| use std::cmp::Ordering; | ||||
| use slice_group_by::GroupBy; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| #[inline] | ||||
| fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { | ||||
|     let mut count = 0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     for group in query_index.linear_group() { | ||||
|         let len = group.len(); | ||||
|         count += is_exact[index..index + len].contains(&true) as usize; | ||||
|         index += len; | ||||
|     } | ||||
|  | ||||
|     count | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct Exact; | ||||
|  | ||||
| impl Criterion for Exact { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let is_exact = lhs.is_exact(); | ||||
|             number_exact_matches(query_index, is_exact) | ||||
|         }; | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let is_exact = rhs.is_exact(); | ||||
|             number_exact_matches(query_index, is_exact) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
| } | ||||
							
								
								
									
										112
									
								
								meilidb-core/src/criterion/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										112
									
								
								meilidb-core/src/criterion/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| mod sum_of_typos; | ||||
| mod number_of_words; | ||||
| mod words_proximity; | ||||
| mod sum_of_words_attribute; | ||||
| mod sum_of_words_position; | ||||
| mod exact; | ||||
| // mod sort_by_attr; | ||||
| mod document_id; | ||||
|  | ||||
| use std::cmp::Ordering; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| pub use self::{ | ||||
|     sum_of_typos::SumOfTypos, | ||||
|     number_of_words::NumberOfWords, | ||||
|     words_proximity::WordsProximity, | ||||
|     sum_of_words_attribute::SumOfWordsAttribute, | ||||
|     sum_of_words_position::SumOfWordsPosition, | ||||
|     exact::Exact, | ||||
|     // sort_by_attr::SortByAttr, | ||||
|     document_id::DocumentId, | ||||
| }; | ||||
|  | ||||
| pub trait Criterion: Send + Sync { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering; | ||||
|  | ||||
|     #[inline] | ||||
|     fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { | ||||
|         self.evaluate(lhs, rhs) == Ordering::Equal | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         (**self).evaluate(lhs, rhs) | ||||
|     } | ||||
|  | ||||
|     fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { | ||||
|         (**self).eq(lhs, rhs) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<T: Criterion + ?Sized> Criterion for Box<T> { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         (**self).evaluate(lhs, rhs) | ||||
|     } | ||||
|  | ||||
|     fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool { | ||||
|         (**self).eq(lhs, rhs) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Default)] | ||||
| pub struct CriteriaBuilder<'a> { | ||||
|     inner: Vec<Box<dyn Criterion + 'a>> | ||||
| } | ||||
|  | ||||
| impl<'a> CriteriaBuilder<'a> | ||||
| { | ||||
|     pub fn new() -> CriteriaBuilder<'a> { | ||||
|         CriteriaBuilder { inner: Vec::new() } | ||||
|     } | ||||
|  | ||||
|     pub fn with_capacity(capacity: usize) -> CriteriaBuilder<'a> { | ||||
|         CriteriaBuilder { inner: Vec::with_capacity(capacity) } | ||||
|     } | ||||
|  | ||||
|     pub fn reserve(&mut self, additional: usize) { | ||||
|         self.inner.reserve(additional) | ||||
|     } | ||||
|  | ||||
|     pub fn add<C: 'a>(mut self, criterion: C) -> CriteriaBuilder<'a> | ||||
|     where C: Criterion, | ||||
|     { | ||||
|         self.push(criterion); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn push<C: 'a>(&mut self, criterion: C) | ||||
|     where C: Criterion, | ||||
|     { | ||||
|         self.inner.push(Box::new(criterion)); | ||||
|     } | ||||
|  | ||||
|     pub fn build(self) -> Criteria<'a> { | ||||
|         Criteria { inner: self.inner } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Criteria<'a> { | ||||
|     inner: Vec<Box<dyn Criterion + 'a>>, | ||||
| } | ||||
|  | ||||
| impl<'a> Default for Criteria<'a> { | ||||
|     fn default() -> Self { | ||||
|         CriteriaBuilder::with_capacity(7) | ||||
|             .add(SumOfTypos) | ||||
|             .add(NumberOfWords) | ||||
|             .add(WordsProximity) | ||||
|             .add(SumOfWordsAttribute) | ||||
|             .add(SumOfWordsPosition) | ||||
|             .add(Exact) | ||||
|             .add(DocumentId) | ||||
|             .build() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> AsRef<[Box<Criterion + 'a>]> for Criteria<'a> { | ||||
|     fn as_ref(&self) -> &[Box<dyn Criterion + 'a>] { | ||||
|         &self.inner | ||||
|     } | ||||
| } | ||||
							
								
								
									
										27
									
								
								meilidb-core/src/criterion/number_of_words.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								meilidb-core/src/criterion/number_of_words.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | ||||
| use std::cmp::Ordering; | ||||
| use slice_group_by::GroupBy; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| #[inline] | ||||
| fn number_of_query_words(query_index: &[u32]) -> usize { | ||||
|     query_index.linear_group().count() | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct NumberOfWords; | ||||
|  | ||||
| impl Criterion for NumberOfWords { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             number_of_query_words(query_index) | ||||
|         }; | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             number_of_query_words(query_index) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
| } | ||||
							
								
								
									
										122
									
								
								meilidb-core/src/criterion/sort_by_attr.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										122
									
								
								meilidb-core/src/criterion/sort_by_attr.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,122 @@ | ||||
| use std::cmp::Ordering; | ||||
| use std::error::Error; | ||||
| use std::fmt; | ||||
|  | ||||
| use crate::database::schema::{Schema, SchemaAttr}; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::database::RankedMap; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| /// An helper struct that permit to sort documents by | ||||
| /// some of their stored attributes. | ||||
| /// | ||||
| /// # Note | ||||
| /// | ||||
| /// If a document cannot be deserialized it will be considered [`None`][]. | ||||
| /// | ||||
| /// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`, | ||||
| /// so you must check the [`Ord`] of `Option` implementation. | ||||
| /// | ||||
| /// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None | ||||
| /// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord | ||||
| /// | ||||
| /// # Example | ||||
| /// | ||||
| /// ```ignore | ||||
| /// use serde_derive::Deserialize; | ||||
| /// use meilidb::rank::criterion::*; | ||||
| /// | ||||
| /// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?; | ||||
| /// | ||||
| /// let builder = CriteriaBuilder::with_capacity(8) | ||||
| ///        .add(SumOfTypos) | ||||
| ///        .add(NumberOfWords) | ||||
| ///        .add(WordsProximity) | ||||
| ///        .add(SumOfWordsAttribute) | ||||
| ///        .add(SumOfWordsPosition) | ||||
| ///        .add(Exact) | ||||
| ///        .add(custom_ranking) | ||||
| ///        .add(DocumentId); | ||||
| /// | ||||
| /// let criterion = builder.build(); | ||||
| /// | ||||
| /// ``` | ||||
| pub struct SortByAttr<'a> { | ||||
|     ranked_map: &'a RankedMap, | ||||
|     attr: SchemaAttr, | ||||
|     reversed: bool, | ||||
| } | ||||
|  | ||||
| impl<'a> SortByAttr<'a> { | ||||
|     pub fn lower_is_better( | ||||
|         ranked_map: &'a RankedMap, | ||||
|         schema: &Schema, | ||||
|         attr_name: &str, | ||||
|     ) -> Result<SortByAttr<'a>, SortByAttrError> | ||||
|     { | ||||
|         SortByAttr::new(ranked_map, schema, attr_name, false) | ||||
|     } | ||||
|  | ||||
|     pub fn higher_is_better( | ||||
|         ranked_map: &'a RankedMap, | ||||
|         schema: &Schema, | ||||
|         attr_name: &str, | ||||
|     ) -> Result<SortByAttr<'a>, SortByAttrError> | ||||
|     { | ||||
|         SortByAttr::new(ranked_map, schema, attr_name, true) | ||||
|     } | ||||
|  | ||||
|     fn new( | ||||
|         ranked_map: &'a RankedMap, | ||||
|         schema: &Schema, | ||||
|         attr_name: &str, | ||||
|         reversed: bool, | ||||
|     ) -> Result<SortByAttr<'a>, SortByAttrError> | ||||
|     { | ||||
|         let attr = match schema.attribute(attr_name) { | ||||
|             Some(attr) => attr, | ||||
|             None => return Err(SortByAttrError::AttributeNotFound), | ||||
|         }; | ||||
|  | ||||
|         if !schema.props(attr).is_ranked() { | ||||
|             return Err(SortByAttrError::AttributeNotRegisteredForRanking); | ||||
|         } | ||||
|  | ||||
|         Ok(SortByAttr { ranked_map, attr, reversed }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> Criterion for SortByAttr<'a> { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = self.ranked_map.get(&(lhs.id, self.attr)); | ||||
|         let rhs = self.ranked_map.get(&(rhs.id, self.attr)); | ||||
|  | ||||
|         match (lhs, rhs) { | ||||
|             (Some(lhs), Some(rhs)) => { | ||||
|                 let order = lhs.cmp(&rhs); | ||||
|                 if self.reversed { order.reverse() } else { order } | ||||
|             }, | ||||
|             (None,    Some(_)) => Ordering::Greater, | ||||
|             (Some(_), None)    => Ordering::Less, | ||||
|             (None,    None)    => Ordering::Equal, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] | ||||
| pub enum SortByAttrError { | ||||
|     AttributeNotFound, | ||||
|     AttributeNotRegisteredForRanking, | ||||
| } | ||||
|  | ||||
| impl fmt::Display for SortByAttrError { | ||||
|     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||
|         use SortByAttrError::*; | ||||
|         match self { | ||||
|             AttributeNotFound => f.write_str("attribute not found in the schema"), | ||||
|             AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Error for SortByAttrError { } | ||||
							
								
								
									
										112
									
								
								meilidb-core/src/criterion/sum_of_typos.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										112
									
								
								meilidb-core/src/criterion/sum_of_typos.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,112 @@ | ||||
| use std::cmp::Ordering; | ||||
|  | ||||
| use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| // This function is a wrong logarithmic 10 function. | ||||
| // It is safe to panic on input number higher than 3, | ||||
| // the number of typos is never bigger than that. | ||||
| #[inline] | ||||
| fn custom_log10(n: u8) -> f32 { | ||||
|     match n { | ||||
|         0 => 0.0,       // log(1) | ||||
|         1 => 0.30102,   // log(2) | ||||
|         2 => 0.47712,   // log(3) | ||||
|         3 => 0.60205,   // log(4) | ||||
|         _ => panic!("invalid number"), | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[inline] | ||||
| fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize { | ||||
|     let mut number_words = 0; | ||||
|     let mut sum_typos = 0.0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     for group in query_index.linear_group() { | ||||
|         sum_typos += custom_log10(distance[index]); | ||||
|         number_words += 1; | ||||
|         index += group.len(); | ||||
|     } | ||||
|  | ||||
|     (number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct SumOfTypos; | ||||
|  | ||||
| impl Criterion for SumOfTypos { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let distance = lhs.distance(); | ||||
|             sum_matches_typos(query_index, distance) | ||||
|         }; | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let distance = rhs.distance(); | ||||
|             sum_matches_typos(query_index, distance) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs).reverse() | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|  | ||||
|     // typing: "Geox CEO" | ||||
|     // | ||||
|     // doc0: "Geox SpA: CEO and Executive" | ||||
|     // doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation" | ||||
|     #[test] | ||||
|     fn one_typo_reference() { | ||||
|         let query_index0 = &[0, 1]; | ||||
|         let distance0 = &[0, 0]; | ||||
|  | ||||
|         let query_index1 = &[0, 1]; | ||||
|         let distance1 = &[1, 0]; | ||||
|  | ||||
|         let doc0 = sum_matches_typos(query_index0, distance0); | ||||
|         let doc1 = sum_matches_typos(query_index1, distance1); | ||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); | ||||
|     } | ||||
|  | ||||
|     // typing: "bouton manchette" | ||||
|     // | ||||
|     // doc0: "bouton manchette" | ||||
|     // doc1: "bouton" | ||||
|     #[test] | ||||
|     fn no_typo() { | ||||
|         let query_index0 = &[0, 1]; | ||||
|         let distance0 = &[0, 0]; | ||||
|  | ||||
|         let query_index1 = &[0]; | ||||
|         let distance1 = &[0]; | ||||
|  | ||||
|         let doc0 = sum_matches_typos(query_index0, distance0); | ||||
|         let doc1 = sum_matches_typos(query_index1, distance1); | ||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); | ||||
|     } | ||||
|  | ||||
|     // typing: "bouton manchztte" | ||||
|     // | ||||
|     // doc0: "bouton manchette" | ||||
|     // doc1: "bouton" | ||||
|     #[test] | ||||
|     fn one_typo() { | ||||
|         let query_index0 = &[0, 1]; | ||||
|         let distance0 = &[0, 1]; | ||||
|  | ||||
|         let query_index1 = &[0]; | ||||
|         let distance1 = &[0]; | ||||
|  | ||||
|         let doc0 = sum_matches_typos(query_index0, distance0); | ||||
|         let doc1 = sum_matches_typos(query_index1, distance1); | ||||
|         assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										38
									
								
								meilidb-core/src/criterion/sum_of_words_attribute.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								meilidb-core/src/criterion/sum_of_words_attribute.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| use std::cmp::Ordering; | ||||
| use slice_group_by::GroupBy; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| #[inline] | ||||
| fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { | ||||
|     let mut sum_attributes = 0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     for group in query_index.linear_group() { | ||||
|         sum_attributes += attribute[index] as usize; | ||||
|         index += group.len(); | ||||
|     } | ||||
|  | ||||
|     sum_attributes | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct SumOfWordsAttribute; | ||||
|  | ||||
| impl Criterion for SumOfWordsAttribute { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let attribute = lhs.attribute(); | ||||
|             sum_matches_attributes(query_index, attribute) | ||||
|         }; | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let attribute = rhs.attribute(); | ||||
|             sum_matches_attributes(query_index, attribute) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										38
									
								
								meilidb-core/src/criterion/sum_of_words_position.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										38
									
								
								meilidb-core/src/criterion/sum_of_words_position.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,38 @@ | ||||
| use std::cmp::Ordering; | ||||
| use slice_group_by::GroupBy; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| #[inline] | ||||
| fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { | ||||
|     let mut sum_word_index = 0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     for group in query_index.linear_group() { | ||||
|         sum_word_index += word_index[index] as usize; | ||||
|         index += group.len(); | ||||
|     } | ||||
|  | ||||
|     sum_word_index | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct SumOfWordsPosition; | ||||
|  | ||||
| impl Criterion for SumOfWordsPosition { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let word_index = lhs.word_index(); | ||||
|             sum_matches_attribute_index(query_index, word_index) | ||||
|         }; | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let word_index = rhs.word_index(); | ||||
|             sum_matches_attribute_index(query_index, word_index) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										151
									
								
								meilidb-core/src/criterion/words_proximity.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										151
									
								
								meilidb-core/src/criterion/words_proximity.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,151 @@ | ||||
| use std::cmp::{self, Ordering}; | ||||
| use slice_group_by::GroupBy; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::RawDocument; | ||||
|  | ||||
| const MAX_DISTANCE: u16 = 8; | ||||
|  | ||||
| #[inline] | ||||
| fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) { | ||||
|     (a.clone(), b.clone()) | ||||
| } | ||||
|  | ||||
| fn index_proximity(lhs: u16, rhs: u16) -> u16 { | ||||
|     if lhs < rhs { | ||||
|         cmp::min(rhs - lhs, MAX_DISTANCE) | ||||
|     } else { | ||||
|         cmp::min(lhs - rhs, MAX_DISTANCE) + 1 | ||||
|     } | ||||
| } | ||||
|  | ||||
| fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 { | ||||
|     if lattr != rattr { return MAX_DISTANCE } | ||||
|     index_proximity(lwi, rwi) | ||||
| } | ||||
|  | ||||
| fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 { | ||||
|     let mut min_prox = u16::max_value(); | ||||
|  | ||||
|     for a in lattr.iter().zip(lwi) { | ||||
|         for b in rattr.iter().zip(rwi) { | ||||
|             let a = clone_tuple(a); | ||||
|             let b = clone_tuple(b); | ||||
|             min_prox = cmp::min(min_prox, attribute_proximity(a, b)); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     min_prox | ||||
| } | ||||
|  | ||||
| fn matches_proximity( | ||||
|     query_index: &[u32], | ||||
|     distance: &[u8], | ||||
|     attribute: &[u16], | ||||
|     word_index: &[u16], | ||||
| ) -> u16 | ||||
| { | ||||
|     let mut query_index_groups = query_index.linear_group(); | ||||
|     let mut proximity = 0; | ||||
|     let mut index = 0; | ||||
|  | ||||
|     let get_attr_wi = |index: usize, group_len: usize| { | ||||
|         // retrieve the first distance group (with the lowest values) | ||||
|         let len = distance[index..index + group_len].linear_group().next().unwrap().len(); | ||||
|  | ||||
|         let rattr = &attribute[index..index + len]; | ||||
|         let rwi = &word_index[index..index + len]; | ||||
|  | ||||
|         (rattr, rwi) | ||||
|     }; | ||||
|  | ||||
|     let mut last = query_index_groups.next().map(|group| { | ||||
|         let attr_wi = get_attr_wi(index, group.len()); | ||||
|         index += group.len(); | ||||
|         attr_wi | ||||
|     }); | ||||
|  | ||||
|     // iter by windows of size 2 | ||||
|     while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) { | ||||
|         let attr_wi = get_attr_wi(index, rhs.len()); | ||||
|         proximity += min_proximity(lhs, attr_wi); | ||||
|         last = Some(attr_wi); | ||||
|         index += rhs.len(); | ||||
|     } | ||||
|  | ||||
|     proximity | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Copy)] | ||||
| pub struct WordsProximity; | ||||
|  | ||||
| impl Criterion for WordsProximity { | ||||
|     fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering { | ||||
|         let lhs = { | ||||
|             let query_index = lhs.query_index(); | ||||
|             let distance = lhs.distance(); | ||||
|             let attribute = lhs.attribute(); | ||||
|             let word_index = lhs.word_index(); | ||||
|             matches_proximity(query_index, distance, attribute, word_index) | ||||
|         }; | ||||
|  | ||||
|         let rhs = { | ||||
|             let query_index = rhs.query_index(); | ||||
|             let distance = rhs.distance(); | ||||
|             let attribute = rhs.attribute(); | ||||
|             let word_index = rhs.word_index(); | ||||
|             matches_proximity(query_index, distance, attribute, word_index) | ||||
|         }; | ||||
|  | ||||
|         lhs.cmp(&rhs) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn three_different_attributes() { | ||||
|  | ||||
|         // "soup" "of the" "the day" | ||||
|         // | ||||
|         // { id: 0, attr: 0, attr_index: 0 } | ||||
|         // { id: 1, attr: 1, attr_index: 0 } | ||||
|         // { id: 2, attr: 1, attr_index: 1 } | ||||
|         // { id: 2, attr: 2, attr_index: 0 } | ||||
|         // { id: 3, attr: 3, attr_index: 1 } | ||||
|  | ||||
|         let query_index = &[0, 1, 2, 2, 3]; | ||||
|         let distance    = &[0, 0, 0, 0, 0]; | ||||
|         let attribute   = &[0, 1, 1, 2, 3]; | ||||
|         let word_index  = &[0, 0, 1, 0, 1]; | ||||
|  | ||||
|         //   soup -> of = 8 | ||||
|         // + of -> the  = 1 | ||||
|         // + the -> day = 8 (not 1) | ||||
|         assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 17); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn two_different_attributes() { | ||||
|  | ||||
|         // "soup day" "soup of the day" | ||||
|         // | ||||
|         // { id: 0, attr: 0, attr_index: 0 } | ||||
|         // { id: 0, attr: 1, attr_index: 0 } | ||||
|         // { id: 1, attr: 1, attr_index: 1 } | ||||
|         // { id: 2, attr: 1, attr_index: 2 } | ||||
|         // { id: 3, attr: 0, attr_index: 1 } | ||||
|         // { id: 3, attr: 1, attr_index: 3 } | ||||
|  | ||||
|         let query_index = &[0, 0, 1, 2, 3, 3]; | ||||
|         let distance    = &[0, 0, 0, 0, 0, 0]; | ||||
|         let attribute   = &[0, 1, 1, 1, 0, 1]; | ||||
|         let word_index  = &[0, 0, 1, 2, 1, 3]; | ||||
|  | ||||
|         //   soup -> of = 1 | ||||
|         // + of -> the  = 1 | ||||
|         // + the -> day = 1 | ||||
|         assert_eq!(matches_proximity(query_index, distance, attribute, word_index), 3); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										61
									
								
								meilidb-core/src/data/doc_ids.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										61
									
								
								meilidb-core/src/data/doc_ids.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,61 @@ | ||||
| use std::slice::from_raw_parts; | ||||
| use std::mem::size_of; | ||||
| use std::error::Error; | ||||
|  | ||||
| use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; | ||||
| use sdset::Set; | ||||
|  | ||||
| use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; | ||||
| use crate::write_to_bytes::WriteToBytes; | ||||
| use crate::data::SharedData; | ||||
| use crate::DocumentId; | ||||
|  | ||||
| use super::into_u8_slice; | ||||
|  | ||||
| #[derive(Default, Clone)] | ||||
| pub struct DocIds(SharedData); | ||||
|  | ||||
| impl DocIds { | ||||
|     pub fn new(ids: &Set<DocumentId>) -> DocIds { | ||||
|         let bytes = unsafe { into_u8_slice(ids.as_slice()) }; | ||||
|         let data = SharedData::from_bytes(bytes.to_vec()); | ||||
|         DocIds(data) | ||||
|     } | ||||
|  | ||||
|     pub fn is_empty(&self) -> bool { | ||||
|         self.0.is_empty() | ||||
|     } | ||||
|  | ||||
|     pub fn as_bytes(&self) -> &[u8] { | ||||
|         &self.0 | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl AsRef<Set<DocumentId>> for DocIds { | ||||
|     fn as_ref(&self) -> &Set<DocumentId> { | ||||
|         let slice = &self.0; | ||||
|         let ptr = slice.as_ptr() as *const DocumentId; | ||||
|         let len = slice.len() / size_of::<DocumentId>(); | ||||
|         let slice = unsafe { from_raw_parts(ptr, len) }; | ||||
|         Set::new_unchecked(slice) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl FromSharedDataCursor for DocIds { | ||||
|     type Error = Box<Error>; | ||||
|  | ||||
|     fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIds, Self::Error> { | ||||
|         let len = cursor.read_u64::<LittleEndian>()? as usize; | ||||
|         let data = cursor.extract(len); | ||||
|  | ||||
|         Ok(DocIds(data)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl WriteToBytes for DocIds { | ||||
|     fn write_to_bytes(&self, bytes: &mut Vec<u8>) { | ||||
|         let len = self.0.len() as u64; | ||||
|         bytes.write_u64::<LittleEndian>(len).unwrap(); | ||||
|         bytes.extend_from_slice(&self.0); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										231
									
								
								meilidb-core/src/data/doc_indexes.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										231
									
								
								meilidb-core/src/data/doc_indexes.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,231 @@ | ||||
| use std::io::{self, Write}; | ||||
| use std::slice::from_raw_parts; | ||||
| use std::mem::size_of; | ||||
| use std::ops::Index; | ||||
|  | ||||
| use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; | ||||
| use sdset::Set; | ||||
|  | ||||
| use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; | ||||
| use crate::write_to_bytes::WriteToBytes; | ||||
| use crate::data::SharedData; | ||||
| use crate::DocIndex; | ||||
|  | ||||
| use super::into_u8_slice; | ||||
|  | ||||
| #[derive(Debug)] | ||||
| #[repr(C)] | ||||
| struct Range { | ||||
|     start: u64, | ||||
|     end: u64, | ||||
| } | ||||
|  | ||||
| #[derive(Clone, Default)] | ||||
| pub struct DocIndexes { | ||||
|     ranges: SharedData, | ||||
|     indexes: SharedData, | ||||
| } | ||||
|  | ||||
| impl DocIndexes { | ||||
|     pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> { | ||||
|         self.ranges().get(index).map(|Range { start, end }| { | ||||
|             let start = *start as usize; | ||||
|             let end = *end as usize; | ||||
|             let slice = &self.indexes()[start..end]; | ||||
|             Set::new_unchecked(slice) | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|     fn ranges(&self) -> &[Range] { | ||||
|         let slice = &self.ranges; | ||||
|         let ptr = slice.as_ptr() as *const Range; | ||||
|         let len = slice.len() / size_of::<Range>(); | ||||
|         unsafe { from_raw_parts(ptr, len) } | ||||
|     } | ||||
|  | ||||
|     fn indexes(&self) -> &[DocIndex] { | ||||
|         let slice = &self.indexes; | ||||
|         let ptr = slice.as_ptr() as *const DocIndex; | ||||
|         let len = slice.len() / size_of::<DocIndex>(); | ||||
|         unsafe { from_raw_parts(ptr, len) } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Index<usize> for DocIndexes { | ||||
|     type Output = [DocIndex]; | ||||
|  | ||||
|     fn index(&self, index: usize) -> &Self::Output { | ||||
|         match self.get(index) { | ||||
|             Some(indexes) => indexes, | ||||
|             None => panic!("index {} out of range for a maximum of {} ranges", index, self.ranges().len()), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl FromSharedDataCursor for DocIndexes { | ||||
|     type Error = io::Error; | ||||
|  | ||||
|     fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<DocIndexes, Self::Error> { | ||||
|         let len = cursor.read_u64::<LittleEndian>()? as usize; | ||||
|         let ranges = cursor.extract(len); | ||||
|  | ||||
|         let len = cursor.read_u64::<LittleEndian>()? as usize; | ||||
|         let indexes = cursor.extract(len); | ||||
|  | ||||
|         Ok(DocIndexes { ranges, indexes }) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl WriteToBytes for DocIndexes { | ||||
|     fn write_to_bytes(&self, bytes: &mut Vec<u8>) { | ||||
|         let ranges_len = self.ranges.len() as u64; | ||||
|         let _ = bytes.write_u64::<LittleEndian>(ranges_len); | ||||
|         bytes.extend_from_slice(&self.ranges); | ||||
|  | ||||
|         let indexes_len = self.indexes.len() as u64; | ||||
|         let _ = bytes.write_u64::<LittleEndian>(indexes_len); | ||||
|         bytes.extend_from_slice(&self.indexes); | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct DocIndexesBuilder<W> { | ||||
|     ranges: Vec<Range>, | ||||
|     indexes: Vec<DocIndex>, | ||||
|     wtr: W, | ||||
| } | ||||
|  | ||||
| impl DocIndexesBuilder<Vec<u8>> { | ||||
|     pub fn memory() -> Self { | ||||
|         DocIndexesBuilder { | ||||
|             ranges: Vec::new(), | ||||
|             indexes: Vec::new(), | ||||
|             wtr: Vec::new(), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<W: Write> DocIndexesBuilder<W> { | ||||
|     pub fn new(wtr: W) -> Self { | ||||
|         DocIndexesBuilder { | ||||
|             ranges: Vec::new(), | ||||
|             indexes: Vec::new(), | ||||
|             wtr: wtr, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn insert(&mut self, indexes: &Set<DocIndex>) { | ||||
|         let len = indexes.len() as u64; | ||||
|         let start = self.ranges.last().map(|r| r.end).unwrap_or(0); | ||||
|         let range = Range { start, end: start + len }; | ||||
|         self.ranges.push(range); | ||||
|  | ||||
|         self.indexes.extend_from_slice(indexes); | ||||
|     } | ||||
|  | ||||
|     pub fn finish(self) -> io::Result<()> { | ||||
|         self.into_inner().map(drop) | ||||
|     } | ||||
|  | ||||
|     pub fn into_inner(mut self) -> io::Result<W> { | ||||
|         let ranges = unsafe { into_u8_slice(&self.ranges) }; | ||||
|         let len = ranges.len() as u64; | ||||
|         self.wtr.write_u64::<LittleEndian>(len)?; | ||||
|         self.wtr.write_all(ranges)?; | ||||
|  | ||||
|         let indexes = unsafe { into_u8_slice(&self.indexes) }; | ||||
|         let len = indexes.len() as u64; | ||||
|         self.wtr.write_u64::<LittleEndian>(len)?; | ||||
|         self.wtr.write_all(indexes)?; | ||||
|  | ||||
|         Ok(self.wtr) | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use std::error::Error; | ||||
|     use crate::DocumentId; | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn builder_serialize_deserialize() -> Result<(), Box<Error>> { | ||||
|         let a = DocIndex { | ||||
|             document_id: DocumentId(0), | ||||
|             attribute: 3, | ||||
|             word_index: 11, | ||||
|             char_index: 30, | ||||
|             char_length: 4, | ||||
|         }; | ||||
|         let b = DocIndex { | ||||
|             document_id: DocumentId(1), | ||||
|             attribute: 4, | ||||
|             word_index: 21, | ||||
|             char_index: 35, | ||||
|             char_length: 6, | ||||
|         }; | ||||
|         let c = DocIndex { | ||||
|             document_id: DocumentId(2), | ||||
|             attribute: 8, | ||||
|             word_index: 2, | ||||
|             char_index: 89, | ||||
|             char_length: 6, | ||||
|         }; | ||||
|  | ||||
|         let mut builder = DocIndexesBuilder::memory(); | ||||
|  | ||||
|         builder.insert(Set::new(&[a])?); | ||||
|         builder.insert(Set::new(&[a, b, c])?); | ||||
|         builder.insert(Set::new(&[a, c])?); | ||||
|  | ||||
|         let bytes = builder.into_inner()?; | ||||
|         let docs = DocIndexes::from_bytes(bytes)?; | ||||
|  | ||||
|         assert_eq!(docs.get(0), Some(Set::new(&[a])?)); | ||||
|         assert_eq!(docs.get(1), Some(Set::new(&[a, b, c])?)); | ||||
|         assert_eq!(docs.get(2), Some(Set::new(&[a, c])?)); | ||||
|         assert_eq!(docs.get(3), None); | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn serialize_deserialize() -> Result<(), Box<Error>> { | ||||
|         let a = DocIndex { | ||||
|             document_id: DocumentId(0), | ||||
|             attribute: 3, | ||||
|             word_index: 11, | ||||
|             char_index: 30, | ||||
|             char_length: 4, | ||||
|         }; | ||||
|         let b = DocIndex { | ||||
|             document_id: DocumentId(1), | ||||
|             attribute: 4, | ||||
|             word_index: 21, | ||||
|             char_index: 35, | ||||
|             char_length: 6, | ||||
|         }; | ||||
|         let c = DocIndex { | ||||
|             document_id: DocumentId(2), | ||||
|             attribute: 8, | ||||
|             word_index: 2, | ||||
|             char_index: 89, | ||||
|             char_length: 6, | ||||
|         }; | ||||
|  | ||||
|         let mut builder = DocIndexesBuilder::memory(); | ||||
|  | ||||
|         builder.insert(Set::new(&[a])?); | ||||
|         builder.insert(Set::new(&[a, b, c])?); | ||||
|         builder.insert(Set::new(&[a, c])?); | ||||
|  | ||||
|         let builder_bytes = builder.into_inner()?; | ||||
|         let docs = DocIndexes::from_bytes(builder_bytes.clone())?; | ||||
|  | ||||
|         let mut bytes = Vec::new(); | ||||
|         docs.write_to_bytes(&mut bytes); | ||||
|  | ||||
|         assert_eq!(builder_bytes, bytes); | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										16
									
								
								meilidb-core/src/data/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								meilidb-core/src/data/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,16 @@ | ||||
| mod doc_ids; | ||||
| mod doc_indexes; | ||||
| mod shared_data; | ||||
|  | ||||
| use std::slice::from_raw_parts; | ||||
| use std::mem::size_of; | ||||
|  | ||||
| pub use self::doc_ids::DocIds; | ||||
| pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; | ||||
| pub use self::shared_data::SharedData; | ||||
|  | ||||
| unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] { | ||||
|     let ptr = slice.as_ptr() as *const u8; | ||||
|     let len = slice.len() * size_of::<T>(); | ||||
|     from_raw_parts(ptr, len) | ||||
| } | ||||
							
								
								
									
										48
									
								
								meilidb-core/src/data/shared_data.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								meilidb-core/src/data/shared_data.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,48 @@ | ||||
| use std::sync::Arc; | ||||
| use std::ops::Deref; | ||||
|  | ||||
| #[derive(Default, Clone)] | ||||
| pub struct SharedData { | ||||
|     pub bytes: Arc<Vec<u8>>, | ||||
|     pub offset: usize, | ||||
|     pub len: usize, | ||||
| } | ||||
|  | ||||
| impl SharedData { | ||||
|     pub fn from_bytes(vec: Vec<u8>) -> SharedData { | ||||
|         let len = vec.len(); | ||||
|         let bytes = Arc::from(vec); | ||||
|         SharedData::new(bytes, 0, len) | ||||
|     } | ||||
|  | ||||
|     pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData { | ||||
|         SharedData { bytes, offset, len } | ||||
|     } | ||||
|  | ||||
|     pub fn as_slice(&self) -> &[u8] { | ||||
|         &self.bytes[self.offset..self.offset + self.len] | ||||
|     } | ||||
|  | ||||
|     pub fn range(&self, offset: usize, len: usize) -> SharedData { | ||||
|         assert!(offset + len <= self.len); | ||||
|         SharedData { | ||||
|             bytes: self.bytes.clone(), | ||||
|             offset: self.offset + offset, | ||||
|             len: len, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Deref for SharedData { | ||||
|     type Target = [u8]; | ||||
|  | ||||
|     fn deref(&self) -> &Self::Target { | ||||
|         self.as_slice() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl AsRef<[u8]> for SharedData { | ||||
|     fn as_ref(&self) -> &[u8] { | ||||
|         self.as_slice() | ||||
|     } | ||||
| } | ||||
							
								
								
									
										104
									
								
								meilidb-core/src/distinct_map.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										104
									
								
								meilidb-core/src/distinct_map.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,104 @@ | ||||
| use std::hash::Hash; | ||||
|  | ||||
| use hashbrown::HashMap; | ||||
|  | ||||
| pub struct DistinctMap<K> { | ||||
|     inner: HashMap<K, usize>, | ||||
|     limit: usize, | ||||
|     len: usize, | ||||
| } | ||||
|  | ||||
| impl<K: Hash + Eq> DistinctMap<K> { | ||||
|     pub fn new(limit: usize) -> Self { | ||||
|         DistinctMap { | ||||
|             inner: HashMap::new(), | ||||
|             limit: limit, | ||||
|             len: 0, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn len(&self) -> usize { | ||||
|         self.len | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct BufferedDistinctMap<'a, K> { | ||||
|     internal: &'a mut DistinctMap<K>, | ||||
|     inner: HashMap<K, usize>, | ||||
|     len: usize, | ||||
| } | ||||
|  | ||||
| impl<'a, K: Hash + Eq> BufferedDistinctMap<'a, K> { | ||||
|     pub fn new(internal: &'a mut DistinctMap<K>) -> BufferedDistinctMap<'a, K> { | ||||
|         BufferedDistinctMap { | ||||
|             internal: internal, | ||||
|             inner: HashMap::new(), | ||||
|             len: 0, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn register(&mut self, key: K) -> bool { | ||||
|         let internal_seen = self.internal.inner.get(&key).unwrap_or(&0); | ||||
|         let inner_seen = self.inner.entry(key).or_insert(0); | ||||
|         let seen = *internal_seen + *inner_seen; | ||||
|  | ||||
|         if seen < self.internal.limit { | ||||
|             *inner_seen += 1; | ||||
|             self.len += 1; | ||||
|             true | ||||
|         } else { | ||||
|             false | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn register_without_key(&mut self) -> bool { | ||||
|         self.len += 1; | ||||
|         true | ||||
|     } | ||||
|  | ||||
|     pub fn transfert_to_internal(&mut self) { | ||||
|         for (k, v) in self.inner.drain() { | ||||
|             let value = self.internal.inner.entry(k).or_insert(0); | ||||
|             *value += v; | ||||
|         } | ||||
|  | ||||
|         self.internal.len += self.len; | ||||
|         self.len = 0; | ||||
|     } | ||||
|  | ||||
|     pub fn len(&self) -> usize { | ||||
|         self.internal.len() + self.len | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|  | ||||
|     #[test] | ||||
|     fn easy_distinct_map() { | ||||
|         let mut map = DistinctMap::new(2); | ||||
|         let mut buffered = BufferedDistinctMap::new(&mut map); | ||||
|  | ||||
|         for x in &[1, 1, 1, 2, 3, 4, 5, 6, 6, 6, 6, 6] { | ||||
|             buffered.register(x); | ||||
|         } | ||||
|         buffered.transfert_to_internal(); | ||||
|         assert_eq!(map.len(), 8); | ||||
|  | ||||
|         let mut map = DistinctMap::new(2); | ||||
|         let mut buffered = BufferedDistinctMap::new(&mut map); | ||||
|         assert_eq!(buffered.register(1), true); | ||||
|         assert_eq!(buffered.register(1), true); | ||||
|         assert_eq!(buffered.register(1), false); | ||||
|         assert_eq!(buffered.register(1), false); | ||||
|  | ||||
|         assert_eq!(buffered.register(2), true); | ||||
|         assert_eq!(buffered.register(3), true); | ||||
|         assert_eq!(buffered.register(2), true); | ||||
|         assert_eq!(buffered.register(2), false); | ||||
|  | ||||
|         buffered.transfert_to_internal(); | ||||
|         assert_eq!(map.len(), 5); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										175
									
								
								meilidb-core/src/index.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										175
									
								
								meilidb-core/src/index.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,175 @@ | ||||
| use std::error::Error; | ||||
|  | ||||
| use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; | ||||
| use fst::{map, Map, IntoStreamer, Streamer}; | ||||
| use fst::raw::Fst; | ||||
| use sdset::duo::{Union, DifferenceByKey}; | ||||
| use sdset::{Set, SetOperation}; | ||||
|  | ||||
| use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; | ||||
| use crate::write_to_bytes::WriteToBytes; | ||||
| use crate::data::{DocIndexes, DocIndexesBuilder}; | ||||
| use crate::{DocumentId, DocIndex}; | ||||
|  | ||||
| #[derive(Default)] | ||||
| pub struct Index { | ||||
|     pub map: Map, | ||||
|     pub indexes: DocIndexes, | ||||
| } | ||||
|  | ||||
| impl Index { | ||||
|     pub fn remove_documents(&self, documents: &Set<DocumentId>) -> Index { | ||||
|         let mut buffer = Vec::new(); | ||||
|         let mut builder = IndexBuilder::new(); | ||||
|         let mut stream = self.into_stream(); | ||||
|  | ||||
|         while let Some((key, indexes)) = stream.next() { | ||||
|             buffer.clear(); | ||||
|  | ||||
|             let op = DifferenceByKey::new(indexes, documents, |x| x.document_id, |x| *x); | ||||
|             op.extend_vec(&mut buffer); | ||||
|  | ||||
|             if !buffer.is_empty() { | ||||
|                 let indexes = Set::new_unchecked(&buffer); | ||||
|                 builder.insert(key, indexes).unwrap(); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         builder.build() | ||||
|     } | ||||
|  | ||||
|     pub fn union(&self, other: &Index) -> Index { | ||||
|         let mut builder = IndexBuilder::new(); | ||||
|         let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union(); | ||||
|  | ||||
|         let mut buffer = Vec::new(); | ||||
|         while let Some((key, ivalues)) = stream.next() { | ||||
|             buffer.clear(); | ||||
|             match ivalues { | ||||
|                 [a, b] => { | ||||
|                     let indexes = if a.index == 0 { &self.indexes } else { &other.indexes }; | ||||
|                     let indexes = &indexes[a.value as usize]; | ||||
|                     let a = Set::new_unchecked(indexes); | ||||
|  | ||||
|                     let indexes = if b.index == 0 { &self.indexes } else { &other.indexes }; | ||||
|                     let indexes = &indexes[b.value as usize]; | ||||
|                     let b = Set::new_unchecked(indexes); | ||||
|  | ||||
|                     let op = Union::new(a, b); | ||||
|                     op.extend_vec(&mut buffer); | ||||
|                 }, | ||||
|                 [x] => { | ||||
|                     let indexes = if x.index == 0 { &self.indexes } else { &other.indexes }; | ||||
|                     let indexes = &indexes[x.value as usize]; | ||||
|                     buffer.extend_from_slice(indexes) | ||||
|                 }, | ||||
|                 _ => continue, | ||||
|             } | ||||
|  | ||||
|             if !buffer.is_empty() { | ||||
|                 let indexes = Set::new_unchecked(&buffer); | ||||
|                 builder.insert(key, indexes).unwrap(); | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         builder.build() | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl FromSharedDataCursor for Index { | ||||
|     type Error = Box<Error>; | ||||
|  | ||||
|     fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Index, Self::Error> { | ||||
|         let len = cursor.read_u64::<LittleEndian>()? as usize; | ||||
|         let data = cursor.extract(len); | ||||
|  | ||||
|         let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?; | ||||
|         let map = Map::from(fst); | ||||
|  | ||||
|         let indexes = DocIndexes::from_shared_data_cursor(cursor)?; | ||||
|  | ||||
|         Ok(Index { map, indexes}) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl WriteToBytes for Index { | ||||
|     fn write_to_bytes(&self, bytes: &mut Vec<u8>) { | ||||
|         let slice = self.map.as_fst().as_bytes(); | ||||
|         let len = slice.len() as u64; | ||||
|         let _ = bytes.write_u64::<LittleEndian>(len); | ||||
|         bytes.extend_from_slice(slice); | ||||
|  | ||||
|         self.indexes.write_to_bytes(bytes); | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'m, 'a> IntoStreamer<'a> for &'m Index { | ||||
|     type Item = (&'a [u8], &'a Set<DocIndex>); | ||||
|     type Into = Stream<'m>; | ||||
|  | ||||
|     fn into_stream(self) -> Self::Into { | ||||
|         Stream { | ||||
|             map_stream: self.map.into_stream(), | ||||
|             indexes: &self.indexes, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct Stream<'m> { | ||||
|     map_stream: map::Stream<'m>, | ||||
|     indexes: &'m DocIndexes, | ||||
| } | ||||
|  | ||||
| impl<'m, 'a> Streamer<'a> for Stream<'m> { | ||||
|     type Item = (&'a [u8], &'a Set<DocIndex>); | ||||
|  | ||||
|     fn next(&'a mut self) -> Option<Self::Item> { | ||||
|         match self.map_stream.next() { | ||||
|             Some((input, index)) => { | ||||
|                 let indexes = &self.indexes[index as usize]; | ||||
|                 let indexes = Set::new_unchecked(indexes); | ||||
|                 Some((input, indexes)) | ||||
|             }, | ||||
|             None => None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct IndexBuilder { | ||||
|     map: fst::MapBuilder<Vec<u8>>, | ||||
|     indexes: DocIndexesBuilder<Vec<u8>>, | ||||
|     value: u64, | ||||
| } | ||||
|  | ||||
| impl IndexBuilder { | ||||
|     pub fn new() -> Self { | ||||
|         IndexBuilder { | ||||
|             map: fst::MapBuilder::memory(), | ||||
|             indexes: DocIndexesBuilder::memory(), | ||||
|             value: 0, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// If a key is inserted that is less than or equal to any previous key added, | ||||
|     /// then an error is returned. Similarly, if there was a problem writing | ||||
|     /// to the underlying writer, an error is returned. | ||||
|     // FIXME what if one write doesn't work but the other do ? | ||||
|     pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> fst::Result<()> | ||||
|     where K: AsRef<[u8]>, | ||||
|     { | ||||
|         self.map.insert(key, self.value)?; | ||||
|         self.indexes.insert(indexes); | ||||
|         self.value += 1; | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     pub fn build(self) -> Index { | ||||
|         let map = self.map.into_inner().unwrap(); | ||||
|         let indexes = self.indexes.into_inner().unwrap(); | ||||
|  | ||||
|         let map = Map::from_bytes(map).unwrap(); | ||||
|         let indexes = DocIndexes::from_bytes(indexes).unwrap(); | ||||
|  | ||||
|         Index { map, indexes } | ||||
|     } | ||||
| } | ||||
							
								
								
									
										297
									
								
								meilidb-core/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										297
									
								
								meilidb-core/src/lib.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,297 @@ | ||||
| pub mod criterion; | ||||
| pub mod data; | ||||
| mod index; | ||||
| mod automaton; | ||||
| mod query_builder; | ||||
| mod distinct_map; | ||||
|  | ||||
| pub mod shared_data_cursor; | ||||
| pub mod write_to_bytes; | ||||
|  | ||||
| use std::sync::Arc; | ||||
| use serde_derive::{Serialize, Deserialize}; | ||||
|  | ||||
| use slice_group_by::GroupBy; | ||||
| use rayon::slice::ParallelSliceMut; | ||||
|  | ||||
| pub use self::index::{Index, IndexBuilder}; | ||||
| pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; | ||||
|  | ||||
| /// Represent an internally generated document unique identifier. | ||||
| /// | ||||
| /// It is used to inform the database the document you want to deserialize. | ||||
| /// Helpful for custom ranking. | ||||
| #[derive(Serialize, Deserialize)] | ||||
| #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] | ||||
| pub struct DocumentId(pub u64); | ||||
|  | ||||
| /// This structure represent the position of a word | ||||
| /// in a document and its attributes. | ||||
| /// | ||||
| /// This is stored in the map, generated at index time, | ||||
| /// extracted and interpreted at search time. | ||||
| #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||
| #[repr(C)] | ||||
| pub struct DocIndex { | ||||
|     /// The document identifier where the word was found. | ||||
|     pub document_id: DocumentId, | ||||
|  | ||||
|     /// The attribute in the document where the word was found | ||||
|     /// along with the index in it. | ||||
|     pub attribute: u16, | ||||
|     pub word_index: u16, | ||||
|  | ||||
|     /// The position in bytes where the word was found | ||||
|     /// along with the length of it. | ||||
|     /// | ||||
|     /// It informs on the original word area in the text indexed | ||||
|     /// without needing to run the tokenizer again. | ||||
|     pub char_index: u16, | ||||
|     pub char_length: u16, | ||||
| } | ||||
|  | ||||
| /// This structure represent a matching word with informations | ||||
| /// on the location of the word in the document. | ||||
| /// | ||||
| /// The order of the field is important because it defines | ||||
| /// the way these structures are ordered between themselves. | ||||
| /// | ||||
| /// The word in itself is not important. | ||||
| // TODO do data oriented programming ? very arrays ? | ||||
| #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||
| pub struct Match { | ||||
|     /// The word index in the query sentence. | ||||
|     /// Same as the `attribute_index` but for the query words. | ||||
|     /// | ||||
|     /// Used to retrieve the automaton that match this word. | ||||
|     pub query_index: u32, | ||||
|  | ||||
|     /// The distance the word has with the query word | ||||
|     /// (i.e. the Levenshtein distance). | ||||
|     pub distance: u8, | ||||
|  | ||||
|     /// The attribute in the document where the word was found | ||||
|     /// along with the index in it. | ||||
|     pub attribute: u16, | ||||
|     pub word_index: u16, | ||||
|  | ||||
|     /// Whether the word that match is an exact match or a prefix. | ||||
|     pub is_exact: bool, | ||||
|  | ||||
|     /// The position in bytes where the word was found | ||||
|     /// along with the length of it. | ||||
|     /// | ||||
|     /// It informs on the original word area in the text indexed | ||||
|     /// without needing to run the tokenizer again. | ||||
|     pub char_index: u16, | ||||
|     pub char_length: u16, | ||||
| } | ||||
|  | ||||
| impl Match { | ||||
|     pub fn zero() -> Self { | ||||
|         Match { | ||||
|             query_index: 0, | ||||
|             distance: 0, | ||||
|             attribute: 0, | ||||
|             word_index: 0, | ||||
|             is_exact: false, | ||||
|             char_index: 0, | ||||
|             char_length: 0, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn max() -> Self { | ||||
|         Match { | ||||
|             query_index: u32::max_value(), | ||||
|             distance: u8::max_value(), | ||||
|             attribute: u16::max_value(), | ||||
|             word_index: u16::max_value(), | ||||
|             is_exact: true, | ||||
|             char_index: u16::max_value(), | ||||
|             char_length: u16::max_value(), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] | ||||
| pub struct Document { | ||||
|     pub id: DocumentId, | ||||
|     pub matches: Vec<Match>, | ||||
| } | ||||
|  | ||||
| impl Document { | ||||
|     fn from_raw(raw: &RawDocument) -> Document { | ||||
|         let len = raw.matches.range.len(); | ||||
|         let mut matches = Vec::with_capacity(len); | ||||
|  | ||||
|         let query_index = raw.query_index(); | ||||
|         let distance = raw.distance(); | ||||
|         let attribute = raw.attribute(); | ||||
|         let word_index = raw.word_index(); | ||||
|         let is_exact = raw.is_exact(); | ||||
|         let char_index = raw.char_index(); | ||||
|         let char_length = raw.char_length(); | ||||
|  | ||||
|         for i in 0..len { | ||||
|             let match_ = Match { | ||||
|                 query_index: query_index[i], | ||||
|                 distance: distance[i], | ||||
|                 attribute: attribute[i], | ||||
|                 word_index: word_index[i], | ||||
|                 is_exact: is_exact[i], | ||||
|                 char_index: char_index[i], | ||||
|                 char_length: char_length[i], | ||||
|             }; | ||||
|             matches.push(match_); | ||||
|         } | ||||
|  | ||||
|         Document { id: raw.id, matches } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Clone)] | ||||
| pub struct RawDocument { | ||||
|     pub id: DocumentId, | ||||
|     pub matches: SharedMatches, | ||||
| } | ||||
|  | ||||
| impl RawDocument { | ||||
|     fn new(id: DocumentId, range: Range, matches: Arc<Matches>) -> RawDocument { | ||||
|         RawDocument { id, matches: SharedMatches { range, matches } } | ||||
|     } | ||||
|  | ||||
|     pub fn query_index(&self) -> &[u32] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.query_index.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
|  | ||||
|     pub fn distance(&self) -> &[u8] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
|  | ||||
|     pub fn attribute(&self) -> &[u16] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
|  | ||||
|     pub fn word_index(&self) -> &[u16] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.word_index.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
|  | ||||
|     pub fn is_exact(&self) -> &[bool] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
|  | ||||
|     pub fn char_index(&self) -> &[u16] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.char_index.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
|  | ||||
|     pub fn char_length(&self) -> &[u16] { | ||||
|         let r = self.matches.range; | ||||
|         // it is safe because construction/modifications | ||||
|         // can only be done in this module | ||||
|         unsafe { &self.matches.matches.char_length.get_unchecked(r.start..r.end) } | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn raw_documents_from_matches(mut matches: Vec<(DocumentId, Match)>) -> Vec<RawDocument> { | ||||
|     let mut docs_ranges = Vec::<(DocumentId, Range)>::new(); | ||||
|     let mut matches2 = Matches::with_capacity(matches.len()); | ||||
|  | ||||
|     matches.par_sort_unstable(); | ||||
|  | ||||
|     for group in matches.linear_group_by(|(a, _), (b, _)| a == b) { | ||||
|         let id = group[0].0; | ||||
|         let start = docs_ranges.last().map(|(_, r)| r.end).unwrap_or(0); | ||||
|         let end = start + group.len(); | ||||
|         docs_ranges.push((id, Range { start, end })); | ||||
|  | ||||
|         matches2.extend_from_slice(group); | ||||
|     } | ||||
|  | ||||
|     let matches = Arc::new(matches2); | ||||
|     docs_ranges.into_iter().map(|(i, r)| RawDocument::new(i, r, matches.clone())).collect() | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Copy, Clone)] | ||||
| struct Range { | ||||
|     start: usize, | ||||
|     end: usize, | ||||
| } | ||||
|  | ||||
| impl Range { | ||||
|     fn len(self) -> usize { | ||||
|         self.end - self.start | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Clone)] | ||||
| pub struct SharedMatches { | ||||
|     range: Range, | ||||
|     matches: Arc<Matches>, | ||||
| } | ||||
|  | ||||
| #[derive(Clone)] | ||||
| struct Matches { | ||||
|     query_index: Vec<u32>, | ||||
|     distance: Vec<u8>, | ||||
|     attribute: Vec<u16>, | ||||
|     word_index: Vec<u16>, | ||||
|     is_exact: Vec<bool>, | ||||
|     char_index: Vec<u16>, | ||||
|     char_length: Vec<u16>, | ||||
| } | ||||
|  | ||||
| impl Matches { | ||||
|     fn with_capacity(cap: usize) -> Matches { | ||||
|         Matches { | ||||
|             query_index: Vec::with_capacity(cap), | ||||
|             distance: Vec::with_capacity(cap), | ||||
|             attribute: Vec::with_capacity(cap), | ||||
|             word_index: Vec::with_capacity(cap), | ||||
|             is_exact: Vec::with_capacity(cap), | ||||
|             char_index: Vec::with_capacity(cap), | ||||
|             char_length: Vec::with_capacity(cap), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn extend_from_slice(&mut self, matches: &[(DocumentId, Match)]) { | ||||
|         for (_, match_) in matches { | ||||
|             self.query_index.push(match_.query_index); | ||||
|             self.distance.push(match_.distance); | ||||
|             self.attribute.push(match_.attribute); | ||||
|             self.word_index.push(match_.word_index); | ||||
|             self.is_exact.push(match_.is_exact); | ||||
|             self.char_index.push(match_.char_index); | ||||
|             self.char_length.push(match_.char_length); | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use super::*; | ||||
|     use std::mem; | ||||
|  | ||||
|     #[test] | ||||
|     fn docindex_mem_size() { | ||||
|         assert_eq!(mem::size_of::<DocIndex>(), 24); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										357
									
								
								meilidb-core/src/query_builder.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										357
									
								
								meilidb-core/src/query_builder.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,357 @@ | ||||
| use std::{cmp, mem}; | ||||
| use std::ops::Range; | ||||
| use std::time::Instant; | ||||
| use std::hash::Hash; | ||||
| use std::rc::Rc; | ||||
|  | ||||
| use rayon::slice::ParallelSliceMut; | ||||
| use slice_group_by::{GroupByMut, LinearStrGroupBy}; | ||||
| use hashbrown::{HashMap, HashSet}; | ||||
| use fst::Streamer; | ||||
| use log::info; | ||||
|  | ||||
| use crate::automaton::{self, DfaExt, AutomatonExt}; | ||||
| use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; | ||||
| use crate::criterion::Criteria; | ||||
| use crate::{raw_documents_from_matches, RawDocument, Document}; | ||||
| use crate::{Index, Match, DocumentId}; | ||||
|  | ||||
| // query splitting must move out of this crate | ||||
| pub fn is_cjk(c: char) -> bool { | ||||
|     (c >= '\u{2e80}' && c <= '\u{2eff}') || | ||||
|     (c >= '\u{2f00}' && c <= '\u{2fdf}') || | ||||
|     (c >= '\u{3040}' && c <= '\u{309f}') || | ||||
|     (c >= '\u{30a0}' && c <= '\u{30ff}') || | ||||
|     (c >= '\u{3100}' && c <= '\u{312f}') || | ||||
|     (c >= '\u{3200}' && c <= '\u{32ff}') || | ||||
|     (c >= '\u{3400}' && c <= '\u{4dbf}') || | ||||
|     (c >= '\u{4e00}' && c <= '\u{9fff}') || | ||||
|     (c >= '\u{f900}' && c <= '\u{faff}') | ||||
| } | ||||
|  | ||||
| #[derive(Debug, PartialEq, Eq)] | ||||
| enum CharCategory { | ||||
|     Space, | ||||
|     Cjk, | ||||
|     Other, | ||||
| } | ||||
|  | ||||
| fn classify_char(c: char) -> CharCategory { | ||||
|     if c.is_whitespace() { CharCategory::Space } | ||||
|     else if is_cjk(c) { CharCategory::Cjk } | ||||
|     else { CharCategory::Other } | ||||
| } | ||||
|  | ||||
| fn is_word(s: &&str) -> bool { | ||||
|     !s.chars().any(char::is_whitespace) | ||||
| } | ||||
|  | ||||
| fn same_group_category(a: char, b: char) -> bool { | ||||
|     let ca = classify_char(a); | ||||
|     let cb = classify_char(b); | ||||
|     if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb } | ||||
| } | ||||
|  | ||||
| fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> { | ||||
|     let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); | ||||
|     let mut groups = LinearStrGroupBy::new(query, same_group_category) | ||||
|                         .filter(is_word) | ||||
|                         .map(str::to_lowercase) | ||||
|                         .peekable(); | ||||
|  | ||||
|     let mut automatons = Vec::new(); | ||||
|     while let Some(word) = groups.next() { | ||||
|         let has_following_word = groups.peek().is_some(); | ||||
|         let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { | ||||
|             automaton::build_dfa(&word) | ||||
|         } else { | ||||
|             automaton::build_prefix_dfa(&word) | ||||
|         }; | ||||
|         automatons.push(lev); | ||||
|     } | ||||
|  | ||||
|     automatons | ||||
| } | ||||
|  | ||||
| pub type FilterFunc = fn(DocumentId) -> bool; | ||||
|  | ||||
| pub struct QueryBuilder<'i, 'c, FI> { | ||||
|     index: &'i Index, | ||||
|     criteria: Criteria<'c>, | ||||
|     searchable_attrs: Option<HashSet<u16>>, | ||||
|     filter: Option<FI>, | ||||
| } | ||||
|  | ||||
| impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> { | ||||
|     pub fn new(index: &'i Index) -> Self { | ||||
|         QueryBuilder::with_criteria(index, Criteria::default()) | ||||
|     } | ||||
|  | ||||
|     pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self { | ||||
|         QueryBuilder { index, criteria, searchable_attrs: None, filter: None } | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> | ||||
| { | ||||
|     pub fn with_filter<F>(self, function: F) -> QueryBuilder<'i, 'c, F> | ||||
|     where F: Fn(DocumentId) -> bool, | ||||
|     { | ||||
|         QueryBuilder { | ||||
|             index: self.index, | ||||
|             criteria: self.criteria, | ||||
|             searchable_attrs: self.searchable_attrs, | ||||
|             filter: Some(function) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F> | ||||
|     where F: Fn(DocumentId) -> Option<K>, | ||||
|           K: Hash + Eq, | ||||
|     { | ||||
|         DistinctQueryBuilder { | ||||
|             inner: self, | ||||
|             function: function, | ||||
|             size: size | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn add_searchable_attribute(&mut self, attribute: u16) { | ||||
|         let attributes = self.searchable_attrs.get_or_insert_with(HashSet::new); | ||||
|         attributes.insert(attribute); | ||||
|     } | ||||
|  | ||||
|     fn query_all(&self, query: &str) -> Vec<RawDocument> { | ||||
|         let automatons = split_whitespace_automatons(query); | ||||
|  | ||||
|         let mut stream = { | ||||
|             let mut op_builder = fst::map::OpBuilder::new(); | ||||
|             for automaton in &automatons { | ||||
|                 let stream = self.index.map.search(automaton); | ||||
|                 op_builder.push(stream); | ||||
|             } | ||||
|             op_builder.union() | ||||
|         }; | ||||
|  | ||||
|         let mut matches = Vec::new(); | ||||
|  | ||||
|         while let Some((input, indexed_values)) = stream.next() { | ||||
|             for iv in indexed_values { | ||||
|                 let automaton = &automatons[iv.index]; | ||||
|                 let distance = automaton.eval(input).to_u8(); | ||||
|                 let is_exact = distance == 0 && input.len() == automaton.query_len(); | ||||
|  | ||||
|                 let doc_indexes = &self.index.indexes; | ||||
|                 let doc_indexes = &doc_indexes[iv.value as usize]; | ||||
|  | ||||
|                 for di in doc_indexes { | ||||
|                     if self.searchable_attrs.as_ref().map_or(true, |r| r.contains(&di.attribute)) { | ||||
|                         let match_ = Match { | ||||
|                             query_index: iv.index as u32, | ||||
|                             distance: distance, | ||||
|                             attribute: di.attribute, | ||||
|                             word_index: di.word_index, | ||||
|                             is_exact: is_exact, | ||||
|                             char_index: di.char_index, | ||||
|                             char_length: di.char_length, | ||||
|                         }; | ||||
|                         matches.push((di.document_id, match_)); | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let total_matches = matches.len(); | ||||
|         let raw_documents = raw_documents_from_matches(matches); | ||||
|  | ||||
|         info!("{} total documents to classify", raw_documents.len()); | ||||
|         info!("{} total matches to classify", total_matches); | ||||
|  | ||||
|         raw_documents | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> | ||||
| where FI: Fn(DocumentId) -> bool, | ||||
| { | ||||
|     pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> { | ||||
|         // We delegate the filter work to the distinct query builder, | ||||
|         // specifying a distinct rule that has no effect. | ||||
|         if self.filter.is_some() { | ||||
|             let builder = self.with_distinct(|_| None as Option<()>, 1); | ||||
|             return builder.query(query, range); | ||||
|         } | ||||
|  | ||||
|         let start = Instant::now(); | ||||
|         let mut documents = self.query_all(query); | ||||
|         info!("query_all took {:.2?}", start.elapsed()); | ||||
|  | ||||
|         let mut groups = vec![documents.as_mut_slice()]; | ||||
|  | ||||
|         'criteria: for (ci, criterion) in self.criteria.as_ref().iter().enumerate() { | ||||
|             let tmp_groups = mem::replace(&mut groups, Vec::new()); | ||||
|             let mut documents_seen = 0; | ||||
|  | ||||
|             for group in tmp_groups { | ||||
|                 info!("criterion {}, documents group of size {}", ci, group.len()); | ||||
|  | ||||
|                 // if this group does not overlap with the requested range, | ||||
|                 // push it without sorting and splitting it | ||||
|                 if documents_seen + group.len() < range.start { | ||||
|                     documents_seen += group.len(); | ||||
|                     groups.push(group); | ||||
|                     continue; | ||||
|                 } | ||||
|  | ||||
|                 let start = Instant::now(); | ||||
|                 group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); | ||||
|                 info!("criterion {} sort took {:.2?}", ci, start.elapsed()); | ||||
|  | ||||
|                 for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { | ||||
|                     documents_seen += group.len(); | ||||
|                     groups.push(group); | ||||
|  | ||||
|                     // we have sort enough documents if the last document sorted is after | ||||
|                     // the end of the requested range, we can continue to the next criterion | ||||
|                     if documents_seen >= range.end { continue 'criteria } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let offset = cmp::min(documents.len(), range.start); | ||||
|         let iter = documents.into_iter().skip(offset).take(range.len()); | ||||
|         iter.map(|d| Document::from_raw(&d)).collect() | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub struct DistinctQueryBuilder<'i, 'c, FI, FD> { | ||||
|     inner: QueryBuilder<'i, 'c, FI>, | ||||
|     function: FD, | ||||
|     size: usize, | ||||
| } | ||||
|  | ||||
| impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD> | ||||
| { | ||||
|     pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD> | ||||
|     where F: Fn(DocumentId) -> bool, | ||||
|     { | ||||
|         DistinctQueryBuilder { | ||||
|             inner: self.inner.with_filter(function), | ||||
|             function: self.function, | ||||
|             size: self.size | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn add_searchable_attribute(&mut self, attribute: u16) { | ||||
|         self.inner.add_searchable_attribute(attribute); | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD> | ||||
| where FI: Fn(DocumentId) -> bool, | ||||
|       FD: Fn(DocumentId) -> Option<K>, | ||||
|       K: Hash + Eq, | ||||
| { | ||||
|     pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> { | ||||
|         let start = Instant::now(); | ||||
|         let mut documents = self.inner.query_all(query); | ||||
|         info!("query_all took {:.2?}", start.elapsed()); | ||||
|  | ||||
|         let mut groups = vec![documents.as_mut_slice()]; | ||||
|         let mut key_cache = HashMap::new(); | ||||
|  | ||||
|         let mut filter_map = HashMap::new(); | ||||
|         // these two variables informs on the current distinct map and | ||||
|         // on the raw offset of the start of the group where the | ||||
|         // range.start bound is located according to the distinct function | ||||
|         let mut distinct_map = DistinctMap::new(self.size); | ||||
|         let mut distinct_raw_offset = 0; | ||||
|  | ||||
|         'criteria: for (ci, criterion) in self.inner.criteria.as_ref().iter().enumerate() { | ||||
|             let tmp_groups = mem::replace(&mut groups, Vec::new()); | ||||
|             let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map); | ||||
|             let mut documents_seen = 0; | ||||
|  | ||||
|             for group in tmp_groups { | ||||
|                 info!("criterion {}, documents group of size {}", ci, group.len()); | ||||
|  | ||||
|                 // if this group does not overlap with the requested range, | ||||
|                 // push it without sorting and splitting it | ||||
|                 if documents_seen + group.len() < distinct_raw_offset { | ||||
|                     documents_seen += group.len(); | ||||
|                     groups.push(group); | ||||
|                     continue; | ||||
|                 } | ||||
|  | ||||
|                 let start = Instant::now(); | ||||
|                 group.par_sort_unstable_by(|a, b| criterion.evaluate(a, b)); | ||||
|                 info!("criterion {} sort took {:.2?}", ci, start.elapsed()); | ||||
|  | ||||
|                 for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) { | ||||
|                     // we must compute the real distinguished len of this sub-group | ||||
|                     for document in group.iter() { | ||||
|                         let filter_accepted = match &self.inner.filter { | ||||
|                             Some(filter) => { | ||||
|                                 let entry = filter_map.entry(document.id); | ||||
|                                 *entry.or_insert_with(|| (filter)(document.id)) | ||||
|                             }, | ||||
|                             None => true, | ||||
|                         }; | ||||
|  | ||||
|                         if filter_accepted { | ||||
|                             let entry = key_cache.entry(document.id); | ||||
|                             let key = entry.or_insert_with(|| (self.function)(document.id).map(Rc::new)); | ||||
|  | ||||
|                             match key.clone() { | ||||
|                                 Some(key) => buf_distinct.register(key), | ||||
|                                 None => buf_distinct.register_without_key(), | ||||
|                             }; | ||||
|                         } | ||||
|  | ||||
|                         // the requested range end is reached: stop computing distinct | ||||
|                         if buf_distinct.len() >= range.end { break } | ||||
|                     } | ||||
|  | ||||
|                     documents_seen += group.len(); | ||||
|                     groups.push(group); | ||||
|  | ||||
|                     // if this sub-group does not overlap with the requested range | ||||
|                     // we must update the distinct map and its start index | ||||
|                     if buf_distinct.len() < range.start { | ||||
|                         buf_distinct.transfert_to_internal(); | ||||
|                         distinct_raw_offset = documents_seen; | ||||
|                     } | ||||
|  | ||||
|                     // we have sort enough documents if the last document sorted is after | ||||
|                     // the end of the requested range, we can continue to the next criterion | ||||
|                     if buf_distinct.len() >= range.end { continue 'criteria } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let mut out_documents = Vec::with_capacity(range.len()); | ||||
|         let mut seen = BufferedDistinctMap::new(&mut distinct_map); | ||||
|  | ||||
|         for document in documents.into_iter().skip(distinct_raw_offset) { | ||||
|             let filter_accepted = match &self.inner.filter { | ||||
|                 Some(_) => filter_map.remove(&document.id).expect("BUG: filtered not found"), | ||||
|                 None => true, | ||||
|             }; | ||||
|  | ||||
|             if filter_accepted { | ||||
|                 let key = key_cache.remove(&document.id).expect("BUG: cached key not found"); | ||||
|                 let distinct_accepted = match key { | ||||
|                     Some(key) => seen.register(key), | ||||
|                     None => seen.register_without_key(), | ||||
|                 }; | ||||
|  | ||||
|                 if distinct_accepted && seen.len() > range.start { | ||||
|                     out_documents.push(Document::from_raw(&document)); | ||||
|                     if out_documents.len() == range.len() { break } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         out_documents | ||||
|     } | ||||
| } | ||||
							
								
								
									
										56
									
								
								meilidb-core/src/shared_data_cursor.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										56
									
								
								meilidb-core/src/shared_data_cursor.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,56 @@ | ||||
| use std::io::{self, Read, Cursor, BufRead}; | ||||
| use std::sync::Arc; | ||||
| use crate::data::SharedData; | ||||
|  | ||||
| pub struct SharedDataCursor(Cursor<SharedData>); | ||||
|  | ||||
| impl SharedDataCursor { | ||||
|     pub fn from_bytes(bytes: Vec<u8>) -> SharedDataCursor { | ||||
|         let len = bytes.len(); | ||||
|         let bytes = Arc::new(bytes); | ||||
|  | ||||
|         SharedDataCursor::from_shared_bytes(bytes, 0, len) | ||||
|     } | ||||
|  | ||||
|     pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedDataCursor { | ||||
|         let data = SharedData::new(bytes, offset, len); | ||||
|         let cursor = Cursor::new(data); | ||||
|  | ||||
|         SharedDataCursor(cursor) | ||||
|     } | ||||
|  | ||||
|     pub fn extract(&mut self, amt: usize) -> SharedData { | ||||
|         let offset = self.0.position() as usize; | ||||
|         let extracted = self.0.get_ref().range(offset, amt); | ||||
|         self.0.consume(amt); | ||||
|  | ||||
|         extracted | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl Read for SharedDataCursor { | ||||
|     fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { | ||||
|         self.0.read(buf) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl BufRead for SharedDataCursor { | ||||
|     fn fill_buf(&mut self) -> io::Result<&[u8]> { | ||||
|         self.0.fill_buf() | ||||
|     } | ||||
|  | ||||
|     fn consume(&mut self, amt: usize) { | ||||
|         self.0.consume(amt) | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub trait FromSharedDataCursor: Sized { | ||||
|     type Error; | ||||
|  | ||||
|     fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error>; | ||||
|  | ||||
|     fn from_bytes(bytes: Vec<u8>) -> Result<Self, Self::Error> { | ||||
|         let mut cursor = SharedDataCursor::from_bytes(bytes); | ||||
|         Self::from_shared_data_cursor(&mut cursor) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										9
									
								
								meilidb-core/src/write_to_bytes.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								meilidb-core/src/write_to_bytes.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | ||||
| pub trait WriteToBytes { | ||||
|     fn write_to_bytes(&self, bytes: &mut Vec<u8>); | ||||
|  | ||||
|     fn into_bytes(&self) -> Vec<u8> { | ||||
|         let mut bytes = Vec::new(); | ||||
|         self.write_to_bytes(&mut bytes); | ||||
|         bytes | ||||
|     } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user