mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	Merge #3866
3866: Update charabia v0.8.0 r=dureuill a=ManyTheFish
# Pull Request
Update Charabia:
- enhance Japanese segmentation
- enhance Latin Tokenization
  - words containing `_` are now properly segmented into several words
  - brackets `{([])}` are no more considered as context separators so word separated by brackets are now considered near together for the proximity ranking rule
- fixes #3815
- fixes #3778
- fixes [product#151](https://github.com/meilisearch/product/discussions/151)
> Important note: now the float numbers are segmented around the `.` so `3.22` is segmented as [`3`, `.`, `22`] but the middle dot isn't considered as a hard separator, which means that if we search `3.22` we find documents containing `3.22`
Co-authored-by: ManyTheFish <many@meilisearch.com>
			
			
This commit is contained in:
		| @@ -256,7 +256,8 @@ pub(crate) mod tests { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let tokenizer = TokenizerBuilder::new().build(); | ||||
|         let mut builder = TokenizerBuilder::default(); | ||||
|         let tokenizer = builder.build(); | ||||
|         let tokens = tokenizer.tokenize("split this world"); | ||||
|         let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); | ||||
|         let matching_words = MatchingWords::new(ctx, query_terms); | ||||
|   | ||||
| @@ -12,16 +12,16 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>"; | ||||
| const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>"; | ||||
|  | ||||
| /// Structure used to build a Matcher allowing to customize formating tags. | ||||
| pub struct MatcherBuilder<'a, A> { | ||||
| pub struct MatcherBuilder<'m> { | ||||
|     matching_words: MatchingWords, | ||||
|     tokenizer: Tokenizer<'a, 'a, A>, | ||||
|     tokenizer: Tokenizer<'m>, | ||||
|     crop_marker: Option<String>, | ||||
|     highlight_prefix: Option<String>, | ||||
|     highlight_suffix: Option<String>, | ||||
| } | ||||
|  | ||||
| impl<'a, A> MatcherBuilder<'a, A> { | ||||
|     pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { | ||||
| impl<'m> MatcherBuilder<'m> { | ||||
|     pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self { | ||||
|         Self { | ||||
|             matching_words, | ||||
|             tokenizer, | ||||
| @@ -46,7 +46,7 @@ impl<'a, A> MatcherBuilder<'a, A> { | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { | ||||
|     pub fn build<'t>(&'m self, text: &'t str) -> Matcher<'t, 'm> { | ||||
|         let crop_marker = match &self.crop_marker { | ||||
|             Some(marker) => marker.as_str(), | ||||
|             None => DEFAULT_CROP_MARKER, | ||||
| @@ -103,17 +103,17 @@ pub struct MatchBounds { | ||||
|  | ||||
| /// Structure used to analize a string, compute words that match, | ||||
| /// and format the source string, returning a highlighted and cropped sub-string. | ||||
| pub struct Matcher<'t, 'm, A> { | ||||
| pub struct Matcher<'t, 'm> { | ||||
|     text: &'t str, | ||||
|     matching_words: &'m MatchingWords, | ||||
|     tokenizer: &'m Tokenizer<'m, 'm, A>, | ||||
|     tokenizer: &'m Tokenizer<'m>, | ||||
|     crop_marker: &'m str, | ||||
|     highlight_prefix: &'m str, | ||||
|     highlight_suffix: &'m str, | ||||
|     matches: Option<(Vec<Token<'t>>, Vec<Match>)>, | ||||
| } | ||||
|  | ||||
| impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { | ||||
| impl<'t> Matcher<'t, '_> { | ||||
|     /// Iterates over tokens and save any of them that matches the query. | ||||
|     fn compute_matches(&mut self) -> &mut Self { | ||||
|         /// some words are counted as matches only if they are close together and in the good order, | ||||
| @@ -503,7 +503,7 @@ mod tests { | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use crate::{execute_search, SearchContext}; | ||||
|  | ||||
|     impl<'a> MatcherBuilder<'a, &[u8]> { | ||||
|     impl<'a> MatcherBuilder<'a> { | ||||
|         fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { | ||||
|             let mut ctx = SearchContext::new(index, rtxn); | ||||
|             let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( | ||||
| @@ -530,7 +530,7 @@ mod tests { | ||||
|                 None => MatchingWords::default(), | ||||
|             }; | ||||
|  | ||||
|             MatcherBuilder::new(matching_words, TokenizerBuilder::new().build()) | ||||
|             MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer()) | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -690,7 +690,7 @@ mod tests { | ||||
|         // should crop the phrase instead of croping around the match. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"… Split The World is a book written by Emily Henry…" | ||||
|             @"…Split The World is a book written by Emily Henry…" | ||||
|         ); | ||||
|  | ||||
|         // Text containing some matches. | ||||
|   | ||||
| @@ -7,7 +7,7 @@ use crate::{Result, SearchContext, MAX_WORD_LENGTH}; | ||||
| /// Convert the tokenised search query into a list of located query terms. | ||||
| pub fn located_query_terms_from_tokens( | ||||
|     ctx: &mut SearchContext, | ||||
|     query: NormalizedTokenIter<&[u8]>, | ||||
|     query: NormalizedTokenIter, | ||||
|     words_limit: Option<usize>, | ||||
| ) -> Result<Vec<LocatedQueryTerm>> { | ||||
|     let nbr_typos = number_of_typos_allowed(ctx)?; | ||||
| @@ -303,7 +303,8 @@ mod tests { | ||||
|  | ||||
|     #[test] | ||||
|     fn start_with_hard_separator() -> Result<()> { | ||||
|         let tokenizer = TokenizerBuilder::new().build(); | ||||
|         let mut builder = TokenizerBuilder::default(); | ||||
|         let tokenizer = builder.build(); | ||||
|         let tokens = tokenizer.tokenize("."); | ||||
|         let index = temp_index_with_documents(); | ||||
|         let rtxn = index.read_txn()?; | ||||
|   | ||||
| @@ -128,10 +128,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||
|         .map(|reader| (documents_ids, reader, script_language_docids)) | ||||
| } | ||||
|  | ||||
| fn extract_tokens_from_document<T: AsRef<[u8]>>( | ||||
| fn extract_tokens_from_document( | ||||
|     obkv: &KvReader<FieldId>, | ||||
|     searchable_fields: &Option<HashSet<FieldId>>, | ||||
|     tokenizer: &Tokenizer<T>, | ||||
|     tokenizer: &Tokenizer, | ||||
|     max_positions_per_attributes: u32, | ||||
|     buffers: &mut Buffers, | ||||
|     script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>, | ||||
|   | ||||
| @@ -1,18 +1,21 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| 0                [1, ] | ||||
| 1                [2, ] | ||||
| 10.0             [1, ] | ||||
| 10               [1, ] | ||||
| 12               [0, ] | ||||
| 1344             [3, ] | ||||
| 2                [0, ] | ||||
| 23               [5, ] | ||||
| 25.99            [2, ] | ||||
| 3.5              [0, ] | ||||
| 25               [2, ] | ||||
| 3                [0, ] | ||||
| 35               [5, ] | ||||
| 4                [4, ] | ||||
| 42               [0, 5, ] | ||||
| 456              [1, ] | ||||
| 5                [0, ] | ||||
| 99               [2, ] | ||||
| adams            [5, ] | ||||
| adventure        [1, ] | ||||
| alice            [2, ] | ||||
| @@ -29,7 +32,7 @@ galaxy           [5, ] | ||||
| guide            [5, ] | ||||
| half             [4, ] | ||||
| harry            [4, ] | ||||
| hitchhiker'      [5, ] | ||||
| hitchhiker       [5, ] | ||||
| hobbit           [3, ] | ||||
| in               [2, ] | ||||
| j                [3, 4, ] | ||||
|   | ||||
| @@ -1,19 +1,22 @@ | ||||
| --- | ||||
| source: milli/src/update/index_documents/mod.rs | ||||
| --- | ||||
| 0                [1, 7, ] | ||||
| 1                [2, ] | ||||
| 10.0             [1, 7, ] | ||||
| 10               [1, 7, ] | ||||
| 12               [0, 8, ] | ||||
| 1344             [3, ] | ||||
| 1813             [8, ] | ||||
| 2                [0, 8, ] | ||||
| 23               [5, ] | ||||
| 25.99            [2, ] | ||||
| 3.5              [0, 8, ] | ||||
| 25               [2, ] | ||||
| 3                [0, 8, ] | ||||
| 35               [5, ] | ||||
| 4                [4, 6, ] | ||||
| 42               [0, 5, 8, ] | ||||
| 456              [1, 7, ] | ||||
| 5                [0, 8, ] | ||||
| 99               [2, ] | ||||
| adams            [5, ] | ||||
| adventure        [1, 7, ] | ||||
| alice            [2, ] | ||||
| @@ -31,7 +34,7 @@ galaxy           [5, ] | ||||
| guide            [5, ] | ||||
| half             [4, 6, ] | ||||
| harry            [4, 6, ] | ||||
| hitchhiker'      [5, ] | ||||
| hitchhiker       [5, ] | ||||
| hobbit           [3, ] | ||||
| in               [2, ] | ||||
| j                [3, 4, 6, 8, ] | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| use std::collections::{BTreeSet, HashMap, HashSet}; | ||||
| use std::result::Result as StdResult; | ||||
|  | ||||
| use charabia::{Tokenizer, TokenizerBuilder}; | ||||
| use charabia::{Normalize, Tokenizer, TokenizerBuilder}; | ||||
| use deserr::{DeserializeError, Deserr}; | ||||
| use itertools::Itertools; | ||||
| use serde::{Deserialize, Deserializer, Serialize, Serializer}; | ||||
| @@ -423,6 +423,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         match self.stop_words { | ||||
|             Setting::Set(ref stop_words) => { | ||||
|                 let current = self.index.stop_words(self.wtxn)?; | ||||
|  | ||||
|                 // Apply an unlossy normalization on stop_words | ||||
|                 let stop_words = stop_words | ||||
|                     .iter() | ||||
|                     .map(|w| w.as_str().normalize(&Default::default()).into_owned()); | ||||
|  | ||||
|                 // since we can't compare a BTreeSet with an FST we are going to convert the | ||||
|                 // BTreeSet to an FST and then compare bytes per bytes the two FSTs. | ||||
|                 let fst = fst::Set::from_iter(stop_words)?; | ||||
| @@ -446,7 +452,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|     fn update_synonyms(&mut self) -> Result<bool> { | ||||
|         match self.synonyms { | ||||
|             Setting::Set(ref synonyms) => { | ||||
|                 fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> { | ||||
|                 fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> { | ||||
|                     tokenizer | ||||
|                         .tokenize(text) | ||||
|                         .filter_map(|token| { | ||||
| @@ -647,7 +653,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|     fn update_exact_words(&mut self) -> Result<()> { | ||||
|         match self.exact_words { | ||||
|             Setting::Set(ref mut words) => { | ||||
|                 fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String { | ||||
|                 fn normalize(tokenizer: &Tokenizer, text: &str) -> String { | ||||
|                     tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect() | ||||
|                 } | ||||
|  | ||||
|   | ||||
| @@ -1,27 +1,28 @@ | ||||
| --- | ||||
| source: milli/src/update/delete_documents.rs | ||||
| --- | ||||
| 1_36             [3, ] | ||||
| 1_37             [4, ] | ||||
| 1_38             [5, ] | ||||
| 1_39             [6, ] | ||||
| 1_40             [7, ] | ||||
| 1_41             [8, ] | ||||
| 1_42             [9, ] | ||||
| 1_43             [10, ] | ||||
| 1_44             [11, ] | ||||
| 1_45             [12, ] | ||||
| 1_46             [13, ] | ||||
| 1_47             [14, ] | ||||
| 1_5              [1, ] | ||||
| 1_52             [15, ] | ||||
| 1_57             [16, ] | ||||
| 1_58             [17, ] | ||||
| 1_68             [18, ] | ||||
| 1_69             [19, ] | ||||
| 1_7              [2, ] | ||||
| 1_71             [21, ] | ||||
| 2.2              [21, ] | ||||
| 1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ] | ||||
| 2                [21, ] | ||||
| 36               [3, ] | ||||
| 37               [4, ] | ||||
| 38               [5, ] | ||||
| 39               [6, ] | ||||
| 40               [7, ] | ||||
| 41               [8, ] | ||||
| 42               [9, ] | ||||
| 43               [10, ] | ||||
| 44               [11, ] | ||||
| 45               [12, ] | ||||
| 46               [13, ] | ||||
| 47               [14, ] | ||||
| 5                [1, ] | ||||
| 52               [15, ] | ||||
| 57               [16, ] | ||||
| 58               [17, ] | ||||
| 68               [18, ] | ||||
| 69               [19, ] | ||||
| 7                [2, ] | ||||
| 71               [21, ] | ||||
| abstract         [2, 6, 10, 13, 14, 15, 16, 17, ] | ||||
| aquarium         [5, ] | ||||
| art              [4, 5, 8, 9, 10, 12, 17, ] | ||||
|   | ||||
| @@ -1,4 +1,25 @@ | ||||
| --- | ||||
| source: milli/src/update/delete_documents.rs | ||||
| --- | ||||
| 1  1                36               [3, ] | ||||
| 1  1                37               [4, ] | ||||
| 1  1                38               [5, ] | ||||
| 1  1                39               [6, ] | ||||
| 1  1                40               [7, ] | ||||
| 1  1                41               [8, ] | ||||
| 1  1                42               [9, ] | ||||
| 1  1                43               [10, ] | ||||
| 1  1                44               [11, ] | ||||
| 1  1                45               [12, ] | ||||
| 1  1                46               [13, ] | ||||
| 1  1                47               [14, ] | ||||
| 1  1                5                [1, ] | ||||
| 1  1                52               [15, ] | ||||
| 1  1                57               [16, ] | ||||
| 1  1                58               [17, ] | ||||
| 1  1                68               [18, ] | ||||
| 1  1                69               [19, ] | ||||
| 1  1                7                [2, ] | ||||
| 1  1                71               [21, ] | ||||
| 1  2                2                [21, ] | ||||
|  | ||||
|   | ||||
| @@ -1,31 +1,31 @@ | ||||
| --- | ||||
| source: milli/src/update/delete_documents.rs | ||||
| --- | ||||
| 1.2              [20, 22, ] | ||||
| 1_36             [3, ] | ||||
| 1_37             [4, ] | ||||
| 1_38             [5, ] | ||||
| 1_39             [6, ] | ||||
| 1_4              [0, ] | ||||
| 1_40             [7, ] | ||||
| 1_41             [8, ] | ||||
| 1_42             [9, ] | ||||
| 1_43             [10, ] | ||||
| 1_44             [11, ] | ||||
| 1_45             [12, ] | ||||
| 1_46             [13, ] | ||||
| 1_47             [14, ] | ||||
| 1_5              [1, ] | ||||
| 1_52             [15, ] | ||||
| 1_57             [16, ] | ||||
| 1_58             [17, ] | ||||
| 1_68             [18, ] | ||||
| 1_69             [19, ] | ||||
| 1_7              [2, ] | ||||
| 1_70             [20, ] | ||||
| 1_71             [21, ] | ||||
| 1_72             [22, ] | ||||
| 2.2              [21, ] | ||||
| 1                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ] | ||||
| 2                [20, 21, 22, ] | ||||
| 36               [3, ] | ||||
| 37               [4, ] | ||||
| 38               [5, ] | ||||
| 39               [6, ] | ||||
| 4                [0, ] | ||||
| 40               [7, ] | ||||
| 41               [8, ] | ||||
| 42               [9, ] | ||||
| 43               [10, ] | ||||
| 44               [11, ] | ||||
| 45               [12, ] | ||||
| 46               [13, ] | ||||
| 47               [14, ] | ||||
| 5                [1, ] | ||||
| 52               [15, ] | ||||
| 57               [16, ] | ||||
| 58               [17, ] | ||||
| 68               [18, ] | ||||
| 69               [19, ] | ||||
| 7                [2, ] | ||||
| 70               [20, ] | ||||
| 71               [21, ] | ||||
| 72               [22, ] | ||||
| abstract         [2, 6, 10, 13, 14, 15, 16, 17, ] | ||||
| aquarium         [5, ] | ||||
| art              [4, 5, 8, 9, 10, 12, 17, ] | ||||
|   | ||||
| @@ -1,4 +1,29 @@ | ||||
| --- | ||||
| source: milli/src/update/delete_documents.rs | ||||
| --- | ||||
| 1  1                2                [20, 22, ] | ||||
| 1  1                36               [3, ] | ||||
| 1  1                37               [4, ] | ||||
| 1  1                38               [5, ] | ||||
| 1  1                39               [6, ] | ||||
| 1  1                4                [0, ] | ||||
| 1  1                40               [7, ] | ||||
| 1  1                41               [8, ] | ||||
| 1  1                42               [9, ] | ||||
| 1  1                43               [10, ] | ||||
| 1  1                44               [11, ] | ||||
| 1  1                45               [12, ] | ||||
| 1  1                46               [13, ] | ||||
| 1  1                47               [14, ] | ||||
| 1  1                5                [1, ] | ||||
| 1  1                52               [15, ] | ||||
| 1  1                57               [16, ] | ||||
| 1  1                58               [17, ] | ||||
| 1  1                68               [18, ] | ||||
| 1  1                69               [19, ] | ||||
| 1  1                7                [2, ] | ||||
| 1  1                70               [20, ] | ||||
| 1  1                71               [21, ] | ||||
| 1  1                72               [22, ] | ||||
| 1  2                2                [21, ] | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user