mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 13:06:27 +00:00 
			
		
		
		
	Update Charabia
This commit is contained in:
		| @@ -256,7 +256,8 @@ pub(crate) mod tests { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let tokenizer = TokenizerBuilder::new().build(); | ||||
|         let mut builder = TokenizerBuilder::default(); | ||||
|         let tokenizer = builder.build(); | ||||
|         let tokens = tokenizer.tokenize("split this world"); | ||||
|         let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); | ||||
|         let matching_words = MatchingWords::new(ctx, query_terms); | ||||
|   | ||||
| @@ -12,16 +12,16 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>"; | ||||
| const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>"; | ||||
|  | ||||
| /// Structure used to build a Matcher allowing to customize formating tags. | ||||
| pub struct MatcherBuilder<'a, A> { | ||||
| pub struct MatcherBuilder<'m> { | ||||
|     matching_words: MatchingWords, | ||||
|     tokenizer: Tokenizer<'a, 'a, A>, | ||||
|     tokenizer: Tokenizer<'m>, | ||||
|     crop_marker: Option<String>, | ||||
|     highlight_prefix: Option<String>, | ||||
|     highlight_suffix: Option<String>, | ||||
| } | ||||
|  | ||||
| impl<'a, A> MatcherBuilder<'a, A> { | ||||
|     pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { | ||||
| impl<'m> MatcherBuilder<'m> { | ||||
|     pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self { | ||||
|         Self { | ||||
|             matching_words, | ||||
|             tokenizer, | ||||
| @@ -46,7 +46,7 @@ impl<'a, A> MatcherBuilder<'a, A> { | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { | ||||
|     pub fn build<'t>(&'m self, text: &'t str) -> Matcher<'t, 'm> { | ||||
|         let crop_marker = match &self.crop_marker { | ||||
|             Some(marker) => marker.as_str(), | ||||
|             None => DEFAULT_CROP_MARKER, | ||||
| @@ -103,17 +103,17 @@ pub struct MatchBounds { | ||||
|  | ||||
| /// Structure used to analize a string, compute words that match, | ||||
| /// and format the source string, returning a highlighted and cropped sub-string. | ||||
| pub struct Matcher<'t, 'm, A> { | ||||
| pub struct Matcher<'t, 'm> { | ||||
|     text: &'t str, | ||||
|     matching_words: &'m MatchingWords, | ||||
|     tokenizer: &'m Tokenizer<'m, 'm, A>, | ||||
|     tokenizer: &'m Tokenizer<'m>, | ||||
|     crop_marker: &'m str, | ||||
|     highlight_prefix: &'m str, | ||||
|     highlight_suffix: &'m str, | ||||
|     matches: Option<(Vec<Token<'t>>, Vec<Match>)>, | ||||
| } | ||||
|  | ||||
| impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { | ||||
| impl<'t> Matcher<'t, '_> { | ||||
|     /// Iterates over tokens and save any of them that matches the query. | ||||
|     fn compute_matches(&mut self) -> &mut Self { | ||||
|         /// some words are counted as matches only if they are close together and in the good order, | ||||
| @@ -503,7 +503,7 @@ mod tests { | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use crate::{execute_search, SearchContext}; | ||||
|  | ||||
|     impl<'a> MatcherBuilder<'a, &[u8]> { | ||||
|     impl<'a> MatcherBuilder<'a> { | ||||
|         fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { | ||||
|             let mut ctx = SearchContext::new(index, rtxn); | ||||
|             let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( | ||||
| @@ -530,7 +530,7 @@ mod tests { | ||||
|                 None => MatchingWords::default(), | ||||
|             }; | ||||
|  | ||||
|             MatcherBuilder::new(matching_words, TokenizerBuilder::new().build()) | ||||
|             MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer()) | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -690,7 +690,7 @@ mod tests { | ||||
|         // should crop the phrase instead of croping around the match. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"… Split The World is a book written by Emily Henry…" | ||||
|             @"…Split The World is a book written by Emily Henry…" | ||||
|         ); | ||||
|  | ||||
|         // Text containing some matches. | ||||
|   | ||||
| @@ -7,7 +7,7 @@ use crate::{Result, SearchContext, MAX_WORD_LENGTH}; | ||||
| /// Convert the tokenised search query into a list of located query terms. | ||||
| pub fn located_query_terms_from_tokens( | ||||
|     ctx: &mut SearchContext, | ||||
|     query: NormalizedTokenIter<&[u8]>, | ||||
|     query: NormalizedTokenIter, | ||||
|     words_limit: Option<usize>, | ||||
| ) -> Result<Vec<LocatedQueryTerm>> { | ||||
|     let nbr_typos = number_of_typos_allowed(ctx)?; | ||||
| @@ -303,7 +303,8 @@ mod tests { | ||||
|  | ||||
|     #[test] | ||||
|     fn start_with_hard_separator() -> Result<()> { | ||||
|         let tokenizer = TokenizerBuilder::new().build(); | ||||
|         let mut builder = TokenizerBuilder::default(); | ||||
|         let tokenizer = builder.build(); | ||||
|         let tokens = tokenizer.tokenize("."); | ||||
|         let index = temp_index_with_documents(); | ||||
|         let rtxn = index.read_txn()?; | ||||
|   | ||||
| @@ -113,7 +113,7 @@ fn test_ignore_stop_words() { | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 9, | ||||
|                     rank: 7, | ||||
|                     max_rank: 11, | ||||
|                 }, | ||||
|             ), | ||||
| @@ -166,7 +166,7 @@ fn test_ignore_stop_words() { | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 9, | ||||
|                     rank: 7, | ||||
|                     max_rank: 11, | ||||
|                 }, | ||||
|             ), | ||||
| @@ -219,7 +219,7 @@ fn test_ignore_stop_words() { | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 9, | ||||
|                     rank: 7, | ||||
|                     max_rank: 11, | ||||
|                 }, | ||||
|             ), | ||||
| @@ -259,7 +259,7 @@ fn test_ignore_stop_words() { | ||||
|             ), | ||||
|             Proximity( | ||||
|                 Rank { | ||||
|                     rank: 7, | ||||
|                     rank: 1, | ||||
|                     max_rank: 8, | ||||
|                 }, | ||||
|             ), | ||||
| @@ -271,7 +271,7 @@ fn test_ignore_stop_words() { | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 17, | ||||
|                     rank: 15, | ||||
|                     max_rank: 21, | ||||
|                 }, | ||||
|             ), | ||||
| @@ -411,7 +411,7 @@ fn test_stop_words_in_phrase() { | ||||
|             ), | ||||
|             Proximity( | ||||
|                 Rank { | ||||
|                     rank: 6, | ||||
|                     rank: 1, | ||||
|                     max_rank: 8, | ||||
|                 }, | ||||
|             ), | ||||
| @@ -423,7 +423,7 @@ fn test_stop_words_in_phrase() { | ||||
|             ), | ||||
|             Position( | ||||
|                 Rank { | ||||
|                     rank: 29, | ||||
|                     rank: 27, | ||||
|                     max_rank: 31, | ||||
|                 }, | ||||
|             ), | ||||
|   | ||||
| @@ -128,10 +128,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||
|         .map(|reader| (documents_ids, reader, script_language_docids)) | ||||
| } | ||||
|  | ||||
| fn extract_tokens_from_document<T: AsRef<[u8]>>( | ||||
| fn extract_tokens_from_document( | ||||
|     obkv: &KvReader<FieldId>, | ||||
|     searchable_fields: &Option<HashSet<FieldId>>, | ||||
|     tokenizer: &Tokenizer<T>, | ||||
|     tokenizer: &Tokenizer, | ||||
|     max_positions_per_attributes: u32, | ||||
|     buffers: &mut Buffers, | ||||
|     script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>, | ||||
|   | ||||
| @@ -1,7 +1,7 @@ | ||||
| use std::collections::{BTreeSet, HashMap, HashSet}; | ||||
| use std::result::Result as StdResult; | ||||
|  | ||||
| use charabia::{Tokenizer, TokenizerBuilder}; | ||||
| use charabia::{Normalize, Tokenizer, TokenizerBuilder}; | ||||
| use deserr::{DeserializeError, Deserr}; | ||||
| use itertools::Itertools; | ||||
| use serde::{Deserialize, Deserializer, Serialize, Serializer}; | ||||
| @@ -413,6 +413,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|         match self.stop_words { | ||||
|             Setting::Set(ref stop_words) => { | ||||
|                 let current = self.index.stop_words(self.wtxn)?; | ||||
|  | ||||
|                 // Apply an unlossy normalization on stop_words | ||||
|                 let stop_words = stop_words | ||||
|                     .iter() | ||||
|                     .map(|w| w.as_str().normalize(&Default::default()).into_owned()); | ||||
|  | ||||
|                 // since we can't compare a BTreeSet with an FST we are going to convert the | ||||
|                 // BTreeSet to an FST and then compare bytes per bytes the two FSTs. | ||||
|                 let fst = fst::Set::from_iter(stop_words)?; | ||||
| @@ -436,7 +442,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|     fn update_synonyms(&mut self) -> Result<bool> { | ||||
|         match self.synonyms { | ||||
|             Setting::Set(ref synonyms) => { | ||||
|                 fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> { | ||||
|                 fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> { | ||||
|                     tokenizer | ||||
|                         .tokenize(text) | ||||
|                         .filter_map(|token| { | ||||
| @@ -637,7 +643,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | ||||
|     fn update_exact_words(&mut self) -> Result<()> { | ||||
|         match self.exact_words { | ||||
|             Setting::Set(ref mut words) => { | ||||
|                 fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String { | ||||
|                 fn normalize(tokenizer: &Tokenizer, text: &str) -> String { | ||||
|                     tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect() | ||||
|                 } | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user