mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Merge #736
736: Update charabia r=curquiza a=ManyTheFish Update Charabia to the last version. > We are now Romanizing Chinese characters into Pinyin. > Note that we keep the accent because they are in fact never typed directly by the end-user, moreover, changing an accent leads to a different Chinese character, and I don't have sufficient knowledge to forecast the impact of removing accents in this context. Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
		| @@ -14,14 +14,14 @@ const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>"; | ||||
| /// Structure used to build a Matcher allowing to customize formating tags. | ||||
| pub struct MatcherBuilder<'a, A> { | ||||
|     matching_words: MatchingWords, | ||||
|     tokenizer: Tokenizer<'a, A>, | ||||
|     tokenizer: Tokenizer<'a, 'a, A>, | ||||
|     crop_marker: Option<String>, | ||||
|     highlight_prefix: Option<String>, | ||||
|     highlight_suffix: Option<String>, | ||||
| } | ||||
|  | ||||
| impl<'a, A> MatcherBuilder<'a, A> { | ||||
|     pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, A>) -> Self { | ||||
|     pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { | ||||
|         Self { | ||||
|             matching_words, | ||||
|             tokenizer, | ||||
| @@ -106,7 +106,7 @@ pub struct MatchBounds { | ||||
| pub struct Matcher<'t, 'm, A> { | ||||
|     text: &'t str, | ||||
|     matching_words: &'m MatchingWords, | ||||
|     tokenizer: &'m Tokenizer<'m, A>, | ||||
|     tokenizer: &'m Tokenizer<'m, 'm, A>, | ||||
|     crop_marker: &'m str, | ||||
|     highlight_prefix: &'m str, | ||||
|     highlight_suffix: &'m str, | ||||
|   | ||||
| @@ -6,7 +6,7 @@ use std::hash::Hash; | ||||
| use std::rc::Rc; | ||||
| use std::{fmt, mem}; | ||||
|  | ||||
| use charabia::classifier::ClassifiedTokenIter; | ||||
| use charabia::normalizer::NormalizedTokenIter; | ||||
| use charabia::{SeparatorKind, TokenKind}; | ||||
| use roaring::RoaringBitmap; | ||||
| use slice_group_by::GroupBy; | ||||
| @@ -270,7 +270,7 @@ impl<'a> QueryTreeBuilder<'a> { | ||||
|     ///   (the criterion `typo` will be ignored) | ||||
|     pub fn build<A: AsRef<[u8]>>( | ||||
|         &self, | ||||
|         query: ClassifiedTokenIter<A>, | ||||
|         query: NormalizedTokenIter<A>, | ||||
|     ) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> { | ||||
|         let primitive_query = create_primitive_query(query, self.words_limit); | ||||
|         if !primitive_query.is_empty() { | ||||
| @@ -778,7 +778,7 @@ impl PrimitiveQueryPart { | ||||
| /// Create primitive query from tokenized query string, | ||||
| /// the primitive query is an intermediate state to build the query tree. | ||||
| fn create_primitive_query<A>( | ||||
|     query: ClassifiedTokenIter<A>, | ||||
|     query: NormalizedTokenIter<A>, | ||||
|     words_limit: Option<usize>, | ||||
| ) -> PrimitiveQuery | ||||
| where | ||||
| @@ -892,7 +892,7 @@ mod test { | ||||
|             terms_matching_strategy: TermsMatchingStrategy, | ||||
|             authorize_typos: bool, | ||||
|             words_limit: Option<usize>, | ||||
|             query: ClassifiedTokenIter<A>, | ||||
|             query: NormalizedTokenIter<A>, | ||||
|         ) -> Result<Option<(Operation, PrimitiveQuery)>> { | ||||
|             let primitive_query = create_primitive_query(query, words_limit); | ||||
|             if !primitive_query.is_empty() { | ||||
|   | ||||
| @@ -1575,11 +1575,11 @@ mod tests { | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         // Only the first document should match. | ||||
|         let count = index.word_docids.get(&rtxn, "化妆包").unwrap().unwrap().len(); | ||||
|         let count = index.word_docids.get(&rtxn, "huàzhuāngbāo").unwrap().unwrap().len(); | ||||
|         assert_eq!(count, 1); | ||||
|  | ||||
|         // Only the second document should match. | ||||
|         let count = index.word_docids.get(&rtxn, "包").unwrap().unwrap().len(); | ||||
|         let count = index.word_docids.get(&rtxn, "bāo").unwrap().unwrap().len(); | ||||
|         assert_eq!(count, 1); | ||||
|  | ||||
|         let mut search = crate::Search::new(&rtxn, &index); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user