mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	Separate calc_byte_length function
This commit is contained in:
		| @@ -8,6 +8,7 @@ use std::cmp::{max, min}; | ||||
|  | ||||
| use charabia::{Language, SeparatorKind, Token, Tokenizer}; | ||||
| use either::Either; | ||||
| use itertools::Itertools; | ||||
| pub use matching_words::MatchingWords; | ||||
| use matching_words::{MatchType, PartialMatch}; | ||||
| use r#match::{Match, MatchPosition}; | ||||
| @@ -229,12 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | ||||
|                 .iter() | ||||
|                 .map(|m| MatchBounds { | ||||
|                     start: tokens[m.get_first_token_pos()].byte_start, | ||||
|                     length: (m.get_first_token_pos()..m.get_last_token_pos() + 1) | ||||
|                         .map(|i| tokens[i].clone()) | ||||
|                         .flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */)) | ||||
|                         .map(|(original, _)| original as usize) | ||||
|                         .take(m.char_count) | ||||
|                         .sum(), | ||||
|                     length: self.calc_byte_length(&tokens, m), | ||||
|                     indices: if array_indices.is_empty() { | ||||
|                         None | ||||
|                     } else { | ||||
| @@ -245,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     fn calc_byte_length(&self, tokens: &Vec<Token<'t>>, m: &Match) -> usize { | ||||
|         (m.get_first_token_pos()..=m.get_last_token_pos()) | ||||
|             .flat_map(|i| match &tokens[i].char_map { | ||||
|                 Some(char_map) => { | ||||
|                     char_map.iter().map(|(original, _)| *original as usize).collect_vec() | ||||
|                 } | ||||
|                 None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(), | ||||
|             }) | ||||
|             .take(m.char_count) | ||||
|             .sum() | ||||
|     } | ||||
|  | ||||
|     /// Returns the bounds in byte index of the crop window. | ||||
|     fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { | ||||
|         let ( | ||||
|   | ||||
		Reference in New Issue
	
	Block a user