mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-03 17:36:29 +00:00 
			
		
		
		
	Separate calc_byte_length function
This commit is contained in:
		@@ -8,6 +8,7 @@ use std::cmp::{max, min};
 | 
			
		||||
 | 
			
		||||
use charabia::{Language, SeparatorKind, Token, Tokenizer};
 | 
			
		||||
use either::Either;
 | 
			
		||||
use itertools::Itertools;
 | 
			
		||||
pub use matching_words::MatchingWords;
 | 
			
		||||
use matching_words::{MatchType, PartialMatch};
 | 
			
		||||
use r#match::{Match, MatchPosition};
 | 
			
		||||
@@ -229,12 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
 | 
			
		||||
                .iter()
 | 
			
		||||
                .map(|m| MatchBounds {
 | 
			
		||||
                    start: tokens[m.get_first_token_pos()].byte_start,
 | 
			
		||||
                    length: (m.get_first_token_pos()..m.get_last_token_pos() + 1)
 | 
			
		||||
                        .map(|i| tokens[i].clone())
 | 
			
		||||
                        .flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */))
 | 
			
		||||
                        .map(|(original, _)| original as usize)
 | 
			
		||||
                        .take(m.char_count)
 | 
			
		||||
                        .sum(),
 | 
			
		||||
                    length: self.calc_byte_length(&tokens, m),
 | 
			
		||||
                    indices: if array_indices.is_empty() {
 | 
			
		||||
                        None
 | 
			
		||||
                    } else {
 | 
			
		||||
@@ -245,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn calc_byte_length(&self, tokens: &Vec<Token<'t>>, m: &Match) -> usize {
 | 
			
		||||
        (m.get_first_token_pos()..=m.get_last_token_pos())
 | 
			
		||||
            .flat_map(|i| match &tokens[i].char_map {
 | 
			
		||||
                Some(char_map) => {
 | 
			
		||||
                    char_map.iter().map(|(original, _)| *original as usize).collect_vec()
 | 
			
		||||
                }
 | 
			
		||||
                None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(),
 | 
			
		||||
            })
 | 
			
		||||
            .take(m.char_count)
 | 
			
		||||
            .sum()
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Returns the bounds in byte index of the crop window.
 | 
			
		||||
    fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
 | 
			
		||||
        let (
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user