Separate calc_byte_length function

2025-11-03 17:36:29 +00:00 · 2025-03-25 00:47:17 +08:00
parent debd2b21b8
commit 2800e42243
1 changed files with 14 additions and 6 deletions
--- a/crates/milli/src/search/new/matches/mod.rs
+++ b/crates/milli/src/search/new/matches/mod.rs
@@ -8,6 +8,7 @@ use std::cmp::{max, min};

 use charabia::{Language, SeparatorKind, Token, Tokenizer};
 use either::Either;
+use itertools::Itertools;
 pub use matching_words::MatchingWords;
 use matching_words::{MatchType, PartialMatch};
 use r#match::{Match, MatchPosition};
@@ -229,12 +230,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                .iter()
                .map(|m| MatchBounds {
                    start: tokens[m.get_first_token_pos()].byte_start,
-                    length: (m.get_first_token_pos()..m.get_last_token_pos() + 1)
-                        .map(|i| tokens[i].clone())
-                        .flat_map(|token| token.char_map.clone().unwrap_or(vec![(1, 1); token.char_end - token.char_start] /* Some token doesn't have a char map, here we treat them as single byte chars. */))
-                        .map(|(original, _)| original as usize)
-                        .take(m.char_count)
-                        .sum(),
+                    length: self.calc_byte_length(&tokens, m),
                    indices: if array_indices.is_empty() {
                        None
                    } else {
@@ -245,6 +241,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
        }
    }

+    fn calc_byte_length(&self, tokens: &Vec<Token<'t>>, m: &Match) -> usize {
+        (m.get_first_token_pos()..=m.get_last_token_pos())
+            .flat_map(|i| match &tokens[i].char_map {
+                Some(char_map) => {
+                    char_map.iter().map(|(original, _)| *original as usize).collect_vec()
+                }
+                None => tokens[i].lemma().chars().map(|c| c.len_utf8()).collect_vec(),
+            })
+            .take(m.char_count)
+            .sum()
+    }
+
    /// Returns the bounds in byte index of the crop window.
    fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
        let (