Update Charabia

2025-10-27 14:06:27 +00:00 · 2023-06-28 18:52:32 +02:00
parent 9deeec88e0
commit 84845de9ef
9 changed files with 150 additions and 140 deletions
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@@ -256,7 +256,8 @@ pub(crate) mod tests {
        let temp_index = temp_index_with_documents();
        let rtxn = temp_index.read_txn().unwrap();
        let mut ctx = SearchContext::new(&temp_index, &rtxn);
-        let tokenizer = TokenizerBuilder::new().build();
+        let mut builder = TokenizerBuilder::default();
+        let tokenizer = builder.build();
        let tokens = tokenizer.tokenize("split this world");
        let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
        let matching_words = MatchingWords::new(ctx, query_terms);
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -12,16 +12,16 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
 const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";

 /// Structure used to build a Matcher allowing to customize formating tags.
-pub struct MatcherBuilder<'a, A> {
+pub struct MatcherBuilder<'m> {
    matching_words: MatchingWords,
-    tokenizer: Tokenizer<'a, 'a, A>,
+    tokenizer: Tokenizer<'m>,
    crop_marker: Option<String>,
    highlight_prefix: Option<String>,
    highlight_suffix: Option<String>,
 }

-impl<'a, A> MatcherBuilder<'a, A> {
-    pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
+impl<'m> MatcherBuilder<'m> {
+    pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self {
        Self {
            matching_words,
            tokenizer,
@@ -46,7 +46,7 @@ impl<'a, A> MatcherBuilder<'a, A> {
        self
    }

-    pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> {
+    pub fn build<'t>(&'m self, text: &'t str) -> Matcher<'t, 'm> {
        let crop_marker = match &self.crop_marker {
            Some(marker) => marker.as_str(),
            None => DEFAULT_CROP_MARKER,
@@ -103,17 +103,17 @@ pub struct MatchBounds {

 /// Structure used to analize a string, compute words that match,
 /// and format the source string, returning a highlighted and cropped sub-string.
-pub struct Matcher<'t, 'm, A> {
+pub struct Matcher<'t, 'm> {
    text: &'t str,
    matching_words: &'m MatchingWords,
-    tokenizer: &'m Tokenizer<'m, 'm, A>,
+    tokenizer: &'m Tokenizer<'m>,
    crop_marker: &'m str,
    highlight_prefix: &'m str,
    highlight_suffix: &'m str,
    matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
 }

-impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
+impl<'t> Matcher<'t, '_> {
    /// Iterates over tokens and save any of them that matches the query.
    fn compute_matches(&mut self) -> &mut Self {
        /// some words are counted as matches only if they are close together and in the good order,
@@ -503,7 +503,7 @@ mod tests {
    use crate::index::tests::TempIndex;
    use crate::{execute_search, SearchContext};

-    impl<'a> MatcherBuilder<'a, &[u8]> {
+    impl<'a> MatcherBuilder<'a> {
        fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self {
            let mut ctx = SearchContext::new(index, rtxn);
            let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
@@ -530,7 +530,7 @@ mod tests {
                None => MatchingWords::default(),
            };

-            MatcherBuilder::new(matching_words, TokenizerBuilder::new().build())
+            MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer())
        }
    }

@@ -690,7 +690,7 @@ mod tests {
        // should crop the phrase instead of croping around the match.
        insta::assert_snapshot!(
            matcher.format(format_options),
-            @"… Split The World is a book written by Emily Henry…"
+            @"…Split The World is a book written by Emily Henry…"
        );

        // Text containing some matches.
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@@ -7,7 +7,7 @@ use crate::{Result, SearchContext, MAX_WORD_LENGTH};
 /// Convert the tokenised search query into a list of located query terms.
 pub fn located_query_terms_from_tokens(
    ctx: &mut SearchContext,
-    query: NormalizedTokenIter<&[u8]>,
+    query: NormalizedTokenIter,
    words_limit: Option<usize>,
 ) -> Result<Vec<LocatedQueryTerm>> {
    let nbr_typos = number_of_typos_allowed(ctx)?;
@@ -303,7 +303,8 @@ mod tests {

    #[test]
    fn start_with_hard_separator() -> Result<()> {
-        let tokenizer = TokenizerBuilder::new().build();
+        let mut builder = TokenizerBuilder::default();
+        let tokenizer = builder.build();
        let tokens = tokenizer.tokenize(".");
        let index = temp_index_with_documents();
        let rtxn = index.read_txn()?;
--- a/milli/src/search/new/tests/stop_words.rs
+++ b/milli/src/search/new/tests/stop_words.rs
@@ -113,7 +113,7 @@ fn test_ignore_stop_words() {
            ),
            Position(
                Rank {
-                    rank: 9,
+                    rank: 7,
                    max_rank: 11,
                },
            ),
@@ -166,7 +166,7 @@ fn test_ignore_stop_words() {
            ),
            Position(
                Rank {
-                    rank: 9,
+                    rank: 7,
                    max_rank: 11,
                },
            ),
@@ -219,7 +219,7 @@ fn test_ignore_stop_words() {
            ),
            Position(
                Rank {
-                    rank: 9,
+                    rank: 7,
                    max_rank: 11,
                },
            ),
@@ -259,7 +259,7 @@ fn test_ignore_stop_words() {
            ),
            Proximity(
                Rank {
-                    rank: 7,
+                    rank: 1,
                    max_rank: 8,
                },
            ),
@@ -271,7 +271,7 @@ fn test_ignore_stop_words() {
            ),
            Position(
                Rank {
-                    rank: 17,
+                    rank: 15,
                    max_rank: 21,
                },
            ),
@@ -411,7 +411,7 @@ fn test_stop_words_in_phrase() {
            ),
            Proximity(
                Rank {
-                    rank: 6,
+                    rank: 1,
                    max_rank: 8,
                },
            ),
@@ -423,7 +423,7 @@ fn test_stop_words_in_phrase() {
            ),
            Position(
                Rank {
-                    rank: 29,
+                    rank: 27,
                    max_rank: 31,
                },
            ),