Merge branch 'search-refactor-typo-attributes' into search-refactor

2025-11-09 12:26:30 +00:00 · 2023-04-12 16:47:31 +02:00
parent 1f813a6f3b 38b7b31beb
commit 644e136aee
10 changed files with 254 additions and 95 deletions
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@@ -1,17 +1,17 @@
-use fst::automaton::Str;
-use fst::{Automaton, IntoStreamer, Streamer};
-use heed::types::DecodeIgnore;
-use heed::BytesDecode;
 use std::borrow::Cow;
 use std::collections::BTreeSet;
 use std::ops::ControlFlow;

+use fst::automaton::Str;
+use fst::{Automaton, IntoStreamer, Streamer};
+use heed::types::DecodeIgnore;
+
 use super::*;
 use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
 use crate::search::new::query_term::TwoTypoTerm;
 use crate::search::new::{limits, SearchContext};
 use crate::search::{build_dfa, get_first};
-use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH};
+use crate::{Result, MAX_WORD_LENGTH};

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum NumberOfTypos {
@@ -177,6 +177,7 @@ pub fn partially_initialized_term_from_word(
    word: &str,
    max_typo: u8,
    is_prefix: bool,
+    is_ngram: bool,
 ) -> Result<QueryTerm> {
    let word_interned = ctx.word_interner.insert(word.to_owned());

@@ -197,12 +198,19 @@ pub fn partially_initialized_term_from_word(
    let fst = ctx.index.words_fst(ctx.txn)?;

    let use_prefix_db = is_prefix
-        && ctx
+        && (ctx
            .index
            .word_prefix_docids
            .remap_data_type::<DecodeIgnore>()
            .get(ctx.txn, word)?
-            .is_some();
+            .is_some()
+            || (!is_ngram
+                && ctx
+                    .index
+                    .exact_word_prefix_docids
+                    .remap_data_type::<DecodeIgnore>()
+                    .get(ctx.txn, word)?
+                    .is_some()));
    let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };

    let mut zero_typo = None;
@@ -385,9 +393,7 @@ fn split_best_frequency(
        let left = ctx.word_interner.insert(left.to_owned());
        let right = ctx.word_interner.insert(right.to_owned());

-        if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? {
-            let frequency =
-                CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?;
+        if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
            if best.map_or(true, |(old, _, _)| frequency > old) {
                best = Some((frequency, left, right));
            }
--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@@ -3,18 +3,18 @@ mod ntypo_subset;
 mod parse_query;
 mod phrase;

-use super::interner::{DedupInterner, Interned};
-use super::{limits, SearchContext};
-use crate::Result;
 use std::collections::BTreeSet;
 use std::ops::RangeInclusive;

+use compute_derivations::partially_initialized_term_from_word;
 use either::Either;
 pub use ntypo_subset::NTypoTermSubset;
 pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
 pub use phrase::Phrase;

-use compute_derivations::partially_initialized_term_from_word;
+use super::interner::{DedupInterner, Interned};
+use super::{limits, SearchContext, Word};
+use crate::Result;

 /// A set of word derivations attached to a location in the search query.
 #[derive(Clone, PartialEq, Eq, Hash)]
@@ -159,12 +159,12 @@ impl QueryTermSubset {
        self.two_typo_subset.intersect(&other.two_typo_subset);
    }

-    pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Interned<String>> {
+    pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Word> {
        let original = ctx.term_interner.get(self.original);
        let Some(use_prefix_db) = original.zero_typo.use_prefix_db else {
            return None
        };
-        match &self.zero_typo_subset {
+        let word = match &self.zero_typo_subset {
            NTypoTermSubset::All => Some(use_prefix_db),
            NTypoTermSubset::Subset { words, phrases: _ } => {
                // TODO: use a subset of prefix words instead
@@ -175,12 +175,19 @@ impl QueryTermSubset {
                }
            }
            NTypoTermSubset::Nothing => None,
-        }
+        };
+        word.map(|word| {
+            if original.ngram_words.is_some() {
+                Word::Derived(word)
+            } else {
+                Word::Original(word)
+            }
+        })
    }
    pub fn all_single_words_except_prefix_db(
        &self,
        ctx: &mut SearchContext,
-    ) -> Result<BTreeSet<Interned<String>>> {
+    ) -> Result<BTreeSet<Word>> {
        let mut result = BTreeSet::default();
        // TODO: a compute_partially funtion
        if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
@@ -197,8 +204,20 @@ impl QueryTermSubset {
                    synonyms: _,
                    use_prefix_db: _,
                } = &original.zero_typo;
-                result.extend(zero_typo.iter().copied());
-                result.extend(prefix_of.iter().copied());
+                result.extend(zero_typo.iter().copied().map(|w| {
+                    if original.ngram_words.is_some() {
+                        Word::Derived(w)
+                    } else {
+                        Word::Original(w)
+                    }
+                }));
+                result.extend(prefix_of.iter().copied().map(|w| {
+                    if original.ngram_words.is_some() {
+                        Word::Derived(w)
+                    } else {
+                        Word::Original(w)
+                    }
+                }));
            }
            NTypoTermSubset::Subset { words, phrases: _ } => {
                let ZeroTypoTerm {
@@ -210,10 +229,20 @@ impl QueryTermSubset {
                } = &original.zero_typo;
                if let Some(zero_typo) = zero_typo {
                    if words.contains(zero_typo) {
-                        result.insert(*zero_typo);
+                        if original.ngram_words.is_some() {
+                            result.insert(Word::Derived(*zero_typo));
+                        } else {
+                            result.insert(Word::Original(*zero_typo));
+                        }
                    }
                }
-                result.extend(prefix_of.intersection(words).copied());
+                result.extend(prefix_of.intersection(words).copied().map(|w| {
+                    if original.ngram_words.is_some() {
+                        Word::Derived(w)
+                    } else {
+                        Word::Original(w)
+                    }
+                }));
            }
            NTypoTermSubset::Nothing => {}
        }
@@ -223,13 +252,13 @@ impl QueryTermSubset {
                let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
                    panic!()
                };
-                result.extend(one_typo.iter().copied())
+                result.extend(one_typo.iter().copied().map(Word::Derived))
            }
            NTypoTermSubset::Subset { words, phrases: _ } => {
                let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
                    panic!()
                };
-                result.extend(one_typo.intersection(words));
+                result.extend(one_typo.intersection(words).copied().map(Word::Derived));
            }
            NTypoTermSubset::Nothing => {}
        };
@@ -239,13 +268,13 @@ impl QueryTermSubset {
                let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
                    panic!()
                };
-                result.extend(two_typos.iter().copied());
+                result.extend(two_typos.iter().copied().map(Word::Derived));
            }
            NTypoTermSubset::Subset { words, phrases: _ } => {
                let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
                    panic!()
                };
-                result.extend(two_typos.intersection(words));
+                result.extend(two_typos.intersection(words).copied().map(Word::Derived));
            }
            NTypoTermSubset::Nothing => {}
        };
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@@ -1,8 +1,8 @@
-use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind};
-
-use crate::{Result, SearchContext, MAX_WORD_LENGTH};
+use charabia::normalizer::NormalizedTokenIter;
+use charabia::{SeparatorKind, TokenKind};

 use super::*;
+use crate::{Result, SearchContext, MAX_WORD_LENGTH};

 /// Convert the tokenised search query into a list of located query terms.
 // TODO: checking if the positions are correct for phrases, separators, ngrams
@@ -51,6 +51,7 @@ pub fn located_query_terms_from_string(
                                word,
                                nbr_typos(word),
                                false,
+                                false,
                            )?;
                            let located_term = LocatedQueryTerm {
                                value: ctx.term_interner.push(term),
@@ -62,8 +63,13 @@ pub fn located_query_terms_from_string(
                    }
                } else {
                    let word = token.lemma();
-                    let term =
-                        partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?;
+                    let term = partially_initialized_term_from_word(
+                        ctx,
+                        word,
+                        nbr_typos(word),
+                        true,
+                        false,
+                    )?;
                    let located_term = LocatedQueryTerm {
                        value: ctx.term_interner.push(term),
                        positions: position..=position,
@@ -195,7 +201,8 @@ pub fn make_ngram(
    let max_nbr_typos =
        number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);

-    let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?;
+    let mut term =
+        partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?;

    // Now add the synonyms
    let index_synonyms = ctx.index.synonyms(ctx.txn)?;