mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-01 19:30:01 +00:00
Merge branch 'search-refactor-typo-attributes' into search-refactor
This commit is contained in:
@ -1,17 +1,17 @@
|
||||
use fst::automaton::Str;
|
||||
use fst::{Automaton, IntoStreamer, Streamer};
|
||||
use heed::types::DecodeIgnore;
|
||||
use heed::BytesDecode;
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use fst::automaton::Str;
|
||||
use fst::{Automaton, IntoStreamer, Streamer};
|
||||
use heed::types::DecodeIgnore;
|
||||
|
||||
use super::*;
|
||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||
use crate::search::new::query_term::TwoTypoTerm;
|
||||
use crate::search::new::{limits, SearchContext};
|
||||
use crate::search::{build_dfa, get_first};
|
||||
use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH};
|
||||
use crate::{Result, MAX_WORD_LENGTH};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum NumberOfTypos {
|
||||
@ -177,6 +177,7 @@ pub fn partially_initialized_term_from_word(
|
||||
word: &str,
|
||||
max_typo: u8,
|
||||
is_prefix: bool,
|
||||
is_ngram: bool,
|
||||
) -> Result<QueryTerm> {
|
||||
let word_interned = ctx.word_interner.insert(word.to_owned());
|
||||
|
||||
@ -197,12 +198,19 @@ pub fn partially_initialized_term_from_word(
|
||||
let fst = ctx.index.words_fst(ctx.txn)?;
|
||||
|
||||
let use_prefix_db = is_prefix
|
||||
&& ctx
|
||||
&& (ctx
|
||||
.index
|
||||
.word_prefix_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(ctx.txn, word)?
|
||||
.is_some();
|
||||
.is_some()
|
||||
|| (!is_ngram
|
||||
&& ctx
|
||||
.index
|
||||
.exact_word_prefix_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(ctx.txn, word)?
|
||||
.is_some()));
|
||||
let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };
|
||||
|
||||
let mut zero_typo = None;
|
||||
@ -385,9 +393,7 @@ fn split_best_frequency(
|
||||
let left = ctx.word_interner.insert(left.to_owned());
|
||||
let right = ctx.word_interner.insert(right.to_owned());
|
||||
|
||||
if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? {
|
||||
let frequency =
|
||||
CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?;
|
||||
if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
|
||||
if best.map_or(true, |(old, _, _)| frequency > old) {
|
||||
best = Some((frequency, left, right));
|
||||
}
|
||||
|
@ -3,18 +3,18 @@ mod ntypo_subset;
|
||||
mod parse_query;
|
||||
mod phrase;
|
||||
|
||||
use super::interner::{DedupInterner, Interned};
|
||||
use super::{limits, SearchContext};
|
||||
use crate::Result;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use compute_derivations::partially_initialized_term_from_word;
|
||||
use either::Either;
|
||||
pub use ntypo_subset::NTypoTermSubset;
|
||||
pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
|
||||
pub use phrase::Phrase;
|
||||
|
||||
use compute_derivations::partially_initialized_term_from_word;
|
||||
use super::interner::{DedupInterner, Interned};
|
||||
use super::{limits, SearchContext, Word};
|
||||
use crate::Result;
|
||||
|
||||
/// A set of word derivations attached to a location in the search query.
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
@ -159,12 +159,12 @@ impl QueryTermSubset {
|
||||
self.two_typo_subset.intersect(&other.two_typo_subset);
|
||||
}
|
||||
|
||||
pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Interned<String>> {
|
||||
pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Word> {
|
||||
let original = ctx.term_interner.get(self.original);
|
||||
let Some(use_prefix_db) = original.zero_typo.use_prefix_db else {
|
||||
return None
|
||||
};
|
||||
match &self.zero_typo_subset {
|
||||
let word = match &self.zero_typo_subset {
|
||||
NTypoTermSubset::All => Some(use_prefix_db),
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
// TODO: use a subset of prefix words instead
|
||||
@ -175,12 +175,19 @@ impl QueryTermSubset {
|
||||
}
|
||||
}
|
||||
NTypoTermSubset::Nothing => None,
|
||||
}
|
||||
};
|
||||
word.map(|word| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(word)
|
||||
} else {
|
||||
Word::Original(word)
|
||||
}
|
||||
})
|
||||
}
|
||||
pub fn all_single_words_except_prefix_db(
|
||||
&self,
|
||||
ctx: &mut SearchContext,
|
||||
) -> Result<BTreeSet<Interned<String>>> {
|
||||
) -> Result<BTreeSet<Word>> {
|
||||
let mut result = BTreeSet::default();
|
||||
// TODO: a compute_partially funtion
|
||||
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
||||
@ -197,8 +204,20 @@ impl QueryTermSubset {
|
||||
synonyms: _,
|
||||
use_prefix_db: _,
|
||||
} = &original.zero_typo;
|
||||
result.extend(zero_typo.iter().copied());
|
||||
result.extend(prefix_of.iter().copied());
|
||||
result.extend(zero_typo.iter().copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
result.extend(prefix_of.iter().copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let ZeroTypoTerm {
|
||||
@ -210,10 +229,20 @@ impl QueryTermSubset {
|
||||
} = &original.zero_typo;
|
||||
if let Some(zero_typo) = zero_typo {
|
||||
if words.contains(zero_typo) {
|
||||
result.insert(*zero_typo);
|
||||
if original.ngram_words.is_some() {
|
||||
result.insert(Word::Derived(*zero_typo));
|
||||
} else {
|
||||
result.insert(Word::Original(*zero_typo));
|
||||
}
|
||||
}
|
||||
}
|
||||
result.extend(prefix_of.intersection(words).copied());
|
||||
result.extend(prefix_of.intersection(words).copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
}
|
||||
@ -223,13 +252,13 @@ impl QueryTermSubset {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.iter().copied())
|
||||
result.extend(one_typo.iter().copied().map(Word::Derived))
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.intersection(words));
|
||||
result.extend(one_typo.intersection(words).copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
@ -239,13 +268,13 @@ impl QueryTermSubset {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(two_typos.iter().copied());
|
||||
result.extend(two_typos.iter().copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(two_typos.intersection(words));
|
||||
result.extend(two_typos.intersection(words).copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
|
@ -1,8 +1,8 @@
|
||||
use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind};
|
||||
|
||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||
use charabia::normalizer::NormalizedTokenIter;
|
||||
use charabia::{SeparatorKind, TokenKind};
|
||||
|
||||
use super::*;
|
||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||
|
||||
/// Convert the tokenised search query into a list of located query terms.
|
||||
// TODO: checking if the positions are correct for phrases, separators, ngrams
|
||||
@ -51,6 +51,7 @@ pub fn located_query_terms_from_string(
|
||||
word,
|
||||
nbr_typos(word),
|
||||
false,
|
||||
false,
|
||||
)?;
|
||||
let located_term = LocatedQueryTerm {
|
||||
value: ctx.term_interner.push(term),
|
||||
@ -62,8 +63,13 @@ pub fn located_query_terms_from_string(
|
||||
}
|
||||
} else {
|
||||
let word = token.lemma();
|
||||
let term =
|
||||
partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?;
|
||||
let term = partially_initialized_term_from_word(
|
||||
ctx,
|
||||
word,
|
||||
nbr_typos(word),
|
||||
true,
|
||||
false,
|
||||
)?;
|
||||
let located_term = LocatedQueryTerm {
|
||||
value: ctx.term_interner.push(term),
|
||||
positions: position..=position,
|
||||
@ -195,7 +201,8 @@ pub fn make_ngram(
|
||||
let max_nbr_typos =
|
||||
number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);
|
||||
|
||||
let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?;
|
||||
let mut term =
|
||||
partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?;
|
||||
|
||||
// Now add the synonyms
|
||||
let index_synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||
|
Reference in New Issue
Block a user