mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-02 03:40:00 +00:00
Split query_term module into multiple submodules
This commit is contained in:
331
milli/src/search/new/query_term/mod.rs
Normal file
331
milli/src/search/new/query_term/mod.rs
Normal file
@ -0,0 +1,331 @@
|
||||
mod compute_derivations;
|
||||
mod ntypo_subset;
|
||||
mod parse_query;
|
||||
mod phrase;
|
||||
|
||||
use super::interner::{DedupInterner, Interned};
|
||||
use super::{limits, SearchContext};
|
||||
use crate::Result;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
pub use ntypo_subset::NTypoTermSubset;
|
||||
pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed};
|
||||
pub use phrase::Phrase;
|
||||
|
||||
use compute_derivations::partially_initialized_term_from_word;
|
||||
|
||||
/**
|
||||
A set of word derivations attached to a location in the search query.
|
||||
|
||||
*/
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct LocatedQueryTermSubset {
|
||||
pub term_subset: QueryTermSubset,
|
||||
pub positions: RangeInclusive<u16>,
|
||||
pub term_ids: RangeInclusive<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct QueryTermSubset {
|
||||
original: Interned<QueryTerm>,
|
||||
zero_typo_subset: NTypoTermSubset,
|
||||
one_typo_subset: NTypoTermSubset,
|
||||
two_typo_subset: NTypoTermSubset,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct QueryTerm {
|
||||
original: Interned<String>,
|
||||
ngram_words: Option<Vec<Interned<String>>>,
|
||||
max_nbr_typos: u8,
|
||||
is_prefix: bool,
|
||||
zero_typo: ZeroTypoTerm,
|
||||
// May not be computed yet
|
||||
one_typo: Lazy<OneTypoTerm>,
|
||||
// May not be computed yet
|
||||
two_typo: Lazy<TwoTypoTerm>,
|
||||
}
|
||||
|
||||
// SubTerms will be in a dedup interner
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct ZeroTypoTerm {
|
||||
/// The original phrase, if any
|
||||
phrase: Option<Interned<Phrase>>,
|
||||
/// A single word equivalent to the original term, with zero typos
|
||||
zero_typo: Option<Interned<String>>,
|
||||
/// All the words that contain the original word as prefix
|
||||
prefix_of: BTreeSet<Interned<String>>,
|
||||
/// All the synonyms of the original word or phrase
|
||||
synonyms: BTreeSet<Interned<Phrase>>,
|
||||
/// A prefix in the prefix databases matching the original word
|
||||
use_prefix_db: Option<Interned<String>>,
|
||||
}
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct OneTypoTerm {
|
||||
/// The original word split into multiple consecutive words
|
||||
split_words: Option<Interned<Phrase>>,
|
||||
/// Words that are 1 typo away from the original word
|
||||
one_typo: BTreeSet<Interned<String>>,
|
||||
}
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct TwoTypoTerm {
|
||||
/// Words that are 2 typos away from the original word
|
||||
two_typos: BTreeSet<Interned<String>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum Lazy<T> {
|
||||
Uninit,
|
||||
Init(T),
|
||||
}
|
||||
impl<T> Lazy<T> {
|
||||
pub fn is_init(&self) -> bool {
|
||||
match self {
|
||||
Lazy::Uninit => false,
|
||||
Lazy::Init(_) => true,
|
||||
}
|
||||
}
|
||||
pub fn is_uninit(&self) -> bool {
|
||||
match self {
|
||||
Lazy::Uninit => true,
|
||||
Lazy::Init(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTermSubset {
|
||||
pub fn empty(for_term: Interned<QueryTerm>) -> Self {
|
||||
Self {
|
||||
original: for_term,
|
||||
zero_typo_subset: NTypoTermSubset::Nothing,
|
||||
one_typo_subset: NTypoTermSubset::Nothing,
|
||||
two_typo_subset: NTypoTermSubset::Nothing,
|
||||
}
|
||||
}
|
||||
pub fn full(for_term: Interned<QueryTerm>) -> Self {
|
||||
Self {
|
||||
original: for_term,
|
||||
zero_typo_subset: NTypoTermSubset::All,
|
||||
one_typo_subset: NTypoTermSubset::All,
|
||||
two_typo_subset: NTypoTermSubset::All,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn union(&mut self, other: &Self) {
|
||||
assert!(self.original == other.original);
|
||||
self.zero_typo_subset.union(&other.zero_typo_subset);
|
||||
self.one_typo_subset.union(&other.one_typo_subset);
|
||||
self.two_typo_subset.union(&other.two_typo_subset);
|
||||
}
|
||||
pub fn intersect(&mut self, other: &Self) {
|
||||
assert!(self.original == other.original);
|
||||
self.zero_typo_subset.intersect(&other.zero_typo_subset);
|
||||
self.one_typo_subset.intersect(&other.one_typo_subset);
|
||||
self.two_typo_subset.intersect(&other.two_typo_subset);
|
||||
}
|
||||
|
||||
pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option<Interned<String>> {
|
||||
let original = ctx.term_interner.get(self.original);
|
||||
let Some(use_prefix_db) = original.zero_typo.use_prefix_db else {
|
||||
return None
|
||||
};
|
||||
match &self.zero_typo_subset {
|
||||
NTypoTermSubset::All => Some(use_prefix_db),
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
// TODO: use a subset of prefix words instead
|
||||
if words.contains(&use_prefix_db) {
|
||||
Some(use_prefix_db)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
NTypoTermSubset::Nothing => None,
|
||||
}
|
||||
}
|
||||
pub fn all_single_words_except_prefix_db(
|
||||
&self,
|
||||
ctx: &mut SearchContext,
|
||||
) -> Result<BTreeSet<Interned<String>>> {
|
||||
let mut result = BTreeSet::default();
|
||||
// TODO: a compute_partially funtion
|
||||
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
||||
self.original.compute_fully_if_needed(ctx)?;
|
||||
}
|
||||
|
||||
let original = ctx.term_interner.get_mut(self.original);
|
||||
if !self.zero_typo_subset.is_empty() {
|
||||
let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } =
|
||||
&original.zero_typo;
|
||||
result.extend(zero_typo.iter().copied());
|
||||
result.extend(prefix_of.iter().copied());
|
||||
};
|
||||
|
||||
match &self.one_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.iter().copied())
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.intersection(words));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
|
||||
match &self.two_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(two_typos.iter().copied());
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(two_typos.intersection(words));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result<BTreeSet<Interned<Phrase>>> {
|
||||
let mut result = BTreeSet::default();
|
||||
|
||||
if !self.one_typo_subset.is_empty() {
|
||||
// TODO: compute less than fully if possible
|
||||
self.original.compute_fully_if_needed(ctx)?;
|
||||
}
|
||||
let original = ctx.term_interner.get_mut(self.original);
|
||||
|
||||
let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } =
|
||||
&original.zero_typo;
|
||||
result.extend(phrase.iter().copied());
|
||||
result.extend(synonyms.iter().copied());
|
||||
|
||||
if !self.one_typo_subset.is_empty() {
|
||||
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
|
||||
panic!();
|
||||
};
|
||||
result.extend(split_words.iter().copied());
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn original_phrase(&self, ctx: &SearchContext) -> Option<Interned<Phrase>> {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
if let Some(p) = t.zero_typo.phrase {
|
||||
if self.zero_typo_subset.contains_phrase(p) {
|
||||
return Some(p);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
match t.max_nbr_typos {
|
||||
0 => 0,
|
||||
1 => {
|
||||
if self.one_typo_subset.is_empty() {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
2 => {
|
||||
if self.two_typo_subset.is_empty() {
|
||||
if self.one_typo_subset.is_empty() {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}
|
||||
_ => panic!(),
|
||||
}
|
||||
}
|
||||
pub fn clear_zero_typo_subset(&mut self) {
|
||||
self.zero_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn clear_one_typo_subset(&mut self) {
|
||||
self.one_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn clear_two_typo_subset(&mut self) {
|
||||
self.two_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn description(&self, ctx: &SearchContext) -> String {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
ctx.word_interner.get(t.original).to_owned()
|
||||
}
|
||||
}
|
||||
|
||||
impl ZeroTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self;
|
||||
phrase.is_none()
|
||||
&& zero_typo.is_none()
|
||||
&& prefix_of.is_empty()
|
||||
&& synonyms.is_empty()
|
||||
&& use_prefix_db.is_none()
|
||||
}
|
||||
}
|
||||
impl OneTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let OneTypoTerm { split_words, one_typo } = self;
|
||||
one_typo.is_empty() && split_words.is_none()
|
||||
}
|
||||
}
|
||||
impl TwoTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let TwoTypoTerm { two_typos } = self;
|
||||
two_typos.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let Lazy::Init(one_typo) = &self.one_typo else {
|
||||
return false;
|
||||
};
|
||||
let Lazy::Init(two_typo) = &self.two_typo else {
|
||||
return false;
|
||||
};
|
||||
|
||||
self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
/// Return the original word from the given query term
|
||||
fn original_single_word(self, ctx: &SearchContext) -> Option<Interned<String>> {
|
||||
let self_ = ctx.term_interner.get(self);
|
||||
if self_.ngram_words.is_some() {
|
||||
None
|
||||
} else {
|
||||
Some(self_.original)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A query term coupled with its position in the user's search query.
|
||||
#[derive(Clone)]
|
||||
pub struct LocatedQueryTerm {
|
||||
pub value: Interned<QueryTerm>,
|
||||
pub positions: RangeInclusive<u16>,
|
||||
}
|
||||
|
||||
impl LocatedQueryTerm {
|
||||
/// Return `true` iff the term is empty
|
||||
pub fn is_empty(&self, interner: &DedupInterner<QueryTerm>) -> bool {
|
||||
interner.get(self.value).is_empty()
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user