mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-11-03 01:16:27 +00:00
Merge branch 'main' into tmp-release-v1.11.0
This commit is contained in:
366
crates/milli/src/search/new/bucket_sort.rs
Normal file
366
crates/milli/src/search/new/bucket_sort.rs
Normal file
@@ -0,0 +1,366 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::logger::SearchLogger;
|
||||
use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait};
|
||||
use super::SearchContext;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput};
|
||||
use crate::{Result, TimeBudget};
|
||||
|
||||
pub struct BucketSortOutput {
|
||||
pub docids: Vec<u32>,
|
||||
pub scores: Vec<Vec<ScoreDetails>>,
|
||||
pub all_candidates: RoaringBitmap,
|
||||
|
||||
pub degraded: bool,
|
||||
}
|
||||
|
||||
// TODO: would probably be good to regroup some of these inside of a struct?
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::bucket_sort")]
|
||||
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
||||
query: &Q,
|
||||
distinct: Option<&str>,
|
||||
universe: &RoaringBitmap,
|
||||
from: usize,
|
||||
length: usize,
|
||||
scoring_strategy: ScoringStrategy,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
) -> Result<BucketSortOutput> {
|
||||
logger.initial_query(query);
|
||||
logger.ranking_rules(&ranking_rules);
|
||||
logger.initial_universe(universe);
|
||||
|
||||
let distinct_field = match distinct {
|
||||
Some(distinct) => Some(distinct),
|
||||
None => ctx.index.distinct_field(ctx.txn)?,
|
||||
};
|
||||
|
||||
let distinct_fid = if let Some(field) = distinct_field {
|
||||
ctx.index.fields_ids_map(ctx.txn)?.id(field)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
if universe.len() < from as u64 {
|
||||
return Ok(BucketSortOutput {
|
||||
docids: vec![],
|
||||
scores: vec![],
|
||||
all_candidates: universe.clone(),
|
||||
degraded: false,
|
||||
});
|
||||
}
|
||||
if ranking_rules.is_empty() {
|
||||
if let Some(distinct_fid) = distinct_fid {
|
||||
let mut excluded = RoaringBitmap::new();
|
||||
let mut results = vec![];
|
||||
for docid in universe.iter() {
|
||||
if results.len() >= from + length {
|
||||
break;
|
||||
}
|
||||
if excluded.contains(docid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
||||
results.push(docid);
|
||||
}
|
||||
|
||||
let mut all_candidates = universe - excluded;
|
||||
all_candidates.extend(results.iter().copied());
|
||||
// drain the results of the skipped elements
|
||||
// this **must** be done **after** writing the entire results in `all_candidates` to ensure
|
||||
// e.g. estimatedTotalHits is correct.
|
||||
if results.len() >= from {
|
||||
results.drain(..from);
|
||||
} else {
|
||||
results.clear();
|
||||
}
|
||||
|
||||
return Ok(BucketSortOutput {
|
||||
scores: vec![Default::default(); results.len()],
|
||||
docids: results,
|
||||
all_candidates,
|
||||
degraded: false,
|
||||
});
|
||||
} else {
|
||||
let docids: Vec<u32> = universe.iter().skip(from).take(length).collect();
|
||||
return Ok(BucketSortOutput {
|
||||
scores: vec![Default::default(); docids.len()],
|
||||
docids,
|
||||
all_candidates: universe.clone(),
|
||||
degraded: false,
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
let ranking_rules_len = ranking_rules.len();
|
||||
|
||||
logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe);
|
||||
|
||||
ranking_rules[0].start_iteration(ctx, logger, universe, query)?;
|
||||
|
||||
let mut ranking_rule_scores: Vec<ScoreDetails> = vec![];
|
||||
|
||||
let mut ranking_rule_universes: Vec<RoaringBitmap> =
|
||||
vec![RoaringBitmap::default(); ranking_rules_len];
|
||||
ranking_rule_universes[0].clone_from(universe);
|
||||
let mut cur_ranking_rule_index = 0;
|
||||
|
||||
/// Finish iterating over the current ranking rule, yielding
|
||||
/// control to the parent (or finishing the search if not possible).
|
||||
/// Update the universes accordingly and inform the logger.
|
||||
macro_rules! back {
|
||||
() => {
|
||||
// FIXME: temporarily disabled assert: see <https://github.com/meilisearch/meilisearch/pull/4013>
|
||||
// assert!(
|
||||
// ranking_rule_universes[cur_ranking_rule_index].is_empty(),
|
||||
// "The ranking rule {} did not sort its bucket exhaustively",
|
||||
// ranking_rules[cur_ranking_rule_index].id()
|
||||
// );
|
||||
logger.end_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
);
|
||||
ranking_rule_universes[cur_ranking_rule_index].clear();
|
||||
ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);
|
||||
if cur_ranking_rule_index == 0 {
|
||||
break;
|
||||
} else {
|
||||
cur_ranking_rule_index -= 1;
|
||||
}
|
||||
if ranking_rule_scores.len() > cur_ranking_rule_index {
|
||||
ranking_rule_scores.pop();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let mut all_candidates = universe.clone();
|
||||
let mut valid_docids = vec![];
|
||||
let mut valid_scores = vec![];
|
||||
let mut cur_offset = 0usize;
|
||||
|
||||
macro_rules! maybe_add_to_results {
|
||||
($candidates:expr) => {
|
||||
maybe_add_to_results(
|
||||
ctx,
|
||||
from,
|
||||
length,
|
||||
logger,
|
||||
&mut valid_docids,
|
||||
&mut valid_scores,
|
||||
&mut all_candidates,
|
||||
&mut ranking_rule_universes,
|
||||
&mut ranking_rules,
|
||||
cur_ranking_rule_index,
|
||||
&mut cur_offset,
|
||||
distinct_fid,
|
||||
&ranking_rule_scores,
|
||||
$candidates,
|
||||
)?;
|
||||
};
|
||||
}
|
||||
|
||||
while valid_docids.len() < length {
|
||||
if time_budget.exceeded() {
|
||||
loop {
|
||||
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
|
||||
ranking_rule_scores.push(ScoreDetails::Skipped);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
maybe_add_to_results!(bucket);
|
||||
|
||||
ranking_rule_scores.pop();
|
||||
|
||||
if cur_ranking_rule_index == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
back!();
|
||||
}
|
||||
|
||||
return Ok(BucketSortOutput {
|
||||
scores: valid_scores,
|
||||
docids: valid_docids,
|
||||
all_candidates,
|
||||
degraded: true,
|
||||
});
|
||||
}
|
||||
|
||||
// The universe for this bucket is zero, so we don't need to sort
|
||||
// anything, just go back to the parent ranking rule.
|
||||
if ranking_rule_universes[cur_ranking_rule_index].is_empty()
|
||||
|| (scoring_strategy == ScoringStrategy::Skip
|
||||
&& ranking_rule_universes[cur_ranking_rule_index].len() == 1)
|
||||
{
|
||||
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
|
||||
maybe_add_to_results!(bucket);
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(
|
||||
ctx,
|
||||
logger,
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
)?
|
||||
else {
|
||||
back!();
|
||||
continue;
|
||||
};
|
||||
|
||||
ranking_rule_scores.push(next_bucket.score);
|
||||
|
||||
logger.next_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
&next_bucket.candidates,
|
||||
);
|
||||
|
||||
debug_assert!(
|
||||
ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)
|
||||
);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -=
|
||||
next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
|
||||
|
||||
if cur_ranking_rule_index == ranking_rules_len - 1
|
||||
|| (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
|
||||
|| cur_offset + (next_bucket.candidates.len() as usize) < from
|
||||
{
|
||||
maybe_add_to_results!(next_bucket.candidates);
|
||||
ranking_rule_scores.pop();
|
||||
continue;
|
||||
}
|
||||
|
||||
cur_ranking_rule_index += 1;
|
||||
ranking_rule_universes[cur_ranking_rule_index].clone_from(&next_bucket.candidates);
|
||||
logger.start_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&next_bucket.query,
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
);
|
||||
ranking_rules[cur_ranking_rule_index].start_iteration(
|
||||
ctx,
|
||||
logger,
|
||||
&next_bucket.candidates,
|
||||
&next_bucket.query,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(BucketSortOutput {
|
||||
docids: valid_docids,
|
||||
scores: valid_scores,
|
||||
all_candidates,
|
||||
degraded: false,
|
||||
})
|
||||
}
|
||||
|
||||
/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset`
|
||||
/// into account and inform the logger.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
from: usize,
|
||||
length: usize,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
|
||||
valid_docids: &mut Vec<u32>,
|
||||
valid_scores: &mut Vec<Vec<ScoreDetails>>,
|
||||
all_candidates: &mut RoaringBitmap,
|
||||
|
||||
ranking_rule_universes: &mut [RoaringBitmap],
|
||||
ranking_rules: &mut [BoxRankingRule<'ctx, Q>],
|
||||
|
||||
cur_ranking_rule_index: usize,
|
||||
|
||||
cur_offset: &mut usize,
|
||||
|
||||
distinct_fid: Option<u16>,
|
||||
ranking_rule_scores: &[ScoreDetails],
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
// First apply the distinct rule on the candidates, reducing the universes if necessary
|
||||
let candidates = if let Some(distinct_fid) = distinct_fid {
|
||||
let DistinctOutput { remaining, excluded } =
|
||||
apply_distinct_rule(ctx, distinct_fid, &candidates)?;
|
||||
for universe in ranking_rule_universes.iter_mut() {
|
||||
*universe -= &excluded;
|
||||
*all_candidates -= &excluded;
|
||||
}
|
||||
remaining
|
||||
} else {
|
||||
candidates.clone()
|
||||
};
|
||||
*all_candidates |= &candidates;
|
||||
|
||||
// if the candidates are empty, there is nothing to do;
|
||||
if candidates.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// if we still haven't reached the first document to return
|
||||
if *cur_offset < from {
|
||||
// and if no document from this bucket can be returned
|
||||
if *cur_offset + (candidates.len() as usize) < from {
|
||||
// then just skip the bucket
|
||||
logger.skip_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&candidates,
|
||||
);
|
||||
} else {
|
||||
// otherwise, skip some of the documents and add some of the rest, in order of ids
|
||||
let candidates_vec = candidates.iter().collect::<Vec<_>>();
|
||||
let (skipped_candidates, candidates) = candidates_vec.split_at(from - *cur_offset);
|
||||
|
||||
logger.skip_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&skipped_candidates.iter().collect(),
|
||||
);
|
||||
let candidates =
|
||||
candidates.iter().take(length - valid_docids.len()).copied().collect::<Vec<_>>();
|
||||
logger.add_to_results(&candidates);
|
||||
valid_docids.extend_from_slice(&candidates);
|
||||
valid_scores
|
||||
.extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len()));
|
||||
}
|
||||
} else {
|
||||
// if we have passed the offset already, add some of the documents (up to the limit)
|
||||
let candidates = candidates.iter().take(length - valid_docids.len()).collect::<Vec<u32>>();
|
||||
logger.add_to_results(&candidates);
|
||||
valid_docids.extend_from_slice(&candidates);
|
||||
valid_scores
|
||||
.extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len()));
|
||||
}
|
||||
|
||||
*cur_offset += candidates.len() as usize;
|
||||
Ok(())
|
||||
}
|
||||
716
crates/milli/src/search/new/db_cache.rs
Normal file
716
crates/milli/src/search/new/db_cache.rs
Normal file
@@ -0,0 +1,716 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::hash::Hash;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesEncode, Database, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::interner::Interned;
|
||||
use super::Word;
|
||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
|
||||
};
|
||||
|
||||
/// A cache storing pointers to values in the LMDB databases.
|
||||
///
|
||||
/// Used for performance reasons only. By using this cache, we avoid performing a
|
||||
/// database lookup and instead get a direct reference to the value using a fast
|
||||
/// local HashMap lookup.
|
||||
#[derive(Default)]
|
||||
pub struct DatabaseCache<'ctx> {
|
||||
pub word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>,
|
||||
pub prefix_word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
pub exact_word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
pub exact_word_prefix_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
|
||||
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
||||
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
|
||||
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
}
|
||||
impl<'ctx> DatabaseCache<'ctx> {
|
||||
fn get_value<'v, K1, KC>(
|
||||
txn: &'ctx RoTxn<'_>,
|
||||
cache_key: K1,
|
||||
db_key: &'v KC::EItem,
|
||||
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
db: Database<KC, Bytes>,
|
||||
) -> Result<Option<RoaringBitmap>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
{
|
||||
if let Entry::Vacant(entry) = cache.entry(cache_key) {
|
||||
let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
|
||||
entry.insert(bitmap_ptr);
|
||||
}
|
||||
|
||||
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
|
||||
Some(Cow::Borrowed(bytes)) => bytes,
|
||||
Some(Cow::Owned(bytes)) => bytes.as_slice(),
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
match (bitmap_bytes, universe) {
|
||||
(bytes, Some(universe)) => {
|
||||
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
|
||||
.map(Some)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)
|
||||
.map_err(Into::into),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_value_length<'v, K1, KC>(
|
||||
txn: &'ctx RoTxn<'_>,
|
||||
cache_key: K1,
|
||||
db_key: &'v KC::EItem,
|
||||
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
|
||||
db: Database<KC, Bytes>,
|
||||
) -> Result<Option<u64>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
{
|
||||
if let Entry::Vacant(entry) = cache.entry(cache_key) {
|
||||
let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
|
||||
entry.insert(bitmap_ptr);
|
||||
}
|
||||
|
||||
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
|
||||
Some(Cow::Borrowed(bytes)) => bytes,
|
||||
Some(Cow::Owned(bytes)) => bytes.as_slice(),
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
CboRoaringBitmapLenCodec::bytes_decode_owned(bitmap_bytes)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
fn get_value_from_keys<'v, K1, KC>(
|
||||
txn: &'ctx RoTxn<'_>,
|
||||
cache_key: K1,
|
||||
db_keys: &'v [KC::EItem],
|
||||
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
|
||||
db: Database<KC, Bytes>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
merger: MergeFn,
|
||||
) -> Result<Option<RoaringBitmap>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
KC::EItem: Sized,
|
||||
{
|
||||
if let Entry::Vacant(entry) = cache.entry(cache_key) {
|
||||
let bitmap_ptr: Option<Cow<'ctx, [u8]>> = match db_keys {
|
||||
[] => None,
|
||||
[key] => db.get(txn, key)?.map(Cow::Borrowed),
|
||||
keys => {
|
||||
let bitmaps = keys
|
||||
.iter()
|
||||
.filter_map(|key| db.get(txn, key).transpose())
|
||||
.map(|v| v.map(Cow::Borrowed))
|
||||
.collect::<std::result::Result<Vec<Cow<'_, [u8]>>, _>>()?;
|
||||
|
||||
if bitmaps.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(merger(&[], &bitmaps[..])?)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
entry.insert(bitmap_ptr);
|
||||
}
|
||||
|
||||
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
|
||||
Some(Cow::Borrowed(bytes)) => bytes,
|
||||
Some(Cow::Owned(bytes)) => bytes.as_slice(),
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
match (bitmap_bytes, universe) {
|
||||
(bytes, Some(universe)) => {
|
||||
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
|
||||
.map(Some)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)
|
||||
.map_err(Into::into),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
pub fn get_words_fst(&mut self) -> Result<fst::Set<Cow<'ctx, [u8]>>> {
|
||||
if let Some(fst) = self.db_cache.words_fst.clone() {
|
||||
Ok(fst)
|
||||
} else {
|
||||
let fst = self.index.words_fst(self.txn)?;
|
||||
self.db_cache.words_fst = Some(fst.clone());
|
||||
Ok(fst)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn word_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Word,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match word {
|
||||
Word::Original(word) => {
|
||||
let exact = self.get_db_exact_word_docids(universe, word)?;
|
||||
let tolerant = self.get_db_word_docids(universe, word)?;
|
||||
Ok(match (exact, tolerant) {
|
||||
(None, None) => None,
|
||||
(None, Some(tolerant)) => Some(tolerant),
|
||||
(Some(exact), None) => Some(exact),
|
||||
(Some(exact), Some(tolerant)) => {
|
||||
let mut both = exact;
|
||||
both |= tolerant;
|
||||
Some(both)
|
||||
}
|
||||
})
|
||||
}
|
||||
Word::Derived(word) => self.get_db_word_docids(universe, word),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve or insert the given value in the `word_docids` database.
|
||||
fn get_db_word_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(word).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _>(
|
||||
self.txn,
|
||||
word,
|
||||
&keys[..],
|
||||
&mut self.db_cache.word_docids,
|
||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _>(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.word_docids,
|
||||
universe,
|
||||
self.index.word_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_db_exact_word_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(word).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _>(
|
||||
self.txn,
|
||||
word,
|
||||
&keys[..],
|
||||
&mut self.db_cache.exact_word_docids,
|
||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _>(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.exact_word_docids,
|
||||
universe,
|
||||
self.index.exact_word_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn word_prefix_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
prefix: Word,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match prefix {
|
||||
Word::Original(prefix) => {
|
||||
let exact = self.get_db_exact_word_prefix_docids(universe, prefix)?;
|
||||
let tolerant = self.get_db_word_prefix_docids(universe, prefix)?;
|
||||
Ok(match (exact, tolerant) {
|
||||
(None, None) => None,
|
||||
(None, Some(tolerant)) => Some(tolerant),
|
||||
(Some(exact), None) => Some(exact),
|
||||
(Some(exact), Some(tolerant)) => {
|
||||
let mut both = exact;
|
||||
both |= tolerant;
|
||||
Some(both)
|
||||
}
|
||||
})
|
||||
}
|
||||
Word::Derived(prefix) => self.get_db_word_prefix_docids(universe, prefix),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve or insert the given value in the `word_prefix_docids` database.
|
||||
fn get_db_word_prefix_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(prefix).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _>(
|
||||
self.txn,
|
||||
prefix,
|
||||
&keys[..],
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _>(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
universe,
|
||||
self.index.word_prefix_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_db_exact_word_prefix_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(prefix).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _>(
|
||||
self.txn,
|
||||
prefix,
|
||||
&keys[..],
|
||||
&mut self.db_cache.exact_word_prefix_docids,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _>(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.exact_word_prefix_docids,
|
||||
universe,
|
||||
self.index.exact_word_prefix_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_db_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word1: Interned<String>,
|
||||
word2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
|
||||
ProximityPrecision::ByAttribute => {
|
||||
// Force proximity to 0 because:
|
||||
// in ByAttribute, there are only 2 possible distances:
|
||||
// 1. words in same attribute: in that the DB contains (0, word1, word2)
|
||||
// 2. words in different attributes: no DB entry for these two words.
|
||||
let proximity = 0;
|
||||
let docids = if let Some(docids) =
|
||||
self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2))
|
||||
{
|
||||
docids
|
||||
.as_ref()
|
||||
.map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d))
|
||||
.transpose()
|
||||
.map_err(heed::Error::Decoding)?
|
||||
} else {
|
||||
// Compute the distance at the attribute level and store it in the cache.
|
||||
let fids = self.index.searchable_fields_ids(self.txn)?;
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for fid in fids {
|
||||
// for each field, intersect left word bitmap and right word bitmap,
|
||||
// then merge the result in a global bitmap before storing it in the cache.
|
||||
let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
|
||||
let word2_docids = self.get_db_word_fid_docids(universe, word2, fid)?;
|
||||
if let (Some(word1_docids), Some(word2_docids)) =
|
||||
(word1_docids, word2_docids)
|
||||
{
|
||||
docids |= word1_docids & word2_docids;
|
||||
}
|
||||
}
|
||||
let encoded = CboRoaringBitmapCodec::bytes_encode(&docids)
|
||||
.map(Cow::into_owned)
|
||||
.map(Cow::Owned)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)?;
|
||||
self.db_cache
|
||||
.word_pair_proximity_docids
|
||||
.insert((proximity, word1, word2), encoded);
|
||||
Some(docids)
|
||||
};
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _>(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(word2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_pair_proximity_docids,
|
||||
universe,
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_db_word_pair_proximity_docids_len(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word1: Interned<String>,
|
||||
word2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<u64>> {
|
||||
match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
|
||||
ProximityPrecision::ByAttribute => Ok(self
|
||||
.get_db_word_pair_proximity_docids(universe, word1, word2, proximity)?
|
||||
.map(|d| d.len())),
|
||||
ProximityPrecision::ByWord => DatabaseCache::get_value_length::<_, _>(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(word2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_pair_proximity_docids,
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_pair_proximity_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word1: Interned<String>,
|
||||
prefix2: Interned<String>,
|
||||
mut proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default();
|
||||
if proximity_precision == ProximityPrecision::ByAttribute {
|
||||
// Force proximity to 0 because:
|
||||
// in ByAttribute, there are only 2 possible distances:
|
||||
// 1. words in same attribute: in that the DB contains (0, word1, word2)
|
||||
// 2. words in different attributes: no DB entry for these two words.
|
||||
proximity = 0;
|
||||
}
|
||||
|
||||
let docids = if let Some(docids) =
|
||||
self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2))
|
||||
{
|
||||
docids.clone()
|
||||
} else {
|
||||
let prefix_docids = match proximity_precision {
|
||||
ProximityPrecision::ByAttribute => {
|
||||
// Compute the distance at the attribute level and store it in the cache.
|
||||
let fids = self.index.searchable_fields_ids(self.txn)?;
|
||||
let mut prefix_docids = RoaringBitmap::new();
|
||||
// for each field, intersect left word bitmap and right word bitmap,
|
||||
// then merge the result in a global bitmap before storing it in the cache.
|
||||
for fid in fids {
|
||||
let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
|
||||
let prefix2_docids =
|
||||
self.get_db_word_prefix_fid_docids(universe, prefix2, fid)?;
|
||||
if let (Some(word1_docids), Some(prefix2_docids)) =
|
||||
(word1_docids, prefix2_docids)
|
||||
{
|
||||
prefix_docids |= word1_docids & prefix2_docids;
|
||||
}
|
||||
}
|
||||
prefix_docids
|
||||
}
|
||||
ProximityPrecision::ByWord => {
|
||||
// compute docids using prefix iter and store the result in the cache.
|
||||
let key = U8StrStrCodec::bytes_encode(&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(prefix2).as_str(),
|
||||
))
|
||||
.unwrap()
|
||||
.into_owned();
|
||||
let mut prefix_docids = RoaringBitmap::new();
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_pair_proximity_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(self.txn, &key)?;
|
||||
for result in remap_key_type {
|
||||
let (_, docids) = result?;
|
||||
|
||||
prefix_docids |= docids;
|
||||
}
|
||||
prefix_docids
|
||||
}
|
||||
};
|
||||
self.db_cache
|
||||
.word_prefix_pair_proximity_docids
|
||||
.insert((proximity, word1, prefix2), Some(prefix_docids.clone()));
|
||||
Some(prefix_docids)
|
||||
};
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
pub fn get_db_prefix_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
left_prefix: Interned<String>,
|
||||
right: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
// only accept exact matches on reverted positions
|
||||
self.get_db_word_pair_proximity_docids(universe, left_prefix, right, proximity)
|
||||
}
|
||||
|
||||
pub fn get_db_word_fid_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Interned<String>,
|
||||
fid: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
// if the requested fid isn't in the restricted list, return None.
|
||||
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
DatabaseCache::get_value::<_, _>(
|
||||
self.txn,
|
||||
(word, fid),
|
||||
&(self.word_interner.get(word).as_str(), fid),
|
||||
&mut self.db_cache.word_fid_docids,
|
||||
universe,
|
||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_fid_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word_prefix: Interned<String>,
|
||||
fid: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
// if the requested fid isn't in the restricted list, return None.
|
||||
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
DatabaseCache::get_value::<_, _>(
|
||||
self.txn,
|
||||
(word_prefix, fid),
|
||||
&(self.word_interner.get(word_prefix).as_str(), fid),
|
||||
&mut self.db_cache.word_prefix_fid_docids,
|
||||
universe,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
let fids = match self.db_cache.word_fids.entry(word) {
|
||||
Entry::Occupied(fids) => fids.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut fids = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_fid_docids
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, fid), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache.word_fid_docids.insert((word, fid), Some(Cow::Borrowed(value)));
|
||||
fids.push(fid);
|
||||
}
|
||||
entry.insert(fids.clone());
|
||||
fids
|
||||
}
|
||||
};
|
||||
Ok(fids)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_fids(&mut self, word_prefix: Interned<String>) -> Result<Vec<u16>> {
|
||||
let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) {
|
||||
Entry::Occupied(fids) => fids.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut fids = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_prefix_fid_docids
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, fid), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_prefix_fid_docids
|
||||
.insert((word_prefix, fid), Some(Cow::Borrowed(value)));
|
||||
fids.push(fid);
|
||||
}
|
||||
entry.insert(fids.clone());
|
||||
fids
|
||||
}
|
||||
};
|
||||
Ok(fids)
|
||||
}
|
||||
|
||||
pub fn get_db_word_position_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _>(
|
||||
self.txn,
|
||||
(word, position),
|
||||
&(self.word_interner.get(word).as_str(), position),
|
||||
&mut self.db_cache.word_position_docids,
|
||||
universe,
|
||||
self.index.word_position_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_position_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word_prefix: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _>(
|
||||
self.txn,
|
||||
(word_prefix, position),
|
||||
&(self.word_interner.get(word_prefix).as_str(), position),
|
||||
&mut self.db_cache.word_prefix_position_docids,
|
||||
universe,
|
||||
self.index.word_prefix_position_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_db_word_positions(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
let positions = match self.db_cache.word_positions.entry(word) {
|
||||
Entry::Occupied(positions) => positions.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut positions = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_position_docids
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, position), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_position_docids
|
||||
.insert((word, position), Some(Cow::Borrowed(value)));
|
||||
positions.push(position);
|
||||
}
|
||||
entry.insert(positions.clone());
|
||||
positions
|
||||
}
|
||||
};
|
||||
Ok(positions)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_positions(
|
||||
&mut self,
|
||||
word_prefix: Interned<String>,
|
||||
) -> Result<Vec<u16>> {
|
||||
let positions = match self.db_cache.word_prefix_positions.entry(word_prefix) {
|
||||
Entry::Occupied(positions) => positions.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut positions = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_prefix_position_docids
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, position), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_prefix_position_docids
|
||||
.insert((word_prefix, position), Some(Cow::Borrowed(value)));
|
||||
positions.push(position);
|
||||
}
|
||||
entry.insert(positions.clone());
|
||||
positions
|
||||
}
|
||||
};
|
||||
Ok(positions)
|
||||
}
|
||||
}
|
||||
123
crates/milli/src/search/new/distinct.rs
Normal file
123
crates/milli/src/search/new/distinct.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
use heed::types::{Bytes, Str, Unit};
|
||||
use heed::{Database, RoPrefix, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
const FID_SIZE: usize = 2;
|
||||
const DOCID_SIZE: usize = 4;
|
||||
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::{Index, Result, SearchContext};
|
||||
|
||||
pub struct DistinctOutput {
|
||||
pub remaining: RoaringBitmap,
|
||||
pub excluded: RoaringBitmap,
|
||||
}
|
||||
|
||||
/// Return a [`DistinctOutput`] containing:
|
||||
/// - `remaining`: a set of docids built such that exactly one element from `candidates`
|
||||
/// is kept for each distinct value inside the given field. If the field does not exist, it
|
||||
/// is considered unique.
|
||||
/// - `excluded`: the set of document ids that contain a value for the given field that occurs
|
||||
/// in the given candidates.
|
||||
pub fn apply_distinct_rule(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
field_id: u16,
|
||||
candidates: &RoaringBitmap,
|
||||
) -> Result<DistinctOutput> {
|
||||
let mut excluded = RoaringBitmap::new();
|
||||
let mut remaining = RoaringBitmap::new();
|
||||
for docid in candidates {
|
||||
if excluded.contains(docid) {
|
||||
continue;
|
||||
}
|
||||
distinct_single_docid(ctx.index, ctx.txn, field_id, docid, &mut excluded)?;
|
||||
remaining.push(docid);
|
||||
}
|
||||
Ok(DistinctOutput { remaining, excluded })
|
||||
}
|
||||
|
||||
/// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id.
|
||||
pub fn distinct_single_docid(
|
||||
index: &Index,
|
||||
txn: &RoTxn<'_>,
|
||||
field_id: u16,
|
||||
docid: u32,
|
||||
excluded: &mut RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
for item in facet_string_values(docid, field_id, index, txn)? {
|
||||
let ((_, _, facet_value), _) = item?;
|
||||
if let Some(facet_docids) = facet_value_docids(
|
||||
index.facet_id_string_docids.remap_types(),
|
||||
txn,
|
||||
field_id,
|
||||
facet_value,
|
||||
)? {
|
||||
*excluded |= facet_docids;
|
||||
}
|
||||
}
|
||||
for item in facet_number_values(docid, field_id, index, txn)? {
|
||||
let ((_, _, facet_value), _) = item?;
|
||||
if let Some(facet_docids) =
|
||||
facet_value_docids(index.facet_id_f64_docids.remap_types(), txn, field_id, facet_value)?
|
||||
{
|
||||
*excluded |= facet_docids;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return all the docids containing the given value in the given field
|
||||
fn facet_value_docids(
|
||||
database: Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
txn: &RoTxn<'_>,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
database
|
||||
.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })
|
||||
.map(|opt| opt.map(|v| v.bitmap))
|
||||
}
|
||||
|
||||
/// Return an iterator over each number value in the given field of the given document.
|
||||
fn facet_number_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn<'a>,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Unit>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
/// Return an iterator over each string value in the given field of the given document.
|
||||
pub fn facet_string_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn<'a>,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Str>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_types();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||
}
|
||||
299
crates/milli/src/search/new/exact_attribute.rs
Normal file
299
crates/milli/src/search/new/exact_attribute.rs
Normal file
@@ -0,0 +1,299 @@
|
||||
use heed::types::Bytes;
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
|
||||
use super::query_graph::QueryGraph;
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::search::new::query_graph::QueryNodeData;
|
||||
use crate::search::new::query_term::ExactTerm;
|
||||
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
|
||||
|
||||
/// A ranking rule that produces 3 disjoint buckets:
|
||||
///
|
||||
/// 1. Documents from the universe whose value is exactly the query.
|
||||
/// 2. Documents from the universe not in (1) whose value starts with the query.
|
||||
/// 3. Documents from the universe not in (1) or (2).
|
||||
pub struct ExactAttribute {
|
||||
state: State,
|
||||
}
|
||||
|
||||
impl ExactAttribute {
|
||||
pub fn new() -> Self {
|
||||
Self { state: Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
||||
fn id(&self) -> String {
|
||||
"exact_attribute".to_owned()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::exact_attribute")]
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &roaring::RoaringBitmap,
|
||||
query: &QueryGraph,
|
||||
) -> Result<()> {
|
||||
self.state = State::start_iteration(ctx, universe, query)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::exact_attribute")]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &roaring::RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||
let state = std::mem::take(&mut self.state);
|
||||
let (state, output) = State::next(state, universe);
|
||||
self.state = state;
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::exact_attribute")]
|
||||
fn end_iteration(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
self.state = Default::default();
|
||||
}
|
||||
}
|
||||
|
||||
/// Inner state of the ranking rule.
|
||||
#[derive(Default)]
|
||||
enum State {
|
||||
/// State between two iterations
|
||||
#[default]
|
||||
Uninitialized,
|
||||
/// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
|
||||
ExactAttribute(QueryGraph, Vec<FieldCandidates>),
|
||||
/// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
|
||||
/// but isn't the exact query.
|
||||
AttributeStarts(QueryGraph, Vec<FieldCandidates>),
|
||||
/// The next calls to `next` will output the input universe.
|
||||
Empty(QueryGraph),
|
||||
}
|
||||
|
||||
/// The candidates sorted by attributes
|
||||
///
|
||||
/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
|
||||
struct FieldCandidates {
|
||||
/// The candidates that start with all the words of the query in the field
|
||||
start_with_exact: RoaringBitmap,
|
||||
/// The candidates that have the same number of words as the query in the field
|
||||
exact_word_count: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn start_iteration(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
) -> Result<Self> {
|
||||
struct ExactTermInfo {
|
||||
exact_term: ExactTerm,
|
||||
start_position: u16,
|
||||
start_term_id: u8,
|
||||
position_count: usize,
|
||||
}
|
||||
|
||||
let mut exact_terms: Vec<ExactTermInfo> =
|
||||
Vec::with_capacity(query_graph.nodes.len() as usize);
|
||||
for (_, node) in query_graph.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(term) => {
|
||||
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
|
||||
exact_term
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
exact_terms.push(ExactTermInfo {
|
||||
exact_term,
|
||||
start_position: *term.positions.start(),
|
||||
start_term_id: *term.term_ids.start(),
|
||||
position_count: term.positions.len(),
|
||||
});
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||
}
|
||||
}
|
||||
|
||||
exact_terms.sort_by_key(|x| x.start_term_id);
|
||||
exact_terms.dedup_by_key(|x| x.start_term_id);
|
||||
let count_all_positions = exact_terms.iter().fold(0, |acc, x| acc + x.position_count);
|
||||
|
||||
// bail if there is a "hole" (missing word) in remaining query graph
|
||||
if let Some(e) = exact_terms.first() {
|
||||
if e.start_term_id != 0 {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
} else {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
let mut previous_id = 0;
|
||||
for e in exact_terms.iter() {
|
||||
if e.start_term_id < previous_id || e.start_term_id - previous_id > 1 {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
} else {
|
||||
previous_id = e.start_term_id;
|
||||
}
|
||||
}
|
||||
|
||||
// sample query: "sunflower are pretty"
|
||||
// sunflower at pos 0 in attr A
|
||||
// are at pos 1 in attr B
|
||||
// pretty at pos 2 in attr C
|
||||
// We want to eliminate such document
|
||||
|
||||
// first check that for each term, there exists some attribute that has this term at the correct position
|
||||
//"word-position-docids";
|
||||
let mut candidates = universe.clone();
|
||||
let words_positions: Vec<(Vec<_>, _)> = exact_terms
|
||||
.iter()
|
||||
.map(|e| (e.exact_term.interned_words(ctx).collect(), e.start_position))
|
||||
.collect();
|
||||
for (words, position) in &words_positions {
|
||||
if candidates.is_empty() {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
|
||||
'words: for (offset, word) in words.iter().enumerate() {
|
||||
let offset = offset as u16;
|
||||
let word = if let Some(word) = word {
|
||||
word
|
||||
} else {
|
||||
continue 'words;
|
||||
};
|
||||
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
|
||||
// longer phrases we'll be losing on precision here.
|
||||
let bucketed_position = crate::bucketed_position(position + offset);
|
||||
let word_position_docids = ctx
|
||||
.get_db_word_position_docids(Some(universe), *word, bucketed_position)?
|
||||
.unwrap_or_default();
|
||||
candidates &= word_position_docids;
|
||||
if candidates.is_empty() {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let candidates = candidates;
|
||||
|
||||
if candidates.is_empty() {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
|
||||
let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?;
|
||||
|
||||
let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
|
||||
// then check that there exists at least one attribute that has all of the terms
|
||||
for fid in searchable_fields_ids {
|
||||
let intersection = MultiOps::intersection(
|
||||
words_positions
|
||||
.iter()
|
||||
.flat_map(|(words, ..)| words.iter())
|
||||
// ignore stop words words in phrases
|
||||
.flatten()
|
||||
.map(|word| -> Result<_> {
|
||||
Ok(ctx
|
||||
.get_db_word_fid_docids(Some(&candidates), *word, fid)?
|
||||
.unwrap_or_default())
|
||||
}),
|
||||
)?;
|
||||
if !intersection.is_empty() {
|
||||
// Although not really worth it in terms of performance,
|
||||
// if would be good to put this in cache for the sake of consistency
|
||||
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
|
||||
let bitmap_bytes = ctx
|
||||
.index
|
||||
.field_id_word_count_docids
|
||||
.remap_data_type::<Bytes>()
|
||||
.get(ctx.txn, &(fid, count_all_positions as u8))?;
|
||||
|
||||
match bitmap_bytes {
|
||||
Some(bytes) => {
|
||||
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
|
||||
}
|
||||
None => RoaringBitmap::default(),
|
||||
}
|
||||
} else {
|
||||
RoaringBitmap::default()
|
||||
};
|
||||
candidates_per_attribute.push(FieldCandidates {
|
||||
start_with_exact: intersection,
|
||||
exact_word_count: candidates_with_exact_word_count,
|
||||
});
|
||||
}
|
||||
}
|
||||
// note we could have "false positives" where there both exist different attributes that collectively
|
||||
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
|
||||
|
||||
Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
|
||||
}
|
||||
|
||||
fn next(
|
||||
state: State,
|
||||
universe: &RoaringBitmap,
|
||||
) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
|
||||
let (state, output) = match state {
|
||||
State::Uninitialized => (state, None),
|
||||
State::ExactAttribute(query_graph, candidates_per_attribute) => {
|
||||
// TODO it can be much faster to do the intersections before the unions...
|
||||
// or maybe the candidates_per_attribute are not containing anything outside universe
|
||||
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|
||||
|FieldCandidates { start_with_exact, exact_word_count }| {
|
||||
start_with_exact & exact_word_count
|
||||
},
|
||||
));
|
||||
candidates &= universe;
|
||||
(
|
||||
State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
|
||||
Some(RankingRuleOutput {
|
||||
query: query_graph,
|
||||
candidates,
|
||||
score: ScoreDetails::ExactAttribute(
|
||||
score_details::ExactAttribute::ExactMatch,
|
||||
),
|
||||
}),
|
||||
)
|
||||
}
|
||||
State::AttributeStarts(query_graph, candidates_per_attribute) => {
|
||||
// TODO it can be much faster to do the intersections before the unions...
|
||||
// or maybe the candidates_per_attribute are not containing anything outside universe
|
||||
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|
||||
|FieldCandidates { mut start_with_exact, exact_word_count }| {
|
||||
start_with_exact -= exact_word_count;
|
||||
start_with_exact
|
||||
},
|
||||
));
|
||||
candidates &= universe;
|
||||
(
|
||||
State::Empty(query_graph.clone()),
|
||||
Some(RankingRuleOutput {
|
||||
query: query_graph,
|
||||
candidates,
|
||||
score: ScoreDetails::ExactAttribute(
|
||||
score_details::ExactAttribute::MatchesStart,
|
||||
),
|
||||
}),
|
||||
)
|
||||
}
|
||||
State::Empty(query_graph) => (
|
||||
State::Empty(query_graph.clone()),
|
||||
Some(RankingRuleOutput {
|
||||
query: query_graph,
|
||||
candidates: universe.clone(),
|
||||
score: ScoreDetails::ExactAttribute(
|
||||
score_details::ExactAttribute::NoExactMatch,
|
||||
),
|
||||
}),
|
||||
),
|
||||
};
|
||||
(state, output)
|
||||
}
|
||||
}
|
||||
309
crates/milli/src/search/new/geo_sort.rs
Normal file
309
crates/milli/src/search/new/geo_sort.rs
Normal file
@@ -0,0 +1,309 @@
|
||||
use std::collections::VecDeque;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use heed::types::{Bytes, Unit};
|
||||
use heed::{RoPrefix, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
|
||||
use super::facet_string_values;
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::{
|
||||
distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext,
|
||||
SearchLogger,
|
||||
};
|
||||
|
||||
const FID_SIZE: usize = 2;
|
||||
const DOCID_SIZE: usize = 4;
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||
}
|
||||
|
||||
/// Return an iterator over each number value in the given field of the given document.
|
||||
fn facet_number_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn<'a>,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<OrderedF64Codec>, Unit>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
/// Define the strategy used by the geo sort.
|
||||
/// The parameter represents the cache size, and, in the case of the Dynamic strategy,
|
||||
/// the point where we move from using the iterative strategy to the rtree.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum Strategy {
|
||||
AlwaysIterative(usize),
|
||||
AlwaysRtree(usize),
|
||||
Dynamic(usize),
|
||||
}
|
||||
|
||||
impl Default for Strategy {
|
||||
fn default() -> Self {
|
||||
Strategy::Dynamic(1000)
|
||||
}
|
||||
}
|
||||
|
||||
impl Strategy {
|
||||
pub fn use_rtree(&self, candidates: usize) -> bool {
|
||||
match self {
|
||||
Strategy::AlwaysIterative(_) => false,
|
||||
Strategy::AlwaysRtree(_) => true,
|
||||
Strategy::Dynamic(i) => candidates >= *i,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cache_size(&self) -> usize {
|
||||
match self {
|
||||
Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GeoSort<Q: RankingRuleQueryTrait> {
|
||||
query: Option<Q>,
|
||||
|
||||
strategy: Strategy,
|
||||
ascending: bool,
|
||||
point: [f64; 2],
|
||||
field_ids: Option<[u16; 2]>,
|
||||
rtree: Option<RTree<GeoPoint>>,
|
||||
|
||||
cached_sorted_docids: VecDeque<(u32, [f64; 2])>,
|
||||
geo_candidates: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
|
||||
pub fn new(
|
||||
strategy: Strategy,
|
||||
geo_faceted_docids: RoaringBitmap,
|
||||
point: [f64; 2],
|
||||
ascending: bool,
|
||||
) -> Result<Self> {
|
||||
Ok(Self {
|
||||
query: None,
|
||||
strategy,
|
||||
ascending,
|
||||
point,
|
||||
geo_candidates: geo_faceted_docids,
|
||||
field_ids: None,
|
||||
rtree: None,
|
||||
cached_sorted_docids: VecDeque::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Refill the internal buffer of cached docids based on the strategy.
|
||||
/// Drop the rtree if we don't need it anymore.
|
||||
fn fill_buffer(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'_>,
|
||||
geo_candidates: &RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");
|
||||
debug_assert!(self.cached_sorted_docids.is_empty());
|
||||
|
||||
// lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree`
|
||||
let rtree = if self.strategy.use_rtree(geo_candidates.len() as usize) {
|
||||
if let Some(rtree) = self.rtree.as_ref() {
|
||||
// get rtree from cache
|
||||
Some(rtree)
|
||||
} else {
|
||||
let rtree = ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree");
|
||||
// insert rtree in cache and returns it.
|
||||
// Can't use `get_or_insert_with` because getting the rtree from the DB is a fallible operation.
|
||||
Some(&*self.rtree.insert(rtree))
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let cache_size = self.strategy.cache_size();
|
||||
if let Some(rtree) = rtree {
|
||||
if self.ascending {
|
||||
let point = lat_lng_to_xyz(&self.point);
|
||||
for point in rtree.nearest_neighbor_iter(&point) {
|
||||
if geo_candidates.contains(point.data.0) {
|
||||
self.cached_sorted_docids.push_back(point.data);
|
||||
if self.cached_sorted_docids.len() >= cache_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// in the case of the desc geo sort we look for the closest point to the opposite of the queried point
|
||||
// and we insert the points in reverse order they get reversed when emptying the cache later on
|
||||
let point = lat_lng_to_xyz(&opposite_of(self.point));
|
||||
for point in rtree.nearest_neighbor_iter(&point) {
|
||||
if geo_candidates.contains(point.data.0) {
|
||||
self.cached_sorted_docids.push_front(point.data);
|
||||
if self.cached_sorted_docids.len() >= cache_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// the iterative version
|
||||
let [lat, lng] = self.field_ids.unwrap();
|
||||
|
||||
let mut documents = geo_candidates
|
||||
.iter()
|
||||
.map(|id| -> Result<_> { Ok((id, geo_value(id, lat, lng, ctx.index, ctx.txn)?)) })
|
||||
.collect::<Result<Vec<(u32, [f64; 2])>>>()?;
|
||||
// computing the distance between two points is expensive thus we cache the result
|
||||
documents
|
||||
.sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize);
|
||||
self.cached_sorted_docids.extend(documents);
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the lat and long values from a single document.
|
||||
///
|
||||
/// If it is not able to find it in the facet number index it will extract it
|
||||
/// from the facet string index and parse it as f64 (as the geo extraction behaves).
|
||||
fn geo_value(
|
||||
docid: u32,
|
||||
field_lat: u16,
|
||||
field_lng: u16,
|
||||
index: &Index,
|
||||
rtxn: &RoTxn<'_>,
|
||||
) -> Result<[f64; 2]> {
|
||||
let extract_geo = |geo_field: u16| -> Result<f64> {
|
||||
match facet_number_values(docid, geo_field, index, rtxn)?.next() {
|
||||
Some(Ok(((_, _, geo), ()))) => Ok(geo),
|
||||
Some(Err(e)) => Err(e.into()),
|
||||
None => match facet_string_values(docid, geo_field, index, rtxn)?.next() {
|
||||
Some(Ok((_, geo))) => {
|
||||
Ok(geo.parse::<f64>().expect("cannot parse geo field as f64"))
|
||||
}
|
||||
Some(Err(e)) => Err(e.into()),
|
||||
None => panic!("A geo faceted document doesn't contain any lat or lng"),
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let lat = extract_geo(field_lat)?;
|
||||
let lng = extract_geo(field_lng)?;
|
||||
|
||||
Ok([lat, lng])
|
||||
}
|
||||
|
||||
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
|
||||
fn id(&self) -> String {
|
||||
"geo_sort".to_owned()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")]
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
query: &Q,
|
||||
) -> Result<()> {
|
||||
assert!(self.query.is_none());
|
||||
|
||||
self.query = Some(query.clone());
|
||||
|
||||
let geo_candidates = &self.geo_candidates & universe;
|
||||
|
||||
if geo_candidates.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let fid_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");
|
||||
let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");
|
||||
self.field_ids = Some([lat, lng]);
|
||||
self.fill_buffer(ctx, &geo_candidates)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")]
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<Q>>> {
|
||||
let query = self.query.as_ref().unwrap().clone();
|
||||
|
||||
let geo_candidates = &self.geo_candidates & universe;
|
||||
|
||||
if geo_candidates.is_empty() {
|
||||
return Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: universe.clone(),
|
||||
score: ScoreDetails::GeoSort(score_details::GeoSort {
|
||||
target_point: self.point,
|
||||
ascending: self.ascending,
|
||||
value: None,
|
||||
}),
|
||||
}));
|
||||
}
|
||||
|
||||
let ascending = self.ascending;
|
||||
let next = |cache: &mut VecDeque<_>| {
|
||||
if ascending {
|
||||
cache.pop_front()
|
||||
} else {
|
||||
cache.pop_back()
|
||||
}
|
||||
};
|
||||
while let Some((id, point)) = next(&mut self.cached_sorted_docids) {
|
||||
if geo_candidates.contains(id) {
|
||||
return Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: RoaringBitmap::from_iter([id]),
|
||||
score: ScoreDetails::GeoSort(score_details::GeoSort {
|
||||
target_point: self.point,
|
||||
ascending: self.ascending,
|
||||
value: Some(point),
|
||||
}),
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
// if we got out of this loop it means we've exhausted our cache.
|
||||
// we need to refill it and run the function again.
|
||||
self.fill_buffer(ctx, &geo_candidates)?;
|
||||
self.next_bucket(ctx, logger, universe)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")]
|
||||
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
|
||||
// we do not reset the rtree here, it could be used in a next iteration
|
||||
self.query = None;
|
||||
self.cached_sorted_docids.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the antipodal coordinate of `coord`
|
||||
fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] {
|
||||
coord[0] *= -1.;
|
||||
// in the case of x,0 we want to return x,180
|
||||
if coord[1] > 0. {
|
||||
coord[1] -= 180.;
|
||||
} else {
|
||||
coord[1] += 180.;
|
||||
}
|
||||
|
||||
coord
|
||||
}
|
||||
431
crates/milli/src/search/new/graph_based_ranking_rule.rs
Normal file
431
crates/milli/src/search/new/graph_based_ranking_rule.rs
Normal file
@@ -0,0 +1,431 @@
|
||||
/*! Implementation of a generic graph-based ranking rule.
|
||||
|
||||
A graph-based ranking rule is a ranking rule that works by representing
|
||||
its possible operations and their relevancy cost as a directed acyclic multi-graph
|
||||
built on top of the query graph. It then computes its buckets by finding the
|
||||
cheapest paths from the start node to the end node and computing the document ids
|
||||
that satisfy those paths.
|
||||
|
||||
For example, the proximity ranking rule builds a graph where the edges between two
|
||||
nodes represent a condition that the term of the source node is in a certain proximity
|
||||
to the term of the destination node. With the query "pretty house by" where the term
|
||||
"pretty" has three possible proximities to the term "house" and "house" has two
|
||||
proximities to "by", the graph will look like this:
|
||||
|
||||
```txt
|
||||
┌───────┐ ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐ ┌───────┐
|
||||
│ START │──0─▶│pretty │─────2────▶│ house │ │ by │─0─▶│ END │
|
||||
└───────┘ └───────┘─────3────▶└───────┘──2-─▶└─────┘ └───────┘
|
||||
```
|
||||
The proximity ranking rule's first bucket will be determined by the union of all
|
||||
the shortest paths from START to END, which in this case is:
|
||||
```txt
|
||||
START --0-> pretty --1--> house --1--> by --0--> end
|
||||
```
|
||||
The path's corresponding document ids are found by taking the intersection of the
|
||||
document ids of each edge. That is, we find the documents where both `pretty` is
|
||||
1-close to `house` AND `house` is 1-close to `by`.
|
||||
|
||||
For the second bucket, we get the union of the second-cheapest paths, which are:
|
||||
```txt
|
||||
START --0-> pretty --1--> house --2--> by --0--> end
|
||||
START --0-> pretty --2--> house --1--> by --0--> end
|
||||
```
|
||||
That is we find the documents where either:
|
||||
- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
|
||||
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
|
||||
*/
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::interner::{Interned, MappedInterner};
|
||||
use super::logger::SearchLogger;
|
||||
use super::query_graph::QueryNode;
|
||||
use super::ranking_rule_graph::{
|
||||
ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph,
|
||||
RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, WordsGraph,
|
||||
};
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
||||
use crate::score_details::Rank;
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::ranking_rule_graph::PathVisitor;
|
||||
use crate::{Result, TermsMatchingStrategy};
|
||||
|
||||
pub type Words = GraphBasedRankingRule<WordsGraph>;
|
||||
impl GraphBasedRankingRule<WordsGraph> {
|
||||
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
|
||||
Self::new_with_id("words".to_owned(), Some(terms_matching_strategy))
|
||||
}
|
||||
}
|
||||
pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
|
||||
impl GraphBasedRankingRule<ProximityGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("proximity".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Fid = GraphBasedRankingRule<FidGraph>;
|
||||
impl GraphBasedRankingRule<FidGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("fid".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Position = GraphBasedRankingRule<PositionGraph>;
|
||||
impl GraphBasedRankingRule<PositionGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("position".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Typo = GraphBasedRankingRule<TypoGraph>;
|
||||
impl GraphBasedRankingRule<TypoGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("typo".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Exactness = GraphBasedRankingRule<ExactnessGraph>;
|
||||
impl GraphBasedRankingRule<ExactnessGraph> {
|
||||
pub fn new() -> Self {
|
||||
Self::new_with_id("exactness".to_owned(), None)
|
||||
}
|
||||
}
|
||||
|
||||
/// A generic graph-based ranking rule
|
||||
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
||||
id: String,
|
||||
terms_matching_strategy: Option<TermsMatchingStrategy>,
|
||||
// When the ranking rule is not iterating over its buckets,
|
||||
// its state is `None`.
|
||||
state: Option<GraphBasedRankingRuleState<G>>,
|
||||
}
|
||||
impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
|
||||
/// Creates the ranking rule with the given identifier
|
||||
pub fn new_with_id(id: String, terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self { id, terms_matching_strategy, state: None }
|
||||
}
|
||||
}
|
||||
|
||||
/// The internal state of a graph-based ranking rule during iteration
|
||||
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
||||
/// The current graph
|
||||
graph: RankingRuleGraph<G>,
|
||||
/// Cache to retrieve the docids associated with each edge
|
||||
conditions_cache: ConditionDocIdsCache<G>,
|
||||
/// Cache used to optimistically discard paths that resolve to no documents.
|
||||
dead_ends_cache: DeadEndsCache<G::Condition>,
|
||||
/// A structure giving the list of possible costs from each node to the end node
|
||||
all_costs: MappedInterner<QueryNode, Vec<u64>>,
|
||||
/// An index in the first element of `all_distances`, giving the cost of the next bucket
|
||||
cur_cost: u64,
|
||||
/// One above the highest possible cost for this rule
|
||||
next_max_cost: u64,
|
||||
}
|
||||
|
||||
impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule<G> {
|
||||
fn id(&self) -> String {
|
||||
self.id.clone()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::graph_based")]
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
_universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
) -> Result<()> {
|
||||
// the `next_max_cost` is the successor integer to the maximum cost of the paths in the graph.
|
||||
//
|
||||
// When there is a matching strategy, it also factors the additional costs of:
|
||||
// 1. The words that are matched in phrases
|
||||
// 2. Skipping words (by adding them to the paths with a cost)
|
||||
let mut next_max_cost = 1;
|
||||
let removal_cost = if let Some(terms_matching_strategy) = self.terms_matching_strategy {
|
||||
// add the cost of the phrase to the next_max_cost
|
||||
next_max_cost += query_graph
|
||||
.words_in_phrases_count(ctx)
|
||||
// remove 1 from the words in phrases count, because when there is a phrase we can now have a document
|
||||
// where only the phrase is matching, and none of the non-phrase words.
|
||||
// With the `1` that `next_max_cost` is initialized with, this gets counted twice.
|
||||
.saturating_sub(1) as u64;
|
||||
match terms_matching_strategy {
|
||||
TermsMatchingStrategy::Last => {
|
||||
let removal_order =
|
||||
query_graph.removal_order_for_terms_matching_strategy_last(ctx);
|
||||
let mut forbidden_nodes =
|
||||
SmallBitmap::for_interned_values_in(&query_graph.nodes);
|
||||
let mut costs = query_graph.nodes.map(|_| None);
|
||||
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
|
||||
for ns in removal_order {
|
||||
for n in ns.iter() {
|
||||
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
|
||||
}
|
||||
forbidden_nodes.union(&ns);
|
||||
}
|
||||
costs
|
||||
}
|
||||
TermsMatchingStrategy::Frequency => {
|
||||
let removal_order =
|
||||
query_graph.removal_order_for_terms_matching_strategy_frequency(ctx)?;
|
||||
let mut forbidden_nodes =
|
||||
SmallBitmap::for_interned_values_in(&query_graph.nodes);
|
||||
let mut costs = query_graph.nodes.map(|_| None);
|
||||
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
|
||||
for ns in removal_order {
|
||||
for n in ns.iter() {
|
||||
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
|
||||
}
|
||||
forbidden_nodes.union(&ns);
|
||||
}
|
||||
costs
|
||||
}
|
||||
TermsMatchingStrategy::All => query_graph.nodes.map(|_| None),
|
||||
}
|
||||
} else {
|
||||
query_graph.nodes.map(|_| None)
|
||||
};
|
||||
|
||||
let graph = RankingRuleGraph::build(ctx, query_graph.clone(), removal_cost)?;
|
||||
let condition_docids_cache = ConditionDocIdsCache::default();
|
||||
let dead_ends_cache = DeadEndsCache::new(&graph.conditions_interner);
|
||||
|
||||
// Then pre-compute the cost of all paths from each node to the end node
|
||||
let all_costs = graph.find_all_costs_to_end();
|
||||
|
||||
next_max_cost +=
|
||||
all_costs.get(graph.query_graph.root_node).iter().copied().max().unwrap_or(0);
|
||||
|
||||
let state = GraphBasedRankingRuleState {
|
||||
graph,
|
||||
conditions_cache: condition_docids_cache,
|
||||
dead_ends_cache,
|
||||
all_costs,
|
||||
cur_cost: 0,
|
||||
next_max_cost,
|
||||
};
|
||||
|
||||
self.state = Some(state);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::graph_based")]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||
// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
|
||||
// should never happen
|
||||
let mut state = self.state.take().unwrap();
|
||||
|
||||
let all_costs = state.all_costs.get(state.graph.query_graph.root_node);
|
||||
// Retrieve the cost of the paths to compute
|
||||
let Some(&cost) = all_costs.iter().find(|c| **c >= state.cur_cost) else {
|
||||
self.state = None;
|
||||
return Ok(None);
|
||||
};
|
||||
state.cur_cost = cost + 1;
|
||||
|
||||
let mut bucket = RoaringBitmap::new();
|
||||
|
||||
let GraphBasedRankingRuleState {
|
||||
graph,
|
||||
conditions_cache: condition_docids_cache,
|
||||
dead_ends_cache,
|
||||
all_costs,
|
||||
cur_cost: _,
|
||||
next_max_cost,
|
||||
} = &mut state;
|
||||
|
||||
let rank = *next_max_cost - cost;
|
||||
let score = G::rank_to_score(Rank { rank: rank as u32, max_rank: *next_max_cost as u32 });
|
||||
|
||||
let mut universe = universe.clone();
|
||||
|
||||
let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner);
|
||||
let mut good_paths = vec![];
|
||||
let mut considered_paths = vec![];
|
||||
|
||||
// For each path of the given cost, we will compute its associated
|
||||
// document ids.
|
||||
// In case the path does not resolve to any document id, we try to figure out why
|
||||
// and update the `dead_ends_cache` accordingly.
|
||||
// Updating the dead_ends_cache helps speed up the execution of `visit_paths_of_cost` and reduces
|
||||
// the number of future candidate paths given by that same function.
|
||||
|
||||
let mut subpaths_docids: Vec<(Interned<G::Condition>, RoaringBitmap)> = vec![];
|
||||
|
||||
let mut nodes_with_removed_outgoing_conditions = BTreeSet::new();
|
||||
let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache);
|
||||
|
||||
visitor.visit_paths(&mut |path, graph, dead_ends_cache| {
|
||||
considered_paths.push(path.to_vec());
|
||||
// If the universe is empty, stop exploring the graph, since no docids will ever be found anymore.
|
||||
if universe.is_empty() {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
// `visit_paths` performs a depth-first search, so the previously visited path
|
||||
// is likely to share a prefix with the current one.
|
||||
// We stored the previous path and the docids associated to each of its prefixes in `subpaths_docids`.
|
||||
// We take advantage of this to avoid computing the docids associated with the common prefix between
|
||||
// the old and current path.
|
||||
let idx_of_first_different_condition = {
|
||||
let mut idx = 0;
|
||||
for (&last_c, cur_c) in path.iter().zip(subpaths_docids.iter().map(|x| x.0)) {
|
||||
if last_c == cur_c {
|
||||
idx += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
subpaths_docids.truncate(idx);
|
||||
idx
|
||||
};
|
||||
// Then for the remaining of the path, we continue computing docids.
|
||||
for latest_condition in path[idx_of_first_different_condition..].iter().copied() {
|
||||
let success = visit_path_condition(
|
||||
ctx,
|
||||
graph,
|
||||
&universe,
|
||||
dead_ends_cache,
|
||||
condition_docids_cache,
|
||||
&mut subpaths_docids,
|
||||
&mut nodes_with_removed_outgoing_conditions,
|
||||
latest_condition,
|
||||
)?;
|
||||
if !success {
|
||||
return Ok(ControlFlow::Continue(()));
|
||||
}
|
||||
}
|
||||
assert!(subpaths_docids.iter().map(|x| x.0).eq(path.iter().copied()));
|
||||
|
||||
let path_docids =
|
||||
subpaths_docids.pop().map(|x| x.1).unwrap_or_else(|| universe.clone());
|
||||
assert!(!path_docids.is_empty());
|
||||
|
||||
// Accumulate the path for logging purposes only
|
||||
good_paths.push(path.to_vec());
|
||||
for &condition in path {
|
||||
used_conditions.insert(condition);
|
||||
}
|
||||
bucket |= &path_docids;
|
||||
// Reduce the size of the universe so that we can more optimistically discard candidate paths
|
||||
universe -= &path_docids;
|
||||
for (_, docids) in subpaths_docids.iter_mut() {
|
||||
*docids -= &path_docids;
|
||||
}
|
||||
|
||||
if universe.is_empty() {
|
||||
Ok(ControlFlow::Break(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
})?;
|
||||
logger.log_internal_state(graph);
|
||||
logger.log_internal_state(&good_paths);
|
||||
|
||||
// We modify the next query graph so that it only contains the subgraph
|
||||
// that was used to compute this bucket
|
||||
|
||||
let paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>> = good_paths
|
||||
.into_iter()
|
||||
.map(|path| {
|
||||
path.into_iter()
|
||||
.map(|condition| {
|
||||
let (a, b) =
|
||||
condition_docids_cache.get_subsets_used_by_condition(condition);
|
||||
(a.clone(), b.clone())
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let next_query_graph = QueryGraph::build_from_paths(paths);
|
||||
|
||||
#[allow(clippy::comparison_chain)]
|
||||
if nodes_with_removed_outgoing_conditions.len() == 1 {
|
||||
graph.update_all_costs_before_node(
|
||||
*nodes_with_removed_outgoing_conditions.first().unwrap(),
|
||||
all_costs,
|
||||
);
|
||||
} else if nodes_with_removed_outgoing_conditions.len() > 1 {
|
||||
*all_costs = graph.find_all_costs_to_end();
|
||||
}
|
||||
|
||||
self.state = Some(state);
|
||||
|
||||
Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket, score }))
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::graph_based")]
|
||||
fn end_iteration(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
self.state = None;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns false if the intersection between the condition
|
||||
/// docids and the previous path docids is empty.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn visit_path_condition<G: RankingRuleGraphTrait>(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
universe: &RoaringBitmap,
|
||||
dead_ends_cache: &mut DeadEndsCache<G::Condition>,
|
||||
condition_docids_cache: &mut ConditionDocIdsCache<G>,
|
||||
subpath: &mut Vec<(Interned<G::Condition>, RoaringBitmap)>,
|
||||
nodes_with_removed_outgoing_conditions: &mut BTreeSet<Interned<QueryNode>>,
|
||||
latest_condition: Interned<G::Condition>,
|
||||
) -> Result<bool> {
|
||||
let condition_docids = &condition_docids_cache
|
||||
.get_computed_condition(ctx, latest_condition, graph, universe)?
|
||||
.docids;
|
||||
if condition_docids.is_empty() {
|
||||
// 1. Store in the cache that this edge is empty for this universe
|
||||
dead_ends_cache.forbid_condition(latest_condition);
|
||||
// 2. remove all the edges with this condition from the ranking rule graph
|
||||
let source_nodes = graph.remove_edges_with_condition(latest_condition);
|
||||
nodes_with_removed_outgoing_conditions.extend(source_nodes);
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let latest_path_docids = if let Some((_, prev_docids)) = subpath.last() {
|
||||
prev_docids & condition_docids
|
||||
} else {
|
||||
condition_docids.clone()
|
||||
};
|
||||
if !latest_path_docids.is_empty() {
|
||||
subpath.push((latest_condition, latest_path_docids));
|
||||
return Ok(true);
|
||||
}
|
||||
// If the (sub)path is empty, we try to figure out why and update the caches accordingly.
|
||||
|
||||
// First, we know that this path is empty, and thus any path
|
||||
// that is a superset of it will also be empty.
|
||||
dead_ends_cache.forbid_condition_after_prefix(subpath.iter().map(|x| x.0), latest_condition);
|
||||
|
||||
if subpath.len() <= 1 {
|
||||
return Ok(false);
|
||||
}
|
||||
let mut subprefix = vec![];
|
||||
// Deadend if the intersection between this edge and any
|
||||
// previous prefix is disjoint with the universe
|
||||
// We already know that the intersection with the last one
|
||||
// is empty,
|
||||
for (past_condition, sp_docids) in subpath[..subpath.len() - 1].iter() {
|
||||
subprefix.push(*past_condition);
|
||||
if condition_docids.is_disjoint(sp_docids) {
|
||||
dead_ends_cache
|
||||
.forbid_condition_after_prefix(subprefix.iter().copied(), latest_condition);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
259
crates/milli/src/search/new/interner.rs
Normal file
259
crates/milli/src/search/new/interner.rs
Normal file
@@ -0,0 +1,259 @@
|
||||
use std::fmt;
|
||||
use std::hash::Hash;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
|
||||
/// An index within an interner ([`FixedSizeInterner`], [`DedupInterner`], or [`MappedInterner`]).
|
||||
pub struct Interned<T> {
|
||||
idx: u16,
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
impl<T> Interned<T> {
|
||||
/// Create an interned value manually from its raw index within the interner.
|
||||
pub fn from_raw(idx: u16) -> Self {
|
||||
Self { idx, _phantom: PhantomData }
|
||||
}
|
||||
/// Get the raw index from the interned value
|
||||
pub fn into_raw(self) -> u16 {
|
||||
self.idx
|
||||
}
|
||||
}
|
||||
|
||||
/// A [`DedupInterner`] is used to store a unique copy of a value of type `T`. This value
|
||||
/// is then identified by a lightweight index of type [`Interned<T>`], which can
|
||||
/// be copied, compared, and hashed efficiently. An immutable reference to the original value
|
||||
/// can be retrieved using `self.get(interned)`. A set of values within the interner can be
|
||||
/// efficiently managed using [`SmallBitmap<T>`](super::small_bitmap::SmallBitmap).
|
||||
///
|
||||
/// A dedup-interner can contain a maximum of `u16::MAX` values.
|
||||
#[derive(Clone)]
|
||||
pub struct DedupInterner<T> {
|
||||
stable_store: Vec<T>,
|
||||
lookup: FxHashMap<T, Interned<T>>,
|
||||
}
|
||||
impl<T> Default for DedupInterner<T> {
|
||||
fn default() -> Self {
|
||||
Self { stable_store: Default::default(), lookup: Default::default() }
|
||||
}
|
||||
}
|
||||
impl<T> DedupInterner<T> {
|
||||
/// Convert the dedup-interner into a fixed-size interner, such that new
|
||||
/// elements cannot be added to it anymore.
|
||||
pub fn freeze(self) -> FixedSizeInterner<T> {
|
||||
FixedSizeInterner { stable_store: self.stable_store }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DedupInterner<T>
|
||||
where
|
||||
T: Clone + Eq + Hash,
|
||||
{
|
||||
/// Insert the given value into the dedup-interner, and return
|
||||
/// its index.
|
||||
pub fn insert(&mut self, s: T) -> Interned<T> {
|
||||
if let Some(interned) = self.lookup.get(&s) {
|
||||
*interned
|
||||
} else {
|
||||
assert!(self.stable_store.len() < u16::MAX as usize);
|
||||
self.stable_store.push(s.clone());
|
||||
let interned = Interned::from_raw(self.stable_store.len() as u16 - 1);
|
||||
self.lookup.insert(s, interned);
|
||||
interned
|
||||
}
|
||||
}
|
||||
/// Get a reference to the interned value.
|
||||
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
}
|
||||
|
||||
/// A fixed-length store for values of type `T`, where each value is identified
|
||||
/// by an index of type [`Interned<T>`].
|
||||
#[derive(Clone)]
|
||||
pub struct FixedSizeInterner<T> {
|
||||
stable_store: Vec<T>,
|
||||
}
|
||||
impl<T: Clone> FixedSizeInterner<T> {
|
||||
/// Create a fixed-size interner of the given length containing
|
||||
/// clones of the given value.
|
||||
pub fn new(length: u16, value: T) -> Self {
|
||||
Self { stable_store: vec![value; length as usize] }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> FixedSizeInterner<T> {
|
||||
pub fn from_vec(store: Vec<T>) -> Self {
|
||||
Self { stable_store: store }
|
||||
}
|
||||
pub fn all_interned_values(&self) -> SmallBitmap<T> {
|
||||
let mut b = SmallBitmap::for_interned_values_in(self);
|
||||
for i in self.indexes() {
|
||||
b.insert(i);
|
||||
}
|
||||
b
|
||||
}
|
||||
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn get_mut(&mut self, interned: Interned<T>) -> &mut T {
|
||||
&mut self.stable_store[interned.idx as usize]
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u16 {
|
||||
self.stable_store.len() as u16
|
||||
}
|
||||
pub fn map_move<U>(self, map_f: impl Fn(T) -> U) -> FixedSizeInterner<U> {
|
||||
FixedSizeInterner { stable_store: self.stable_store.into_iter().map(map_f).collect() }
|
||||
}
|
||||
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner {
|
||||
stable_store: self.stable_store.iter().map(map_f).collect(),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn map_indexes<U>(&self, map_f: impl Fn(Interned<T>) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData }
|
||||
}
|
||||
pub fn indexes(&self) -> impl Iterator<Item = Interned<T>> {
|
||||
(0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16))
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (Interned<T>, &T)> {
|
||||
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
|
||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
}
|
||||
|
||||
/// A fixed-length store for values of type `T`, where each value is identified
|
||||
/// by an index of type [`Interned<T>`].
|
||||
#[derive(Clone)]
|
||||
pub struct Interner<T> {
|
||||
stable_store: Vec<T>,
|
||||
}
|
||||
impl<T> Default for Interner<T> {
|
||||
fn default() -> Self {
|
||||
Self { stable_store: vec![] }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Interner<T> {
|
||||
pub fn from_vec(v: Vec<T>) -> Self {
|
||||
Self { stable_store: v }
|
||||
}
|
||||
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn get_mut(&mut self, interned: Interned<T>) -> &mut T {
|
||||
&mut self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn push(&mut self, value: T) -> Interned<T> {
|
||||
assert!(self.stable_store.len() < u16::MAX as usize);
|
||||
self.stable_store.push(value);
|
||||
Interned::from_raw(self.stable_store.len() as u16 - 1)
|
||||
}
|
||||
pub fn len(&self) -> u16 {
|
||||
self.stable_store.len() as u16
|
||||
}
|
||||
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner {
|
||||
stable_store: self.stable_store.iter().map(map_f).collect(),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn map_indexes<U>(&self, map_f: impl Fn(Interned<T>) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData }
|
||||
}
|
||||
pub fn indexes(&self) -> impl Iterator<Item = Interned<T>> {
|
||||
(0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16))
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (Interned<T>, &T)> {
|
||||
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
|
||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn freeze(self) -> FixedSizeInterner<T> {
|
||||
FixedSizeInterner { stable_store: self.stable_store }
|
||||
}
|
||||
}
|
||||
|
||||
/// A store of values of type `T`, each linked to a value of type `From`
|
||||
/// stored in another interner. To create a mapped interner, use the
|
||||
/// `map` method on [`FixedSizeInterner`] or [`MappedInterner`].
|
||||
///
|
||||
/// Values in this interner are indexed with [`Interned<From>`].
|
||||
#[derive(Clone)]
|
||||
pub struct MappedInterner<From, T> {
|
||||
stable_store: Vec<T>,
|
||||
_phantom: PhantomData<From>,
|
||||
}
|
||||
|
||||
impl<From, T> MappedInterner<From, T> {
|
||||
pub fn get(&self, interned: Interned<From>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn get_mut(&mut self, interned: Interned<From>) -> &mut T {
|
||||
&mut self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<From, U> {
|
||||
MappedInterner {
|
||||
stable_store: self.stable_store.iter().map(map_f).collect(),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (Interned<From>, &T)> {
|
||||
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<From>, &mut T)> {
|
||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
}
|
||||
// Interned<T> boilerplate implementations
|
||||
|
||||
impl<T> Hash for Interned<T> {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
self.idx.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Ord for Interned<T> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.idx.cmp(&other.idx)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> PartialOrd for Interned<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Eq for Interned<T> {}
|
||||
|
||||
impl<T> PartialEq for Interned<T> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.idx == other.idx
|
||||
}
|
||||
}
|
||||
impl<T> Clone for Interned<T> {
|
||||
fn clone(&self) -> Self {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Copy for Interned<T> {}
|
||||
|
||||
impl<T> fmt::Display for Interned<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Display::fmt(&self.idx, f)
|
||||
}
|
||||
}
|
||||
impl<T> fmt::Debug for Interned<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Debug::fmt(&self.idx, f)
|
||||
}
|
||||
}
|
||||
17
crates/milli/src/search/new/limits.rs
Normal file
17
crates/milli/src/search/new/limits.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
/// Maximum number of tokens we consider in a single search.
|
||||
pub const MAX_TOKEN_COUNT: usize = 1_000;
|
||||
|
||||
/// Maximum number of prefixes that can be derived from a single word.
|
||||
pub const MAX_PREFIX_COUNT: usize = 1_000;
|
||||
/// Maximum number of words that can be derived from a single word with a distance of one to that word.
|
||||
pub const MAX_ONE_TYPO_COUNT: usize = 150;
|
||||
/// Maximum number of words that can be derived from a single word with a distance of two to that word.
|
||||
pub const MAX_TWO_TYPOS_COUNT: usize = 50;
|
||||
|
||||
/// Maximum amount of synonym phrases that can be derived from a single word.
|
||||
pub const MAX_SYNONYM_PHRASE_COUNT: usize = 50;
|
||||
|
||||
/// Maximum amount of words inside of all the synonym phrases that can be derived from a single word.
|
||||
///
|
||||
/// This limit is meant to gracefully handle the case where a word would have very long phrases as synonyms.
|
||||
pub const MAX_SYNONYM_WORD_COUNT: usize = 100;
|
||||
81
crates/milli/src/search/new/logger/mod.rs
Normal file
81
crates/milli/src/search/new/logger/mod.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
// #[cfg(test)]
|
||||
pub mod visual;
|
||||
|
||||
use std::any::Any;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::ranking_rules::BoxRankingRule;
|
||||
use super::{RankingRule, RankingRuleQueryTrait};
|
||||
|
||||
/// Trait for structure logging the execution of a search query.
|
||||
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
/// Logs the initial query
|
||||
fn initial_query(&mut self, _query: &Q);
|
||||
|
||||
/// Logs the value of the initial set of all candidates
|
||||
fn initial_universe(&mut self, _universe: &RoaringBitmap);
|
||||
|
||||
/// Logs the query that was used to compute the set of all candidates
|
||||
fn query_for_initial_universe(&mut self, _query: &Q);
|
||||
|
||||
/// Logs the ranking rules used to perform the search query
|
||||
fn ranking_rules(&mut self, _rr: &[BoxRankingRule<'_, Q>]);
|
||||
|
||||
/// Logs the start of a ranking rule's iteration.
|
||||
fn start_iteration_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, Q>,
|
||||
_query: &Q,
|
||||
_universe: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the end of the computation of a ranking rule bucket
|
||||
fn next_bucket_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, Q>,
|
||||
_universe: &RoaringBitmap,
|
||||
_candidates: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the skipping of a ranking rule bucket
|
||||
fn skip_bucket_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, Q>,
|
||||
_candidates: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the end of a ranking rule's iteration.
|
||||
fn end_iteration_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, Q>,
|
||||
_universe: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the addition of document ids to the final results
|
||||
fn add_to_results(&mut self, _docids: &[u32]);
|
||||
|
||||
/// Logs an internal state in the search algorithms
|
||||
fn log_internal_state(&mut self, _rr: &dyn Any);
|
||||
}
|
||||
|
||||
/// A dummy [`SearchLogger`] which does nothing.
|
||||
pub struct DefaultSearchLogger;
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
fn initial_query(&mut self, _query: &Q) {}
|
||||
|
||||
fn initial_universe(&mut self, _universe: &RoaringBitmap) {}
|
||||
|
||||
fn query_for_initial_universe(&mut self, _query: &Q) {}
|
||||
|
||||
fn ranking_rules(&mut self, _rr: &[BoxRankingRule<'_, Q>]) {}
|
||||
|
||||
fn add_to_results(&mut self, _docids: &[u32]) {}
|
||||
|
||||
fn log_internal_state(&mut self, _rr: &dyn Any) {}
|
||||
}
|
||||
554
crates/milli/src/search/new/logger/visual.rs
Normal file
554
crates/milli/src/search/new/logger/visual.rs
Normal file
@@ -0,0 +1,554 @@
|
||||
use std::any::Any;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::new::interner::Interned;
|
||||
use crate::search::new::query_graph::QueryNodeData;
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::ranking_rule_graph::{
|
||||
Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition,
|
||||
ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph,
|
||||
WordsCondition, WordsGraph,
|
||||
};
|
||||
use crate::search::new::ranking_rules::BoxRankingRule;
|
||||
use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger};
|
||||
use crate::Result;
|
||||
|
||||
pub enum SearchEvents {
|
||||
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
|
||||
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
|
||||
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
|
||||
RankingRuleEndIteration { ranking_rule_idx: usize },
|
||||
ExtendResults { new: Vec<u32> },
|
||||
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
|
||||
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
|
||||
TypoGraph { graph: RankingRuleGraph<TypoGraph> },
|
||||
TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> },
|
||||
WordsGraph { graph: RankingRuleGraph<WordsGraph> },
|
||||
WordsPaths { paths: Vec<Vec<Interned<WordsCondition>>> },
|
||||
FidGraph { graph: RankingRuleGraph<FidGraph> },
|
||||
FidPaths { paths: Vec<Vec<Interned<FidCondition>>> },
|
||||
PositionGraph { graph: RankingRuleGraph<PositionGraph> },
|
||||
PositionPaths { paths: Vec<Vec<Interned<PositionCondition>>> },
|
||||
}
|
||||
|
||||
enum Location {
|
||||
Words,
|
||||
Typo,
|
||||
Proximity,
|
||||
Fid,
|
||||
Position,
|
||||
Other,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VisualSearchLogger {
|
||||
initial_query: Option<QueryGraph>,
|
||||
initial_query_time: Option<Instant>,
|
||||
query_for_universe: Option<QueryGraph>,
|
||||
initial_universe: Option<RoaringBitmap>,
|
||||
ranking_rules_ids: Option<Vec<String>>,
|
||||
events: Vec<SearchEvents>,
|
||||
location: Vec<Location>,
|
||||
}
|
||||
|
||||
impl SearchLogger<QueryGraph> for VisualSearchLogger {
|
||||
fn initial_query(&mut self, query: &QueryGraph) {
|
||||
self.initial_query = Some(query.clone());
|
||||
self.initial_query_time = Some(Instant::now());
|
||||
}
|
||||
|
||||
fn query_for_initial_universe(&mut self, query: &QueryGraph) {
|
||||
self.query_for_universe = Some(query.clone());
|
||||
}
|
||||
|
||||
fn initial_universe(&mut self, universe: &RoaringBitmap) {
|
||||
self.initial_universe = Some(universe.clone());
|
||||
}
|
||||
fn ranking_rules(&mut self, rr: &[BoxRankingRule<'_, QueryGraph>]) {
|
||||
self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect());
|
||||
}
|
||||
|
||||
fn start_iteration_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'_, QueryGraph>,
|
||||
_query: &QueryGraph,
|
||||
universe: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleStartIteration {
|
||||
ranking_rule_idx,
|
||||
universe_len: universe.len(),
|
||||
});
|
||||
self.location.push(match ranking_rule.id().as_str() {
|
||||
"words" => Location::Words,
|
||||
"typo" => Location::Typo,
|
||||
"proximity" => Location::Proximity,
|
||||
"fid" => Location::Fid,
|
||||
"position" => Location::Position,
|
||||
_ => Location::Other,
|
||||
});
|
||||
}
|
||||
|
||||
fn next_bucket_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
bucket: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleNextBucket {
|
||||
ranking_rule_idx,
|
||||
universe_len: universe.len(),
|
||||
bucket_len: bucket.len(),
|
||||
});
|
||||
}
|
||||
fn skip_bucket_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, QueryGraph>,
|
||||
bucket: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleSkipBucket {
|
||||
ranking_rule_idx,
|
||||
bucket_len: bucket.len(),
|
||||
})
|
||||
}
|
||||
|
||||
fn end_iteration_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, QueryGraph>,
|
||||
_universe: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx });
|
||||
self.location.pop();
|
||||
}
|
||||
fn add_to_results(&mut self, docids: &[u32]) {
|
||||
self.events.push(SearchEvents::ExtendResults { new: docids.to_vec() });
|
||||
}
|
||||
|
||||
/// Logs the internal state of the ranking rule
|
||||
fn log_internal_state(&mut self, state: &dyn Any) {
|
||||
let Some(location) = self.location.last() else { return };
|
||||
match location {
|
||||
Location::Words => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<WordsGraph>>() {
|
||||
self.events.push(SearchEvents::WordsGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<WordsCondition>>>>() {
|
||||
self.events.push(SearchEvents::WordsPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Typo => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<TypoGraph>>() {
|
||||
self.events.push(SearchEvents::TypoGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<TypoCondition>>>>() {
|
||||
self.events.push(SearchEvents::TypoPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Proximity => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<ProximityGraph>>() {
|
||||
self.events.push(SearchEvents::ProximityGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<ProximityCondition>>>>()
|
||||
{
|
||||
self.events.push(SearchEvents::ProximityPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Fid => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<FidGraph>>() {
|
||||
self.events.push(SearchEvents::FidGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<FidCondition>>>>() {
|
||||
self.events.push(SearchEvents::FidPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Position => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<PositionGraph>>() {
|
||||
self.events.push(SearchEvents::PositionGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<PositionCondition>>>>() {
|
||||
self.events.push(SearchEvents::PositionPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Other => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl VisualSearchLogger {
|
||||
pub fn finish<'ctx>(self, ctx: &'ctx mut SearchContext<'ctx>, folder: &Path) -> Result<()> {
|
||||
let mut f = DetailedLoggerFinish::new(ctx, folder)?;
|
||||
f.finish(self)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct DetailedLoggerFinish<'ctx> {
|
||||
ctx: &'ctx mut SearchContext<'ctx>,
|
||||
/// The folder where all the files should be printed
|
||||
folder_path: PathBuf,
|
||||
/// The main file visualising the search request
|
||||
index_file: BufWriter<File>,
|
||||
/// A vector of counters where each counter at index i represents the number of times
|
||||
/// that the ranking rule at idx i-1 was called since its last call to `start_iteration`.
|
||||
/// This is used to uniquely identify a point in the sequence diagram.
|
||||
rr_action_counter: Vec<usize>,
|
||||
/// The file storing information about the internal state of the latest active ranking rule
|
||||
file_for_internal_state: Option<BufWriter<File>>,
|
||||
}
|
||||
|
||||
impl<'ctx> DetailedLoggerFinish<'ctx> {
|
||||
fn cur_file(&mut self) -> &mut BufWriter<File> {
|
||||
if let Some(file) = self.file_for_internal_state.as_mut() {
|
||||
file
|
||||
} else {
|
||||
&mut self.index_file
|
||||
}
|
||||
}
|
||||
fn pop_rr_action(&mut self) {
|
||||
self.file_for_internal_state = None;
|
||||
self.rr_action_counter.pop();
|
||||
}
|
||||
fn push_new_rr_action(&mut self) {
|
||||
self.file_for_internal_state = None;
|
||||
self.rr_action_counter.push(0);
|
||||
}
|
||||
fn increment_cur_rr_action(&mut self) {
|
||||
self.file_for_internal_state = None;
|
||||
if let Some(c) = self.rr_action_counter.last_mut() {
|
||||
*c += 1;
|
||||
}
|
||||
}
|
||||
fn id_of_timestamp(&self) -> String {
|
||||
let mut s = String::new();
|
||||
for t in self.rr_action_counter.iter() {
|
||||
s.push_str(&format!("{t}_"));
|
||||
}
|
||||
s
|
||||
}
|
||||
fn id_of_extend_results(&self) -> String {
|
||||
let mut s = String::new();
|
||||
s.push_str("results.\"");
|
||||
s.push_str(&self.id_of_timestamp());
|
||||
s.push('"');
|
||||
s
|
||||
}
|
||||
fn id_of_last_rr_action(&self) -> String {
|
||||
let mut s = String::new();
|
||||
let rr_id = if self.rr_action_counter.is_empty() {
|
||||
"start.\"".to_owned()
|
||||
} else {
|
||||
format!("{}.\"", self.rr_action_counter.len() - 1)
|
||||
};
|
||||
s.push_str(&rr_id);
|
||||
s.push_str(&self.id_of_timestamp());
|
||||
s.push('"');
|
||||
s
|
||||
}
|
||||
fn make_new_file_for_internal_state_if_needed(&mut self) -> Result<()> {
|
||||
if self.file_for_internal_state.is_some() {
|
||||
return Ok(());
|
||||
}
|
||||
let timestamp = self.id_of_timestamp();
|
||||
let id = self.id_of_last_rr_action();
|
||||
let new_file_path = self.folder_path.join(format!("{timestamp}.d2"));
|
||||
self.file_for_internal_state = Some(BufWriter::new(File::create(new_file_path)?));
|
||||
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{id} {{
|
||||
link: \"{timestamp}.d2.svg\"
|
||||
}}"
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
fn new(ctx: &'ctx mut SearchContext<'ctx>, folder_path: &Path) -> Result<Self> {
|
||||
let index_path = folder_path.join("index.d2");
|
||||
let index_file = BufWriter::new(File::create(index_path)?);
|
||||
|
||||
Ok(Self {
|
||||
ctx,
|
||||
folder_path: folder_path.to_owned(),
|
||||
index_file,
|
||||
rr_action_counter: vec![],
|
||||
file_for_internal_state: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn finish(&mut self, logger: VisualSearchLogger) -> Result<()> {
|
||||
writeln!(&mut self.index_file, "direction: right")?;
|
||||
if let Some(qg) = logger.initial_query {
|
||||
writeln!(&mut self.index_file, "Initial Query Graph: {{")?;
|
||||
self.write_query_graph(&qg)?;
|
||||
writeln!(&mut self.index_file, "}}")?;
|
||||
}
|
||||
if let Some(qg) = logger.query_for_universe {
|
||||
writeln!(&mut self.index_file, "Query Graph Used To Compute Universe: {{")?;
|
||||
self.write_query_graph(&qg)?;
|
||||
writeln!(&mut self.index_file, "}}")?;
|
||||
}
|
||||
let Some(ranking_rules_ids) = logger.ranking_rules_ids else { return Ok(()) };
|
||||
writeln!(&mut self.index_file, "Control Flow Between Ranking Rules: {{")?;
|
||||
writeln!(&mut self.index_file, "shape: sequence_diagram")?;
|
||||
writeln!(&mut self.index_file, "start")?;
|
||||
for (idx, rr_id) in ranking_rules_ids.iter().enumerate() {
|
||||
writeln!(&mut self.index_file, "{idx}: {rr_id}")?;
|
||||
}
|
||||
writeln!(&mut self.index_file, "results")?;
|
||||
for event in logger.events {
|
||||
self.write_event(event)?;
|
||||
}
|
||||
writeln!(&mut self.index_file, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_event(&mut self, e: SearchEvents) -> Result<()> {
|
||||
match e {
|
||||
SearchEvents::RankingRuleStartIteration { ranking_rule_idx, universe_len } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len());
|
||||
self.write_start_iteration(universe_len)?;
|
||||
}
|
||||
SearchEvents::RankingRuleNextBucket { ranking_rule_idx, universe_len, bucket_len } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||
self.write_next_bucket(bucket_len, universe_len)?;
|
||||
}
|
||||
SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, bucket_len } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||
self.write_skip_bucket(bucket_len)?;
|
||||
}
|
||||
SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||
self.write_end_iteration()?;
|
||||
}
|
||||
SearchEvents::ExtendResults { new } => {
|
||||
self.write_extend_results(new)?;
|
||||
}
|
||||
SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::ProximityPaths { paths } => {
|
||||
self.write_rr_graph_paths::<ProximityGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::TypoGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::TypoPaths { paths } => {
|
||||
self.write_rr_graph_paths::<TypoGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::WordsGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::WordsPaths { paths } => {
|
||||
self.write_rr_graph_paths::<WordsGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::FidPaths { paths } => {
|
||||
self.write_rr_graph_paths::<FidGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::PositionGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::PositionPaths { paths } => {
|
||||
self.write_rr_graph_paths::<PositionGraph>(paths)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn write_query_graph(&mut self, qg: &QueryGraph) -> Result<()> {
|
||||
writeln!(self.cur_file(), "direction: right")?;
|
||||
for (node_id, node) in qg.nodes.iter() {
|
||||
if matches!(node.data, QueryNodeData::Deleted) {
|
||||
continue;
|
||||
}
|
||||
self.write_query_node(node_id, node)?;
|
||||
|
||||
for edge in node.successors.iter() {
|
||||
writeln!(self.cur_file(), "{node_id} -> {edge};\n").unwrap();
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_start_iteration(&mut self, _universe_len: u64) -> Result<()> {
|
||||
let parent_action_id = self.id_of_last_rr_action();
|
||||
self.push_new_rr_action();
|
||||
let self_action_id = self.id_of_last_rr_action();
|
||||
writeln!(&mut self.index_file, "{parent_action_id} -> {self_action_id} : start iteration")?;
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{self_action_id} {{
|
||||
style {{
|
||||
fill: \"#D8A7B1\"
|
||||
}}
|
||||
}}"
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_next_bucket(&mut self, bucket_len: u64, universe_len: u64) -> Result<()> {
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
self.increment_cur_rr_action();
|
||||
let next_action_id = self.id_of_last_rr_action();
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{cur_action_id} -> {next_action_id} : next bucket {bucket_len}/{universe_len}"
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_skip_bucket(&mut self, bucket_len: u64) -> Result<()> {
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
self.increment_cur_rr_action();
|
||||
let next_action_id = self.id_of_last_rr_action();
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{cur_action_id} -> {next_action_id} : skip bucket ({bucket_len})"
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_end_iteration(&mut self) -> Result<()> {
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
self.pop_rr_action();
|
||||
let parent_action_id = self.id_of_last_rr_action();
|
||||
|
||||
writeln!(&mut self.index_file, "{cur_action_id} -> {parent_action_id} : end iteration",)?;
|
||||
Ok(())
|
||||
}
|
||||
fn write_extend_results(&mut self, new: Vec<u32>) -> Result<()> {
|
||||
if new.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
let results_id = self.id_of_extend_results();
|
||||
let docids = new.iter().collect::<Vec<_>>();
|
||||
let len = new.len();
|
||||
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{cur_action_id} -> {results_id} : \"add {len}\"
|
||||
{results_id} {{
|
||||
tooltip: \"{docids:?}\"
|
||||
style {{
|
||||
fill: \"#B6E2D3\"
|
||||
}}
|
||||
}}
|
||||
"
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_query_node(&mut self, node_idx: Interned<QueryNode>, node: &QueryNode) -> Result<()> {
|
||||
let Self {
|
||||
ctx, index_file, file_for_internal_state: active_ranking_rule_state_file, ..
|
||||
} = self;
|
||||
let file = if let Some(file) = active_ranking_rule_state_file.as_mut() {
|
||||
file
|
||||
} else {
|
||||
index_file
|
||||
};
|
||||
match &node.data {
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset,
|
||||
positions: _,
|
||||
term_ids: _,
|
||||
}) => {
|
||||
writeln!(
|
||||
file,
|
||||
"{node_idx} : \"{}\" {{
|
||||
shape: class
|
||||
max_nbr_typo: {}",
|
||||
term_subset.description(ctx),
|
||||
term_subset.max_typo_cost(ctx)
|
||||
)?;
|
||||
|
||||
for w in term_subset.all_single_words_except_prefix_db(ctx)? {
|
||||
let w = ctx.word_interner.get(w.interned());
|
||||
writeln!(file, "{w}: word")?;
|
||||
}
|
||||
for p in term_subset.all_phrases(ctx)? {
|
||||
writeln!(file, "{}: phrase", p.description(ctx))?;
|
||||
}
|
||||
if let Some(w) = term_subset.use_prefix_db(ctx) {
|
||||
let w = ctx.word_interner.get(w.interned());
|
||||
writeln!(file, "{w}: prefix db")?;
|
||||
}
|
||||
|
||||
writeln!(file, "}}")?;
|
||||
}
|
||||
QueryNodeData::Deleted => panic!(),
|
||||
QueryNodeData::Start => {
|
||||
writeln!(file, "{node_idx} : START")?;
|
||||
}
|
||||
QueryNodeData::End => {
|
||||
writeln!(file, "{node_idx} : END")?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn write_rr_graph<R: RankingRuleGraphTrait>(
|
||||
&mut self,
|
||||
graph: &RankingRuleGraph<R>,
|
||||
) -> Result<()> {
|
||||
self.make_new_file_for_internal_state_if_needed()?;
|
||||
|
||||
writeln!(self.cur_file(), "direction: right")?;
|
||||
|
||||
writeln!(self.cur_file(), "Graph {{")?;
|
||||
for (node_idx, node) in graph.query_graph.nodes.iter() {
|
||||
if matches!(&node.data, QueryNodeData::Deleted) {
|
||||
continue;
|
||||
}
|
||||
self.write_query_node(node_idx, node)?;
|
||||
}
|
||||
for (_edge_id, edge) in graph.edges_store.iter() {
|
||||
let Some(edge) = edge else { continue };
|
||||
let Edge { source_node, dest_node, condition: details, cost, nodes_to_skip: _ } = edge;
|
||||
|
||||
match &details {
|
||||
None => {
|
||||
writeln!(
|
||||
self.cur_file(),
|
||||
"{source_node} -> {dest_node} : \"always cost {cost}\"",
|
||||
)?;
|
||||
}
|
||||
Some(condition) => {
|
||||
writeln!(
|
||||
self.cur_file(),
|
||||
"{source_node} -> {dest_node} : \"{condition} cost {cost}\"",
|
||||
cost = edge.cost,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
writeln!(self.cur_file(), "}}")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_rr_graph_paths<R: RankingRuleGraphTrait>(
|
||||
&mut self,
|
||||
paths: Vec<Vec<Interned<R::Condition>>>,
|
||||
) -> Result<()> {
|
||||
self.make_new_file_for_internal_state_if_needed()?;
|
||||
let file = if let Some(file) = self.file_for_internal_state.as_mut() {
|
||||
file
|
||||
} else {
|
||||
&mut self.index_file
|
||||
};
|
||||
writeln!(file, "Path {{")?;
|
||||
for (path_idx, condition_indexes) in paths.iter().enumerate() {
|
||||
writeln!(file, "{path_idx} {{")?;
|
||||
for condition in condition_indexes.iter() {
|
||||
writeln!(file, "{condition}")?;
|
||||
}
|
||||
for couple_edges in condition_indexes.windows(2) {
|
||||
let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() };
|
||||
writeln!(file, "{src_edge_idx} -> {dest_edge_idx}")?;
|
||||
}
|
||||
writeln!(file, "}}")?;
|
||||
}
|
||||
writeln!(file, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
330
crates/milli/src/search/new/matches/matching_words.rs
Normal file
330
crates/milli/src/search/new/matches/matching_words.rs
Normal file
@@ -0,0 +1,330 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::fmt;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use charabia::Token;
|
||||
|
||||
use super::super::interner::Interned;
|
||||
use super::super::query_term::LocatedQueryTerm;
|
||||
use super::super::{DedupInterner, Phrase};
|
||||
use crate::SearchContext;
|
||||
|
||||
pub struct LocatedMatchingPhrase {
|
||||
pub value: Interned<Phrase>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
}
|
||||
|
||||
pub struct LocatedMatchingWords {
|
||||
pub value: Vec<Interned<String>>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
pub is_prefix: bool,
|
||||
pub original_char_count: usize,
|
||||
}
|
||||
|
||||
/// Structure created from a query tree
|
||||
/// referencing words that match the given query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
word_interner: DedupInterner<String>,
|
||||
phrase_interner: DedupInterner<Phrase>,
|
||||
phrases: Vec<LocatedMatchingPhrase>,
|
||||
words: Vec<LocatedMatchingWords>,
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(ctx: SearchContext<'_>, located_terms: Vec<LocatedQueryTerm>) -> Self {
|
||||
let mut phrases = Vec::new();
|
||||
let mut words = Vec::new();
|
||||
|
||||
// Extract and centralize the different phrases and words to match stored in a QueryTerm
|
||||
// and wrap them in dedicated structures.
|
||||
for located_term in located_terms {
|
||||
let term = ctx.term_interner.get(located_term.value);
|
||||
let (matching_words, matching_phrases) = term.all_computed_derivations();
|
||||
|
||||
for matching_phrase in matching_phrases {
|
||||
phrases.push(LocatedMatchingPhrase {
|
||||
value: matching_phrase,
|
||||
positions: located_term.positions.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
words.push(LocatedMatchingWords {
|
||||
value: matching_words,
|
||||
positions: located_term.positions.clone(),
|
||||
is_prefix: term.is_prefix(),
|
||||
original_char_count: term.original_word(&ctx).chars().count(),
|
||||
});
|
||||
}
|
||||
|
||||
// Sort word to put prefixes at the bottom prioritizing the exact matches.
|
||||
words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
|
||||
|
||||
Self {
|
||||
phrases,
|
||||
words,
|
||||
word_interner: ctx.word_interner,
|
||||
phrase_interner: ctx.phrase_interner,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over terms that match or partially match the given token.
|
||||
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
|
||||
MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
|
||||
}
|
||||
|
||||
/// Try to match the token with one of the located_words.
|
||||
fn match_unique_words<'a>(&'a self, token: &Token<'_>) -> Option<MatchType<'a>> {
|
||||
for located_words in &self.words {
|
||||
for word in &located_words.value {
|
||||
let word = self.word_interner.get(*word);
|
||||
// if the word is a prefix we match using starts_with.
|
||||
if located_words.is_prefix && token.lemma().starts_with(word) {
|
||||
let Some((char_index, c)) =
|
||||
word.char_indices().take(located_words.original_char_count).last()
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
let prefix_length = char_index + c.len_utf8();
|
||||
let char_len = token.original_lengths(prefix_length).0;
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full { char_len, ids });
|
||||
// else we exact match the token.
|
||||
} else if token.lemma() == word {
|
||||
let char_len = token.char_end - token.char_start;
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full { char_len, ids });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
matching_words: &'a MatchingWords,
|
||||
phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for MatchesIter<'a, '_> {
|
||||
type Item = MatchType<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.phrases.next() {
|
||||
// Try to match all the phrases first.
|
||||
Some(located_phrase) => {
|
||||
let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
|
||||
|
||||
// create a PartialMatch struct to make it compute the first match
|
||||
// instead of duplicating the code.
|
||||
let ids = &located_phrase.positions;
|
||||
// collect the references of words from the interner.
|
||||
let words = phrase
|
||||
.words
|
||||
.iter()
|
||||
.map(|word| {
|
||||
word.map(|word| self.matching_words.word_interner.get(word).as_str())
|
||||
})
|
||||
.collect();
|
||||
let partial = PartialMatch { matching_words: words, ids, char_len: 0 };
|
||||
|
||||
partial.match_token(self.token).or_else(|| self.next())
|
||||
}
|
||||
// If no phrases matches, try to match uiques words.
|
||||
None => self.matching_words.match_unique_words(self.token),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Id of a matching term corespounding to a word written by the end user.
|
||||
pub type WordId = u16;
|
||||
|
||||
/// A given token can partially match a query word for several reasons:
|
||||
/// - split words
|
||||
/// - multi-word synonyms
|
||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MatchType<'a> {
|
||||
Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
|
||||
Partial(PartialMatch<'a>),
|
||||
}
|
||||
|
||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: Vec<Option<&'a str>>,
|
||||
ids: &'a RangeInclusive<WordId>,
|
||||
char_len: usize,
|
||||
}
|
||||
|
||||
impl<'a> PartialMatch<'a> {
|
||||
/// Returns:
|
||||
/// - None if the given token breaks the partial match
|
||||
/// - Partial if the given token matches the partial match but doesn't complete it
|
||||
/// - Full if the given token completes the partial match
|
||||
pub fn match_token(self, token: &Token<'_>) -> Option<MatchType<'a>> {
|
||||
let Self { mut matching_words, ids, .. } = self;
|
||||
|
||||
let is_matching = match matching_words.first()? {
|
||||
Some(word) => &token.lemma() == word,
|
||||
// a None value in the phrase corresponds to a stop word,
|
||||
// the walue is considered a match if the current token is categorized as a stop word.
|
||||
None => token.is_stopword(),
|
||||
};
|
||||
|
||||
let char_len = token.char_end - token.char_start;
|
||||
// if there are remaining words to match in the phrase and the current token is matching,
|
||||
// return a new Partial match allowing the highlighter to continue.
|
||||
if is_matching && matching_words.len() > 1 {
|
||||
matching_words.remove(0);
|
||||
Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
|
||||
// if there is no remaining word to match in the phrase and the current token is matching,
|
||||
// return a Full match.
|
||||
} else if is_matching {
|
||||
Some(MatchType::Full { char_len, ids })
|
||||
// if the current token doesn't match, return None to break the match sequence.
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn char_len(&self) -> usize {
|
||||
self.char_len
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
|
||||
|
||||
let phrases: Vec<_> = phrases
|
||||
.iter()
|
||||
.map(|p| {
|
||||
(
|
||||
phrase_interner
|
||||
.get(p.value)
|
||||
.words
|
||||
.iter()
|
||||
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
p.positions.clone(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let words: Vec<_> = words
|
||||
.iter()
|
||||
.flat_map(|w| {
|
||||
w.value
|
||||
.iter()
|
||||
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::borrow::Cow;
|
||||
|
||||
use charabia::{TokenKind, TokenizerBuilder};
|
||||
|
||||
use super::super::super::located_query_terms_from_tokens;
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::query_term::ExtractedTokens;
|
||||
|
||||
pub(crate) fn temp_index_with_documents() -> TempIndex {
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
|
||||
{ "id": 2, "name": "Westfália" },
|
||||
{ "id": 3, "name": "Ŵôřlḑôle" },
|
||||
]))
|
||||
.unwrap();
|
||||
temp_index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize("split this world");
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("split"),
|
||||
char_end: "split".chars().count(),
|
||||
byte_end: "split".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("worlded"),
|
||||
char_end: "worlded".chars().count(),
|
||||
byte_end: "worlded".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
}
|
||||
}
|
||||
906
crates/milli/src/search/new/matches/mod.rs
Normal file
906
crates/milli/src/search/new/matches/mod.rs
Normal file
@@ -0,0 +1,906 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use charabia::{Language, SeparatorKind, Token, Tokenizer};
|
||||
pub use matching_words::MatchingWords;
|
||||
use matching_words::{MatchType, PartialMatch, WordId};
|
||||
use serde::Serialize;
|
||||
|
||||
pub mod matching_words;
|
||||
|
||||
const DEFAULT_CROP_MARKER: &str = "…";
|
||||
const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
|
||||
const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
|
||||
|
||||
/// Structure used to build a Matcher allowing to customize formating tags.
|
||||
pub struct MatcherBuilder<'m> {
|
||||
matching_words: MatchingWords,
|
||||
tokenizer: Tokenizer<'m>,
|
||||
crop_marker: Option<String>,
|
||||
highlight_prefix: Option<String>,
|
||||
highlight_suffix: Option<String>,
|
||||
}
|
||||
|
||||
impl<'m> MatcherBuilder<'m> {
|
||||
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self {
|
||||
Self {
|
||||
matching_words,
|
||||
tokenizer,
|
||||
crop_marker: None,
|
||||
highlight_prefix: None,
|
||||
highlight_suffix: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn crop_marker(&mut self, marker: String) -> &Self {
|
||||
self.crop_marker = Some(marker);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn highlight_prefix(&mut self, prefix: String) -> &Self {
|
||||
self.highlight_prefix = Some(prefix);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn highlight_suffix(&mut self, suffix: String) -> &Self {
|
||||
self.highlight_suffix = Some(suffix);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build<'t, 'lang>(
|
||||
&self,
|
||||
text: &'t str,
|
||||
locales: Option<&'lang [Language]>,
|
||||
) -> Matcher<'t, 'm, '_, 'lang> {
|
||||
let crop_marker = match &self.crop_marker {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_CROP_MARKER,
|
||||
};
|
||||
|
||||
let highlight_prefix = match &self.highlight_prefix {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_HIGHLIGHT_PREFIX,
|
||||
};
|
||||
let highlight_suffix = match &self.highlight_suffix {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_HIGHLIGHT_SUFFIX,
|
||||
};
|
||||
Matcher {
|
||||
text,
|
||||
matching_words: &self.matching_words,
|
||||
tokenizer: &self.tokenizer,
|
||||
crop_marker,
|
||||
highlight_prefix,
|
||||
highlight_suffix,
|
||||
matches: None,
|
||||
locales,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default, Debug)]
|
||||
pub struct FormatOptions {
|
||||
pub highlight: bool,
|
||||
pub crop: Option<usize>,
|
||||
}
|
||||
|
||||
impl FormatOptions {
|
||||
pub fn merge(self, other: Self) -> Self {
|
||||
Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) }
|
||||
}
|
||||
|
||||
pub fn should_format(&self) -> bool {
|
||||
self.highlight || self.crop.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Match {
|
||||
match_len: usize,
|
||||
// ids of the query words that matches.
|
||||
ids: Vec<WordId>,
|
||||
// position of the word in the whole text.
|
||||
word_position: usize,
|
||||
// position of the token in the whole text.
|
||||
token_position: usize,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub struct MatchBounds {
|
||||
pub start: usize,
|
||||
pub length: usize,
|
||||
}
|
||||
|
||||
/// Structure used to analyze a string, compute words that match,
|
||||
/// and format the source string, returning a highlighted and cropped sub-string.
|
||||
pub struct Matcher<'t, 'tokenizer, 'b, 'lang> {
|
||||
text: &'t str,
|
||||
matching_words: &'b MatchingWords,
|
||||
tokenizer: &'b Tokenizer<'tokenizer>,
|
||||
locales: Option<&'lang [Language]>,
|
||||
crop_marker: &'b str,
|
||||
highlight_prefix: &'b str,
|
||||
highlight_suffix: &'b str,
|
||||
matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
|
||||
}
|
||||
|
||||
impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
||||
/// Iterates over tokens and save any of them that matches the query.
|
||||
fn compute_matches(&mut self) -> &mut Self {
|
||||
/// some words are counted as matches only if they are close together and in the good order,
|
||||
/// compute_partial_match peek into next words to validate if the match is complete.
|
||||
fn compute_partial_match<'a>(
|
||||
mut partial: PartialMatch<'a>,
|
||||
token_position: usize,
|
||||
word_position: usize,
|
||||
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
|
||||
matches: &mut Vec<Match>,
|
||||
) -> bool {
|
||||
let mut potential_matches = vec![(token_position, word_position, partial.char_len())];
|
||||
|
||||
for (token_position, word_position, word) in words_positions {
|
||||
partial = match partial.match_token(word) {
|
||||
// token matches the partial match, but the match is not full,
|
||||
// we temporarily save the current token then we try to match the next one.
|
||||
Some(MatchType::Partial(partial)) => {
|
||||
potential_matches.push((token_position, word_position, partial.char_len()));
|
||||
partial
|
||||
}
|
||||
// partial match is now full, we keep this matches and we advance positions
|
||||
Some(MatchType::Full { char_len, ids }) => {
|
||||
let ids: Vec<_> = ids.clone().collect();
|
||||
// save previously matched tokens as matches.
|
||||
let iter = potential_matches.into_iter().map(
|
||||
|(token_position, word_position, match_len)| Match {
|
||||
match_len,
|
||||
ids: ids.clone(),
|
||||
word_position,
|
||||
token_position,
|
||||
},
|
||||
);
|
||||
matches.extend(iter);
|
||||
|
||||
// save the token that closes the partial match as a match.
|
||||
matches.push(Match {
|
||||
match_len: char_len,
|
||||
ids,
|
||||
word_position,
|
||||
token_position,
|
||||
});
|
||||
|
||||
// the match is complete, we return true.
|
||||
return true;
|
||||
}
|
||||
// no match, continue to next match.
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
|
||||
// the match is not complete, we return false.
|
||||
false
|
||||
}
|
||||
|
||||
let tokens: Vec<_> =
|
||||
self.tokenizer.tokenize_with_allow_list(self.text, self.locales).collect();
|
||||
let mut matches = Vec::new();
|
||||
|
||||
let mut words_positions = tokens
|
||||
.iter()
|
||||
.scan((0, 0), |(token_position, word_position), token| {
|
||||
let current_token_position = *token_position;
|
||||
let current_word_position = *word_position;
|
||||
*token_position += 1;
|
||||
if !token.is_separator() {
|
||||
*word_position += 1;
|
||||
}
|
||||
|
||||
Some((current_token_position, current_word_position, token))
|
||||
})
|
||||
.filter(|(_, _, token)| !token.is_separator());
|
||||
|
||||
while let Some((token_position, word_position, word)) = words_positions.next() {
|
||||
for match_type in self.matching_words.match_token(word) {
|
||||
match match_type {
|
||||
// we match, we save the current token as a match,
|
||||
// then we continue the rest of the tokens.
|
||||
MatchType::Full { char_len, ids } => {
|
||||
let ids: Vec<_> = ids.clone().collect();
|
||||
matches.push(Match {
|
||||
match_len: char_len,
|
||||
ids,
|
||||
word_position,
|
||||
token_position,
|
||||
});
|
||||
break;
|
||||
}
|
||||
// we match partially, iterate over next tokens to check if we can complete the match.
|
||||
MatchType::Partial(partial) => {
|
||||
// if match is completed, we break the matching loop over the current token,
|
||||
// then we continue the rest of the tokens.
|
||||
let mut wp = words_positions.clone();
|
||||
if compute_partial_match(
|
||||
partial,
|
||||
token_position,
|
||||
word_position,
|
||||
&mut wp,
|
||||
&mut matches,
|
||||
) {
|
||||
words_positions = wp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.matches = Some((tokens, matches));
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns boundaries of the words that match the query.
|
||||
pub fn matches(&mut self) -> Vec<MatchBounds> {
|
||||
match &self.matches {
|
||||
None => self.compute_matches().matches(),
|
||||
Some((tokens, matches)) => matches
|
||||
.iter()
|
||||
.map(|m| MatchBounds {
|
||||
start: tokens[m.token_position].byte_start,
|
||||
length: m.match_len,
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the bounds in byte index of the crop window.
|
||||
fn crop_bounds(
|
||||
&self,
|
||||
tokens: &[Token<'_>],
|
||||
matches: &[Match],
|
||||
crop_size: usize,
|
||||
) -> (usize, usize) {
|
||||
// if there is no match, we start from the beginning of the string by default.
|
||||
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
|
||||
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
|
||||
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
|
||||
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
|
||||
|
||||
// matches needs to be counted in the crop len.
|
||||
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
|
||||
|
||||
// create the initial state of the crop window: 2 iterators starting from the matches positions,
|
||||
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
|
||||
let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
|
||||
// an iterator starting from the last match token position and going towards the end of the text.
|
||||
let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
|
||||
|
||||
// grows the crop window peeking in both directions
|
||||
// until the window contains the good number of words:
|
||||
while remaining_words > 0 {
|
||||
let before_token = before_tokens.peek().map(|t| t.separator_kind());
|
||||
let after_token = after_tokens.peek().map(|t| t.separator_kind());
|
||||
|
||||
match (before_token, after_token) {
|
||||
// we can expand both sides.
|
||||
(Some(before_token), Some(after_token)) => {
|
||||
match (before_token, after_token) {
|
||||
// if they are both separators and are the same kind then advance both,
|
||||
// or expand in the soft separator separator side.
|
||||
(Some(before_token_kind), Some(after_token_kind)) => {
|
||||
if before_token_kind == after_token_kind {
|
||||
before_tokens.next();
|
||||
|
||||
// this avoid having an ending separator before crop marker.
|
||||
if remaining_words > 1 {
|
||||
after_tokens.next();
|
||||
}
|
||||
} else if before_token_kind == SeparatorKind::Hard {
|
||||
after_tokens.next();
|
||||
} else {
|
||||
before_tokens.next();
|
||||
}
|
||||
}
|
||||
// if one of the tokens is a word, we expend in the side of the word.
|
||||
// left is a word, advance left.
|
||||
(None, Some(_)) => {
|
||||
before_tokens.next();
|
||||
remaining_words -= 1;
|
||||
}
|
||||
// right is a word, advance right.
|
||||
(Some(_), None) => {
|
||||
after_tokens.next();
|
||||
remaining_words -= 1;
|
||||
}
|
||||
// both are words, advance left then right if remaining_word > 0.
|
||||
(None, None) => {
|
||||
before_tokens.next();
|
||||
remaining_words -= 1;
|
||||
|
||||
if remaining_words > 0 {
|
||||
after_tokens.next();
|
||||
remaining_words -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// the end of the text is reached, advance left.
|
||||
(Some(before_token), None) => {
|
||||
before_tokens.next();
|
||||
if before_token.is_none() {
|
||||
remaining_words -= 1;
|
||||
}
|
||||
}
|
||||
// the start of the text is reached, advance right.
|
||||
(None, Some(after_token)) => {
|
||||
after_tokens.next();
|
||||
if after_token.is_none() {
|
||||
remaining_words -= 1;
|
||||
}
|
||||
}
|
||||
// no more token to add.
|
||||
(None, None) => break,
|
||||
}
|
||||
}
|
||||
|
||||
// finally, keep the byte index of each bound of the crop window.
|
||||
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
||||
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
||||
|
||||
(crop_byte_start, crop_byte_end)
|
||||
}
|
||||
|
||||
/// Compute the score of a match interval:
|
||||
/// 1) count unique matches
|
||||
/// 2) calculate distance between matches
|
||||
/// 3) count ordered matches
|
||||
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
|
||||
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
|
||||
let mut order_score = 0;
|
||||
let mut distance_score = 0;
|
||||
|
||||
let mut iter = matches.iter().peekable();
|
||||
while let Some(m) = iter.next() {
|
||||
if let Some(next_match) = iter.peek() {
|
||||
// if matches are ordered
|
||||
if next_match.ids.iter().min() > m.ids.iter().min() {
|
||||
order_score += 1;
|
||||
}
|
||||
|
||||
// compute distance between matches
|
||||
distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
|
||||
}
|
||||
|
||||
ids.extend(m.ids.iter());
|
||||
}
|
||||
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
let uniq_score = ids.len() as i16;
|
||||
|
||||
// rank by unique match count, then by distance between matches, then by ordered match count.
|
||||
(uniq_score, distance_score, order_score)
|
||||
}
|
||||
|
||||
/// Returns the matches interval where the score computed by match_interval_score is the best.
|
||||
fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
|
||||
// we compute the matches interval if we have at least 2 matches.
|
||||
if matches.len() > 1 {
|
||||
// positions of the first and the last match of the best matches interval in `matches`.
|
||||
let mut best_interval = (0, 0);
|
||||
let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
|
||||
// current interval positions.
|
||||
let mut interval_first = 0;
|
||||
let mut interval_last = 0;
|
||||
for (index, next_match) in matches.iter().enumerate().skip(1) {
|
||||
// if next match would make interval gross more than crop_size,
|
||||
// we compare the current interval with the best one,
|
||||
// then we increase `interval_first` until next match can be added.
|
||||
if next_match.word_position - matches[interval_first].word_position >= crop_size {
|
||||
let interval_score =
|
||||
self.match_interval_score(&matches[interval_first..=interval_last]);
|
||||
|
||||
// keep interval if it's the best
|
||||
if interval_score > best_interval_score {
|
||||
best_interval = (interval_first, interval_last);
|
||||
best_interval_score = interval_score;
|
||||
}
|
||||
|
||||
// advance start of the interval while interval is longer than crop_size.
|
||||
while next_match.word_position - matches[interval_first].word_position
|
||||
>= crop_size
|
||||
{
|
||||
interval_first += 1;
|
||||
}
|
||||
}
|
||||
interval_last = index;
|
||||
}
|
||||
|
||||
// compute the last interval score and compare it to the best one.
|
||||
let interval_score =
|
||||
self.match_interval_score(&matches[interval_first..=interval_last]);
|
||||
if interval_score > best_interval_score {
|
||||
best_interval = (interval_first, interval_last);
|
||||
}
|
||||
|
||||
&matches[best_interval.0..=best_interval.1]
|
||||
} else {
|
||||
matches
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the formatted version of the original text.
|
||||
pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> {
|
||||
if !format_options.highlight && format_options.crop.is_none() {
|
||||
// compute matches is not needed if no highlight nor crop is requested.
|
||||
Cow::Borrowed(self.text)
|
||||
} else {
|
||||
match &self.matches {
|
||||
Some((tokens, matches)) => {
|
||||
// If the text has to be cropped,
|
||||
// crop around the best interval.
|
||||
let (byte_start, byte_end) = match format_options.crop {
|
||||
Some(crop_size) if crop_size > 0 => {
|
||||
let matches = self.find_best_match_interval(matches, crop_size);
|
||||
self.crop_bounds(tokens, matches, crop_size)
|
||||
}
|
||||
_ => (0, self.text.len()),
|
||||
};
|
||||
|
||||
let mut formatted = Vec::new();
|
||||
|
||||
// push crop marker if it's not the start of the text.
|
||||
if byte_start > 0 && !self.crop_marker.is_empty() {
|
||||
formatted.push(self.crop_marker);
|
||||
}
|
||||
|
||||
let mut byte_index = byte_start;
|
||||
|
||||
if format_options.highlight {
|
||||
// insert highlight markers around matches.
|
||||
for m in matches {
|
||||
let token = &tokens[m.token_position];
|
||||
|
||||
// skip matches out of the crop window.
|
||||
if token.byte_start < byte_start || token.byte_end > byte_end {
|
||||
continue;
|
||||
}
|
||||
|
||||
if byte_index < token.byte_start {
|
||||
formatted.push(&self.text[byte_index..token.byte_start]);
|
||||
}
|
||||
|
||||
let highlight_byte_index = self.text[token.byte_start..]
|
||||
.char_indices()
|
||||
.enumerate()
|
||||
.find(|(i, _)| *i == m.match_len)
|
||||
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
|
||||
formatted.push(self.highlight_prefix);
|
||||
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
|
||||
formatted.push(self.highlight_suffix);
|
||||
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
||||
if highlight_byte_index < token.byte_end {
|
||||
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
|
||||
}
|
||||
|
||||
byte_index = token.byte_end;
|
||||
}
|
||||
}
|
||||
|
||||
// push the rest of the text between last match and the end of crop.
|
||||
if byte_index < byte_end {
|
||||
formatted.push(&self.text[byte_index..byte_end]);
|
||||
}
|
||||
|
||||
// push crop marker if it's not the end of the text.
|
||||
if byte_end < self.text.len() && !self.crop_marker.is_empty() {
|
||||
formatted.push(self.crop_marker);
|
||||
}
|
||||
|
||||
if formatted.len() == 1 {
|
||||
// avoid concatenating if there is already 1 slice.
|
||||
Cow::Borrowed(&self.text[byte_start..byte_end])
|
||||
} else {
|
||||
Cow::Owned(formatted.concat())
|
||||
}
|
||||
}
|
||||
None => self.compute_matches().format(format_options),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use charabia::TokenizerBuilder;
|
||||
use matching_words::tests::temp_index_with_documents;
|
||||
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{execute_search, filtered_universe, SearchContext, TimeBudget};
|
||||
|
||||
impl<'a> MatcherBuilder<'a> {
|
||||
fn new_test(rtxn: &'a heed::RoTxn<'a>, index: &'a TempIndex, query: &str) -> Self {
|
||||
let mut ctx = SearchContext::new(index, rtxn).unwrap();
|
||||
let universe = filtered_universe(ctx.index, ctx.txn, &None).unwrap();
|
||||
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
||||
&mut ctx,
|
||||
Some(query),
|
||||
crate::TermsMatchingStrategy::default(),
|
||||
crate::score_details::ScoringStrategy::Skip,
|
||||
false,
|
||||
universe,
|
||||
&None,
|
||||
&None,
|
||||
crate::search::new::GeoSortStrategy::default(),
|
||||
0,
|
||||
100,
|
||||
Some(10),
|
||||
&mut crate::DefaultSearchLogger,
|
||||
&mut crate::DefaultSearchLogger,
|
||||
TimeBudget::max(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_identity() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: None };
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// empty text.
|
||||
let text = "";
|
||||
let mut matcher = builder.build(text, None);
|
||||
assert_eq!(&matcher.format(format_options), "");
|
||||
|
||||
// text containing only separators.
|
||||
let text = ":-)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
assert_eq!(&matcher.format(format_options), ":-)");
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text, because there is no matches.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn highlight_unicode() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "world");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing prefix match.
|
||||
let text = "Ŵôřlḑôle";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Ŵôřlḑ</em>ôle"
|
||||
);
|
||||
|
||||
// Text containing unicode match.
|
||||
let text = "Ŵôřlḑ";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Ŵôřlḑ</em>"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "westfali");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing unicode match.
|
||||
let text = "Westfália";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Westfáli</em>a"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_crop() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(10) };
|
||||
|
||||
// empty text.
|
||||
let text = "";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@""
|
||||
);
|
||||
|
||||
// text containing only separators.
|
||||
let text = ":-)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@":-)"
|
||||
);
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 first words with a marker at the end.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"A quick brown fox can not jump 32 feet, right…"
|
||||
);
|
||||
|
||||
// Text without any match starting by a separator.
|
||||
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 first words with a marker at the end.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"(A quick brown fox can not jump 32 feet, right…"
|
||||
);
|
||||
|
||||
// Test phrase propagation
|
||||
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// should crop the phrase instead of croping around the match.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…Split The World is a book written by Emily Henry…"
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…future to build a world with the boy she loves…"
|
||||
);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…she loves. Emily Henry: The Love That Split The World."
|
||||
);
|
||||
|
||||
// Text containing a match unordered and a match ordered.
|
||||
let text = "The world split void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
|
||||
// Text containing matches with different density.
|
||||
let text = "split void the void void world void void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
|
||||
// Text containing matches with same word.
|
||||
let text = "split split split split split split void void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||
|
||||
// empty text.
|
||||
let text = "";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@""
|
||||
);
|
||||
|
||||
// text containing only separators.
|
||||
let text = ":-)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@":-)"
|
||||
);
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// both should return 10 first words with a marker at the end.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"A quick brown fox can not jump 32 feet, right…"
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…future to build a <em>world</em> with <em>the</em> boy she loves…"
|
||||
);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
|
||||
);
|
||||
|
||||
// Text containing a match unordered and a match ordered.
|
||||
let text = "The world split void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop_phrase_query() {
|
||||
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
|
||||
]))
|
||||
.unwrap();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
|
||||
let mut matcher = builder.build(text, None);
|
||||
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…had the power to split <em>the</em> <em>world</em> between those who…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
|
||||
let mut matcher = builder.build(text, None);
|
||||
// should highlight "those" and the phrase "and those".
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn smaller_crop_size() {
|
||||
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let text = "void void split the world void void.";
|
||||
|
||||
// set a smaller crop size
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(2) };
|
||||
let mut matcher = builder.build(text, None);
|
||||
// because crop size < query size, partially format matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…split the…"
|
||||
);
|
||||
|
||||
// set a smaller crop size
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(1) };
|
||||
let mut matcher = builder.build(text, None);
|
||||
// because crop size < query size, partially format matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…split…"
|
||||
);
|
||||
|
||||
// set crop size to 0
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(0) };
|
||||
let mut matcher = builder.build(text, None);
|
||||
// because crop size is 0, crop is ignored.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"void void split the world void void."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_matches() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut builder =
|
||||
MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
|
||||
builder.highlight_prefix("_".to_string());
|
||||
builder.highlight_suffix("_".to_string());
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
let text = "the do or die can't be he do and or isn't he";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
|
||||
);
|
||||
}
|
||||
}
|
||||
883
crates/milli/src/search/new/mod.rs
Normal file
883
crates/milli/src/search/new/mod.rs
Normal file
@@ -0,0 +1,883 @@
|
||||
mod bucket_sort;
|
||||
mod db_cache;
|
||||
mod distinct;
|
||||
mod geo_sort;
|
||||
mod graph_based_ranking_rule;
|
||||
mod interner;
|
||||
mod limits;
|
||||
mod logger;
|
||||
pub mod matches;
|
||||
mod query_graph;
|
||||
mod query_term;
|
||||
mod ranking_rule_graph;
|
||||
mod ranking_rules;
|
||||
mod resolve_query_graph;
|
||||
mod small_bitmap;
|
||||
|
||||
mod exact_attribute;
|
||||
mod sort;
|
||||
mod vector_sort;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use bucket_sort::{bucket_sort, BucketSortOutput};
|
||||
use charabia::{Language, TokenizerBuilder};
|
||||
use db_cache::DatabaseCache;
|
||||
use exact_attribute::ExactAttribute;
|
||||
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
|
||||
use heed::RoTxn;
|
||||
use interner::{DedupInterner, Interner};
|
||||
pub use logger::visual::VisualSearchLogger;
|
||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||
use query_graph::{QueryGraph, QueryNode};
|
||||
use query_term::{
|
||||
located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm,
|
||||
};
|
||||
use ranking_rules::{
|
||||
BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait,
|
||||
};
|
||||
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
|
||||
use roaring::RoaringBitmap;
|
||||
use sort::Sort;
|
||||
|
||||
use self::distinct::facet_string_values;
|
||||
use self::geo_sort::GeoSort;
|
||||
pub use self::geo_sort::Strategy as GeoSortStrategy;
|
||||
use self::graph_based_ranking_rule::Words;
|
||||
use self::interner::Interned;
|
||||
use self::vector_sort::VectorSort;
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::vector::Embedder;
|
||||
use crate::{
|
||||
AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget,
|
||||
UserError, Weight,
|
||||
};
|
||||
|
||||
/// A structure used throughout the execution of a search query.
|
||||
pub struct SearchContext<'ctx> {
|
||||
pub index: &'ctx Index,
|
||||
pub txn: &'ctx RoTxn<'ctx>,
|
||||
pub db_cache: DatabaseCache<'ctx>,
|
||||
pub word_interner: DedupInterner<String>,
|
||||
pub phrase_interner: DedupInterner<Phrase>,
|
||||
pub term_interner: Interner<QueryTerm>,
|
||||
pub phrase_docids: PhraseDocIdsCache,
|
||||
pub restricted_fids: Option<RestrictedFids>,
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result<Self> {
|
||||
let searchable_fids = index.searchable_fields_and_weights(txn)?;
|
||||
let exact_attributes_ids = index.exact_attributes_ids(txn)?;
|
||||
|
||||
let mut exact = Vec::new();
|
||||
let mut tolerant = Vec::new();
|
||||
for (_name, fid, weight) in searchable_fids {
|
||||
if exact_attributes_ids.contains(&fid) {
|
||||
exact.push((fid, weight));
|
||||
} else {
|
||||
tolerant.push((fid, weight));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
index,
|
||||
txn,
|
||||
db_cache: <_>::default(),
|
||||
word_interner: <_>::default(),
|
||||
phrase_interner: <_>::default(),
|
||||
term_interner: <_>::default(),
|
||||
phrase_docids: <_>::default(),
|
||||
restricted_fids: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn attributes_to_search_on(
|
||||
&mut self,
|
||||
attributes_to_search_on: &'ctx [String],
|
||||
) -> Result<()> {
|
||||
let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?;
|
||||
let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?;
|
||||
let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?;
|
||||
|
||||
let mut wildcard = false;
|
||||
|
||||
let mut restricted_fids = RestrictedFids::default();
|
||||
for field_name in attributes_to_search_on {
|
||||
if field_name == "*" {
|
||||
wildcard = true;
|
||||
// we cannot early exit as we want to returns error in case of unknown fields
|
||||
continue;
|
||||
}
|
||||
let searchable_weight =
|
||||
searchable_fields_weights.iter().find(|(name, _, _)| name == field_name);
|
||||
let (fid, weight) = match searchable_weight {
|
||||
// The Field id exist and the field is searchable
|
||||
Some((_name, fid, weight)) => (*fid, *weight),
|
||||
// The field is not searchable but the user didn't define any searchable attributes
|
||||
None if user_defined_searchable.is_none() => continue,
|
||||
// The field is not searchable => User error
|
||||
None => {
|
||||
let (valid_fields, hidden_fields) = self.index.remove_hidden_fields(
|
||||
self.txn,
|
||||
searchable_fields_weights.iter().map(|(name, _, _)| name),
|
||||
)?;
|
||||
|
||||
let field = field_name.to_string();
|
||||
return Err(UserError::InvalidSearchableAttribute {
|
||||
field,
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
};
|
||||
|
||||
if exact_attributes_ids.contains(&fid) {
|
||||
restricted_fids.exact.push((fid, weight));
|
||||
} else {
|
||||
restricted_fids.tolerant.push((fid, weight));
|
||||
};
|
||||
}
|
||||
|
||||
if wildcard {
|
||||
self.restricted_fids = None;
|
||||
} else {
|
||||
self.restricted_fids = Some(restricted_fids);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)]
|
||||
pub enum Word {
|
||||
Original(Interned<String>),
|
||||
Derived(Interned<String>),
|
||||
}
|
||||
|
||||
impl Word {
|
||||
pub fn interned(&self) -> Interned<String> {
|
||||
match self {
|
||||
Word::Original(word) => *word,
|
||||
Word::Derived(word) => *word,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct RestrictedFids {
|
||||
pub tolerant: Vec<(FieldId, Weight)>,
|
||||
pub exact: Vec<(FieldId, Weight)>,
|
||||
}
|
||||
|
||||
impl RestrictedFids {
|
||||
pub fn contains(&self, fid: &FieldId) -> bool {
|
||||
self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid)
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it.
|
||||
fn resolve_maximally_reduced_query_graph(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
matching_strategy: TermsMatchingStrategy,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut graph = query_graph.clone();
|
||||
|
||||
let nodes_to_remove = match matching_strategy {
|
||||
TermsMatchingStrategy::Last => query_graph
|
||||
.removal_order_for_terms_matching_strategy_last(ctx)
|
||||
.iter()
|
||||
.flat_map(|x| x.iter())
|
||||
.collect(),
|
||||
TermsMatchingStrategy::Frequency => query_graph
|
||||
.removal_order_for_terms_matching_strategy_frequency(ctx)?
|
||||
.iter()
|
||||
.flat_map(|x| x.iter())
|
||||
.collect(),
|
||||
TermsMatchingStrategy::All => vec![],
|
||||
};
|
||||
graph.remove_nodes_keep_edges(&nodes_to_remove);
|
||||
|
||||
logger.query_for_initial_universe(&graph);
|
||||
let docids = compute_query_graph_docids(ctx, &graph, universe)?;
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::universe")]
|
||||
fn resolve_universe(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
initial_universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
matching_strategy: TermsMatchingStrategy,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
resolve_maximally_reduced_query_graph(
|
||||
ctx,
|
||||
initial_universe,
|
||||
query_graph,
|
||||
matching_strategy,
|
||||
logger,
|
||||
)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
||||
fn resolve_negative_words(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
negative_words: &[Word],
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut negative_bitmap = RoaringBitmap::new();
|
||||
for &word in negative_words {
|
||||
if let Some(bitmap) = ctx.word_docids(universe, word)? {
|
||||
negative_bitmap |= bitmap;
|
||||
}
|
||||
}
|
||||
Ok(negative_bitmap)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
||||
fn resolve_negative_phrases(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
negative_phrases: &[LocatedQueryTerm],
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut negative_bitmap = RoaringBitmap::new();
|
||||
for term in negative_phrases {
|
||||
let query_term = ctx.term_interner.get(term.value);
|
||||
if let Some(phrase) = query_term.original_phrase() {
|
||||
negative_bitmap |= ctx.get_phrase_docids(phrase)?;
|
||||
}
|
||||
}
|
||||
Ok(negative_bitmap)
|
||||
}
|
||||
|
||||
/// Return the list of initialised ranking rules to be used for a placeholder search.
|
||||
fn get_ranking_rules_for_placeholder_search<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
|
||||
let mut sort = false;
|
||||
let mut sorted_fields = HashSet::new();
|
||||
let mut geo_sorted = false;
|
||||
let mut ranking_rules: Vec<BoxRankingRule<'ctx, PlaceholderQuery>> = vec![];
|
||||
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
||||
for rr in settings_ranking_rules {
|
||||
match rr {
|
||||
// These rules need a query to have an effect; ignore them in placeholder search
|
||||
crate::Criterion::Words
|
||||
| crate::Criterion::Typo
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Proximity
|
||||
| crate::Criterion::Exactness => continue,
|
||||
crate::Criterion::Sort => {
|
||||
if sort {
|
||||
continue;
|
||||
}
|
||||
resolve_sort_criteria(
|
||||
sort_criteria,
|
||||
ctx,
|
||||
&mut ranking_rules,
|
||||
&mut sorted_fields,
|
||||
&mut geo_sorted,
|
||||
geo_strategy,
|
||||
)?;
|
||||
sort = true;
|
||||
}
|
||||
crate::Criterion::Asc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
crate::Criterion::Desc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn get_ranking_rules_for_vector<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
limit_plus_offset: usize,
|
||||
target: &[f32],
|
||||
embedder_name: &str,
|
||||
embedder: &Embedder,
|
||||
quantized: bool,
|
||||
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
|
||||
// query graph search
|
||||
|
||||
let mut sort = false;
|
||||
let mut sorted_fields = HashSet::new();
|
||||
let mut geo_sorted = false;
|
||||
|
||||
let mut vector = false;
|
||||
let mut ranking_rules: Vec<BoxRankingRule<'ctx, PlaceholderQuery>> = vec![];
|
||||
|
||||
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
||||
for rr in settings_ranking_rules {
|
||||
match rr {
|
||||
crate::Criterion::Words
|
||||
| crate::Criterion::Typo
|
||||
| crate::Criterion::Proximity
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Exactness => {
|
||||
if !vector {
|
||||
let vector_candidates = ctx.index.documents_ids(ctx.txn)?;
|
||||
let vector_sort = VectorSort::new(
|
||||
ctx,
|
||||
target.to_vec(),
|
||||
vector_candidates,
|
||||
limit_plus_offset,
|
||||
embedder_name,
|
||||
embedder,
|
||||
quantized,
|
||||
)?;
|
||||
ranking_rules.push(Box::new(vector_sort));
|
||||
vector = true;
|
||||
}
|
||||
}
|
||||
crate::Criterion::Sort => {
|
||||
if sort {
|
||||
continue;
|
||||
}
|
||||
resolve_sort_criteria(
|
||||
sort_criteria,
|
||||
ctx,
|
||||
&mut ranking_rules,
|
||||
&mut sorted_fields,
|
||||
&mut geo_sorted,
|
||||
geo_strategy,
|
||||
)?;
|
||||
sort = true;
|
||||
}
|
||||
crate::Criterion::Asc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
crate::Criterion::Desc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
/// Return the list of initialised ranking rules to be used for a query graph search.
|
||||
fn get_ranking_rules_for_query_graph_search<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
) -> Result<Vec<BoxRankingRule<'ctx, QueryGraph>>> {
|
||||
// query graph search
|
||||
let mut words = false;
|
||||
let mut typo = false;
|
||||
let mut proximity = false;
|
||||
let mut sort = false;
|
||||
let mut attribute = false;
|
||||
let mut exactness = false;
|
||||
let mut sorted_fields = HashSet::new();
|
||||
let mut geo_sorted = false;
|
||||
|
||||
// Don't add the `words` ranking rule if the term matching strategy is `All`
|
||||
if matches!(terms_matching_strategy, TermsMatchingStrategy::All) {
|
||||
words = true;
|
||||
}
|
||||
|
||||
let mut ranking_rules: Vec<BoxRankingRule<'ctx, QueryGraph>> = vec![];
|
||||
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
||||
for rr in settings_ranking_rules {
|
||||
// Add Words before any of: typo, proximity, attribute
|
||||
match rr {
|
||||
crate::Criterion::Typo
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Proximity
|
||||
| crate::Criterion::Exactness => {
|
||||
if !words {
|
||||
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
|
||||
words = true;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
match rr {
|
||||
crate::Criterion::Words => {
|
||||
if words {
|
||||
continue;
|
||||
}
|
||||
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
|
||||
words = true;
|
||||
}
|
||||
crate::Criterion::Typo => {
|
||||
if typo {
|
||||
continue;
|
||||
}
|
||||
typo = true;
|
||||
ranking_rules.push(Box::new(Typo::new(None)));
|
||||
}
|
||||
crate::Criterion::Proximity => {
|
||||
if proximity {
|
||||
continue;
|
||||
}
|
||||
proximity = true;
|
||||
ranking_rules.push(Box::new(Proximity::new(None)));
|
||||
}
|
||||
crate::Criterion::Attribute => {
|
||||
if attribute {
|
||||
continue;
|
||||
}
|
||||
attribute = true;
|
||||
ranking_rules.push(Box::new(Fid::new(None)));
|
||||
ranking_rules.push(Box::new(Position::new(None)));
|
||||
}
|
||||
crate::Criterion::Sort => {
|
||||
if sort {
|
||||
continue;
|
||||
}
|
||||
resolve_sort_criteria(
|
||||
sort_criteria,
|
||||
ctx,
|
||||
&mut ranking_rules,
|
||||
&mut sorted_fields,
|
||||
&mut geo_sorted,
|
||||
geo_strategy,
|
||||
)?;
|
||||
sort = true;
|
||||
}
|
||||
crate::Criterion::Exactness => {
|
||||
if exactness {
|
||||
continue;
|
||||
}
|
||||
ranking_rules.push(Box::new(ExactAttribute::new()));
|
||||
ranking_rules.push(Box::new(Exactness::new()));
|
||||
exactness = true;
|
||||
}
|
||||
crate::Criterion::Asc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
crate::Criterion::Desc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
ctx: &SearchContext<'ctx>,
|
||||
ranking_rules: &mut Vec<BoxRankingRule<'ctx, Query>>,
|
||||
sorted_fields: &mut HashSet<String>,
|
||||
geo_sorted: &mut bool,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
) -> Result<()> {
|
||||
let sort_criteria = sort_criteria.clone().unwrap_or_default();
|
||||
ranking_rules.reserve(sort_criteria.len());
|
||||
for criterion in sort_criteria {
|
||||
match criterion {
|
||||
AscDesc::Asc(Member::Field(field_name)) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
AscDesc::Desc(Member::Field(field_name)) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
AscDesc::Asc(Member::Geo(point)) => {
|
||||
if *geo_sorted {
|
||||
continue;
|
||||
}
|
||||
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
|
||||
ranking_rules.push(Box::new(GeoSort::new(
|
||||
geo_strategy,
|
||||
geo_faceted_docids,
|
||||
point,
|
||||
true,
|
||||
)?));
|
||||
}
|
||||
AscDesc::Desc(Member::Geo(point)) => {
|
||||
if *geo_sorted {
|
||||
continue;
|
||||
}
|
||||
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
|
||||
ranking_rules.push(Box::new(GeoSort::new(
|
||||
geo_strategy,
|
||||
geo_faceted_docids,
|
||||
point,
|
||||
false,
|
||||
)?));
|
||||
}
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::universe")]
|
||||
pub fn filtered_universe(
|
||||
index: &Index,
|
||||
txn: &RoTxn<'_>,
|
||||
filters: &Option<Filter<'_>>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
Ok(if let Some(filters) = filters {
|
||||
filters.evaluate(txn, index)?
|
||||
} else {
|
||||
index.documents_ids(txn)?
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn execute_vector_search(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
vector: &[f32],
|
||||
scoring_strategy: ScoringStrategy,
|
||||
universe: RoaringBitmap,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
distinct: &Option<String>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
from: usize,
|
||||
length: usize,
|
||||
embedder_name: &str,
|
||||
embedder: &Embedder,
|
||||
quantized: bool,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
) -> Result<PartialSearchResult> {
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
// FIXME: input universe = universe & documents_with_vectors
|
||||
// for now if we're computing embeddings for ALL documents, we can assume that this is just universe
|
||||
let ranking_rules = get_ranking_rules_for_vector(
|
||||
ctx,
|
||||
sort_criteria,
|
||||
geo_strategy,
|
||||
from + length,
|
||||
vector,
|
||||
embedder_name,
|
||||
embedder,
|
||||
quantized,
|
||||
)?;
|
||||
|
||||
let mut placeholder_search_logger = logger::DefaultSearchLogger;
|
||||
let placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery> =
|
||||
&mut placeholder_search_logger;
|
||||
|
||||
let BucketSortOutput { docids, scores, all_candidates, degraded } = bucket_sort(
|
||||
ctx,
|
||||
ranking_rules,
|
||||
&PlaceholderQuery,
|
||||
distinct.as_deref(),
|
||||
&universe,
|
||||
from,
|
||||
length,
|
||||
scoring_strategy,
|
||||
placeholder_search_logger,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
)?;
|
||||
|
||||
Ok(PartialSearchResult {
|
||||
candidates: all_candidates,
|
||||
document_scores: scores,
|
||||
documents_ids: docids,
|
||||
located_query_terms: None,
|
||||
degraded,
|
||||
used_negative_operator: false,
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::main")]
|
||||
pub fn execute_search(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
query: Option<&str>,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
scoring_strategy: ScoringStrategy,
|
||||
exhaustive_number_hits: bool,
|
||||
mut universe: RoaringBitmap,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
distinct: &Option<String>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
from: usize,
|
||||
length: usize,
|
||||
words_limit: Option<usize>,
|
||||
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
|
||||
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
locales: Option<&Vec<Language>>,
|
||||
) -> Result<PartialSearchResult> {
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
let mut used_negative_operator = false;
|
||||
let mut located_query_terms = None;
|
||||
let query_terms = if let Some(query) = query {
|
||||
let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder");
|
||||
let entered = span.enter();
|
||||
|
||||
// We make sure that the analyzer is aware of the stop words
|
||||
// this ensures that the query builder is able to properly remove them.
|
||||
let mut tokbuilder = TokenizerBuilder::new();
|
||||
let stop_words = ctx.index.stop_words(ctx.txn)?;
|
||||
if let Some(ref stop_words) = stop_words {
|
||||
tokbuilder.stop_words(stop_words);
|
||||
}
|
||||
|
||||
let separators = ctx.index.allowed_separators(ctx.txn)?;
|
||||
let separators: Option<Vec<_>> =
|
||||
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
if let Some(ref separators) = separators {
|
||||
tokbuilder.separators(separators);
|
||||
}
|
||||
|
||||
let dictionary = ctx.index.dictionary(ctx.txn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
if let Some(ref dictionary) = dictionary {
|
||||
tokbuilder.words_dict(dictionary);
|
||||
}
|
||||
|
||||
let db_locales;
|
||||
match locales {
|
||||
Some(locales) => {
|
||||
if !locales.is_empty() {
|
||||
tokbuilder.allow_list(locales);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// If no locales are specified, we use the locales specified in the localized attributes rules
|
||||
let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?;
|
||||
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?;
|
||||
|
||||
let localized_fields = match &ctx.restricted_fids {
|
||||
// if AttributeToSearchOn is set, use the restricted list of ids
|
||||
Some(restricted_fids) => {
|
||||
let iter = restricted_fids
|
||||
.exact
|
||||
.iter()
|
||||
.chain(restricted_fids.tolerant.iter())
|
||||
.map(|(fid, _)| *fid);
|
||||
|
||||
LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter)
|
||||
}
|
||||
// Otherwise use the full list of ids coming from the index searchable fields
|
||||
None => LocalizedFieldIds::new(
|
||||
&localized_attributes_rules,
|
||||
&fields_ids_map,
|
||||
searchable_fields.into_iter(),
|
||||
),
|
||||
};
|
||||
|
||||
db_locales = localized_fields.all_locales();
|
||||
if !db_locales.is_empty() {
|
||||
tokbuilder.allow_list(&db_locales);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let tokenizer = tokbuilder.build();
|
||||
drop(entered);
|
||||
|
||||
let span = tracing::trace_span!(target: "search::tokens", "tokenize");
|
||||
let entered = span.enter();
|
||||
let tokens = tokenizer.tokenize(query);
|
||||
drop(entered);
|
||||
|
||||
let ExtractedTokens { query_terms, negative_words, negative_phrases } =
|
||||
located_query_terms_from_tokens(ctx, tokens, words_limit)?;
|
||||
used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
|
||||
|
||||
let ignored_documents = resolve_negative_words(ctx, Some(&universe), &negative_words)?;
|
||||
let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?;
|
||||
|
||||
universe -= ignored_documents;
|
||||
universe -= ignored_phrases;
|
||||
|
||||
if query_terms.is_empty() {
|
||||
// Do a placeholder search instead
|
||||
None
|
||||
} else {
|
||||
Some(query_terms)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let bucket_sort_output = if let Some(query_terms) = query_terms {
|
||||
let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
|
||||
located_query_terms = Some(new_located_query_terms);
|
||||
|
||||
let ranking_rules = get_ranking_rules_for_query_graph_search(
|
||||
ctx,
|
||||
sort_criteria,
|
||||
geo_strategy,
|
||||
terms_matching_strategy,
|
||||
)?;
|
||||
|
||||
universe &=
|
||||
resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?;
|
||||
|
||||
bucket_sort(
|
||||
ctx,
|
||||
ranking_rules,
|
||||
&graph,
|
||||
distinct.as_deref(),
|
||||
&universe,
|
||||
from,
|
||||
length,
|
||||
scoring_strategy,
|
||||
query_graph_logger,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
)?
|
||||
} else {
|
||||
let ranking_rules =
|
||||
get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_strategy)?;
|
||||
bucket_sort(
|
||||
ctx,
|
||||
ranking_rules,
|
||||
&PlaceholderQuery,
|
||||
distinct.as_deref(),
|
||||
&universe,
|
||||
from,
|
||||
length,
|
||||
scoring_strategy,
|
||||
placeholder_search_logger,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
)?
|
||||
};
|
||||
|
||||
let BucketSortOutput { docids, scores, mut all_candidates, degraded } = bucket_sort_output;
|
||||
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
|
||||
// The candidates is the universe unless the exhaustive number of hits
|
||||
// is requested and a distinct attribute is set.
|
||||
if exhaustive_number_hits {
|
||||
let distinct_field = match distinct.as_deref() {
|
||||
Some(distinct) => Some(distinct),
|
||||
None => ctx.index.distinct_field(ctx.txn)?,
|
||||
};
|
||||
|
||||
if let Some(f) = distinct_field {
|
||||
if let Some(distinct_fid) = fields_ids_map.id(f) {
|
||||
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(PartialSearchResult {
|
||||
candidates: all_candidates,
|
||||
document_scores: scores,
|
||||
documents_ids: docids,
|
||||
located_query_terms,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
})
|
||||
}
|
||||
|
||||
fn check_sort_criteria(
|
||||
ctx: &SearchContext<'_>,
|
||||
sort_criteria: Option<&Vec<AscDesc>>,
|
||||
) -> Result<()> {
|
||||
let sort_criteria = if let Some(sort_criteria) = sort_criteria {
|
||||
sort_criteria
|
||||
} else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
if sort_criteria.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// We check that the sort ranking rule exists and throw an
|
||||
// error if we try to use it and that it doesn't.
|
||||
let sort_ranking_rule_missing = !ctx.index.criteria(ctx.txn)?.contains(&crate::Criterion::Sort);
|
||||
if sort_ranking_rule_missing {
|
||||
return Err(UserError::SortRankingRuleMissing.into());
|
||||
}
|
||||
|
||||
// We check that we are allowed to use the sort criteria, we check
|
||||
// that they are declared in the sortable fields.
|
||||
let sortable_fields = ctx.index.sortable_fields(ctx.txn)?;
|
||||
for asc_desc in sort_criteria {
|
||||
match asc_desc.member() {
|
||||
Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => {
|
||||
let (valid_fields, hidden_fields) =
|
||||
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
|
||||
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: field.to_string(),
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
Member::Geo(_) if !sortable_fields.contains("_geo") => {
|
||||
let (valid_fields, hidden_fields) =
|
||||
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
|
||||
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: "_geo".to_string(),
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct PartialSearchResult {
|
||||
pub located_query_terms: Option<Vec<LocatedQueryTerm>>,
|
||||
pub candidates: RoaringBitmap,
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
pub document_scores: Vec<Vec<ScoreDetails>>,
|
||||
|
||||
pub degraded: bool,
|
||||
pub used_negative_operator: bool,
|
||||
}
|
||||
536
crates/milli/src/search/new/query_graph.rs
Normal file
536
crates/milli/src/search/new/query_graph.rs
Normal file
@@ -0,0 +1,536 @@
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::BTreeMap;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use fxhash::{FxHashMap, FxHasher};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::interner::{FixedSizeInterner, Interned};
|
||||
use super::query_term::{
|
||||
self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset,
|
||||
};
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
use super::SearchContext;
|
||||
use crate::search::new::interner::Interner;
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||
use crate::Result;
|
||||
|
||||
/// A node of the [`QueryGraph`].
|
||||
///
|
||||
/// There are four types of nodes:
|
||||
/// 1. `Start` : unique, represents the start of the query
|
||||
/// 2. `End` : unique, represents the end of a query
|
||||
/// 3. `Deleted` : represents a node that was deleted.
|
||||
/// All deleted nodes are unreachable from the start node.
|
||||
/// 4. `Term` is a regular node representing a word or combination of words
|
||||
/// from the user query.
|
||||
#[derive(Clone)]
|
||||
pub struct QueryNode {
|
||||
pub data: QueryNodeData,
|
||||
pub predecessors: SmallBitmap<QueryNode>,
|
||||
pub successors: SmallBitmap<QueryNode>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum QueryNodeData {
|
||||
Term(LocatedQueryTermSubset),
|
||||
Deleted,
|
||||
Start,
|
||||
End,
|
||||
}
|
||||
|
||||
/**
|
||||
A graph representing all the ways to interpret the user's search query.
|
||||
|
||||
## Example 1
|
||||
For the search query `sunflower`, we need to register the following things:
|
||||
- we need to look for the exact word `sunflower`
|
||||
- but also any word which is 1 or 2 typos apart from `sunflower`
|
||||
- and every word that contains the prefix `sunflower`
|
||||
- and also the couple of adjacent words `sun flower`
|
||||
- as well as all the user-defined synonyms of `sunflower`
|
||||
|
||||
All these derivations of a word will be stored in [`QueryTerm`].
|
||||
|
||||
## Example 2:
|
||||
For the search query `summer house by`.
|
||||
|
||||
We also look for all word derivations of each term. And we also need to consider
|
||||
the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`.
|
||||
Furthermore, we need to know which words these ngrams replace. This is done by creating the
|
||||
following graph, where each node also contains a list of derivations:
|
||||
```txt
|
||||
┌───────┐
|
||||
┌─│houseby│─────────┐
|
||||
│ └───────┘ │
|
||||
┌───────┐ ┌───────┐ │ ┌───────┐ ┌────┐ │ ┌───────┐
|
||||
│ START │─┬─│summer │─┴─│ house │┌─│ by │─┼─│ END │
|
||||
└───────┘ │ └───────┘ └───────┘│ └────┘ │ └───────┘
|
||||
│ ┌────────────┐ │ │
|
||||
├─│summerhouse │───────┘ │
|
||||
│ └────────────┘ │
|
||||
│ ┌─────────────┐ │
|
||||
└─────────│summerhouseby│───────┘
|
||||
└─────────────┘
|
||||
```
|
||||
Note also that each node has a range of positions associated with it,
|
||||
such that `summer` is known to be a word at the positions `0..=0` and `houseby`
|
||||
is registered with the positions `1..=2`. When two nodes are connected by an edge,
|
||||
it means that they are potentially next to each other in the user's search query
|
||||
(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy)
|
||||
and the transformations that were done on the query graph).
|
||||
*/
|
||||
#[derive(Clone)]
|
||||
pub struct QueryGraph {
|
||||
/// The index of the start node within `self.nodes`
|
||||
pub root_node: Interned<QueryNode>,
|
||||
/// The index of the end node within `self.nodes`
|
||||
pub end_node: Interned<QueryNode>,
|
||||
/// The list of all query nodes
|
||||
pub nodes: FixedSizeInterner<QueryNode>,
|
||||
}
|
||||
|
||||
impl QueryGraph {
|
||||
/// Build the query graph from the parsed user search query, return an updated list of the located query terms
|
||||
/// which contains ngrams.
|
||||
pub fn from_query(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
// The terms here must be consecutive
|
||||
terms: &[LocatedQueryTerm],
|
||||
) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
|
||||
let mut new_located_query_terms = terms.to_vec();
|
||||
|
||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||
|
||||
let mut nodes_data: Vec<QueryNodeData> = vec![QueryNodeData::Start, QueryNodeData::End];
|
||||
let root_node = 0;
|
||||
let end_node = 1;
|
||||
|
||||
// Ee could consider generalizing to 4,5,6,7,etc. ngrams
|
||||
let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
|
||||
(vec![], vec![], vec![root_node]);
|
||||
|
||||
let original_terms_len = terms.len();
|
||||
for term_idx in 0..original_terms_len {
|
||||
let mut new_nodes = vec![];
|
||||
|
||||
let new_node_idx = add_node(
|
||||
&mut nodes_data,
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset: QueryTermSubset::full(terms[term_idx].value),
|
||||
positions: terms[term_idx].positions.clone(),
|
||||
term_ids: term_idx as u8..=term_idx as u8,
|
||||
}),
|
||||
);
|
||||
new_nodes.push(new_node_idx);
|
||||
|
||||
if !prev1.is_empty() {
|
||||
if let Some(ngram) =
|
||||
query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)?
|
||||
{
|
||||
new_located_query_terms.push(ngram.clone());
|
||||
let ngram_idx = add_node(
|
||||
&mut nodes_data,
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset: QueryTermSubset::full(ngram.value),
|
||||
positions: ngram.positions,
|
||||
term_ids: term_idx as u8 - 1..=term_idx as u8,
|
||||
}),
|
||||
);
|
||||
new_nodes.push(ngram_idx);
|
||||
}
|
||||
}
|
||||
if !prev2.is_empty() {
|
||||
if let Some(ngram) =
|
||||
query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)?
|
||||
{
|
||||
new_located_query_terms.push(ngram.clone());
|
||||
let ngram_idx = add_node(
|
||||
&mut nodes_data,
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset: QueryTermSubset::full(ngram.value),
|
||||
positions: ngram.positions,
|
||||
term_ids: term_idx as u8 - 2..=term_idx as u8,
|
||||
}),
|
||||
);
|
||||
new_nodes.push(ngram_idx);
|
||||
}
|
||||
}
|
||||
(prev0, prev1, prev2) = (new_nodes, prev0, prev1);
|
||||
}
|
||||
|
||||
let root_node = Interned::from_raw(root_node);
|
||||
let end_node = Interned::from_raw(end_node);
|
||||
let mut nodes = FixedSizeInterner::new(
|
||||
nodes_data.len() as u16,
|
||||
QueryNode {
|
||||
data: QueryNodeData::Deleted,
|
||||
predecessors: SmallBitmap::new(nodes_data.len() as u16),
|
||||
successors: SmallBitmap::new(nodes_data.len() as u16),
|
||||
},
|
||||
);
|
||||
for (node_idx, node_data) in nodes_data.into_iter().enumerate() {
|
||||
let node = nodes.get_mut(Interned::from_raw(node_idx as u16));
|
||||
node.data = node_data;
|
||||
}
|
||||
let mut graph = QueryGraph { root_node, end_node, nodes };
|
||||
graph.build_initial_edges();
|
||||
|
||||
Ok((graph, new_located_query_terms))
|
||||
}
|
||||
|
||||
/// Remove the given nodes, connecting all their predecessors to all their successors.
|
||||
pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned<QueryNode>]) {
|
||||
for &node_id in nodes {
|
||||
let node = self.nodes.get(node_id);
|
||||
let old_node_pred = node.predecessors.clone();
|
||||
let old_node_succ = node.successors.clone();
|
||||
for pred in old_node_pred.iter() {
|
||||
let pred_successors = &mut self.nodes.get_mut(pred).successors;
|
||||
pred_successors.remove(node_id);
|
||||
pred_successors.union(&old_node_succ);
|
||||
}
|
||||
for succ in old_node_succ.iter() {
|
||||
let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors;
|
||||
succ_predecessors.remove(node_id);
|
||||
succ_predecessors.union(&old_node_pred);
|
||||
}
|
||||
let node = self.nodes.get_mut(node_id);
|
||||
node.data = QueryNodeData::Deleted;
|
||||
node.predecessors.clear();
|
||||
node.successors.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove the given nodes and all their edges from the query graph.
|
||||
pub fn remove_nodes(&mut self, nodes: &[Interned<QueryNode>]) {
|
||||
for &node_id in nodes {
|
||||
let node = &self.nodes.get(node_id);
|
||||
let old_node_pred = node.predecessors.clone();
|
||||
let old_node_succ = node.successors.clone();
|
||||
|
||||
for pred in old_node_pred.iter() {
|
||||
self.nodes.get_mut(pred).successors.remove(node_id);
|
||||
}
|
||||
for succ in old_node_succ.iter() {
|
||||
self.nodes.get_mut(succ).predecessors.remove(node_id);
|
||||
}
|
||||
|
||||
let node = self.nodes.get_mut(node_id);
|
||||
node.data = QueryNodeData::Deleted;
|
||||
node.predecessors.clear();
|
||||
node.successors.clear();
|
||||
}
|
||||
}
|
||||
/// Simplify the query graph by removing all nodes that are disconnected from
|
||||
/// the start or end nodes.
|
||||
pub fn simplify(&mut self) {
|
||||
loop {
|
||||
let mut nodes_to_remove = vec![];
|
||||
for (node_idx, node) in self.nodes.iter() {
|
||||
if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted)
|
||||
&& node.successors.is_empty())
|
||||
|| (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted)
|
||||
&& node.predecessors.is_empty())
|
||||
{
|
||||
nodes_to_remove.push(node_idx);
|
||||
}
|
||||
}
|
||||
if nodes_to_remove.is_empty() {
|
||||
break;
|
||||
} else {
|
||||
self.remove_nodes(&nodes_to_remove);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn build_initial_edges(&mut self) {
|
||||
for (_, node) in self.nodes.iter_mut() {
|
||||
node.successors.clear();
|
||||
node.predecessors.clear();
|
||||
}
|
||||
for node_id in self.nodes.indexes() {
|
||||
let node = self.nodes.get(node_id);
|
||||
let end_prev_term_id = match &node.data {
|
||||
QueryNodeData::Term(term) => *term.term_ids.end() as i16,
|
||||
QueryNodeData::Start => -1,
|
||||
QueryNodeData::Deleted => continue,
|
||||
QueryNodeData::End => continue,
|
||||
};
|
||||
let successors = {
|
||||
let mut successors = SmallBitmap::for_interned_values_in(&self.nodes);
|
||||
let mut min = i16::MAX;
|
||||
for (node_id, node) in self.nodes.iter() {
|
||||
let start_next_term_id = match &node.data {
|
||||
QueryNodeData::Term(term) => *term.term_ids.start() as i16,
|
||||
QueryNodeData::End => i16::MAX,
|
||||
QueryNodeData::Start => continue,
|
||||
QueryNodeData::Deleted => continue,
|
||||
};
|
||||
if start_next_term_id <= end_prev_term_id {
|
||||
continue;
|
||||
}
|
||||
match start_next_term_id.cmp(&min) {
|
||||
Ordering::Less => {
|
||||
min = start_next_term_id;
|
||||
successors.clear();
|
||||
successors.insert(node_id);
|
||||
}
|
||||
Ordering::Equal => {
|
||||
successors.insert(node_id);
|
||||
}
|
||||
Ordering::Greater => continue,
|
||||
}
|
||||
}
|
||||
successors
|
||||
};
|
||||
let node = self.nodes.get_mut(node_id);
|
||||
node.successors = successors.clone();
|
||||
for successor in successors.iter() {
|
||||
let successor = self.nodes.get_mut(successor);
|
||||
successor.predecessors.insert(node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn removal_order_for_terms_matching_strategy_frequency(
|
||||
&self,
|
||||
ctx: &mut SearchContext<'_>,
|
||||
) -> Result<Vec<SmallBitmap<QueryNode>>> {
|
||||
// lookup frequency for each term
|
||||
let mut term_with_frequency: Vec<(u8, u64)> = {
|
||||
let mut term_docids: BTreeMap<u8, RoaringBitmap> = Default::default();
|
||||
for (_, node) in self.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(t) => {
|
||||
let docids = compute_query_term_subset_docids(ctx, None, &t.term_subset)?;
|
||||
for id in t.term_ids.clone() {
|
||||
term_docids
|
||||
.entry(id)
|
||||
.and_modify(|curr| *curr |= &docids)
|
||||
.or_insert_with(|| docids.clone());
|
||||
}
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||
}
|
||||
}
|
||||
term_docids
|
||||
.into_iter()
|
||||
.map(|(idx, docids)| match docids.len() {
|
||||
0 => (idx, u64::MAX),
|
||||
frequency => (idx, frequency),
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
term_with_frequency.sort_by_key(|(_, frequency)| Reverse(*frequency));
|
||||
let mut term_weight = BTreeMap::new();
|
||||
let mut weight: u16 = 1;
|
||||
let mut peekable = term_with_frequency.into_iter().peekable();
|
||||
while let Some((idx, frequency)) = peekable.next() {
|
||||
term_weight.insert(idx, weight);
|
||||
if peekable.peek().map_or(false, |(_, f)| frequency != *f) {
|
||||
weight += 1;
|
||||
}
|
||||
}
|
||||
let cost_of_term_idx = move |term_idx: u8| *term_weight.get(&term_idx).unwrap();
|
||||
Ok(self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx))
|
||||
}
|
||||
|
||||
pub fn removal_order_for_terms_matching_strategy_last(
|
||||
&self,
|
||||
ctx: &SearchContext<'_>,
|
||||
) -> Vec<SmallBitmap<QueryNode>> {
|
||||
let (first_term_idx, last_term_idx) = {
|
||||
let mut first_term_idx = u8::MAX;
|
||||
let mut last_term_idx = 0u8;
|
||||
for (_, node) in self.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(t) => {
|
||||
if *t.term_ids.end() > last_term_idx {
|
||||
last_term_idx = *t.term_ids.end();
|
||||
}
|
||||
if *t.term_ids.start() < first_term_idx {
|
||||
first_term_idx = *t.term_ids.start();
|
||||
}
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||
}
|
||||
}
|
||||
(first_term_idx, last_term_idx)
|
||||
};
|
||||
if first_term_idx >= last_term_idx {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let cost_of_term_idx = |term_idx: u8| {
|
||||
let rank = 1 + last_term_idx - term_idx;
|
||||
rank as u16
|
||||
};
|
||||
self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx)
|
||||
}
|
||||
|
||||
pub fn removal_order_for_terms_matching_strategy(
|
||||
&self,
|
||||
ctx: &SearchContext<'_>,
|
||||
order: impl Fn(u8) -> u16,
|
||||
) -> Vec<SmallBitmap<QueryNode>> {
|
||||
let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
|
||||
let mut at_least_one_mandatory_term = false;
|
||||
for (node_id, node) in self.nodes.iter() {
|
||||
let QueryNodeData::Term(t) = &node.data else { continue };
|
||||
if t.term_subset.original_phrase(ctx).is_some() || t.term_subset.is_mandatory() {
|
||||
at_least_one_mandatory_term = true;
|
||||
continue;
|
||||
}
|
||||
let mut cost = 0;
|
||||
for id in t.term_ids.clone() {
|
||||
cost = std::cmp::max(cost, order(id));
|
||||
}
|
||||
nodes_to_remove
|
||||
.entry(cost)
|
||||
.or_insert_with(|| SmallBitmap::for_interned_values_in(&self.nodes))
|
||||
.insert(node_id);
|
||||
}
|
||||
let mut res: Vec<_> = nodes_to_remove.into_values().collect();
|
||||
if !at_least_one_mandatory_term {
|
||||
res.pop();
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
/// Number of words in the phrases in this query graph
|
||||
pub(crate) fn words_in_phrases_count(&self, ctx: &SearchContext<'_>) -> usize {
|
||||
let mut word_count = 0;
|
||||
for (_, node) in self.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(term) => {
|
||||
let Some(phrase) = term.term_subset.original_phrase(ctx) else { continue };
|
||||
let phrase = ctx.phrase_interner.get(phrase);
|
||||
word_count += phrase.words.iter().copied().filter(|a| a.is_some()).count()
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
word_count
|
||||
}
|
||||
}
|
||||
|
||||
fn add_node(nodes_data: &mut Vec<QueryNodeData>, node_data: QueryNodeData) -> u16 {
|
||||
let new_node_idx = nodes_data.len() as u16;
|
||||
nodes_data.push(node_data);
|
||||
new_node_idx
|
||||
}
|
||||
|
||||
impl QueryGraph {
|
||||
/*
|
||||
Build a query graph from a list of paths
|
||||
|
||||
The paths are composed of source and dest terms.
|
||||
|
||||
For example, consider the following paths:
|
||||
```txt
|
||||
PATH 1 : a -> b1 -> c1 -> d -> e1
|
||||
PATH 2 : a -> b2 -> c2 -> d -> e2
|
||||
```
|
||||
Then the resulting graph will be:
|
||||
```txt
|
||||
┌────┐ ┌────┐ ┌────┐ ┌────┐
|
||||
┌──│ b1 │──│ c1 │───│ d │───│ e1 │
|
||||
┌────┐ │ └────┘ └────┘ └────┘ └────┘
|
||||
│ a │─┤
|
||||
└────┘ │ ┌────┐ ┌────┐ ┌────┐ ┌────┐
|
||||
└──│ b2 │──│ c2 │───│ d │───│ e2 │
|
||||
└────┘ └────┘ └────┘ └────┘
|
||||
```
|
||||
*/
|
||||
pub fn build_from_paths(
|
||||
paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
|
||||
) -> Self {
|
||||
let mut node_data = Interner::default();
|
||||
let root_node = node_data.push(QueryNodeData::Start);
|
||||
let end_node = node_data.push(QueryNodeData::End);
|
||||
|
||||
let mut paths_with_single_terms = vec![];
|
||||
|
||||
for path in paths {
|
||||
let mut processed_path = vec![];
|
||||
let mut prev_dest_term: Option<LocatedQueryTermSubset> = None;
|
||||
for (start_term, dest_term) in path {
|
||||
if let Some(prev_dest_term) = prev_dest_term.take() {
|
||||
if let Some(mut start_term) = start_term {
|
||||
if start_term.term_ids == prev_dest_term.term_ids {
|
||||
start_term.term_subset.intersect(&prev_dest_term.term_subset);
|
||||
processed_path.push(start_term);
|
||||
} else {
|
||||
processed_path.push(prev_dest_term);
|
||||
processed_path.push(start_term);
|
||||
}
|
||||
} else {
|
||||
processed_path.push(prev_dest_term);
|
||||
}
|
||||
} else if let Some(start_term) = start_term {
|
||||
processed_path.push(start_term);
|
||||
}
|
||||
prev_dest_term = Some(dest_term);
|
||||
}
|
||||
if let Some(prev_dest_term) = prev_dest_term {
|
||||
processed_path.push(prev_dest_term);
|
||||
}
|
||||
paths_with_single_terms.push(processed_path);
|
||||
}
|
||||
|
||||
let mut paths_with_single_terms_and_suffix_hash = vec![];
|
||||
for path in paths_with_single_terms {
|
||||
let mut hasher = FxHasher::default();
|
||||
let mut path_with_hash = vec![];
|
||||
for term in path.into_iter().rev() {
|
||||
term.hash(&mut hasher);
|
||||
path_with_hash.push((term, hasher.finish()));
|
||||
}
|
||||
path_with_hash.reverse();
|
||||
paths_with_single_terms_and_suffix_hash.push(path_with_hash);
|
||||
}
|
||||
|
||||
let mut node_data_id_for_term_and_suffix_hash =
|
||||
FxHashMap::<(LocatedQueryTermSubset, u64), Interned<QueryNodeData>>::default();
|
||||
|
||||
let mut paths_with_ids = vec![];
|
||||
for path in paths_with_single_terms_and_suffix_hash {
|
||||
let mut path_with_ids = vec![];
|
||||
for (term, suffix_hash) in path {
|
||||
let node_data_id = node_data_id_for_term_and_suffix_hash
|
||||
.entry((term.clone(), suffix_hash))
|
||||
.or_insert_with(|| node_data.push(QueryNodeData::Term(term)));
|
||||
path_with_ids.push(Interned::from_raw(node_data_id.into_raw()));
|
||||
}
|
||||
paths_with_ids.push(path_with_ids);
|
||||
}
|
||||
|
||||
let nodes_data = node_data.freeze();
|
||||
let nodes_data_len = nodes_data.len();
|
||||
let mut nodes = nodes_data.map_move(|n| QueryNode {
|
||||
data: n,
|
||||
predecessors: SmallBitmap::new(nodes_data_len),
|
||||
successors: SmallBitmap::new(nodes_data_len),
|
||||
});
|
||||
|
||||
let root_node = Interned::<QueryNode>::from_raw(root_node.into_raw());
|
||||
let end_node = Interned::<QueryNode>::from_raw(end_node.into_raw());
|
||||
|
||||
for path in paths_with_ids {
|
||||
let mut prev_node_id = root_node;
|
||||
for node_id in path {
|
||||
let prev_node = nodes.get_mut(prev_node_id);
|
||||
prev_node.successors.insert(node_id);
|
||||
let node = nodes.get_mut(node_id);
|
||||
node.predecessors.insert(prev_node_id);
|
||||
prev_node_id = node_id;
|
||||
}
|
||||
let prev_node = nodes.get_mut(prev_node_id);
|
||||
prev_node.successors.insert(end_node);
|
||||
let node = nodes.get_mut(end_node);
|
||||
node.predecessors.insert(prev_node_id);
|
||||
}
|
||||
|
||||
QueryGraph { root_node, end_node, nodes }
|
||||
}
|
||||
}
|
||||
428
crates/milli/src/search/new/query_term/compute_derivations.rs
Normal file
428
crates/milli/src/search/new/query_term/compute_derivations.rs
Normal file
@@ -0,0 +1,428 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use fst::automaton::Str;
|
||||
use fst::{Automaton, IntoStreamer, Streamer};
|
||||
use heed::types::DecodeIgnore;
|
||||
|
||||
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
|
||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::{Lazy, TwoTypoTerm};
|
||||
use crate::search::new::{limits, SearchContext};
|
||||
use crate::search::{build_dfa, get_first};
|
||||
use crate::{Result, MAX_WORD_LENGTH};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum NumberOfTypos {
|
||||
Zero,
|
||||
One,
|
||||
Two,
|
||||
}
|
||||
|
||||
pub enum ZeroOrOneTypo {
|
||||
Zero,
|
||||
One,
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() {
|
||||
assert!(s.two_typo.is_uninit());
|
||||
// Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
|
||||
self.initialize_one_typo_subterm(ctx)?;
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
assert!(s.one_typo.is_init());
|
||||
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
||||
} else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() {
|
||||
assert!(s.two_typo.is_uninit());
|
||||
self.initialize_one_and_two_typo_subterm(ctx)?;
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
assert!(s.one_typo.is_init() && s.two_typo.is_init());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn find_zero_typo_prefix_derivations(
|
||||
word_interned: Interned<String>,
|
||||
fst: fst::Set<Cow<'_, [u8]>>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let word = word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
let prefix = Str::new(word).starts_with();
|
||||
let mut stream = fst.search(prefix).into_stream();
|
||||
|
||||
while let Some(derived_word) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?.to_owned();
|
||||
let derived_word_interned = word_interner.insert(derived_word);
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_zero_one_typo_derivations(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
word_interned: Interned<String>,
|
||||
is_prefix: bool,
|
||||
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let fst = ctx.get_words_fst()?;
|
||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
|
||||
let dfa = build_dfa(word, 1, is_prefix);
|
||||
let starts = StartsWith(Str::new(get_first(word)));
|
||||
let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream();
|
||||
|
||||
while let Some((derived_word, state)) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?;
|
||||
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
||||
let d = dfa.distance(state.1);
|
||||
match d.to_u8() {
|
||||
0 => {
|
||||
if derived_word != word_interned {
|
||||
let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
let cf = visit(derived_word, ZeroOrOneTypo::One)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
unreachable!("One typo dfa produced multiple typos")
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_zero_one_two_typo_derivations(
|
||||
word_interned: Interned<String>,
|
||||
is_prefix: bool,
|
||||
fst: fst::Set<Cow<'_, [u8]>>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
mut visit: impl FnMut(Interned<String>, NumberOfTypos) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let word = word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
|
||||
let starts = StartsWith(Str::new(get_first(word)));
|
||||
let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
|
||||
let second_dfa = build_dfa(word, 2, is_prefix);
|
||||
let second = Intersection(&second_dfa, &starts);
|
||||
let automaton = Union(first, &second);
|
||||
|
||||
let mut stream = fst.search_with_state(automaton).into_stream();
|
||||
|
||||
while let Some((derived_word, state)) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?;
|
||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||
// in the case the typo is on the first letter, we know the number of typo
|
||||
// is two
|
||||
if get_first(derived_word) != get_first(word) {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Else, we know that it is the second dfa that matched and compute the
|
||||
// correct distance
|
||||
let d = second_dfa.distance((state.1).0);
|
||||
match d.to_u8() {
|
||||
0 => {
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
2 => {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => unreachable!("2 typos DFA produced a distance greater than 2"),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn partially_initialized_term_from_word(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
word: &str,
|
||||
max_typo: u8,
|
||||
is_prefix: bool,
|
||||
is_ngram: bool,
|
||||
) -> Result<QueryTerm> {
|
||||
let word_interned = ctx.word_interner.insert(word.to_owned());
|
||||
|
||||
if word.len() > MAX_WORD_LENGTH {
|
||||
return Ok({
|
||||
QueryTerm {
|
||||
original: ctx.word_interner.insert(word.to_owned()),
|
||||
ngram_words: None,
|
||||
is_prefix: false,
|
||||
max_levenshtein_distance: 0,
|
||||
zero_typo: <_>::default(),
|
||||
one_typo: Lazy::Init(<_>::default()),
|
||||
two_typo: Lazy::Init(<_>::default()),
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let fst = ctx.index.words_fst(ctx.txn)?;
|
||||
|
||||
let use_prefix_db = is_prefix
|
||||
&& (ctx
|
||||
.index
|
||||
.word_prefix_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(ctx.txn, word)?
|
||||
.is_some()
|
||||
|| (!is_ngram
|
||||
&& ctx
|
||||
.index
|
||||
.exact_word_prefix_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(ctx.txn, word)?
|
||||
.is_some()));
|
||||
let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };
|
||||
|
||||
let mut zero_typo = None;
|
||||
let mut prefix_of = BTreeSet::new();
|
||||
|
||||
if fst.contains(word) {
|
||||
zero_typo = Some(word_interned);
|
||||
}
|
||||
|
||||
if is_prefix && use_prefix_db.is_none() {
|
||||
find_zero_typo_prefix_derivations(
|
||||
word_interned,
|
||||
fst,
|
||||
&mut ctx.word_interner,
|
||||
|derived_word| {
|
||||
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
||||
prefix_of.insert(derived_word);
|
||||
Ok(ControlFlow::Continue(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Break(()))
|
||||
}
|
||||
},
|
||||
)?;
|
||||
}
|
||||
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||
let mut synonym_word_count = 0;
|
||||
let synonyms = synonyms
|
||||
.get(&vec![word.to_owned()])
|
||||
.cloned()
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.take(limits::MAX_SYNONYM_PHRASE_COUNT)
|
||||
.filter_map(|words| {
|
||||
if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT {
|
||||
return None;
|
||||
}
|
||||
synonym_word_count += words.len();
|
||||
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
|
||||
Some(ctx.phrase_interner.insert(Phrase { words }))
|
||||
})
|
||||
.collect();
|
||||
let zero_typo =
|
||||
ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db };
|
||||
|
||||
Ok(QueryTerm {
|
||||
original: word_interned,
|
||||
ngram_words: None,
|
||||
max_levenshtein_distance: max_typo,
|
||||
is_prefix,
|
||||
zero_typo,
|
||||
one_typo: Lazy::Uninit,
|
||||
two_typo: Lazy::Uninit,
|
||||
})
|
||||
}
|
||||
|
||||
fn find_split_words(ctx: &mut SearchContext<'_>, word: &str) -> Result<Option<Interned<Phrase>>> {
|
||||
if let Some((l, r)) = split_best_frequency(ctx, word)? {
|
||||
Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
|
||||
let allows_split_words = self_mut.allows_split_words();
|
||||
let QueryTerm {
|
||||
original,
|
||||
is_prefix,
|
||||
one_typo,
|
||||
max_levenshtein_distance: max_nbr_typos,
|
||||
..
|
||||
} = self_mut;
|
||||
|
||||
let original = *original;
|
||||
let is_prefix = *is_prefix;
|
||||
// let original_str = ctx.word_interner.get(*original).to_owned();
|
||||
if one_typo.is_init() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut one_typo_words = BTreeSet::new();
|
||||
|
||||
if *max_nbr_typos > 0 {
|
||||
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
|
||||
match nbr_typos {
|
||||
ZeroOrOneTypo::Zero => {}
|
||||
ZeroOrOneTypo::One => {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
} else {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
})?;
|
||||
}
|
||||
|
||||
let split_words = if allows_split_words {
|
||||
let original_str = ctx.word_interner.get(original).to_owned();
|
||||
find_split_words(ctx, original_str.as_str())?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
|
||||
// Only add the split words to the derivations if:
|
||||
// 1. the term is neither an ngram nor a phrase; OR
|
||||
// 2. the term is an ngram, but the split words are different from the ngram's component words
|
||||
let split_words = if let Some((ngram_words, split_words)) =
|
||||
self_mut.ngram_words.as_ref().zip(split_words.as_ref())
|
||||
{
|
||||
let Phrase { words } = ctx.phrase_interner.get(*split_words);
|
||||
if ngram_words.iter().ne(words.iter().flatten()) {
|
||||
Some(*split_words)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
split_words
|
||||
};
|
||||
let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words };
|
||||
|
||||
self_mut.one_typo = Lazy::Init(one_typo);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
let QueryTerm {
|
||||
original,
|
||||
is_prefix,
|
||||
two_typo,
|
||||
max_levenshtein_distance: max_nbr_typos,
|
||||
..
|
||||
} = self_mut;
|
||||
let original_str = ctx.word_interner.get(*original).to_owned();
|
||||
if two_typo.is_init() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut one_typo_words = BTreeSet::new();
|
||||
let mut two_typo_words = BTreeSet::new();
|
||||
|
||||
if *max_nbr_typos > 0 {
|
||||
find_zero_one_two_typo_derivations(
|
||||
*original,
|
||||
*is_prefix,
|
||||
ctx.index.words_fst(ctx.txn)?,
|
||||
&mut ctx.word_interner,
|
||||
|derived_word, nbr_typos| {
|
||||
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
||||
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
||||
{
|
||||
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
match nbr_typos {
|
||||
NumberOfTypos::Zero => {}
|
||||
NumberOfTypos::One => {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
}
|
||||
}
|
||||
NumberOfTypos::Two => {
|
||||
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
|
||||
two_typo_words.insert(derived_word);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
},
|
||||
)?;
|
||||
}
|
||||
|
||||
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
|
||||
let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words };
|
||||
|
||||
let two_typo = TwoTypoTerm { two_typos: two_typo_words };
|
||||
|
||||
self_mut.one_typo = Lazy::Init(one_typo);
|
||||
self_mut.two_typo = Lazy::Init(two_typo);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Split the original word into the two words that appear the
|
||||
/// most next to each other in the index.
|
||||
///
|
||||
/// Return `None` if the original word cannot be split.
|
||||
fn split_best_frequency(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
original: &str,
|
||||
) -> Result<Option<(Interned<String>, Interned<String>)>> {
|
||||
let chars = original.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = original.split_at(i);
|
||||
let left = ctx.word_interner.insert(left.to_owned());
|
||||
let right = ctx.word_interner.insert(right.to_owned());
|
||||
|
||||
if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(None, left, right, 1)? {
|
||||
if best.map_or(true, |(old, _, _)| frequency > old) {
|
||||
best = Some((frequency, left, right));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, left, right)| (left, right)))
|
||||
}
|
||||
510
crates/milli/src/search/new/query_term/mod.rs
Normal file
510
crates/milli/src/search/new/query_term/mod.rs
Normal file
@@ -0,0 +1,510 @@
|
||||
mod compute_derivations;
|
||||
mod ntypo_subset;
|
||||
mod parse_query;
|
||||
mod phrase;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::iter::FromIterator;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use either::Either;
|
||||
pub use ntypo_subset::NTypoTermSubset;
|
||||
pub use parse_query::{
|
||||
located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens,
|
||||
};
|
||||
pub use phrase::Phrase;
|
||||
|
||||
use super::interner::{DedupInterner, Interned};
|
||||
use super::{limits, SearchContext, Word};
|
||||
use crate::Result;
|
||||
|
||||
/// A set of word derivations attached to a location in the search query.
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct LocatedQueryTermSubset {
|
||||
pub term_subset: QueryTermSubset,
|
||||
pub positions: RangeInclusive<u16>,
|
||||
pub term_ids: RangeInclusive<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct QueryTermSubset {
|
||||
original: Interned<QueryTerm>,
|
||||
zero_typo_subset: NTypoTermSubset,
|
||||
one_typo_subset: NTypoTermSubset,
|
||||
two_typo_subset: NTypoTermSubset,
|
||||
/// `true` if the term cannot be deleted through the term matching strategy
|
||||
///
|
||||
/// Note that there are other reasons for which a term cannot be deleted, such as
|
||||
/// being a phrase. In that case, this field could be set to `false`, but it
|
||||
/// still wouldn't be deleteable by the term matching strategy.
|
||||
mandatory: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct QueryTerm {
|
||||
original: Interned<String>,
|
||||
ngram_words: Option<Vec<Interned<String>>>,
|
||||
max_levenshtein_distance: u8,
|
||||
is_prefix: bool,
|
||||
zero_typo: ZeroTypoTerm,
|
||||
// May not be computed yet
|
||||
one_typo: Lazy<OneTypoTerm>,
|
||||
// May not be computed yet
|
||||
two_typo: Lazy<TwoTypoTerm>,
|
||||
}
|
||||
|
||||
// SubTerms will be in a dedup interner
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct ZeroTypoTerm {
|
||||
/// The original phrase, if any
|
||||
phrase: Option<Interned<Phrase>>,
|
||||
/// A single word equivalent to the original term, with zero typos
|
||||
exact: Option<Interned<String>>,
|
||||
/// All the words that contain the original word as prefix
|
||||
prefix_of: BTreeSet<Interned<String>>,
|
||||
/// All the synonyms of the original word or phrase
|
||||
synonyms: BTreeSet<Interned<Phrase>>,
|
||||
/// A prefix in the prefix databases matching the original word
|
||||
use_prefix_db: Option<Interned<String>>,
|
||||
}
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct OneTypoTerm {
|
||||
/// The original word split into multiple consecutive words
|
||||
split_words: Option<Interned<Phrase>>,
|
||||
/// Words that are 1 typo away from the original word
|
||||
one_typo: BTreeSet<Interned<String>>,
|
||||
}
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct TwoTypoTerm {
|
||||
/// Words that are 2 typos away from the original word
|
||||
two_typos: BTreeSet<Interned<String>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum Lazy<T> {
|
||||
Uninit,
|
||||
Init(T),
|
||||
}
|
||||
impl<T> Lazy<T> {
|
||||
pub fn is_init(&self) -> bool {
|
||||
match self {
|
||||
Lazy::Uninit => false,
|
||||
Lazy::Init(_) => true,
|
||||
}
|
||||
}
|
||||
pub fn is_uninit(&self) -> bool {
|
||||
match self {
|
||||
Lazy::Uninit => true,
|
||||
Lazy::Init(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ExactTerm {
|
||||
Phrase(Interned<Phrase>),
|
||||
Word(Interned<String>),
|
||||
}
|
||||
|
||||
impl ExactTerm {
|
||||
pub fn interned_words<'ctx>(
|
||||
&self,
|
||||
ctx: &'ctx SearchContext<'ctx>,
|
||||
) -> impl Iterator<Item = Option<Interned<String>>> + 'ctx {
|
||||
match *self {
|
||||
ExactTerm::Phrase(phrase) => {
|
||||
let phrase = ctx.phrase_interner.get(phrase);
|
||||
Either::Left(phrase.words.iter().copied())
|
||||
}
|
||||
ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTermSubset {
|
||||
pub fn is_mandatory(&self) -> bool {
|
||||
self.mandatory
|
||||
}
|
||||
pub fn make_mandatory(&mut self) {
|
||||
self.mandatory = true;
|
||||
}
|
||||
pub fn exact_term(&self, ctx: &SearchContext<'_>) -> Option<ExactTerm> {
|
||||
let full_query_term = ctx.term_interner.get(self.original);
|
||||
if full_query_term.ngram_words.is_some() {
|
||||
return None;
|
||||
}
|
||||
if let Some(phrase) = full_query_term.zero_typo.phrase {
|
||||
self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
|
||||
} else if let Some(word) = full_query_term.zero_typo.exact {
|
||||
self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn empty(for_term: Interned<QueryTerm>) -> Self {
|
||||
Self {
|
||||
original: for_term,
|
||||
zero_typo_subset: NTypoTermSubset::Nothing,
|
||||
one_typo_subset: NTypoTermSubset::Nothing,
|
||||
two_typo_subset: NTypoTermSubset::Nothing,
|
||||
mandatory: false,
|
||||
}
|
||||
}
|
||||
pub fn full(for_term: Interned<QueryTerm>) -> Self {
|
||||
Self {
|
||||
original: for_term,
|
||||
zero_typo_subset: NTypoTermSubset::All,
|
||||
one_typo_subset: NTypoTermSubset::All,
|
||||
two_typo_subset: NTypoTermSubset::All,
|
||||
mandatory: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn union(&mut self, other: &Self) {
|
||||
assert!(self.original == other.original);
|
||||
self.zero_typo_subset.union(&other.zero_typo_subset);
|
||||
self.one_typo_subset.union(&other.one_typo_subset);
|
||||
self.two_typo_subset.union(&other.two_typo_subset);
|
||||
}
|
||||
pub fn intersect(&mut self, other: &Self) {
|
||||
assert!(self.original == other.original);
|
||||
self.zero_typo_subset.intersect(&other.zero_typo_subset);
|
||||
self.one_typo_subset.intersect(&other.one_typo_subset);
|
||||
self.two_typo_subset.intersect(&other.two_typo_subset);
|
||||
}
|
||||
|
||||
pub fn use_prefix_db(&self, ctx: &SearchContext<'_>) -> Option<Word> {
|
||||
let original = ctx.term_interner.get(self.original);
|
||||
let use_prefix_db = original.zero_typo.use_prefix_db?;
|
||||
let word = match &self.zero_typo_subset {
|
||||
NTypoTermSubset::All => Some(use_prefix_db),
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
if words.contains(&use_prefix_db) {
|
||||
Some(use_prefix_db)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
NTypoTermSubset::Nothing => None,
|
||||
};
|
||||
word.map(|word| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(word)
|
||||
} else {
|
||||
Word::Original(word)
|
||||
}
|
||||
})
|
||||
}
|
||||
pub fn all_single_words_except_prefix_db(
|
||||
&self,
|
||||
ctx: &mut SearchContext<'_>,
|
||||
) -> Result<BTreeSet<Word>> {
|
||||
let mut result = BTreeSet::default();
|
||||
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
||||
self.original.compute_fully_if_needed(ctx)?;
|
||||
}
|
||||
|
||||
let original = ctx.term_interner.get_mut(self.original);
|
||||
match &self.zero_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let ZeroTypoTerm {
|
||||
phrase: _,
|
||||
exact: zero_typo,
|
||||
prefix_of,
|
||||
synonyms: _,
|
||||
use_prefix_db: _,
|
||||
} = &original.zero_typo;
|
||||
result.extend(zero_typo.iter().copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
result.extend(prefix_of.iter().copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let ZeroTypoTerm {
|
||||
phrase: _,
|
||||
exact: zero_typo,
|
||||
prefix_of,
|
||||
synonyms: _,
|
||||
use_prefix_db: _,
|
||||
} = &original.zero_typo;
|
||||
if let Some(zero_typo) = zero_typo {
|
||||
if words.contains(zero_typo) {
|
||||
if original.ngram_words.is_some() {
|
||||
result.insert(Word::Derived(*zero_typo));
|
||||
} else {
|
||||
result.insert(Word::Original(*zero_typo));
|
||||
}
|
||||
}
|
||||
}
|
||||
result.extend(prefix_of.intersection(words).copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
}
|
||||
|
||||
match &self.one_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo
|
||||
else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.iter().copied().map(Word::Derived))
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo
|
||||
else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.intersection(words).copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
|
||||
match &self.two_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() };
|
||||
result.extend(two_typos.iter().copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() };
|
||||
result.extend(two_typos.intersection(words).copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
pub fn all_phrases(&self, ctx: &mut SearchContext<'_>) -> Result<BTreeSet<Interned<Phrase>>> {
|
||||
let mut result = BTreeSet::default();
|
||||
|
||||
if !self.one_typo_subset.is_empty() {
|
||||
self.original.compute_fully_if_needed(ctx)?;
|
||||
}
|
||||
let original = ctx.term_interner.get_mut(self.original);
|
||||
|
||||
let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } =
|
||||
&original.zero_typo;
|
||||
result.extend(phrase.iter().copied());
|
||||
result.extend(synonyms.iter().copied());
|
||||
|
||||
match &self.one_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo
|
||||
else {
|
||||
panic!();
|
||||
};
|
||||
result.extend(split_words.iter().copied());
|
||||
}
|
||||
NTypoTermSubset::Subset { phrases, .. } => {
|
||||
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo
|
||||
else {
|
||||
panic!();
|
||||
};
|
||||
if let Some(split_words) = split_words {
|
||||
if phrases.contains(split_words) {
|
||||
result.insert(*split_words);
|
||||
}
|
||||
}
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn original_phrase(&self, ctx: &SearchContext<'_>) -> Option<Interned<Phrase>> {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
if let Some(p) = t.zero_typo.phrase {
|
||||
if self.zero_typo_subset.contains_phrase(p) {
|
||||
return Some(p);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
pub fn max_typo_cost(&self, ctx: &SearchContext<'_>) -> u8 {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
match t.max_levenshtein_distance {
|
||||
0 => {
|
||||
if t.allows_split_words() {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
if self.one_typo_subset.is_empty() {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
2 => {
|
||||
if self.two_typo_subset.is_empty() {
|
||||
if self.one_typo_subset.is_empty() {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}
|
||||
_ => panic!(),
|
||||
}
|
||||
}
|
||||
pub fn keep_only_exact_term(&mut self, ctx: &SearchContext<'_>) {
|
||||
if let Some(term) = self.exact_term(ctx) {
|
||||
match term {
|
||||
ExactTerm::Phrase(p) => {
|
||||
self.zero_typo_subset = NTypoTermSubset::Subset {
|
||||
words: BTreeSet::new(),
|
||||
phrases: BTreeSet::from_iter([p]),
|
||||
};
|
||||
self.clear_one_typo_subset();
|
||||
self.clear_two_typo_subset();
|
||||
}
|
||||
ExactTerm::Word(w) => {
|
||||
self.zero_typo_subset = NTypoTermSubset::Subset {
|
||||
words: BTreeSet::from_iter([w]),
|
||||
phrases: BTreeSet::new(),
|
||||
};
|
||||
self.clear_one_typo_subset();
|
||||
self.clear_two_typo_subset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn clear_zero_typo_subset(&mut self) {
|
||||
self.zero_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn clear_one_typo_subset(&mut self) {
|
||||
self.one_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn clear_two_typo_subset(&mut self) {
|
||||
self.two_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn description(&self, ctx: &SearchContext<'_>) -> String {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
ctx.word_interner.get(t.original).to_owned()
|
||||
}
|
||||
}
|
||||
|
||||
impl ZeroTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self;
|
||||
phrase.is_none()
|
||||
&& zero_typo.is_none()
|
||||
&& prefix_of.is_empty()
|
||||
&& synonyms.is_empty()
|
||||
&& use_prefix_db.is_none()
|
||||
}
|
||||
}
|
||||
impl OneTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let OneTypoTerm { split_words, one_typo } = self;
|
||||
one_typo.is_empty() && split_words.is_none()
|
||||
}
|
||||
}
|
||||
impl TwoTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let TwoTypoTerm { two_typos } = self;
|
||||
two_typos.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let Lazy::Init(one_typo) = &self.one_typo else {
|
||||
return false;
|
||||
};
|
||||
let Lazy::Init(two_typo) = &self.two_typo else {
|
||||
return false;
|
||||
};
|
||||
|
||||
self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
|
||||
}
|
||||
fn allows_split_words(&self) -> bool {
|
||||
self.zero_typo.phrase.is_none()
|
||||
}
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
/// Return the original word from the given query term
|
||||
fn original_single_word(self, ctx: &SearchContext<'_>) -> Option<Interned<String>> {
|
||||
let self_ = ctx.term_interner.get(self);
|
||||
if self_.ngram_words.is_some() {
|
||||
None
|
||||
} else {
|
||||
Some(self_.original)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A query term coupled with its position in the user's search query.
|
||||
#[derive(Clone)]
|
||||
pub struct LocatedQueryTerm {
|
||||
pub value: Interned<QueryTerm>,
|
||||
pub positions: RangeInclusive<u16>,
|
||||
}
|
||||
|
||||
impl LocatedQueryTerm {
|
||||
/// Return `true` iff the term is empty
|
||||
pub fn is_empty(&self, interner: &DedupInterner<QueryTerm>) -> bool {
|
||||
interner.get(self.value).is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTerm {
|
||||
pub fn is_cached_prefix(&self) -> bool {
|
||||
self.zero_typo.use_prefix_db.is_some()
|
||||
}
|
||||
pub fn is_prefix(&self) -> bool {
|
||||
self.is_prefix
|
||||
}
|
||||
pub fn original_word(&self, ctx: &SearchContext<'_>) -> String {
|
||||
ctx.word_interner.get(self.original).clone()
|
||||
}
|
||||
|
||||
pub fn original_phrase(&self) -> Option<Interned<Phrase>> {
|
||||
self.zero_typo.phrase
|
||||
}
|
||||
|
||||
pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
|
||||
let mut words = BTreeSet::new();
|
||||
let mut phrases = BTreeSet::new();
|
||||
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
|
||||
&self.zero_typo;
|
||||
words.extend(zero_typo.iter().copied());
|
||||
words.extend(prefix_of.iter().copied());
|
||||
phrases.extend(phrase.iter().copied());
|
||||
phrases.extend(synonyms.iter().copied());
|
||||
|
||||
if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = &self.one_typo {
|
||||
words.extend(one_typo.iter().copied());
|
||||
phrases.extend(split_words.iter().copied());
|
||||
};
|
||||
|
||||
if let Lazy::Init(TwoTypoTerm { two_typos }) = &self.two_typo {
|
||||
words.extend(two_typos.iter().copied());
|
||||
};
|
||||
|
||||
(words.into_iter().collect(), phrases.into_iter().collect())
|
||||
}
|
||||
}
|
||||
79
crates/milli/src/search/new/query_term/ntypo_subset.rs
Normal file
79
crates/milli/src/search/new/query_term/ntypo_subset.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use super::Phrase;
|
||||
use crate::search::new::interner::Interned;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub enum NTypoTermSubset {
|
||||
All,
|
||||
Subset {
|
||||
words: BTreeSet<Interned<String>>,
|
||||
phrases: BTreeSet<Interned<Phrase>>,
|
||||
// TODO: prefixes: BTreeSet<Interned<String>>,
|
||||
},
|
||||
Nothing,
|
||||
}
|
||||
|
||||
impl NTypoTermSubset {
|
||||
pub fn contains_word(&self, word: Interned<String>) -> bool {
|
||||
match self {
|
||||
NTypoTermSubset::All => true,
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word),
|
||||
NTypoTermSubset::Nothing => false,
|
||||
}
|
||||
}
|
||||
pub fn contains_phrase(&self, phrase: Interned<Phrase>) -> bool {
|
||||
match self {
|
||||
NTypoTermSubset::All => true,
|
||||
NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase),
|
||||
NTypoTermSubset::Nothing => false,
|
||||
}
|
||||
}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
match self {
|
||||
NTypoTermSubset::All => false,
|
||||
NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(),
|
||||
NTypoTermSubset::Nothing => true,
|
||||
}
|
||||
}
|
||||
pub fn union(&mut self, other: &Self) {
|
||||
match self {
|
||||
Self::All => {}
|
||||
Self::Subset { words, phrases } => match other {
|
||||
Self::All => {
|
||||
*self = Self::All;
|
||||
}
|
||||
Self::Subset { words: w2, phrases: p2 } => {
|
||||
words.extend(w2);
|
||||
phrases.extend(p2);
|
||||
}
|
||||
Self::Nothing => {}
|
||||
},
|
||||
Self::Nothing => {
|
||||
*self = other.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn intersect(&mut self, other: &Self) {
|
||||
match self {
|
||||
Self::All => *self = other.clone(),
|
||||
Self::Subset { words, phrases } => match other {
|
||||
Self::All => {}
|
||||
Self::Subset { words: w2, phrases: p2 } => {
|
||||
let mut ws = BTreeSet::new();
|
||||
for w in words.intersection(w2) {
|
||||
ws.insert(*w);
|
||||
}
|
||||
let mut ps = BTreeSet::new();
|
||||
for p in phrases.intersection(p2) {
|
||||
ps.insert(*p);
|
||||
}
|
||||
*words = ws;
|
||||
*phrases = ps;
|
||||
}
|
||||
Self::Nothing => *self = Self::Nothing,
|
||||
},
|
||||
Self::Nothing => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
382
crates/milli/src/search/new/query_term/parse_query.rs
Normal file
382
crates/milli/src/search/new/query_term/parse_query.rs
Normal file
@@ -0,0 +1,382 @@
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use charabia::normalizer::NormalizedTokenIter;
|
||||
use charabia::{SeparatorKind, TokenKind};
|
||||
|
||||
use super::compute_derivations::partially_initialized_term_from_word;
|
||||
use super::{LocatedQueryTerm, ZeroTypoTerm};
|
||||
use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
|
||||
use crate::search::new::Word;
|
||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Extraction of the content of a query.
|
||||
pub struct ExtractedTokens {
|
||||
/// The terms to search for in the database.
|
||||
pub query_terms: Vec<LocatedQueryTerm>,
|
||||
/// The words that must not appear in the results.
|
||||
pub negative_words: Vec<Word>,
|
||||
/// The phrases that must not appear in the results.
|
||||
pub negative_phrases: Vec<LocatedQueryTerm>,
|
||||
}
|
||||
|
||||
/// Convert the tokenised search query into a list of located query terms.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
||||
pub fn located_query_terms_from_tokens(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
query: NormalizedTokenIter<'_, '_, '_, '_>,
|
||||
words_limit: Option<usize>,
|
||||
) -> Result<ExtractedTokens> {
|
||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||
|
||||
let mut query_terms = Vec::new();
|
||||
|
||||
let mut negative_phrase = false;
|
||||
let mut phrase: Option<PhraseBuilder> = None;
|
||||
let mut encountered_whitespace = true;
|
||||
let mut negative_next_token = false;
|
||||
let mut negative_words = Vec::new();
|
||||
let mut negative_phrases = Vec::new();
|
||||
|
||||
let parts_limit = words_limit.unwrap_or(usize::MAX);
|
||||
|
||||
// start with the last position as we will wrap around to position 0 at the beginning of the loop below.
|
||||
let mut position = u16::MAX;
|
||||
|
||||
let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable();
|
||||
while let Some(token) = peekable.next() {
|
||||
if token.lemma().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// early return if word limit is exceeded
|
||||
if query_terms.len() >= parts_limit {
|
||||
return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases });
|
||||
}
|
||||
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord => {
|
||||
// On first loop, goes from u16::MAX to 0, then normal increment.
|
||||
position = position.wrapping_add(1);
|
||||
|
||||
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
|
||||
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
|
||||
// 3. if the word is the last token of the query we push it as a prefix word.
|
||||
if let Some(phrase) = &mut phrase {
|
||||
phrase.push_word(ctx, &token, position)
|
||||
} else if negative_next_token {
|
||||
let word = token.lemma().to_string();
|
||||
let word = Word::Original(ctx.word_interner.insert(word));
|
||||
negative_words.push(word);
|
||||
negative_next_token = false;
|
||||
} else if peekable.peek().is_some() {
|
||||
match token.kind {
|
||||
TokenKind::Word => {
|
||||
let word = token.lemma();
|
||||
let term = partially_initialized_term_from_word(
|
||||
ctx,
|
||||
word,
|
||||
nbr_typos(word),
|
||||
false,
|
||||
false,
|
||||
)?;
|
||||
let located_term = LocatedQueryTerm {
|
||||
value: ctx.term_interner.push(term),
|
||||
positions: position..=position,
|
||||
};
|
||||
query_terms.push(located_term);
|
||||
}
|
||||
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (),
|
||||
}
|
||||
} else {
|
||||
let word = token.lemma();
|
||||
let term = partially_initialized_term_from_word(
|
||||
ctx,
|
||||
word,
|
||||
nbr_typos(word),
|
||||
true,
|
||||
false,
|
||||
)?;
|
||||
let located_term = LocatedQueryTerm {
|
||||
value: ctx.term_interner.push(term),
|
||||
positions: position..=position,
|
||||
};
|
||||
query_terms.push(located_term);
|
||||
}
|
||||
}
|
||||
TokenKind::Separator(separator_kind) => {
|
||||
// add penalty for hard separators
|
||||
if let SeparatorKind::Hard = separator_kind {
|
||||
position = position.wrapping_add(7);
|
||||
}
|
||||
|
||||
phrase = 'phrase: {
|
||||
let phrase = phrase.take();
|
||||
|
||||
// If we have a hard separator inside a phrase, we immediately start a new phrase
|
||||
let phrase = if separator_kind == SeparatorKind::Hard {
|
||||
if let Some(phrase) = phrase {
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
// as we are evaluating a negative operator we put the phrase
|
||||
// in the negative one *but* we don't reset the negative operator
|
||||
// as we are immediately starting a new negative phrase.
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
} else {
|
||||
query_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
Some(PhraseBuilder::empty())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
phrase
|
||||
};
|
||||
|
||||
// We close and start a new phrase depending on the number of double quotes
|
||||
let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
|
||||
if quote_count == 0 {
|
||||
break 'phrase phrase;
|
||||
}
|
||||
|
||||
// Consume the closing quote and the phrase
|
||||
if let Some(phrase) = phrase {
|
||||
// Per the check above, quote_count > 0
|
||||
quote_count -= 1;
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
// we were evaluating a negative operator so we
|
||||
// put the phrase in the negative phrases
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
negative_phrase = false;
|
||||
} else {
|
||||
query_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start new phrase if the token ends with an opening quote
|
||||
if quote_count % 2 == 1 {
|
||||
negative_phrase = negative_next_token;
|
||||
Some(PhraseBuilder::empty())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
negative_next_token =
|
||||
phrase.is_none() && token.lemma() == "-" && encountered_whitespace;
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
encountered_whitespace =
|
||||
token.lemma().chars().last().filter(|c| c.is_whitespace()).is_some();
|
||||
}
|
||||
|
||||
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
||||
if let Some(phrase) = phrase.take() {
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
// put the phrase in the negative set if we are evaluating a negative operator.
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
} else {
|
||||
query_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ExtractedTokens { query_terms, negative_words, negative_phrases })
|
||||
}
|
||||
|
||||
pub fn number_of_typos_allowed<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
) -> Result<impl Fn(&str) -> u8 + 'ctx> {
|
||||
let authorize_typos = ctx.index.authorize_typos(ctx.txn)?;
|
||||
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
|
||||
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
|
||||
|
||||
let exact_words = ctx.index.exact_words(ctx.txn)?;
|
||||
|
||||
Ok(Box::new(move |word: &str| {
|
||||
if !authorize_typos
|
||||
|| word.len() < min_len_one_typo as usize
|
||||
|| exact_words.as_ref().map_or(false, |fst| fst.contains(word))
|
||||
{
|
||||
0
|
||||
} else if word.len() < min_len_two_typos as usize {
|
||||
1
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn make_ngram(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
terms: &[LocatedQueryTerm],
|
||||
number_of_typos_allowed: &impl Fn(&str) -> u8,
|
||||
) -> Result<Option<LocatedQueryTerm>> {
|
||||
assert!(!terms.is_empty());
|
||||
for t in terms {
|
||||
if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
for ts in terms.windows(2) {
|
||||
let [t1, t2] = ts else { panic!() };
|
||||
if *t1.positions.end() != t2.positions.start() - 1 {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
let mut words_interned = vec![];
|
||||
for term in terms {
|
||||
if let Some(original_term_word) = term.value.original_single_word(ctx) {
|
||||
words_interned.push(original_term_word);
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
let words =
|
||||
words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::<Vec<_>>();
|
||||
|
||||
let start = *terms.first().as_ref().unwrap().positions.start();
|
||||
let end = *terms.last().as_ref().unwrap().positions.end();
|
||||
let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix;
|
||||
let ngram_str = words.join("");
|
||||
if ngram_str.len() > MAX_WORD_LENGTH {
|
||||
return Ok(None);
|
||||
}
|
||||
let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone());
|
||||
|
||||
let max_nbr_typos =
|
||||
number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);
|
||||
|
||||
let mut term =
|
||||
partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?;
|
||||
|
||||
// Now add the synonyms
|
||||
let index_synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||
|
||||
term.zero_typo.synonyms.extend(
|
||||
index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| {
|
||||
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
|
||||
ctx.phrase_interner.insert(Phrase { words })
|
||||
}),
|
||||
);
|
||||
|
||||
let term = QueryTerm {
|
||||
original: ngram_str_interned,
|
||||
ngram_words: Some(words_interned),
|
||||
is_prefix,
|
||||
max_levenshtein_distance: max_nbr_typos,
|
||||
zero_typo: term.zero_typo,
|
||||
one_typo: Lazy::Uninit,
|
||||
two_typo: Lazy::Uninit,
|
||||
};
|
||||
|
||||
let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end };
|
||||
|
||||
Ok(Some(term))
|
||||
}
|
||||
|
||||
struct PhraseBuilder {
|
||||
words: Vec<Option<crate::search::new::Interned<String>>>,
|
||||
start: u16,
|
||||
end: u16,
|
||||
}
|
||||
|
||||
impl PhraseBuilder {
|
||||
fn empty() -> Self {
|
||||
Self { words: Default::default(), start: u16::MAX, end: u16::MAX }
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.words.is_empty() || self.words.iter().all(Option::is_none)
|
||||
}
|
||||
|
||||
// precondition: token has kind Word or StopWord
|
||||
fn push_word(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'_>,
|
||||
token: &charabia::Token<'_>,
|
||||
position: u16,
|
||||
) {
|
||||
if self.is_empty() {
|
||||
self.start = position;
|
||||
}
|
||||
self.end = position;
|
||||
if let TokenKind::StopWord = token.kind {
|
||||
self.words.push(None);
|
||||
} else {
|
||||
// token has kind Word
|
||||
let word = ctx.word_interner.insert(token.lemma().to_string());
|
||||
self.words.push(Some(word));
|
||||
}
|
||||
}
|
||||
|
||||
fn build(self, ctx: &mut SearchContext<'_>) -> Option<LocatedQueryTerm> {
|
||||
if self.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(LocatedQueryTerm {
|
||||
value: ctx.term_interner.push({
|
||||
let phrase = ctx.phrase_interner.insert(Phrase { words: self.words });
|
||||
let phrase_desc = phrase.description(ctx);
|
||||
QueryTerm {
|
||||
original: ctx.word_interner.insert(phrase_desc),
|
||||
ngram_words: None,
|
||||
max_levenshtein_distance: 0,
|
||||
is_prefix: false,
|
||||
zero_typo: ZeroTypoTerm {
|
||||
phrase: Some(phrase),
|
||||
exact: None,
|
||||
prefix_of: BTreeSet::default(),
|
||||
synonyms: BTreeSet::default(),
|
||||
use_prefix_db: None,
|
||||
},
|
||||
one_typo: Lazy::Uninit,
|
||||
two_typo: Lazy::Uninit,
|
||||
}
|
||||
}),
|
||||
positions: self.start..=self.end,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use charabia::TokenizerBuilder;
|
||||
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
|
||||
fn temp_index_with_documents() -> TempIndex {
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
|
||||
{ "id": 2, "name": "Westfália" },
|
||||
{ "id": 3, "name": "Ŵôřlḑôle" },
|
||||
]))
|
||||
.unwrap();
|
||||
temp_index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn start_with_hard_separator() -> Result<()> {
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize(".");
|
||||
let index = temp_index_with_documents();
|
||||
let rtxn = index.read_txn()?;
|
||||
let mut ctx = SearchContext::new(&index, &rtxn)?;
|
||||
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None)?;
|
||||
assert!(query_terms.is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
21
crates/milli/src/search/new/query_term/phrase.rs
Normal file
21
crates/milli/src/search/new/query_term/phrase.rs
Normal file
@@ -0,0 +1,21 @@
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::search::new::interner::Interned;
|
||||
use crate::SearchContext;
|
||||
|
||||
/// A phrase in the user's search query, consisting of several words
|
||||
/// that must appear side-by-side in the search results.
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct Phrase {
|
||||
pub words: Vec<Option<Interned<String>>>,
|
||||
}
|
||||
impl Interned<Phrase> {
|
||||
pub fn description(self, ctx: &SearchContext<'_>) -> String {
|
||||
let p = ctx.phrase_interner.get(self);
|
||||
p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ")
|
||||
}
|
||||
pub fn words(self, ctx: &SearchContext<'_>) -> Vec<Option<Interned<String>>> {
|
||||
let p = ctx.phrase_interner.get(self);
|
||||
p.words.clone()
|
||||
}
|
||||
}
|
||||
92
crates/milli/src/search/new/ranking_rule_graph/build.rs
Normal file
92
crates/milli/src/search/new/ranking_rule_graph/build.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::search::new::interner::{DedupInterner, MappedInterner};
|
||||
use crate::search::new::query_graph::{QueryNode, QueryNodeData};
|
||||
use crate::search::new::small_bitmap::SmallBitmap;
|
||||
use crate::search::new::{QueryGraph, SearchContext};
|
||||
use crate::Result;
|
||||
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
/// Build the ranking rule graph from the given query graph
|
||||
pub fn build(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
query_graph: QueryGraph,
|
||||
cost_of_ignoring_node: MappedInterner<QueryNode, Option<(u32, SmallBitmap<QueryNode>)>>,
|
||||
) -> Result<Self> {
|
||||
let QueryGraph { nodes: graph_nodes, .. } = &query_graph;
|
||||
|
||||
let mut conditions_interner = DedupInterner::default();
|
||||
|
||||
let mut edges_store = DedupInterner::default();
|
||||
let mut edges_of_node = query_graph.nodes.map(|_| HashSet::new());
|
||||
|
||||
for (source_id, source_node) in graph_nodes.iter() {
|
||||
let new_edges = edges_of_node.get_mut(source_id);
|
||||
|
||||
for dest_idx in source_node.successors.iter() {
|
||||
let src_term = match &source_node.data {
|
||||
QueryNodeData::Term(t) => Some(t),
|
||||
QueryNodeData::Start => None,
|
||||
QueryNodeData::Deleted | QueryNodeData::End => panic!(),
|
||||
};
|
||||
let dest_node = graph_nodes.get(dest_idx);
|
||||
let dest_term = match &dest_node.data {
|
||||
QueryNodeData::Term(t) => t,
|
||||
QueryNodeData::End => {
|
||||
let new_edge_id = edges_store.insert(Some(Edge {
|
||||
source_node: source_id,
|
||||
dest_node: dest_idx,
|
||||
cost: 0,
|
||||
condition: None,
|
||||
nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes),
|
||||
}));
|
||||
new_edges.insert(new_edge_id);
|
||||
continue;
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start => panic!(),
|
||||
};
|
||||
if let Some((cost_of_ignoring, forbidden_nodes)) =
|
||||
cost_of_ignoring_node.get(dest_idx)
|
||||
{
|
||||
let dest = graph_nodes.get(dest_idx);
|
||||
let dest_size = match &dest.data {
|
||||
QueryNodeData::Term(term) => term.term_ids.len(),
|
||||
_ => panic!(),
|
||||
};
|
||||
let new_edge_id = edges_store.insert(Some(Edge {
|
||||
source_node: source_id,
|
||||
dest_node: dest_idx,
|
||||
cost: *cost_of_ignoring * dest_size as u32,
|
||||
condition: None,
|
||||
nodes_to_skip: forbidden_nodes.clone(),
|
||||
}));
|
||||
new_edges.insert(new_edge_id);
|
||||
}
|
||||
|
||||
let edges = G::build_edges(ctx, &mut conditions_interner, src_term, dest_term)?;
|
||||
if edges.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (cost, condition) in edges {
|
||||
let new_edge_id = edges_store.insert(Some(Edge {
|
||||
source_node: source_id,
|
||||
dest_node: dest_idx,
|
||||
cost,
|
||||
condition: Some(condition),
|
||||
nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes),
|
||||
}));
|
||||
new_edges.insert(new_edge_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
let edges_store = edges_store.freeze();
|
||||
let edges_of_node =
|
||||
edges_of_node.map(|edges| SmallBitmap::from_iter(edges.iter().copied(), &edges_store));
|
||||
|
||||
let conditions_interner = conditions_interner.freeze();
|
||||
|
||||
Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node, conditions_interner })
|
||||
}
|
||||
}
|
||||
400
crates/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
Normal file
400
crates/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
Normal file
@@ -0,0 +1,400 @@
|
||||
/** Implements a "PathVisitor" which finds all paths of a certain cost
|
||||
from the START to END node of a ranking rule graph.
|
||||
|
||||
A path is a list of conditions. A condition is the data associated with
|
||||
an edge, given by the ranking rule. Some edges don't have a condition associated
|
||||
with them, they are "unconditional". These kinds of edges are used to "skip" a node.
|
||||
|
||||
The algorithm uses a depth-first search. It benefits from two main optimisations:
|
||||
- The list of all possible costs to go from any node to the END node is precomputed
|
||||
- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges
|
||||
untraversable depending on what other edges were selected.
|
||||
|
||||
These two optimisations are meant to avoid traversing edges that wouldn't lead
|
||||
to a valid path. In practically all cases, we avoid the exponential complexity
|
||||
that is inherent to depth-first search in a large ranking rule graph.
|
||||
|
||||
The DeadEndsCache is a sort of prefix tree which associates a list of forbidden
|
||||
conditions to a list of traversed conditions.
|
||||
For example, the DeadEndsCache could say the following:
|
||||
- Immediately, from the start, the conditions `[a,b]` are forbidden
|
||||
- if we take the condition `c`, then the conditions `[e]` are also forbidden
|
||||
- and if after that, we take `f`, then `[h,i]` are also forbidden
|
||||
- etc.
|
||||
- if we take `g`, then `[f]` is also forbidden
|
||||
- etc.
|
||||
- etc.
|
||||
As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden
|
||||
conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden.
|
||||
|
||||
When a path is found from START to END, we give it to the `visit` closure.
|
||||
This closure takes a mutable reference to the `DeadEndsCache`. This means that
|
||||
the caller can update this cache. Therefore, we must handle the case where the
|
||||
DeadEndsCache has been updated. This means potentially backtracking up to the point
|
||||
where the traversed conditions are all allowed by the new DeadEndsCache.
|
||||
|
||||
The algorithm also implements the `TermsMatchingStrategy` logic.
|
||||
Some edges are augmented with a list of "nodes_to_skip". Skipping
|
||||
a node means "reaching this node through an unconditional edge". If we have
|
||||
already traversed (ie. not skipped) a node that is in this list, then we know that we
|
||||
can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any
|
||||
future node that was present in the "nodes_to_skip" list.
|
||||
|
||||
The caller can decide to stop the path finding algorithm
|
||||
by returning a `ControlFlow::Break` from the `visit` closure.
|
||||
*/
|
||||
use std::collections::{BTreeSet, VecDeque};
|
||||
use std::iter::FromIterator;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use fxhash::FxHashSet;
|
||||
|
||||
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::search::new::interner::{Interned, MappedInterner};
|
||||
use crate::search::new::query_graph::QueryNode;
|
||||
use crate::search::new::small_bitmap::SmallBitmap;
|
||||
use crate::Result;
|
||||
|
||||
/// Closure which processes a path found by the `PathVisitor`
|
||||
type VisitFn<'f, G> = &'f mut dyn FnMut(
|
||||
// the path as a list of conditions
|
||||
&[Interned<<G as RankingRuleGraphTrait>::Condition>],
|
||||
&mut RankingRuleGraph<G>,
|
||||
// a mutable reference to the DeadEndsCache, to update it in case the given
|
||||
// path doesn't resolve to any valid document ids
|
||||
&mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>,
|
||||
) -> Result<ControlFlow<()>>;
|
||||
|
||||
/// A structure which is kept but not updated during the traversal of the graph.
|
||||
/// It can however be updated by the `visit` closure once a valid path has been found.
|
||||
struct VisitorContext<'a, G: RankingRuleGraphTrait> {
|
||||
graph: &'a mut RankingRuleGraph<G>,
|
||||
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
|
||||
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
|
||||
}
|
||||
|
||||
/// The internal state of the traversal algorithm
|
||||
struct VisitorState<G: RankingRuleGraphTrait> {
|
||||
/// Budget from the current node to the end node
|
||||
remaining_cost: u64,
|
||||
/// Previously visited conditions, in order.
|
||||
path: Vec<Interned<G::Condition>>,
|
||||
/// Previously visited conditions, as an efficient and compact set.
|
||||
visited_conditions: SmallBitmap<G::Condition>,
|
||||
/// Previously visited (ie not skipped) nodes, as an efficient and compact set.
|
||||
visited_nodes: SmallBitmap<QueryNode>,
|
||||
/// The conditions that cannot be visited anymore
|
||||
forbidden_conditions: SmallBitmap<G::Condition>,
|
||||
/// The nodes that cannot be visited anymore (they must be skipped)
|
||||
nodes_to_skip: SmallBitmap<QueryNode>,
|
||||
}
|
||||
|
||||
/// See module documentation
|
||||
pub struct PathVisitor<'a, G: RankingRuleGraphTrait> {
|
||||
state: VisitorState<G>,
|
||||
ctx: VisitorContext<'a, G>,
|
||||
}
|
||||
impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
|
||||
pub fn new(
|
||||
cost: u64,
|
||||
graph: &'a mut RankingRuleGraph<G>,
|
||||
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
|
||||
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
|
||||
) -> Self {
|
||||
Self {
|
||||
state: VisitorState {
|
||||
remaining_cost: cost,
|
||||
path: vec![],
|
||||
visited_conditions: SmallBitmap::for_interned_values_in(&graph.conditions_interner),
|
||||
visited_nodes: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
|
||||
forbidden_conditions: SmallBitmap::for_interned_values_in(
|
||||
&graph.conditions_interner,
|
||||
),
|
||||
nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
|
||||
},
|
||||
ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache },
|
||||
}
|
||||
}
|
||||
|
||||
/// See module documentation
|
||||
pub fn visit_paths(mut self, visit: VisitFn<'_, G>) -> Result<()> {
|
||||
let _ =
|
||||
self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<G: RankingRuleGraphTrait> VisitorState<G> {
|
||||
/// Visits a node: traverse all its valid conditional and unconditional edges.
|
||||
///
|
||||
/// Returns ControlFlow::Break if the path finding algorithm should stop.
|
||||
/// Returns whether a valid path was found from this node otherwise.
|
||||
fn visit_node(
|
||||
&mut self,
|
||||
from_node: Interned<QueryNode>,
|
||||
visit: VisitFn<'_, G>,
|
||||
ctx: &mut VisitorContext<'_, G>,
|
||||
) -> Result<ControlFlow<(), bool>> {
|
||||
// any valid path will be found from this point
|
||||
// if a valid path was found, then we know that the DeadEndsCache may have been updated,
|
||||
// and we will need to do more work to potentially backtrack
|
||||
let mut any_valid = false;
|
||||
|
||||
let edges = ctx.graph.edges_of_node.get(from_node).clone();
|
||||
for edge_idx in edges.iter() {
|
||||
// could be none if the edge was deleted
|
||||
let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue };
|
||||
|
||||
if self.remaining_cost < edge.cost as u64 {
|
||||
continue;
|
||||
}
|
||||
self.remaining_cost -= edge.cost as u64;
|
||||
|
||||
let cf = match edge.condition {
|
||||
Some(condition) => self.visit_condition(
|
||||
condition,
|
||||
edge.dest_node,
|
||||
&edge.nodes_to_skip,
|
||||
visit,
|
||||
ctx,
|
||||
)?,
|
||||
None => self.visit_no_condition(edge.dest_node, &edge.nodes_to_skip, visit, ctx)?,
|
||||
};
|
||||
self.remaining_cost += edge.cost as u64;
|
||||
|
||||
let ControlFlow::Continue(next_any_valid) = cf else {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
};
|
||||
any_valid |= next_any_valid;
|
||||
if next_any_valid {
|
||||
// backtrack as much as possible if a valid path was found and the dead_ends_cache
|
||||
// was updated such that the current prefix is now invalid
|
||||
self.forbidden_conditions = ctx
|
||||
.dead_ends_cache
|
||||
.forbidden_conditions_for_all_prefixes_up_to(self.path.iter().copied());
|
||||
if self.visited_conditions.intersects(&self.forbidden_conditions) {
|
||||
return Ok(ControlFlow::Continue(true));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ControlFlow::Continue(any_valid))
|
||||
}
|
||||
|
||||
/// Visits an unconditional edge.
|
||||
///
|
||||
/// Returns ControlFlow::Break if the path finding algorithm should stop.
|
||||
/// Returns whether a valid path was found from this node otherwise.
|
||||
fn visit_no_condition(
|
||||
&mut self,
|
||||
dest_node: Interned<QueryNode>,
|
||||
edge_new_nodes_to_skip: &SmallBitmap<QueryNode>,
|
||||
visit: VisitFn<'_, G>,
|
||||
ctx: &mut VisitorContext<'_, G>,
|
||||
) -> Result<ControlFlow<(), bool>> {
|
||||
if !ctx
|
||||
.all_costs_from_node
|
||||
.get(dest_node)
|
||||
.iter()
|
||||
.any(|next_cost| *next_cost == self.remaining_cost)
|
||||
{
|
||||
return Ok(ControlFlow::Continue(false));
|
||||
}
|
||||
// We've reached the END node!
|
||||
if dest_node == ctx.graph.query_graph.end_node {
|
||||
let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?;
|
||||
// We could change the return type of the visit closure such that the caller
|
||||
// tells us whether the dead ends cache was updated or not.
|
||||
// Alternatively, maybe the DeadEndsCache should have a generation number
|
||||
// to it, so that we don't need to play with these booleans at all.
|
||||
match control_flow {
|
||||
ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)),
|
||||
ControlFlow::Break(_) => Ok(ControlFlow::Break(())),
|
||||
}
|
||||
} else {
|
||||
let old_fbct = self.nodes_to_skip.clone();
|
||||
self.nodes_to_skip.union(edge_new_nodes_to_skip);
|
||||
let cf = self.visit_node(dest_node, visit, ctx)?;
|
||||
self.nodes_to_skip = old_fbct;
|
||||
Ok(cf)
|
||||
}
|
||||
}
|
||||
/// Visits a conditional edge.
|
||||
///
|
||||
/// Returns ControlFlow::Break if the path finding algorithm should stop.
|
||||
/// Returns whether a valid path was found from this node otherwise.
|
||||
fn visit_condition(
|
||||
&mut self,
|
||||
condition: Interned<G::Condition>,
|
||||
dest_node: Interned<QueryNode>,
|
||||
edge_new_nodes_to_skip: &SmallBitmap<QueryNode>,
|
||||
visit: VisitFn<'_, G>,
|
||||
ctx: &mut VisitorContext<'_, G>,
|
||||
) -> Result<ControlFlow<(), bool>> {
|
||||
assert!(dest_node != ctx.graph.query_graph.end_node);
|
||||
|
||||
if self.forbidden_conditions.contains(condition)
|
||||
|| self.nodes_to_skip.contains(dest_node)
|
||||
|| edge_new_nodes_to_skip.intersects(&self.visited_nodes)
|
||||
{
|
||||
return Ok(ControlFlow::Continue(false));
|
||||
}
|
||||
|
||||
// Checking that from the destination node, there is at least
|
||||
// one cost that we can visit that corresponds to our remaining budget.
|
||||
if !ctx
|
||||
.all_costs_from_node
|
||||
.get(dest_node)
|
||||
.iter()
|
||||
.any(|next_cost| *next_cost == self.remaining_cost)
|
||||
{
|
||||
return Ok(ControlFlow::Continue(false));
|
||||
}
|
||||
|
||||
self.path.push(condition);
|
||||
self.visited_nodes.insert(dest_node);
|
||||
self.visited_conditions.insert(condition);
|
||||
|
||||
let old_forb_cond = self.forbidden_conditions.clone();
|
||||
if let Some(next_forbidden) =
|
||||
ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied())
|
||||
{
|
||||
self.forbidden_conditions.union(&next_forbidden);
|
||||
}
|
||||
let old_nodes_to_skip = self.nodes_to_skip.clone();
|
||||
self.nodes_to_skip.union(edge_new_nodes_to_skip);
|
||||
|
||||
let cf = self.visit_node(dest_node, visit, ctx)?;
|
||||
|
||||
self.nodes_to_skip = old_nodes_to_skip;
|
||||
self.forbidden_conditions = old_forb_cond;
|
||||
|
||||
self.visited_conditions.remove(condition);
|
||||
self.visited_nodes.remove(dest_node);
|
||||
self.path.pop();
|
||||
|
||||
Ok(cf)
|
||||
}
|
||||
}
|
||||
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> {
|
||||
let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]);
|
||||
|
||||
self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| {
|
||||
if cur_node == self.query_graph.end_node {
|
||||
*costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
|
||||
return;
|
||||
}
|
||||
let mut self_costs = Vec::<u64>::new();
|
||||
|
||||
let cur_node_edges = &self.edges_of_node.get(cur_node);
|
||||
for edge_idx in cur_node_edges.iter() {
|
||||
let edge = self.edges_store.get(edge_idx).as_ref().unwrap();
|
||||
let succ_node = edge.dest_node;
|
||||
let succ_costs = costs_to_end.get(succ_node);
|
||||
for succ_cost in succ_costs {
|
||||
self_costs.push(edge.cost as u64 + succ_cost);
|
||||
}
|
||||
}
|
||||
self_costs.sort_unstable();
|
||||
self_costs.dedup();
|
||||
|
||||
*costs_to_end.get_mut(cur_node) = self_costs;
|
||||
});
|
||||
costs_to_end
|
||||
}
|
||||
|
||||
pub fn update_all_costs_before_node(
|
||||
&self,
|
||||
node_with_removed_outgoing_conditions: Interned<QueryNode>,
|
||||
costs: &mut MappedInterner<QueryNode, Vec<u64>>,
|
||||
) {
|
||||
// Traverse the graph backward from the target node, recomputing the cost for each of its predecessors.
|
||||
// We first check that no other node is contributing the same total cost to a predecessor before removing
|
||||
// the cost from the predecessor.
|
||||
self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
|
||||
let mut costs_to_remove = FxHashSet::default();
|
||||
costs_to_remove.extend(costs.get(cur_node).iter().copied());
|
||||
|
||||
let cur_node_edges = &self.edges_of_node.get(cur_node);
|
||||
for edge_idx in cur_node_edges.iter() {
|
||||
let edge = self.edges_store.get(edge_idx).as_ref().unwrap();
|
||||
for cost in costs.get(edge.dest_node).iter() {
|
||||
costs_to_remove.remove(&(*cost + edge.cost as u64));
|
||||
if costs_to_remove.is_empty() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if costs_to_remove.is_empty() {
|
||||
return;
|
||||
}
|
||||
let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied());
|
||||
for c in costs_to_remove {
|
||||
new_costs.remove(&c);
|
||||
}
|
||||
*costs.get_mut(cur_node) = new_costs.into_iter().collect();
|
||||
});
|
||||
}
|
||||
|
||||
/// Traverse the graph backwards from the given node such that every time
|
||||
/// a node is visited, we are guaranteed that all its successors either:
|
||||
/// 1. have already been visited; OR
|
||||
/// 2. were not reachable from the given node
|
||||
pub fn traverse_breadth_first_backward(
|
||||
&self,
|
||||
from: Interned<QueryNode>,
|
||||
mut visit: impl FnMut(Interned<QueryNode>),
|
||||
) {
|
||||
let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
|
||||
{
|
||||
// go backward to get the set of all reachable nodes from the given node
|
||||
// the nodes that are not reachable will be set as `visited`
|
||||
let mut stack = VecDeque::new();
|
||||
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
|
||||
enqueued.insert(from);
|
||||
stack.push_back(from);
|
||||
while let Some(n) = stack.pop_front() {
|
||||
if reachable.contains(n) {
|
||||
continue;
|
||||
}
|
||||
reachable.insert(n);
|
||||
for prev_node in self.query_graph.nodes.get(n).predecessors.iter() {
|
||||
if !enqueued.contains(prev_node) && !reachable.contains(prev_node) {
|
||||
stack.push_back(prev_node);
|
||||
enqueued.insert(prev_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut unreachable_or_visited =
|
||||
SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
|
||||
for (n, _) in self.query_graph.nodes.iter() {
|
||||
if !reachable.contains(n) {
|
||||
unreachable_or_visited.insert(n);
|
||||
}
|
||||
}
|
||||
|
||||
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
|
||||
let mut stack = VecDeque::new();
|
||||
|
||||
enqueued.insert(from);
|
||||
stack.push_back(from);
|
||||
|
||||
while let Some(cur_node) = stack.pop_front() {
|
||||
if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) {
|
||||
stack.push_back(cur_node);
|
||||
continue;
|
||||
}
|
||||
unreachable_or_visited.insert(cur_node);
|
||||
visit(cur_node);
|
||||
for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
|
||||
if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) {
|
||||
stack.push_back(prev_node);
|
||||
enqueued.insert(prev_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{ComputedCondition, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::search::new::interner::Interned;
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::SearchContext;
|
||||
use crate::Result;
|
||||
|
||||
/// A cache storing the document ids associated with each ranking rule edge
|
||||
pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
|
||||
pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
|
||||
_phantom: PhantomData<G>,
|
||||
}
|
||||
impl<G: RankingRuleGraphTrait> Default for ConditionDocIdsCache<G> {
|
||||
fn default() -> Self {
|
||||
Self { cache: Default::default(), _phantom: Default::default() }
|
||||
}
|
||||
}
|
||||
impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
|
||||
pub fn get_subsets_used_by_condition(
|
||||
&mut self,
|
||||
interned_condition: Interned<G::Condition>,
|
||||
) -> (&Option<LocatedQueryTermSubset>, &LocatedQueryTermSubset) {
|
||||
let c = &self.cache[&interned_condition];
|
||||
(&c.start_term_subset, &c.end_term_subset)
|
||||
}
|
||||
/// Retrieve the document ids for the given edge condition.
|
||||
///
|
||||
/// If the cache does not yet contain these docids, they are computed
|
||||
/// and inserted in the cache.
|
||||
pub fn get_computed_condition<'s>(
|
||||
&'s mut self,
|
||||
ctx: &mut SearchContext<'_>,
|
||||
interned_condition: Interned<G::Condition>,
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<&'s ComputedCondition> {
|
||||
if self.cache.contains_key(&interned_condition) {
|
||||
let computed = self.cache.get_mut(&interned_condition).unwrap();
|
||||
if computed.universe_len == universe.len() {
|
||||
return Ok(computed);
|
||||
} else {
|
||||
computed.docids &= universe;
|
||||
computed.universe_len = universe.len();
|
||||
return Ok(computed);
|
||||
}
|
||||
}
|
||||
let condition = graph.conditions_interner.get_mut(interned_condition);
|
||||
let computed = G::resolve_condition(ctx, condition, universe)?;
|
||||
// Can we put an assert here for computed.universe_len == universe.len() ?
|
||||
let _ = self.cache.insert(interned_condition, computed);
|
||||
let computed = &self.cache[&interned_condition];
|
||||
Ok(computed)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
use crate::search::new::interner::{FixedSizeInterner, Interned};
|
||||
use crate::search::new::small_bitmap::SmallBitmap;
|
||||
|
||||
pub struct DeadEndsCache<T> {
|
||||
// conditions and next could/should be part of the same vector
|
||||
conditions: Vec<Interned<T>>,
|
||||
next: Vec<Self>,
|
||||
pub forbidden: SmallBitmap<T>,
|
||||
}
|
||||
impl<T> Clone for DeadEndsCache<T> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
conditions: self.conditions.clone(),
|
||||
next: self.next.clone(),
|
||||
forbidden: self.forbidden.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<T> DeadEndsCache<T> {
|
||||
pub fn new(for_interner: &FixedSizeInterner<T>) -> Self {
|
||||
Self {
|
||||
conditions: vec![],
|
||||
next: vec![],
|
||||
forbidden: SmallBitmap::for_interned_values_in(for_interner),
|
||||
}
|
||||
}
|
||||
pub fn forbid_condition(&mut self, condition: Interned<T>) {
|
||||
self.forbidden.insert(condition);
|
||||
}
|
||||
|
||||
fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
|
||||
if let Some(idx) = self.conditions.iter().position(|c| *c == condition) {
|
||||
Some(&mut self.next[idx])
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
pub fn forbidden_conditions_for_all_prefixes_up_to(
|
||||
&mut self,
|
||||
prefix: impl Iterator<Item = Interned<T>>,
|
||||
) -> SmallBitmap<T> {
|
||||
let mut forbidden = self.forbidden.clone();
|
||||
let mut cursor = self;
|
||||
for c in prefix {
|
||||
if let Some(next) = cursor.advance(c) {
|
||||
cursor = next;
|
||||
forbidden.union(&cursor.forbidden);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
forbidden
|
||||
}
|
||||
pub fn forbidden_conditions_after_prefix(
|
||||
&mut self,
|
||||
prefix: impl Iterator<Item = Interned<T>>,
|
||||
) -> Option<SmallBitmap<T>> {
|
||||
let mut cursor = self;
|
||||
for c in prefix {
|
||||
if let Some(next) = cursor.advance(c) {
|
||||
cursor = next;
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
}
|
||||
Some(cursor.forbidden.clone())
|
||||
}
|
||||
pub fn forbid_condition_after_prefix(
|
||||
&mut self,
|
||||
mut prefix: impl Iterator<Item = Interned<T>>,
|
||||
forbidden: Interned<T>,
|
||||
) {
|
||||
match prefix.next() {
|
||||
None => {
|
||||
self.forbidden.insert(forbidden);
|
||||
}
|
||||
Some(first_condition) => {
|
||||
if let Some(idx) = self.conditions.iter().position(|c| *c == first_condition) {
|
||||
return self.next[idx].forbid_condition_after_prefix(prefix, forbidden);
|
||||
}
|
||||
let mut rest = DeadEndsCache {
|
||||
conditions: vec![],
|
||||
next: vec![],
|
||||
forbidden: SmallBitmap::new(self.forbidden.universe_length()),
|
||||
};
|
||||
rest.forbid_condition_after_prefix(prefix, forbidden);
|
||||
self.conditions.push(first_condition);
|
||||
self.next.push(rest);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// pub fn debug_print(&self, indent: usize) {
|
||||
// println!("{} {:?}", " ".repeat(indent), self.forbidden.iter().collect::<Vec<_>>());
|
||||
// for (condition, next) in self.conditions.iter().zip(self.next.iter()) {
|
||||
// println!("{} {condition}:", " ".repeat(indent));
|
||||
// next.debug_print(indent + 2);
|
||||
// }
|
||||
// }
|
||||
}
|
||||
@@ -0,0 +1,92 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{ComputedCondition, RankingRuleGraphTrait};
|
||||
use crate::score_details::{self, Rank, ScoreDetails};
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||
use crate::search::new::Word;
|
||||
use crate::{Result, SearchContext};
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum ExactnessCondition {
|
||||
ExactInAttribute(LocatedQueryTermSubset),
|
||||
Any(LocatedQueryTermSubset),
|
||||
}
|
||||
|
||||
pub enum ExactnessGraph {}
|
||||
|
||||
fn compute_docids(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
dest_node: &LocatedQueryTermSubset,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) {
|
||||
exact_term
|
||||
} else {
|
||||
return Ok(Default::default());
|
||||
};
|
||||
|
||||
let candidates = match exact_term {
|
||||
// TODO I move the intersection here
|
||||
ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)? & universe,
|
||||
ExactTerm::Word(word) => {
|
||||
ctx.word_docids(Some(universe), Word::Original(word))?.unwrap_or_default()
|
||||
}
|
||||
};
|
||||
|
||||
Ok(candidates)
|
||||
}
|
||||
|
||||
impl RankingRuleGraphTrait for ExactnessGraph {
|
||||
type Condition = ExactnessCondition;
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::exactness")]
|
||||
fn resolve_condition(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
condition: &Self::Condition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<ComputedCondition> {
|
||||
let (docids, end_term_subset) = match condition {
|
||||
ExactnessCondition::ExactInAttribute(dest_node) => {
|
||||
let mut end_term_subset = dest_node.clone();
|
||||
end_term_subset.term_subset.keep_only_exact_term(ctx);
|
||||
end_term_subset.term_subset.make_mandatory();
|
||||
(compute_docids(ctx, dest_node, universe)?, end_term_subset)
|
||||
}
|
||||
ExactnessCondition::Any(dest_node) => {
|
||||
let docids =
|
||||
compute_query_term_subset_docids(ctx, Some(universe), &dest_node.term_subset)?;
|
||||
(docids, dest_node.clone())
|
||||
}
|
||||
};
|
||||
|
||||
Ok(ComputedCondition {
|
||||
docids,
|
||||
universe_len: universe.len(),
|
||||
start_term_subset: None,
|
||||
end_term_subset,
|
||||
})
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::exactness")]
|
||||
fn build_edges(
|
||||
_ctx: &mut SearchContext<'_>,
|
||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||
_source_node: Option<&LocatedQueryTermSubset>,
|
||||
dest_node: &LocatedQueryTermSubset,
|
||||
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||
let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone());
|
||||
let exact_condition = conditions_interner.insert(exact_condition);
|
||||
|
||||
let skip_condition = ExactnessCondition::Any(dest_node.clone());
|
||||
let skip_condition = conditions_interner.insert(skip_condition);
|
||||
|
||||
Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)])
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::exactness")]
|
||||
fn rank_to_score(rank: Rank) -> ScoreDetails {
|
||||
ScoreDetails::ExactWords(score_details::ExactWords::from_rank(rank))
|
||||
}
|
||||
}
|
||||
112
crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs
Normal file
112
crates/milli/src/search/new/ranking_rule_graph/fid/mod.rs
Normal file
@@ -0,0 +1,112 @@
|
||||
use fxhash::FxHashSet;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{ComputedCondition, RankingRuleGraphTrait};
|
||||
use crate::score_details::{Rank, ScoreDetails};
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id;
|
||||
use crate::search::new::SearchContext;
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct FidCondition {
|
||||
term: LocatedQueryTermSubset,
|
||||
fid: Option<FieldId>,
|
||||
}
|
||||
|
||||
pub enum FidGraph {}
|
||||
|
||||
impl RankingRuleGraphTrait for FidGraph {
|
||||
type Condition = FidCondition;
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::fid")]
|
||||
fn resolve_condition(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
condition: &Self::Condition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<ComputedCondition> {
|
||||
let FidCondition { term, .. } = condition;
|
||||
|
||||
let docids = if let Some(fid) = condition.fid {
|
||||
compute_query_term_subset_docids_within_field_id(
|
||||
ctx,
|
||||
Some(universe),
|
||||
&term.term_subset,
|
||||
fid,
|
||||
)?
|
||||
} else {
|
||||
RoaringBitmap::new()
|
||||
};
|
||||
|
||||
Ok(ComputedCondition {
|
||||
docids,
|
||||
universe_len: universe.len(),
|
||||
start_term_subset: None,
|
||||
end_term_subset: term.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::fid")]
|
||||
fn build_edges(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||
_from: Option<&LocatedQueryTermSubset>,
|
||||
to_term: &LocatedQueryTermSubset,
|
||||
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||
let term = to_term;
|
||||
|
||||
let mut all_fields = FxHashSet::default();
|
||||
for word in term.term_subset.all_single_words_except_prefix_db(ctx)? {
|
||||
let fields = ctx.get_db_word_fids(word.interned())?;
|
||||
all_fields.extend(fields);
|
||||
}
|
||||
|
||||
for phrase in term.term_subset.all_phrases(ctx)? {
|
||||
for &word in phrase.words(ctx).iter().flatten() {
|
||||
let fields = ctx.get_db_word_fids(word)?;
|
||||
all_fields.extend(fields);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) {
|
||||
let fields = ctx.get_db_word_prefix_fids(word_prefix.interned())?;
|
||||
all_fields.extend(fields);
|
||||
}
|
||||
|
||||
let weights_map = ctx.index.fieldids_weights_map(ctx.txn)?;
|
||||
|
||||
let mut edges = vec![];
|
||||
for fid in all_fields.iter().copied() {
|
||||
let weight = weights_map
|
||||
.weight(fid)
|
||||
.ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?;
|
||||
edges.push((
|
||||
weight as u32 * term.term_ids.len() as u32,
|
||||
conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }),
|
||||
));
|
||||
}
|
||||
|
||||
// always lookup the max_fid if we don't already and add an artificial condition for max scoring
|
||||
let max_weight: Option<u16> = weights_map.max_weight();
|
||||
|
||||
if let Some(max_weight) = max_weight {
|
||||
if !all_fields.contains(&max_weight) {
|
||||
edges.push((
|
||||
max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
|
||||
conditions_interner.insert(FidCondition {
|
||||
term: term.clone(), // TODO remove this ugly clone
|
||||
fid: None,
|
||||
}),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(edges)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::fid")]
|
||||
fn rank_to_score(rank: Rank) -> ScoreDetails {
|
||||
ScoreDetails::Fid(rank)
|
||||
}
|
||||
}
|
||||
160
crates/milli/src/search/new/ranking_rule_graph/mod.rs
Normal file
160
crates/milli/src/search/new/ranking_rule_graph/mod.rs
Normal file
@@ -0,0 +1,160 @@
|
||||
/*! Module implementing the graph used for the graph-based ranking rules
|
||||
and its related algorithms.
|
||||
|
||||
A ranking rule graph is built on top of the [`QueryGraph`]: the nodes stay
|
||||
the same but the edges are replaced.
|
||||
*/
|
||||
|
||||
mod build;
|
||||
mod cheapest_paths;
|
||||
mod condition_docids_cache;
|
||||
mod dead_ends_cache;
|
||||
|
||||
/// Implementation of the `exactness` ranking rule
|
||||
mod exactness;
|
||||
/// Implementation of the `attribute` ranking rule
|
||||
mod fid;
|
||||
/// Implementation of the `position` ranking rule
|
||||
mod position;
|
||||
/// Implementation of the `proximity` ranking rule
|
||||
mod proximity;
|
||||
/// Implementation of the `typo` ranking rule
|
||||
mod typo;
|
||||
/// Implementation of the `words` ranking rule
|
||||
mod words;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::hash::Hash;
|
||||
|
||||
pub use cheapest_paths::PathVisitor;
|
||||
pub use condition_docids_cache::ConditionDocIdsCache;
|
||||
pub use dead_ends_cache::DeadEndsCache;
|
||||
pub use exactness::ExactnessGraph;
|
||||
pub use fid::{FidCondition, FidGraph};
|
||||
pub use position::{PositionCondition, PositionGraph};
|
||||
pub use proximity::{ProximityCondition, ProximityGraph};
|
||||
use roaring::RoaringBitmap;
|
||||
pub use typo::{TypoCondition, TypoGraph};
|
||||
pub use words::{WordsCondition, WordsGraph};
|
||||
|
||||
use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner};
|
||||
use super::query_term::LocatedQueryTermSubset;
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
use super::{QueryGraph, QueryNode, SearchContext};
|
||||
use crate::score_details::{Rank, ScoreDetails};
|
||||
use crate::Result;
|
||||
|
||||
pub struct ComputedCondition {
|
||||
pub docids: RoaringBitmap,
|
||||
pub universe_len: u64,
|
||||
pub start_term_subset: Option<LocatedQueryTermSubset>,
|
||||
pub end_term_subset: LocatedQueryTermSubset,
|
||||
}
|
||||
|
||||
/// An edge in the ranking rule graph.
|
||||
///
|
||||
/// It contains:
|
||||
/// 1. The source and destination nodes
|
||||
/// 2. The cost of traversing this edge
|
||||
/// 3. The condition associated with it
|
||||
/// 4. The list of nodes that have to be skipped
|
||||
/// if this edge is traversed.
|
||||
#[derive(Clone)]
|
||||
pub struct Edge<E> {
|
||||
pub source_node: Interned<QueryNode>,
|
||||
pub dest_node: Interned<QueryNode>,
|
||||
pub cost: u32,
|
||||
pub condition: Option<Interned<E>>,
|
||||
pub nodes_to_skip: SmallBitmap<QueryNode>,
|
||||
}
|
||||
|
||||
impl<E> Hash for Edge<E> {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
self.source_node.hash(state);
|
||||
self.dest_node.hash(state);
|
||||
self.cost.hash(state);
|
||||
self.condition.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl<E> Eq for Edge<E> {}
|
||||
|
||||
impl<E> PartialEq for Edge<E> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.source_node == other.source_node
|
||||
&& self.dest_node == other.dest_node
|
||||
&& self.cost == other.cost
|
||||
&& self.condition == other.condition
|
||||
}
|
||||
}
|
||||
|
||||
/// A trait to be implemented by a marker type to build a graph-based ranking rule.
|
||||
///
|
||||
/// It mostly describes how to:
|
||||
/// 1. Retrieve the set of edges (their cost and condition) between two nodes.
|
||||
/// 2. Compute the document ids satisfying a condition
|
||||
pub trait RankingRuleGraphTrait: Sized + 'static {
|
||||
type Condition: Sized + Clone + PartialEq + Eq + Hash;
|
||||
|
||||
/// Compute the document ids associated with the given edge condition,
|
||||
/// restricted to the given universe.
|
||||
fn resolve_condition(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
condition: &Self::Condition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<ComputedCondition>;
|
||||
|
||||
/// Return the costs and conditions of the edges going from the source node to the destination node
|
||||
fn build_edges(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||
source_node: Option<&LocatedQueryTermSubset>,
|
||||
dest_node: &LocatedQueryTermSubset,
|
||||
) -> Result<Vec<(u32, Interned<Self::Condition>)>>;
|
||||
|
||||
/// Convert the rank of a path to its corresponding score for the ranking rule
|
||||
fn rank_to_score(rank: Rank) -> ScoreDetails;
|
||||
}
|
||||
|
||||
/// The graph used by graph-based ranking rules.
|
||||
///
|
||||
/// It is built on top of a [`QueryGraph`], keeping the same nodes
|
||||
/// but replacing the edges.
|
||||
pub struct RankingRuleGraph<G: RankingRuleGraphTrait> {
|
||||
pub query_graph: QueryGraph,
|
||||
pub edges_store: FixedSizeInterner<Option<Edge<G::Condition>>>,
|
||||
pub edges_of_node: MappedInterner<QueryNode, SmallBitmap<Option<Edge<G::Condition>>>>,
|
||||
pub conditions_interner: FixedSizeInterner<G::Condition>,
|
||||
}
|
||||
impl<G: RankingRuleGraphTrait> Clone for RankingRuleGraph<G> {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
query_graph: self.query_graph.clone(),
|
||||
edges_store: self.edges_store.clone(),
|
||||
edges_of_node: self.edges_of_node.clone(),
|
||||
conditions_interner: self.conditions_interner.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
/// Remove all edges with the given condition
|
||||
/// Return a set of all the source nodes of the removed edges
|
||||
pub fn remove_edges_with_condition(
|
||||
&mut self,
|
||||
condition_to_remove: Interned<G::Condition>,
|
||||
) -> BTreeSet<Interned<QueryNode>> {
|
||||
let mut source_nodes = BTreeSet::new();
|
||||
for (edge_id, edge_opt) in self.edges_store.iter_mut() {
|
||||
let Some(edge) = edge_opt.as_mut() else { continue };
|
||||
let Some(condition) = edge.condition else { continue };
|
||||
|
||||
if condition == condition_to_remove {
|
||||
let (source_node, _dest_node) = (edge.source_node, edge.dest_node);
|
||||
*edge_opt = None;
|
||||
self.edges_of_node.get_mut(source_node).remove(edge_id);
|
||||
source_nodes.insert(source_node);
|
||||
}
|
||||
}
|
||||
source_nodes
|
||||
}
|
||||
}
|
||||
143
crates/milli/src/search/new/ranking_rule_graph/position/mod.rs
Normal file
143
crates/milli/src/search/new/ranking_rule_graph/position/mod.rs
Normal file
@@ -0,0 +1,143 @@
|
||||
use fxhash::{FxHashMap, FxHashSet};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{ComputedCondition, RankingRuleGraphTrait};
|
||||
use crate::score_details::{Rank, ScoreDetails};
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_position;
|
||||
use crate::search::new::SearchContext;
|
||||
use crate::Result;
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct PositionCondition {
|
||||
term: LocatedQueryTermSubset,
|
||||
positions: Vec<u16>,
|
||||
}
|
||||
|
||||
pub enum PositionGraph {}
|
||||
|
||||
impl RankingRuleGraphTrait for PositionGraph {
|
||||
type Condition = PositionCondition;
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::position")]
|
||||
fn resolve_condition(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
condition: &Self::Condition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<ComputedCondition> {
|
||||
let PositionCondition { term, positions } = condition;
|
||||
let mut docids = RoaringBitmap::new();
|
||||
// TODO use MultiOps to do the big union
|
||||
for position in positions {
|
||||
// maybe compute_query_term_subset_docids_within_position should accept a universe as argument
|
||||
docids |= compute_query_term_subset_docids_within_position(
|
||||
ctx,
|
||||
Some(universe),
|
||||
&term.term_subset,
|
||||
*position,
|
||||
)?;
|
||||
}
|
||||
Ok(ComputedCondition {
|
||||
docids,
|
||||
universe_len: universe.len(),
|
||||
start_term_subset: None,
|
||||
end_term_subset: term.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::position")]
|
||||
fn build_edges(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||
_from: Option<&LocatedQueryTermSubset>,
|
||||
to_term: &LocatedQueryTermSubset,
|
||||
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||
let term = to_term;
|
||||
|
||||
let mut all_positions = FxHashSet::default();
|
||||
for word in term.term_subset.all_single_words_except_prefix_db(ctx)? {
|
||||
let positions = ctx.get_db_word_positions(word.interned())?;
|
||||
all_positions.extend(positions);
|
||||
}
|
||||
|
||||
for phrase in term.term_subset.all_phrases(ctx)? {
|
||||
// Only check the position of the first word in the phrase
|
||||
// this is not correct, but it is the best we can do, since
|
||||
// it is difficult/impossible to know the expected position
|
||||
// of a word in a phrase.
|
||||
// There is probably a more correct way to do it though.
|
||||
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
||||
let positions = ctx.get_db_word_positions(*word)?;
|
||||
all_positions.extend(positions);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) {
|
||||
let positions = ctx.get_db_word_prefix_positions(word_prefix.interned())?;
|
||||
all_positions.extend(positions);
|
||||
}
|
||||
|
||||
let mut positions_for_costs = FxHashMap::<u32, Vec<u16>>::default();
|
||||
|
||||
for position in all_positions {
|
||||
// FIXME: bucketed position???
|
||||
let distance = position.abs_diff(*term.positions.start());
|
||||
let cost = {
|
||||
let mut cost = 0;
|
||||
for i in 0..term.term_ids.len() {
|
||||
// This is actually not fully correct and slightly penalises ngrams unfairly.
|
||||
// Because if two words are in the same bucketed position (e.g. 32) and consecutive,
|
||||
// then their position cost will be 32+32=64, but an ngram of these two words at the
|
||||
// same position will have a cost of 32+32+1=65
|
||||
cost += cost_from_distance(distance as u32 + i as u32);
|
||||
}
|
||||
cost
|
||||
};
|
||||
positions_for_costs.entry(cost).or_default().push(position);
|
||||
}
|
||||
|
||||
let max_cost = term.term_ids.len() as u32 * 10;
|
||||
let max_cost_exists = positions_for_costs.contains_key(&max_cost);
|
||||
|
||||
let mut edges = vec![];
|
||||
for (cost, positions) in positions_for_costs {
|
||||
edges.push((
|
||||
cost,
|
||||
conditions_interner.insert(PositionCondition { term: term.clone(), positions }),
|
||||
));
|
||||
}
|
||||
|
||||
if !max_cost_exists {
|
||||
// artificial empty condition for computing max cost
|
||||
edges.push((
|
||||
max_cost,
|
||||
conditions_interner
|
||||
.insert(PositionCondition { term: term.clone(), positions: Vec::default() }),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(edges)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::position")]
|
||||
fn rank_to_score(rank: Rank) -> ScoreDetails {
|
||||
ScoreDetails::Position(rank)
|
||||
}
|
||||
}
|
||||
|
||||
fn cost_from_distance(distance: u32) -> u32 {
|
||||
match distance {
|
||||
0 => 0,
|
||||
1 => 1,
|
||||
2..=4 => 2,
|
||||
5..=7 => 3,
|
||||
8..=11 => 4,
|
||||
12..=16 => 5,
|
||||
17..=24 => 6,
|
||||
25..=64 => 7,
|
||||
65..=256 => 8,
|
||||
257..=1024 => 9,
|
||||
_ => 10,
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
use super::ProximityCondition;
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::SearchContext;
|
||||
use crate::Result;
|
||||
|
||||
pub fn build_edges(
|
||||
_ctx: &mut SearchContext<'_>,
|
||||
conditions_interner: &mut DedupInterner<ProximityCondition>,
|
||||
left_term: Option<&LocatedQueryTermSubset>,
|
||||
right_term: &LocatedQueryTermSubset,
|
||||
) -> Result<Vec<(u32, Interned<ProximityCondition>)>> {
|
||||
let right_ngram_max = right_term.term_ids.len().saturating_sub(1);
|
||||
|
||||
let Some(left_term) = left_term else {
|
||||
return Ok(vec![(
|
||||
right_ngram_max as u32,
|
||||
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
||||
)]);
|
||||
};
|
||||
|
||||
if left_term.positions.end() + 1 != *right_term.positions.start() {
|
||||
// We want to ignore this pair of terms
|
||||
// Unconditionally walk through the edge without computing the docids
|
||||
// This can happen when, in a query like `the sun flowers are beautiful`, the term
|
||||
// `flowers` is removed by the `words` ranking rule.
|
||||
// The remaining query graph represents `the sun .. are beautiful`
|
||||
// but `sun` and `are` have no proximity condition between them
|
||||
return Ok(vec![(
|
||||
right_ngram_max as u32,
|
||||
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
||||
)]);
|
||||
}
|
||||
|
||||
let mut conditions = vec![];
|
||||
for cost in right_ngram_max..(((MAX_DISTANCE as usize) - 1) + right_ngram_max) {
|
||||
conditions.push((
|
||||
cost as u32,
|
||||
conditions_interner.insert(ProximityCondition::Uninit {
|
||||
left_term: left_term.clone(),
|
||||
right_term: right_term.clone(),
|
||||
cost: (cost + 1) as u8,
|
||||
}),
|
||||
))
|
||||
}
|
||||
|
||||
conditions.push((
|
||||
((MAX_DISTANCE - 1) + (right_ngram_max as u32)),
|
||||
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
|
||||
));
|
||||
|
||||
Ok(conditions)
|
||||
}
|
||||
@@ -0,0 +1,251 @@
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::ProximityCondition;
|
||||
use crate::search::new::interner::Interned;
|
||||
use crate::search::new::query_term::{Phrase, QueryTermSubset};
|
||||
use crate::search::new::ranking_rule_graph::ComputedCondition;
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||
use crate::search::new::{SearchContext, Word};
|
||||
use crate::Result;
|
||||
|
||||
pub fn compute_docids(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
condition: &ProximityCondition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<ComputedCondition> {
|
||||
let (left_term, right_term, cost) = match condition {
|
||||
ProximityCondition::Uninit { left_term, right_term, cost } => {
|
||||
(left_term, right_term, *cost)
|
||||
}
|
||||
ProximityCondition::Term { term } => {
|
||||
return Ok(ComputedCondition {
|
||||
docids: compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?,
|
||||
universe_len: universe.len(),
|
||||
start_term_subset: None,
|
||||
end_term_subset: term.clone(),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
let right_term_ngram_len = right_term.term_ids.len() as u8;
|
||||
|
||||
// e.g. for the simple words `sun .. flower`
|
||||
// the cost is 5
|
||||
// the forward proximity is 5
|
||||
// the backward proximity is 4
|
||||
//
|
||||
// for the 2gram `the sunflower`
|
||||
// the cost is 5
|
||||
// the forward proximity is 4
|
||||
// the backward proximity is 3
|
||||
let forward_proximity = 1 + cost - right_term_ngram_len;
|
||||
let backward_proximity = cost - right_term_ngram_len;
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
|
||||
if let Some(right_prefix) = right_term.term_subset.use_prefix_db(ctx) {
|
||||
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)?
|
||||
{
|
||||
compute_prefix_edges(
|
||||
ctx,
|
||||
left_word.interned(),
|
||||
right_prefix.interned(),
|
||||
left_phrase,
|
||||
forward_proximity,
|
||||
backward_proximity,
|
||||
&mut docids,
|
||||
universe,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
|
||||
// Before computing the edges, check that the left word and left phrase
|
||||
// aren't disjoint with the universe, but only do it if there is more than
|
||||
// one word derivation to the right.
|
||||
//
|
||||
// This is an optimisation to avoid checking for an excessive number of
|
||||
// pairs.
|
||||
let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?;
|
||||
if right_derivs.len() > 1 {
|
||||
let universe = &universe;
|
||||
if let Some(left_phrase) = left_phrase {
|
||||
if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
|
||||
continue;
|
||||
}
|
||||
} else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? {
|
||||
if left_word_docids.is_empty() {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (right_word, right_phrase) in right_derivs {
|
||||
compute_non_prefix_edges(
|
||||
ctx,
|
||||
left_word.interned(),
|
||||
right_word,
|
||||
left_phrase,
|
||||
right_phrase,
|
||||
forward_proximity,
|
||||
backward_proximity,
|
||||
&mut docids,
|
||||
universe,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ComputedCondition {
|
||||
docids,
|
||||
universe_len: universe.len(),
|
||||
start_term_subset: Some(left_term.clone()),
|
||||
end_term_subset: right_term.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn compute_prefix_edges(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
left_word: Interned<String>,
|
||||
right_prefix: Interned<String>,
|
||||
left_phrase: Option<Interned<Phrase>>,
|
||||
forward_proximity: u8,
|
||||
backward_proximity: u8,
|
||||
docids: &mut RoaringBitmap,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
let mut used_left_words = BTreeSet::new();
|
||||
let mut used_left_phrases = BTreeSet::new();
|
||||
let mut used_right_prefix = BTreeSet::new();
|
||||
|
||||
let mut universe = universe.clone();
|
||||
if let Some(phrase) = left_phrase {
|
||||
// TODO we can clearly give the universe to this method
|
||||
// Unfortunately, it is deserializing/computing stuff and
|
||||
// keeping the result as a materialized bitmap.
|
||||
let phrase_docids = ctx.get_phrase_docids(phrase)?;
|
||||
if !phrase_docids.is_empty() {
|
||||
used_left_phrases.insert(phrase);
|
||||
}
|
||||
universe &= phrase_docids;
|
||||
if universe.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(new_docids) = ctx.get_db_word_prefix_pair_proximity_docids(
|
||||
Some(&universe),
|
||||
left_word,
|
||||
right_prefix,
|
||||
forward_proximity,
|
||||
)? {
|
||||
if !new_docids.is_empty() {
|
||||
used_left_words.insert(left_word);
|
||||
used_right_prefix.insert(right_prefix);
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
|
||||
// No swapping when computing the proximity between a phrase and a word
|
||||
if left_phrase.is_none() {
|
||||
if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
|
||||
Some(&universe),
|
||||
right_prefix,
|
||||
left_word,
|
||||
backward_proximity,
|
||||
)? {
|
||||
if !new_docids.is_empty() {
|
||||
used_left_words.insert(left_word);
|
||||
used_right_prefix.insert(right_prefix);
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_non_prefix_edges(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
word1: Interned<String>,
|
||||
word2: Interned<String>,
|
||||
left_phrase: Option<Interned<Phrase>>,
|
||||
right_phrase: Option<Interned<Phrase>>,
|
||||
forward_proximity: u8,
|
||||
backward_proximity: u8,
|
||||
docids: &mut RoaringBitmap,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
let mut universe = universe.clone();
|
||||
|
||||
for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
|
||||
universe &= ctx.get_phrase_docids(phrase)?;
|
||||
if universe.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(new_docids) =
|
||||
ctx.get_db_word_pair_proximity_docids(Some(&universe), word1, word2, forward_proximity)?
|
||||
{
|
||||
if !new_docids.is_empty() {
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() {
|
||||
if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(
|
||||
Some(&universe),
|
||||
word2,
|
||||
word1,
|
||||
backward_proximity,
|
||||
)? {
|
||||
if !new_docids.is_empty() {
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn last_words_of_term_derivations(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
t: &QueryTermSubset,
|
||||
) -> Result<BTreeSet<(Option<Interned<Phrase>>, Word)>> {
|
||||
let mut result = BTreeSet::new();
|
||||
|
||||
for w in t.all_single_words_except_prefix_db(ctx)? {
|
||||
result.insert((None, w));
|
||||
}
|
||||
for p in t.all_phrases(ctx)? {
|
||||
let phrase = ctx.phrase_interner.get(p);
|
||||
let last_term_of_phrase = phrase.words.last().unwrap();
|
||||
if let Some(last_word) = last_term_of_phrase {
|
||||
result.insert((Some(p), Word::Original(*last_word)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
fn first_word_of_term_iter(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
t: &QueryTermSubset,
|
||||
) -> Result<BTreeSet<(Interned<String>, Option<Interned<Phrase>>)>> {
|
||||
let mut result = BTreeSet::new();
|
||||
let all_words = t.all_single_words_except_prefix_db(ctx)?;
|
||||
for w in all_words {
|
||||
result.insert((w.interned(), None));
|
||||
}
|
||||
for p in t.all_phrases(ctx)? {
|
||||
let phrase = ctx.phrase_interner.get(p);
|
||||
let first_term_of_phrase = phrase.words.first().unwrap();
|
||||
if let Some(first_word) = first_term_of_phrase {
|
||||
result.insert((*first_word, Some(p)));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
pub mod build;
|
||||
pub mod compute_docids;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{ComputedCondition, RankingRuleGraphTrait};
|
||||
use crate::score_details::{Rank, ScoreDetails};
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::SearchContext;
|
||||
use crate::Result;
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum ProximityCondition {
|
||||
Uninit { left_term: LocatedQueryTermSubset, right_term: LocatedQueryTermSubset, cost: u8 },
|
||||
Term { term: LocatedQueryTermSubset },
|
||||
}
|
||||
|
||||
pub enum ProximityGraph {}
|
||||
|
||||
impl RankingRuleGraphTrait for ProximityGraph {
|
||||
type Condition = ProximityCondition;
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::proximity")]
|
||||
fn resolve_condition(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
condition: &Self::Condition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<ComputedCondition> {
|
||||
compute_docids::compute_docids(ctx, condition, universe)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::proximity")]
|
||||
fn build_edges(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||
source_term: Option<&LocatedQueryTermSubset>,
|
||||
dest_term: &LocatedQueryTermSubset,
|
||||
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||
build::build_edges(ctx, conditions_interner, source_term, dest_term)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::proximity")]
|
||||
fn rank_to_score(rank: Rank) -> ScoreDetails {
|
||||
ScoreDetails::Proximity(rank)
|
||||
}
|
||||
}
|
||||
85
crates/milli/src/search/new/ranking_rule_graph/typo/mod.rs
Normal file
85
crates/milli/src/search/new/ranking_rule_graph/typo/mod.rs
Normal file
@@ -0,0 +1,85 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{ComputedCondition, RankingRuleGraphTrait};
|
||||
use crate::score_details::{self, Rank, ScoreDetails};
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||
use crate::search::new::SearchContext;
|
||||
use crate::Result;
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct TypoCondition {
|
||||
term: LocatedQueryTermSubset,
|
||||
nbr_typos: u8,
|
||||
}
|
||||
|
||||
pub enum TypoGraph {}
|
||||
|
||||
impl RankingRuleGraphTrait for TypoGraph {
|
||||
type Condition = TypoCondition;
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::typo")]
|
||||
fn resolve_condition(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
condition: &Self::Condition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<ComputedCondition> {
|
||||
let TypoCondition { term, .. } = condition;
|
||||
// maybe compute_query_term_subset_docids should accept a universe as argument
|
||||
let docids = compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?;
|
||||
|
||||
Ok(ComputedCondition {
|
||||
docids,
|
||||
universe_len: universe.len(),
|
||||
start_term_subset: None,
|
||||
end_term_subset: term.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::typo")]
|
||||
fn build_edges(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||
_from: Option<&LocatedQueryTermSubset>,
|
||||
to_term: &LocatedQueryTermSubset,
|
||||
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||
let term = to_term;
|
||||
|
||||
let mut edges = vec![];
|
||||
// Ngrams have a base typo cost
|
||||
// 2-gram -> equivalent to 1 typo
|
||||
// 3-gram -> equivalent to 2 typos
|
||||
let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
|
||||
|
||||
for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) {
|
||||
let mut term = term.clone();
|
||||
match nbr_typos {
|
||||
0 => {
|
||||
term.term_subset.clear_one_typo_subset();
|
||||
term.term_subset.clear_two_typo_subset();
|
||||
}
|
||||
1 => {
|
||||
term.term_subset.clear_zero_typo_subset();
|
||||
term.term_subset.clear_two_typo_subset();
|
||||
}
|
||||
2 => {
|
||||
term.term_subset.clear_zero_typo_subset();
|
||||
term.term_subset.clear_one_typo_subset();
|
||||
}
|
||||
_ => panic!(),
|
||||
};
|
||||
|
||||
edges.push((
|
||||
nbr_typos as u32 + base_cost,
|
||||
conditions_interner.insert(TypoCondition { term, nbr_typos }),
|
||||
));
|
||||
}
|
||||
Ok(edges)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::typo")]
|
||||
fn rank_to_score(rank: Rank) -> ScoreDetails {
|
||||
ScoreDetails::Typo(score_details::Typo::from_rank(rank))
|
||||
}
|
||||
}
|
||||
53
crates/milli/src/search/new/ranking_rule_graph/words/mod.rs
Normal file
53
crates/milli/src/search/new/ranking_rule_graph/words/mod.rs
Normal file
@@ -0,0 +1,53 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{ComputedCondition, RankingRuleGraphTrait};
|
||||
use crate::score_details::{self, Rank, ScoreDetails};
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||
use crate::search::new::SearchContext;
|
||||
use crate::Result;
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct WordsCondition {
|
||||
term: LocatedQueryTermSubset,
|
||||
}
|
||||
|
||||
pub enum WordsGraph {}
|
||||
|
||||
impl RankingRuleGraphTrait for WordsGraph {
|
||||
type Condition = WordsCondition;
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::words")]
|
||||
fn resolve_condition(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
condition: &Self::Condition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<ComputedCondition> {
|
||||
let WordsCondition { term, .. } = condition;
|
||||
// maybe compute_query_term_subset_docids should accept a universe as argument
|
||||
let docids = compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?;
|
||||
|
||||
Ok(ComputedCondition {
|
||||
docids,
|
||||
universe_len: universe.len(),
|
||||
start_term_subset: None,
|
||||
end_term_subset: term.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::words")]
|
||||
fn build_edges(
|
||||
_ctx: &mut SearchContext<'_>,
|
||||
conditions_interner: &mut DedupInterner<Self::Condition>,
|
||||
_from: Option<&LocatedQueryTermSubset>,
|
||||
to_term: &LocatedQueryTermSubset,
|
||||
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
|
||||
Ok(vec![(0, conditions_interner.insert(WordsCondition { term: to_term.clone() }))])
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::words")]
|
||||
fn rank_to_score(rank: Rank) -> ScoreDetails {
|
||||
ScoreDetails::Words(score_details::Words::from_rank(rank))
|
||||
}
|
||||
}
|
||||
72
crates/milli/src/search/new/ranking_rules.rs
Normal file
72
crates/milli/src/search/new/ranking_rules.rs
Normal file
@@ -0,0 +1,72 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::logger::SearchLogger;
|
||||
use super::{QueryGraph, SearchContext};
|
||||
use crate::score_details::ScoreDetails;
|
||||
use crate::Result;
|
||||
|
||||
/// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`]
|
||||
pub trait RankingRuleQueryTrait: Sized + Clone + 'static {}
|
||||
|
||||
/// A type describing a placeholder search
|
||||
#[derive(Clone)]
|
||||
pub struct PlaceholderQuery;
|
||||
impl RankingRuleQueryTrait for PlaceholderQuery {}
|
||||
impl RankingRuleQueryTrait for QueryGraph {}
|
||||
|
||||
pub type BoxRankingRule<'ctx, Query> = Box<dyn RankingRule<'ctx, Query> + 'ctx>;
|
||||
|
||||
/// A trait that must be implemented by all ranking rules.
|
||||
///
|
||||
/// It is generic over `'ctx`, the lifetime of the search context
|
||||
/// (i.e. the read transaction and the cache) and over `Query`, which
|
||||
/// can be either [`PlaceholderQuery`] or [`QueryGraph`].
|
||||
pub trait RankingRule<'ctx, Query: RankingRuleQueryTrait> {
|
||||
fn id(&self) -> String;
|
||||
|
||||
/// Prepare the ranking rule such that it can start iterating over its
|
||||
/// buckets using [`next_bucket`](RankingRule::next_bucket).
|
||||
///
|
||||
/// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket).
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<Query>,
|
||||
universe: &RoaringBitmap,
|
||||
query: &Query,
|
||||
) -> Result<()>;
|
||||
|
||||
/// Return the next bucket of this ranking rule.
|
||||
///
|
||||
/// The returned candidates MUST be a subset of the given universe.
|
||||
///
|
||||
/// The universe given as argument is either:
|
||||
/// - a subset of the universe given to the previous call to [`next_bucket`](RankingRule::next_bucket); OR
|
||||
/// - the universe given to [`start_iteration`](RankingRule::start_iteration)
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<Query>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<Query>>>;
|
||||
|
||||
/// Finish iterating over the buckets, which yields control to the parent ranking rule
|
||||
/// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration).
|
||||
fn end_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<Query>,
|
||||
);
|
||||
}
|
||||
|
||||
/// Output of a ranking rule, consisting of the query to be used
|
||||
/// by the child ranking rule and a set of document ids.
|
||||
#[derive(Debug)]
|
||||
pub struct RankingRuleOutput<Q> {
|
||||
/// The query corresponding to the current bucket for the child ranking rule
|
||||
pub query: Q,
|
||||
/// The allowed candidates for the child ranking rule
|
||||
pub candidates: RoaringBitmap,
|
||||
/// The score for the candidates of the current bucket
|
||||
pub score: ScoreDetails,
|
||||
}
|
||||
260
crates/milli/src/search/new/resolve_query_graph.rs
Normal file
260
crates/milli/src/search/new/resolve_query_graph.rs
Normal file
@@ -0,0 +1,260 @@
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
|
||||
use super::interner::Interned;
|
||||
use super::query_graph::QueryNodeData;
|
||||
use super::query_term::{Phrase, QueryTermSubset};
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
use super::{QueryGraph, SearchContext, Word};
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::Result;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct PhraseDocIdsCache {
|
||||
pub cache: FxHashMap<Interned<Phrase>, RoaringBitmap>,
|
||||
}
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
/// Get the document ids associated with the given phrase
|
||||
pub fn get_phrase_docids(&mut self, phrase: Interned<Phrase>) -> Result<&RoaringBitmap> {
|
||||
if self.phrase_docids.cache.contains_key(&phrase) {
|
||||
return Ok(&self.phrase_docids.cache[&phrase]);
|
||||
};
|
||||
let docids = compute_phrase_docids(self, phrase)?;
|
||||
// TODO can we improve that? Because there is an issue, we keep that in cache...
|
||||
let _ = self.phrase_docids.cache.insert(phrase, docids);
|
||||
let docids = &self.phrase_docids.cache[&phrase];
|
||||
Ok(docids)
|
||||
}
|
||||
}
|
||||
pub fn compute_query_term_subset_docids(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
term: &QueryTermSubset,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
// TODO use the MultiOps trait to do large intersections
|
||||
for word in term.all_single_words_except_prefix_db(ctx)? {
|
||||
if let Some(word_docids) = ctx.word_docids(universe, word)? {
|
||||
docids |= word_docids;
|
||||
}
|
||||
}
|
||||
for phrase in term.all_phrases(ctx)? {
|
||||
docids |= ctx.get_phrase_docids(phrase)?;
|
||||
}
|
||||
|
||||
if let Some(prefix) = term.use_prefix_db(ctx) {
|
||||
if let Some(prefix_docids) = ctx.word_prefix_docids(universe, prefix)? {
|
||||
docids |= prefix_docids;
|
||||
}
|
||||
}
|
||||
|
||||
match universe {
|
||||
Some(universe) => Ok(docids & universe),
|
||||
None => Ok(docids),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn compute_query_term_subset_docids_within_field_id(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
term: &QueryTermSubset,
|
||||
fid: u16,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for word in term.all_single_words_except_prefix_db(ctx)? {
|
||||
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, word.interned(), fid)? {
|
||||
docids |= word_fid_docids;
|
||||
}
|
||||
}
|
||||
|
||||
for phrase in term.all_phrases(ctx)? {
|
||||
// There may be false positives when resolving a phrase, so we're not
|
||||
// guaranteed that all of its words are within a single fid.
|
||||
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
||||
if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, *word, fid)? {
|
||||
docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
||||
if let Some(word_fid_docids) =
|
||||
ctx.get_db_word_prefix_fid_docids(universe, word_prefix.interned(), fid)?
|
||||
{
|
||||
docids |= word_fid_docids;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
pub fn compute_query_term_subset_docids_within_position(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
term: &QueryTermSubset,
|
||||
position: u16,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for word in term.all_single_words_except_prefix_db(ctx)? {
|
||||
if let Some(word_position_docids) =
|
||||
ctx.get_db_word_position_docids(universe, word.interned(), position)?
|
||||
{
|
||||
docids |= word_position_docids;
|
||||
}
|
||||
}
|
||||
|
||||
for phrase in term.all_phrases(ctx)? {
|
||||
// It's difficult to know the expected position of the words in the phrase,
|
||||
// so instead we just check the first one.
|
||||
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
|
||||
if let Some(word_position_docids) =
|
||||
ctx.get_db_word_position_docids(universe, *word, position)?
|
||||
{
|
||||
docids |= ctx.get_phrase_docids(phrase)? & word_position_docids;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(word_prefix) = term.use_prefix_db(ctx) {
|
||||
if let Some(word_position_docids) =
|
||||
ctx.get_db_word_prefix_position_docids(universe, word_prefix.interned(), position)?
|
||||
{
|
||||
docids |= word_position_docids;
|
||||
}
|
||||
}
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
/// Returns the subset of the input universe that satisfies the contraints of the input query graph.
|
||||
pub fn compute_query_graph_docids(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
q: &QueryGraph,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes);
|
||||
let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new());
|
||||
|
||||
let mut next_nodes_to_visit = VecDeque::new();
|
||||
next_nodes_to_visit.push_back(q.root_node);
|
||||
|
||||
while let Some(node_id) = next_nodes_to_visit.pop_front() {
|
||||
let node = q.nodes.get(node_id);
|
||||
let predecessors = &node.predecessors;
|
||||
if !predecessors.is_subset(&nodes_resolved) {
|
||||
next_nodes_to_visit.push_back(node_id);
|
||||
continue;
|
||||
}
|
||||
// Take union of all predecessors
|
||||
let predecessors_docids =
|
||||
MultiOps::union(predecessors.iter().map(|p| path_nodes_docids.get(p)));
|
||||
|
||||
let node_docids = match &node.data {
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset,
|
||||
positions: _,
|
||||
term_ids: _,
|
||||
}) => compute_query_term_subset_docids(ctx, Some(&predecessors_docids), term_subset)?,
|
||||
QueryNodeData::Deleted => {
|
||||
panic!()
|
||||
}
|
||||
QueryNodeData::Start => universe.clone(),
|
||||
QueryNodeData::End => {
|
||||
return Ok(predecessors_docids);
|
||||
}
|
||||
};
|
||||
nodes_resolved.insert(node_id);
|
||||
*path_nodes_docids.get_mut(node_id) = node_docids;
|
||||
|
||||
for succ in node.successors.iter() {
|
||||
if !next_nodes_to_visit.contains(&succ) && !nodes_resolved.contains(succ) {
|
||||
next_nodes_to_visit.push_back(succ);
|
||||
}
|
||||
}
|
||||
|
||||
for prec in node.predecessors.iter() {
|
||||
if q.nodes.get(prec).successors.is_subset(&nodes_resolved) {
|
||||
path_nodes_docids.get_mut(prec).clear();
|
||||
}
|
||||
}
|
||||
}
|
||||
panic!()
|
||||
}
|
||||
|
||||
pub fn compute_phrase_docids(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
phrase: Interned<Phrase>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let Phrase { words } = ctx.phrase_interner.get(phrase).clone();
|
||||
|
||||
if words.is_empty() {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
for word in words.iter().flatten().copied() {
|
||||
if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? {
|
||||
candidates |= word_docids;
|
||||
} else {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
}
|
||||
|
||||
let winsize = words.len().min(3);
|
||||
|
||||
for win in words.windows(winsize) {
|
||||
// Get all the documents with the matching distance for each word pairs.
|
||||
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
||||
for (offset, &s1) in win
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
||||
{
|
||||
for (dist, &s2) in win
|
||||
.iter()
|
||||
.skip(offset + 1)
|
||||
.enumerate()
|
||||
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
|
||||
{
|
||||
if dist == 0 {
|
||||
match ctx.get_db_word_pair_proximity_docids(None, s1, s2, 1)? {
|
||||
Some(m) => bitmaps.push(m),
|
||||
// If there are no documents for this pair, there will be no
|
||||
// results for the phrase query.
|
||||
None => return Ok(RoaringBitmap::new()),
|
||||
}
|
||||
} else {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
for dist in 0..=dist {
|
||||
if let Some(m) =
|
||||
ctx.get_db_word_pair_proximity_docids(None, s1, s2, dist as u8 + 1)?
|
||||
{
|
||||
bitmap |= m;
|
||||
}
|
||||
}
|
||||
if bitmap.is_empty() {
|
||||
return Ok(bitmap);
|
||||
} else {
|
||||
bitmaps.push(bitmap);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We sort the bitmaps so that we perform the small intersections first, which is faster.
|
||||
bitmaps.sort_unstable_by_key(|a| a.len());
|
||||
|
||||
// TODO use MultiOps intersection which and remove the above sort
|
||||
for bitmap in bitmaps {
|
||||
candidates &= bitmap;
|
||||
|
||||
// There will be no match, return early
|
||||
if candidates.is_empty() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(candidates)
|
||||
}
|
||||
414
crates/milli/src/search/new/small_bitmap.rs
Normal file
414
crates/milli/src/search/new/small_bitmap.rs
Normal file
@@ -0,0 +1,414 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::interner::{FixedSizeInterner, Interned};
|
||||
|
||||
/// A compact set of [`Interned<T>`]
|
||||
///
|
||||
/// This set optimizes storage by storing the set of values in a bitmap, and further optimizes
|
||||
/// for bitmaps where the highest possible index (describing the limits of the "universe")
|
||||
/// is smaller than 64 by storing them as a `u64`.
|
||||
pub struct SmallBitmap<T> {
|
||||
// internals are not typed as they only represent the indexes that are set
|
||||
internal: SmallBitmapInternal,
|
||||
// restores typing with a tag
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
|
||||
// manual implementation for when `T` is not Clone.
|
||||
impl<T> Clone for SmallBitmap<T> {
|
||||
fn clone(&self) -> Self {
|
||||
Self { internal: self.internal.clone(), _phantom: PhantomData }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> SmallBitmap<T> {
|
||||
/// Constructs a new, **empty**, `SmallBitmap<T>` with an universe large enough to hold all elements
|
||||
/// from `interner`.
|
||||
///
|
||||
/// The constructed bitmap does not refer to any element in the interner, use [`from_iter`] if there should be
|
||||
/// some interned values in the bitmap after construction.
|
||||
pub fn for_interned_values_in(interner: &FixedSizeInterner<T>) -> Self {
|
||||
Self::new(interner.len())
|
||||
}
|
||||
|
||||
/// Constructs a new, **empty**, `SmallBitmap<T>` with an universe at least as large as specified.
|
||||
///
|
||||
/// If the passed universe length is not a multiple of 64, it will be rounded up to the next multiple of 64.
|
||||
pub fn new(universe_length: u16) -> Self {
|
||||
if universe_length <= 64 {
|
||||
Self { internal: SmallBitmapInternal::Tiny(0), _phantom: PhantomData }
|
||||
} else {
|
||||
Self {
|
||||
internal: SmallBitmapInternal::Small(
|
||||
vec![0; 1 + (universe_length - 1) as usize / 64].into_boxed_slice(),
|
||||
),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// The highest index that can be set in this bitmap.
|
||||
///
|
||||
/// The universe length is always a multiple of 64, and may be higher than the value passed to [`Self::new`].
|
||||
pub fn universe_length(&self) -> u16 {
|
||||
self.internal.universe_length()
|
||||
}
|
||||
|
||||
/// Constructs a new `SmallBitmap<T>` with an universe large enough to hold all elements
|
||||
/// from `from_interner`, and containing all the `Interned<T>` produced by `xs`.
|
||||
///
|
||||
/// It is a logic error to pass an iterator producing `Interned<T>`s that don't belong to the passed interner.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - If `xs` produces an element that doesn't fit the universe length obtained from `for_interner`.
|
||||
pub fn from_iter(
|
||||
xs: impl Iterator<Item = Interned<T>>,
|
||||
for_interner: &FixedSizeInterner<T>,
|
||||
) -> Self {
|
||||
Self {
|
||||
internal: SmallBitmapInternal::from_iter(xs.map(|x| x.into_raw()), for_interner.len()),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `true` if this bitmap does not contain any `Interned<T>`.
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.internal.is_empty()
|
||||
}
|
||||
|
||||
/// Removes all `Interned<T>` from this bitmap, such that it [`is_empty`] returns `true` after this call.
|
||||
pub fn clear(&mut self) {
|
||||
self.internal.clear()
|
||||
}
|
||||
|
||||
/// Whether `x` is part of the bitmap.
|
||||
///
|
||||
/// It is a logic error to pass an `Interned<T>` from a different interner that the one this bitmap references.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if `x` does not fit in [`universe_length`]
|
||||
pub fn contains(&self, x: Interned<T>) -> bool {
|
||||
self.internal.contains(x.into_raw())
|
||||
}
|
||||
|
||||
/// Adds `x` to the bitmap, such that [`contains(x)`] returns `true` after this call.
|
||||
///
|
||||
/// It is a logic error to pass an `Interned<T>` from a different interner that the one this bitmap references.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if `x` does not fit in [`universe_length`]
|
||||
pub fn insert(&mut self, x: Interned<T>) {
|
||||
self.internal.insert(x.into_raw())
|
||||
}
|
||||
|
||||
/// Removes `x` from the bitmap, such that [`contains(x)`] returns `false` after this call.
|
||||
///
|
||||
/// It is a logic error to pass an `Interned<T>` from a different interner that the one this bitmap references.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if `x` does not fit in [`universe_length`]
|
||||
pub fn remove(&mut self, x: Interned<T>) {
|
||||
self.internal.remove(x.into_raw())
|
||||
}
|
||||
|
||||
/// Modifies in place this bitmap to retain only the elements that are also present in `other`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if the universe lengths of `self` and `other` differ
|
||||
pub fn intersection(&mut self, other: &Self) {
|
||||
self.internal.intersection(&other.internal)
|
||||
}
|
||||
|
||||
/// Modifies in place this bitmap to add the elements that are present in `other`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if the universe lengths of `self` and `other` differ
|
||||
pub fn union(&mut self, other: &Self) {
|
||||
self.internal.union(&other.internal)
|
||||
}
|
||||
|
||||
/// Modifies in place this bitmap to remove the elements that are also present in `other`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if the universe lengths of `self` and `other` differ
|
||||
pub fn subtract(&mut self, other: &Self) {
|
||||
self.internal.subtract(&other.internal)
|
||||
}
|
||||
|
||||
/// Whether all the elements of `self` are contained in `other`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if the universe lengths of `self` and `other` differ
|
||||
pub fn is_subset(&self, other: &Self) -> bool {
|
||||
self.internal.is_subset(&other.internal)
|
||||
}
|
||||
|
||||
/// Whether any element of `self` is contained in `other`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if the universe lengths of `self` and `other` differ
|
||||
pub fn intersects(&self, other: &Self) -> bool {
|
||||
self.internal.intersects(&other.internal)
|
||||
}
|
||||
|
||||
/// Returns an iterator of the `Interned<T>` that are present in this bitmap.
|
||||
pub fn iter(&self) -> impl Iterator<Item = Interned<T>> + '_ {
|
||||
self.internal.iter().map(|x| Interned::from_raw(x))
|
||||
}
|
||||
}
|
||||
#[derive(Clone)]
|
||||
enum SmallBitmapInternal {
|
||||
Tiny(u64),
|
||||
Small(Box<[u64]>),
|
||||
}
|
||||
impl SmallBitmapInternal {
|
||||
fn new(universe_length: u16) -> Self {
|
||||
if universe_length <= 64 {
|
||||
Self::Tiny(0)
|
||||
} else {
|
||||
Self::Small(vec![0; 1 + universe_length as usize / 64].into_boxed_slice())
|
||||
}
|
||||
}
|
||||
fn from_iter(xs: impl Iterator<Item = u16>, universe_length: u16) -> Self {
|
||||
let mut s = Self::new(universe_length);
|
||||
for x in xs {
|
||||
s.insert(x);
|
||||
}
|
||||
s
|
||||
}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
match self {
|
||||
SmallBitmapInternal::Tiny(set) => *set == 0,
|
||||
SmallBitmapInternal::Small(sets) => {
|
||||
for set in sets.iter() {
|
||||
if *set != 0 {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn clear(&mut self) {
|
||||
match self {
|
||||
SmallBitmapInternal::Tiny(set) => *set = 0,
|
||||
SmallBitmapInternal::Small(sets) => {
|
||||
for set in sets.iter_mut() {
|
||||
*set = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn universe_length(&self) -> u16 {
|
||||
match &self {
|
||||
SmallBitmapInternal::Tiny(_) => 64,
|
||||
SmallBitmapInternal::Small(xs) => 64 * xs.len() as u16,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_set_index(&self, x: u16) -> (u64, u16) {
|
||||
match self {
|
||||
SmallBitmapInternal::Tiny(set) => {
|
||||
assert!(
|
||||
x < 64,
|
||||
"index out of bounds: the universe length is 64 but the index is {}",
|
||||
x
|
||||
);
|
||||
(*set, x)
|
||||
}
|
||||
SmallBitmapInternal::Small(set) => {
|
||||
let idx = (x as usize) / 64;
|
||||
assert!(
|
||||
idx < set.len(),
|
||||
"index out of bounds: the universe length is {} but the index is {}",
|
||||
self.universe_length(),
|
||||
x
|
||||
);
|
||||
(set[idx], x % 64)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_set_index_mut(&mut self, x: u16) -> (&mut u64, u16) {
|
||||
match self {
|
||||
SmallBitmapInternal::Tiny(set) => {
|
||||
assert!(
|
||||
x < 64,
|
||||
"index out of bounds: the universe length is 64 but the index is {}",
|
||||
x
|
||||
);
|
||||
(set, x)
|
||||
}
|
||||
SmallBitmapInternal::Small(set) => {
|
||||
let idx = (x as usize) / 64;
|
||||
assert!(
|
||||
idx < set.len(),
|
||||
"index out of bounds: the universe length is {} but the index is {}",
|
||||
64 * set.len() as u16,
|
||||
x
|
||||
);
|
||||
(&mut set[idx], x % 64)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn contains(&self, x: u16) -> bool {
|
||||
let (set, x) = self.get_set_index(x);
|
||||
set & 0b1 << x != 0
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, x: u16) {
|
||||
let (set, x) = self.get_set_index_mut(x);
|
||||
*set |= 0b1 << x;
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, x: u16) {
|
||||
let (set, x) = self.get_set_index_mut(x);
|
||||
*set &= !(0b1 << x);
|
||||
}
|
||||
|
||||
pub fn intersection(&mut self, other: &SmallBitmapInternal) {
|
||||
self.apply_op(other, |a, b| *a &= b);
|
||||
}
|
||||
pub fn union(&mut self, other: &SmallBitmapInternal) {
|
||||
self.apply_op(other, |a, b| *a |= b);
|
||||
}
|
||||
pub fn subtract(&mut self, other: &SmallBitmapInternal) {
|
||||
self.apply_op(other, |a, b| *a &= !b);
|
||||
}
|
||||
|
||||
pub fn apply_op(&mut self, other: &SmallBitmapInternal, op: impl Fn(&mut u64, u64)) {
|
||||
match (self, other) {
|
||||
(SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(a, *b),
|
||||
(SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => {
|
||||
assert!(
|
||||
a.len() == b.len(),
|
||||
"universe length mismatch: left is {}, but right is {}",
|
||||
a.len() * 64,
|
||||
other.universe_length()
|
||||
);
|
||||
for (a, b) in a.iter_mut().zip(b.iter()) {
|
||||
op(a, *b);
|
||||
}
|
||||
}
|
||||
(this, other) => {
|
||||
panic!(
|
||||
"universe length mismatch: left is {}, but right is {}",
|
||||
this.universe_length(),
|
||||
other.universe_length()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
fn all_satisfy_op(&self, other: &SmallBitmapInternal, op: impl Fn(u64, u64) -> bool) -> bool {
|
||||
match (self, other) {
|
||||
(SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b),
|
||||
(SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => {
|
||||
assert!(
|
||||
a.len() == b.len(),
|
||||
"universe length mismatch: left is {}, but right is {}",
|
||||
a.len() * 64,
|
||||
other.universe_length()
|
||||
);
|
||||
for (a, b) in a.iter().zip(b.iter()) {
|
||||
if !op(*a, *b) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
_ => {
|
||||
panic!(
|
||||
"universe length mismatch: left is {}, but right is {}",
|
||||
self.universe_length(),
|
||||
other.universe_length()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
fn any_satisfy_op(&self, other: &SmallBitmapInternal, op: impl Fn(u64, u64) -> bool) -> bool {
|
||||
match (self, other) {
|
||||
(SmallBitmapInternal::Tiny(a), SmallBitmapInternal::Tiny(b)) => op(*a, *b),
|
||||
(SmallBitmapInternal::Small(a), SmallBitmapInternal::Small(b)) => {
|
||||
assert!(
|
||||
a.len() == b.len(),
|
||||
"universe length mismatch: left is {}, but right is {}",
|
||||
a.len() * 64,
|
||||
other.universe_length()
|
||||
);
|
||||
for (a, b) in a.iter().zip(b.iter()) {
|
||||
if op(*a, *b) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
_ => {
|
||||
panic!(
|
||||
"universe length mismatch: left is {}, but right is {}",
|
||||
self.universe_length(),
|
||||
other.universe_length()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn is_subset(&self, other: &SmallBitmapInternal) -> bool {
|
||||
self.all_satisfy_op(other, |a, b| a & !b == 0)
|
||||
}
|
||||
pub fn intersects(&self, other: &SmallBitmapInternal) -> bool {
|
||||
self.any_satisfy_op(other, |a, b| a & b != 0)
|
||||
}
|
||||
pub fn iter(&self) -> SmallBitmapInternalIter<'_> {
|
||||
match self {
|
||||
SmallBitmapInternal::Tiny(x) => SmallBitmapInternalIter::Tiny(*x),
|
||||
SmallBitmapInternal::Small(xs) => {
|
||||
SmallBitmapInternalIter::Small { cur: xs[0], next: &xs[1..], base: 0 }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum SmallBitmapInternalIter<'b> {
|
||||
Tiny(u64),
|
||||
Small { cur: u64, next: &'b [u64], base: u16 },
|
||||
}
|
||||
impl<'b> Iterator for SmallBitmapInternalIter<'b> {
|
||||
type Item = u16;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self {
|
||||
SmallBitmapInternalIter::Tiny(set) => {
|
||||
if *set > 0 {
|
||||
let idx = set.trailing_zeros() as u16;
|
||||
*set &= *set - 1;
|
||||
Some(idx)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
SmallBitmapInternalIter::Small { cur, next, base } => {
|
||||
if *cur > 0 {
|
||||
let idx = cur.trailing_zeros() as u16;
|
||||
*cur &= *cur - 1;
|
||||
Some(idx + *base)
|
||||
} else if next.is_empty() {
|
||||
return None;
|
||||
} else {
|
||||
*base += 64;
|
||||
*cur = next[0];
|
||||
*next = &next[1..];
|
||||
self.next()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
226
crates/milli/src/search/new/sort.rs
Normal file
226
crates/milli/src/search/new/sort.rs
Normal file
@@ -0,0 +1,226 @@
|
||||
use heed::BytesDecode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::logger::SearchLogger;
|
||||
use super::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait, SearchContext};
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
|
||||
use crate::heed_codec::{BytesRefCodec, StrRefCodec};
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::search::facet::{ascending_facet_sort, descending_facet_sort};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
pub trait RankingRuleOutputIter<'ctx, Query> {
|
||||
fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;
|
||||
}
|
||||
|
||||
pub struct RankingRuleOutputIterWrapper<'ctx, Query> {
|
||||
iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'ctx>,
|
||||
}
|
||||
impl<'ctx, Query> RankingRuleOutputIterWrapper<'ctx, Query> {
|
||||
pub fn new(iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'ctx>) -> Self {
|
||||
Self { iter }
|
||||
}
|
||||
}
|
||||
impl<'ctx, Query> RankingRuleOutputIter<'ctx, Query> for RankingRuleOutputIterWrapper<'ctx, Query> {
|
||||
fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>> {
|
||||
match self.iter.next() {
|
||||
Some(x) => x.map(Some),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// `Query` type parameter: the same as the type parameter to bucket_sort
|
||||
// implements RankingRuleQuery trait, either querygraph or placeholdersearch
|
||||
// The sort ranking rule doesn't need the query parameter, it is doing the same thing
|
||||
// whether we're doing a querygraph or placeholder search.
|
||||
//
|
||||
// Query Stored anyway because every ranking rule must return a query from next_bucket
|
||||
// ---
|
||||
// "Mismatch" between new/old impl.:
|
||||
// - old impl: roaring bitmap as input, ranking rule iterates other all the buckets
|
||||
// - new impl: still works like that but it shouldn't, because the universe may change for every call to next_bucket, itself due to:
|
||||
// 1. elements that were already returned by the ranking rule are subtracted from the universe, also done in the old impl (subtracted from the candidates)
|
||||
// 2. NEW in the new impl.: distinct rule might have been applied btwn calls to next_bucket
|
||||
// new impl ignores docs removed in (2), which is a missed perf opt issue, see `next_bucket`
|
||||
// this perf problem is P2
|
||||
// mostly happens when many documents map to the same distinct attribute value.
|
||||
pub struct Sort<'ctx, Query> {
|
||||
field_name: String,
|
||||
field_id: Option<FieldId>,
|
||||
is_ascending: bool,
|
||||
original_query: Option<Query>,
|
||||
iter: Option<RankingRuleOutputIterWrapper<'ctx, Query>>,
|
||||
must_redact: bool,
|
||||
}
|
||||
impl<'ctx, Query> Sort<'ctx, Query> {
|
||||
pub fn new(
|
||||
index: &Index,
|
||||
rtxn: &'ctx heed::RoTxn<'ctx>,
|
||||
field_name: String,
|
||||
is_ascending: bool,
|
||||
) -> Result<Self> {
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let field_id = fields_ids_map.id(&field_name);
|
||||
let must_redact = Self::must_redact(index, rtxn, &field_name)?;
|
||||
|
||||
Ok(Self {
|
||||
field_name,
|
||||
field_id,
|
||||
is_ascending,
|
||||
original_query: None,
|
||||
iter: None,
|
||||
must_redact,
|
||||
})
|
||||
}
|
||||
|
||||
fn must_redact(index: &Index, rtxn: &'ctx heed::RoTxn<'ctx>, field_name: &str) -> Result<bool> {
|
||||
let Some(displayed_fields) = index.displayed_fields(rtxn)? else {
|
||||
return Ok(false);
|
||||
};
|
||||
|
||||
Ok(!displayed_fields.iter().any(|&field| field == field_name))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, Query> {
|
||||
fn id(&self) -> String {
|
||||
let Self { field_name, is_ascending, .. } = self;
|
||||
format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc" })
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::sort")]
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<Query>,
|
||||
parent_candidates: &RoaringBitmap,
|
||||
parent_query: &Query,
|
||||
) -> Result<()> {
|
||||
let iter: RankingRuleOutputIterWrapper<'ctx, Query> = match self.field_id {
|
||||
Some(field_id) => {
|
||||
let number_db = ctx
|
||||
.index
|
||||
.facet_id_f64_docids
|
||||
.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let string_db = ctx
|
||||
.index
|
||||
.facet_id_string_docids
|
||||
.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
|
||||
let (number_iter, string_iter) = if self.is_ascending {
|
||||
let number_iter = ascending_facet_sort(
|
||||
ctx.txn,
|
||||
number_db,
|
||||
field_id,
|
||||
parent_candidates.clone(),
|
||||
)?;
|
||||
let string_iter = ascending_facet_sort(
|
||||
ctx.txn,
|
||||
string_db,
|
||||
field_id,
|
||||
parent_candidates.clone(),
|
||||
)?;
|
||||
|
||||
(itertools::Either::Left(number_iter), itertools::Either::Left(string_iter))
|
||||
} else {
|
||||
let number_iter = descending_facet_sort(
|
||||
ctx.txn,
|
||||
number_db,
|
||||
field_id,
|
||||
parent_candidates.clone(),
|
||||
)?;
|
||||
let string_iter = descending_facet_sort(
|
||||
ctx.txn,
|
||||
string_db,
|
||||
field_id,
|
||||
parent_candidates.clone(),
|
||||
)?;
|
||||
|
||||
(itertools::Either::Right(number_iter), itertools::Either::Right(string_iter))
|
||||
};
|
||||
let number_iter = number_iter.map(|r| -> Result<_> {
|
||||
let (docids, bytes) = r?;
|
||||
Ok((
|
||||
docids,
|
||||
serde_json::Value::Number(
|
||||
serde_json::Number::from_f64(
|
||||
OrderedF64Codec::bytes_decode(bytes).expect("some number"),
|
||||
)
|
||||
.expect("too big float"),
|
||||
),
|
||||
))
|
||||
});
|
||||
let string_iter = string_iter.map(|r| -> Result<_> {
|
||||
let (docids, bytes) = r?;
|
||||
Ok((
|
||||
docids,
|
||||
serde_json::Value::String(
|
||||
StrRefCodec::bytes_decode(bytes).expect("some string").to_owned(),
|
||||
),
|
||||
))
|
||||
});
|
||||
|
||||
let query_graph = parent_query.clone();
|
||||
let ascending = self.is_ascending;
|
||||
let field_name = self.field_name.clone();
|
||||
let must_redact = self.must_redact;
|
||||
RankingRuleOutputIterWrapper::new(Box::new(number_iter.chain(string_iter).map(
|
||||
move |r| {
|
||||
let (docids, value) = r?;
|
||||
Ok(RankingRuleOutput {
|
||||
query: query_graph.clone(),
|
||||
candidates: docids,
|
||||
score: ScoreDetails::Sort(score_details::Sort {
|
||||
field_name: field_name.clone(),
|
||||
ascending,
|
||||
redacted: must_redact,
|
||||
value,
|
||||
}),
|
||||
})
|
||||
},
|
||||
)))
|
||||
}
|
||||
None => RankingRuleOutputIterWrapper::new(Box::new(std::iter::empty())),
|
||||
};
|
||||
self.original_query = Some(parent_query.clone());
|
||||
self.iter = Some(iter);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::sort")]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<Query>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<Query>>> {
|
||||
let iter = self.iter.as_mut().unwrap();
|
||||
if let Some(mut bucket) = iter.next_bucket()? {
|
||||
bucket.candidates &= universe;
|
||||
Ok(Some(bucket))
|
||||
} else {
|
||||
let query = self.original_query.as_ref().unwrap().clone();
|
||||
Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: universe.clone(),
|
||||
score: ScoreDetails::Sort(score_details::Sort {
|
||||
field_name: self.field_name.clone(),
|
||||
ascending: self.is_ascending,
|
||||
redacted: self.must_redact,
|
||||
value: serde_json::Value::Null,
|
||||
}),
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::sort")]
|
||||
fn end_iteration(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<Query>,
|
||||
) {
|
||||
self.original_query = None;
|
||||
self.iter = None;
|
||||
}
|
||||
}
|
||||
159
crates/milli/src/search/new/tests/attribute_fid.rs
Normal file
159
crates/milli/src/search/new/tests/attribute_fid.rs
Normal file
@@ -0,0 +1,159 @@
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec![
|
||||
"title".to_owned(),
|
||||
"description".to_owned(),
|
||||
"plot".to_owned(),
|
||||
]);
|
||||
s.set_criteria(vec![Criterion::Attribute]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"title": "",
|
||||
"description": "",
|
||||
"plot": "the quick brown fox jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"title": "",
|
||||
"description": "the quick brown foxes jump over the lazy dog",
|
||||
"plot": "",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "the quick brown fox jumps over the lazy dog",
|
||||
"description": "",
|
||||
"plot": "",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "the",
|
||||
"description": "quick brown fox jumps over the lazy dog",
|
||||
"plot": "",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "the quick",
|
||||
"description": "brown fox jumps over the lazy dog",
|
||||
"plot": "",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "the quick brown",
|
||||
"description": "fox jumps over the lazy dog",
|
||||
"plot": "",
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "the quick brown fox",
|
||||
"description": "jumps over the lazy dog",
|
||||
"plot": "",
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "the quick",
|
||||
"description": "brown fox jumps",
|
||||
"plot": "over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "the quick brown",
|
||||
"description": "fox",
|
||||
"plot": "jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "the quick brown",
|
||||
"description": "fox jumps",
|
||||
"plot": "over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "",
|
||||
"description": "the quick brown fox",
|
||||
"plot": "jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "the quick",
|
||||
"description": "",
|
||||
"plot": "brown fox jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "",
|
||||
"description": "the quickbrownfox",
|
||||
"plot": "jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"title": "",
|
||||
"description": "the quick brown fox",
|
||||
"plot": "jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"title": "",
|
||||
"description": "the quickbrownfox",
|
||||
"plot": "jumps overthelazy dog",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attribute_fid_simple() {
|
||||
let index = create_index();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attribute_fid_ngrams() {
|
||||
let index = create_index();
|
||||
db_snap!(index, fields_ids_map, @r###"
|
||||
0 id |
|
||||
1 title |
|
||||
2 description |
|
||||
3 plot |
|
||||
"###);
|
||||
db_snap!(index, searchable_fields, @r###"["title", "description", "plot"]"###);
|
||||
db_snap!(index, fieldids_weights_map, @r###"
|
||||
fid weight
|
||||
1 0 |
|
||||
2 1 |
|
||||
3 2 |
|
||||
"###);
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
}
|
||||
196
crates/milli/src/search/new/tests/attribute_position.rs
Normal file
196
crates/milli/src/search/new/tests/attribute_position.rs
Normal file
@@ -0,0 +1,196 @@
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{db_snap, Criterion, Search, SearchResult, TermsMatchingStrategy};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec![
|
||||
"text".to_owned(),
|
||||
"text2".to_owned(),
|
||||
"other".to_owned(),
|
||||
]);
|
||||
s.set_criteria(vec![Criterion::Attribute]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "do you know about the quick and talented brown fox",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "do you know about the quick brown fox",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "the quick and talented brown fox",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "fox brown quick the",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "the quick brown fox",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
the quick brown fox",
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"text": "quick a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
brown",
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
quickbrown",
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
quick brown",
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a
|
||||
quickbrown",
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"text": "quick brown",
|
||||
"text2": "brown quick",
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"text": "quickbrown",
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"text": "quick brown",
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"text": "quickbrown",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attribute_position_simple() {
|
||||
let index = create_index();
|
||||
|
||||
db_snap!(index, word_position_docids, @"1ad58847d772924d8aab5e92be8cf0cc");
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("quick brown");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
}
|
||||
#[test]
|
||||
fn test_attribute_position_repeated() {
|
||||
let index = create_index();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("a a a a a");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attribute_position_different_fields() {
|
||||
let index = create_index();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("quick brown");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_attribute_position_ngrams() {
|
||||
let index = create_index();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("quick brown");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
}
|
||||
429
crates/milli/src/search/new/tests/cutoff.rs
Normal file
429
crates/milli/src/search/new/tests/cutoff.rs
Normal file
@@ -0,0 +1,429 @@
|
||||
//! This module test the search cutoff and ensure a few things:
|
||||
//! 1. A basic test works and mark the search as degraded
|
||||
//! 2. A test that ensure the filters are affectively applied even with a cutoff of 0
|
||||
//! 3. A test that ensure the cutoff works well with the ranking scores
|
||||
|
||||
use std::time::Duration;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
use meili_snap::snapshot;
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::{Criterion, Filter, Search, TimeBudget};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_filterable_fields(hashset! { S("id") });
|
||||
s.set_criteria(vec![Criterion::Words, Criterion::Typo]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// reverse the ID / insertion order so we see better what was sorted from what got the insertion order ordering
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 4,
|
||||
"text": "hella puppo kefir",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "hella puppy kefir",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "hello",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "hello puppy",
|
||||
},
|
||||
{
|
||||
"id": 0,
|
||||
"text": "hello puppy kefir",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn basic_degraded_search() {
|
||||
let index = create_index();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.query("hello puppy kefir");
|
||||
search.limit(3);
|
||||
search.time_budget(TimeBudget::new(Duration::from_millis(0)));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert!(result.degraded);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn degraded_search_cannot_skip_filter() {
|
||||
let index = create_index();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.query("hello puppy kefir");
|
||||
search.limit(100);
|
||||
search.time_budget(TimeBudget::new(Duration::from_millis(0)));
|
||||
let filter_condition = Filter::from_str("id > 2").unwrap().unwrap();
|
||||
search.filter(filter_condition);
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
assert!(result.degraded);
|
||||
snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###"
|
||||
RoaringBitmap<[0, 1]>
|
||||
[0, 1]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[allow(clippy::format_collect)] // the test is already quite big
|
||||
fn degraded_search_and_score_details() {
|
||||
let index = create_index();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = Search::new(&rtxn, &index);
|
||||
search.query("hello puppy kefir");
|
||||
search.limit(4);
|
||||
search.scoring_strategy(ScoringStrategy::Detailed);
|
||||
search.time_budget(TimeBudget::max());
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 1, 0, 3]
|
||||
Scores: 1.0000 0.9167 0.8333 0.6667
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// Do ONE loop iteration. Not much can be deduced, almost everyone matched the words first bucket.
|
||||
search.time_budget(TimeBudget::max().with_stop_after(1));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [0, 1, 4, 2]
|
||||
Scores: 0.6667 0.6667 0.6667 0.0000
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// Do TWO loop iterations. The first document should be entirely sorted
|
||||
search.time_budget(TimeBudget::max().with_stop_after(2));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 0, 1, 2]
|
||||
Scores: 1.0000 0.6667 0.6667 0.0000
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// Do THREE loop iterations. The second document should be entirely sorted as well
|
||||
search.time_budget(TimeBudget::max().with_stop_after(3));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 1, 0, 2]
|
||||
Scores: 1.0000 0.9167 0.6667 0.0000
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
[
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// Do FOUR loop iterations. The third document should be entirely sorted as well
|
||||
// The words bucket have still not progressed thus the last document doesn't have any info yet.
|
||||
search.time_budget(TimeBudget::max().with_stop_after(4));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 1, 0, 2]
|
||||
Scores: 1.0000 0.9167 0.8333 0.0000
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
// After SIX loop iteration. The words ranking rule gave us a new bucket.
|
||||
// Since we reached the limit we were able to early exit without checking the typo ranking rule.
|
||||
search.time_budget(TimeBudget::max().with_stop_after(6));
|
||||
|
||||
let result = search.execute().unwrap();
|
||||
snapshot!(format!("IDs: {:?}\nScores: {}\nScore Details:\n{:#?}", result.documents_ids, result.document_scores.iter().map(|scores| format!("{:.4} ", ScoreDetails::global_score(scores.iter()))).collect::<String>(), result.document_scores), @r###"
|
||||
IDs: [4, 1, 0, 3]
|
||||
Scores: 1.0000 0.9167 0.8333 0.3333
|
||||
Score Details:
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Skipped,
|
||||
],
|
||||
]
|
||||
"###);
|
||||
}
|
||||
634
crates/milli/src/search/new/tests/distinct.rs
Normal file
634
crates/milli/src/search/new/tests/distinct.rs
Normal file
@@ -0,0 +1,634 @@
|
||||
/*!
|
||||
This module tests the "distinct attribute" feature, and its
|
||||
interaction with other ranking rules.
|
||||
|
||||
1. no duplicate distinct attributes are ever returned
|
||||
2. only the best document (according to the search rules) for each distinct value appears in the result
|
||||
3. if a document does not have a distinct attribute, then the distinct rule does not apply to it
|
||||
|
||||
It doesn't test properly:
|
||||
- combination of distinct + exhaustive_nbr_hits (because we know it's incorrect)
|
||||
- distinct attributes with arrays (because we know it's incorrect as well)
|
||||
*/
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use big_s::S;
|
||||
use heed::RoTxn;
|
||||
use maplit::hashset;
|
||||
|
||||
use super::collect_field_values;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_sortable_fields(hashset! { S("rank1"), S("letter") });
|
||||
s.set_distinct_field("letter".to_owned());
|
||||
s.set_criteria(vec![Criterion::Words]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"letter": "A",
|
||||
"rank1": 0,
|
||||
"text": "the quick brown fox jamps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"letter": "A",
|
||||
"rank1": 1,
|
||||
"text": "the quick brown fox jumpes over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"letter": "B",
|
||||
"rank1": 0,
|
||||
"text": "the quick brown foxjumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"letter": "B",
|
||||
"rank1": 1,
|
||||
"text": "the quick brown fox jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"letter": "B",
|
||||
"rank1": 2,
|
||||
"text": "the quick brown fox jumps over the lazy",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"letter": "C",
|
||||
"rank1": 0,
|
||||
"text": "the quickbrownfox jumps over the lazy",
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"letter": "C",
|
||||
"rank1": 1,
|
||||
"text": "the quick brown fox jumpss over the lazy",
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"letter": "C",
|
||||
"rank1": 2,
|
||||
"text": "the quick brown fox jumps over the lazy",
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"letter": "D",
|
||||
"rank1": 0,
|
||||
"text": "the quick brown fox jumps over the lazy",
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"letter": "E",
|
||||
"rank1": 0,
|
||||
"text": "the quick brown fox jumps over the lazy",
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"letter": "E",
|
||||
"rank1": 1,
|
||||
"text": "the quackbrown foxjunps over",
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"letter": "E",
|
||||
"rank1": 2,
|
||||
"text": "the quicko browno fox junps over",
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"letter": "E",
|
||||
"rank1": 3,
|
||||
"text": "the quicko browno fox jumps over",
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"letter": "E",
|
||||
"rank1": 4,
|
||||
"text": "the quick brewn fox jumps over",
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"letter": "E",
|
||||
"rank1": 5,
|
||||
"text": "the quick brown fox jumps over",
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"letter": "F",
|
||||
"rank1": 0,
|
||||
"text": "the quick brownf fox jumps over",
|
||||
},
|
||||
{
|
||||
"id": 16,
|
||||
"letter": "F",
|
||||
"rank1": 1,
|
||||
"text": "the quic brown fox jamps over",
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"letter": "F",
|
||||
"rank1": 2,
|
||||
"text": "thequick browns fox jimps",
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"letter": "G",
|
||||
"rank1": 0,
|
||||
"text": "the qick brown fox jumps",
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"letter": "G",
|
||||
"rank1": 1,
|
||||
"text": "the quick brownfoxjumps",
|
||||
},
|
||||
{
|
||||
"id": 20,
|
||||
"letter": "H",
|
||||
"rank1": 0,
|
||||
"text": "the quick brow fox jumps",
|
||||
},
|
||||
{
|
||||
"id": 21,
|
||||
"letter": "I",
|
||||
"rank1": 0,
|
||||
"text": "the quick brown fox jpmps",
|
||||
},
|
||||
{
|
||||
"id": 22,
|
||||
"letter": "I",
|
||||
"rank1": 1,
|
||||
"text": "the quick brown fox jumps",
|
||||
},
|
||||
{
|
||||
"id": 23,
|
||||
"letter": "I",
|
||||
"rank1": 2,
|
||||
"text": "the quick",
|
||||
},
|
||||
{
|
||||
"id": 24,
|
||||
"rank1": 0,
|
||||
"text": "the quick",
|
||||
},
|
||||
{
|
||||
"id": 25,
|
||||
"rank1": 1,
|
||||
"text": "the quick brown",
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"rank1": 2,
|
||||
"text": "the quick brown fox",
|
||||
},
|
||||
{
|
||||
"id": 26,
|
||||
"rank1": 3,
|
||||
"text": "the quick brown fox jumps over the lazy dog",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
fn verify_distinct(
|
||||
index: &Index,
|
||||
txn: &RoTxn<'_>,
|
||||
distinct: Option<&str>,
|
||||
docids: &[u32],
|
||||
) -> Vec<String> {
|
||||
let vs = collect_field_values(
|
||||
index,
|
||||
txn,
|
||||
distinct.or_else(|| index.distinct_field(txn).unwrap()).unwrap(),
|
||||
docids,
|
||||
);
|
||||
|
||||
let mut unique = HashSet::new();
|
||||
for v in vs.iter() {
|
||||
if v == "__does_not_exist__" {
|
||||
continue;
|
||||
}
|
||||
assert!(unique.insert(v.clone()));
|
||||
}
|
||||
|
||||
vs
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distinct_placeholder_no_ranking_rules() {
|
||||
let index = create_index();
|
||||
|
||||
// Set the letter as filterable and unset the distinct attribute.
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_filterable_fields(hashset! { S("letter") });
|
||||
s.reset_distinct_field();
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.distinct(S("letter"));
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
|
||||
let distinct_values = verify_distinct(&index, &txn, Some("letter"), &documents_ids);
|
||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||
[
|
||||
"\"A\"",
|
||||
"\"B\"",
|
||||
"\"C\"",
|
||||
"\"D\"",
|
||||
"\"E\"",
|
||||
"\"F\"",
|
||||
"\"G\"",
|
||||
"\"H\"",
|
||||
"\"I\"",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distinct_at_search_placeholder_no_ranking_rules() {
|
||||
let index = create_index();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let s = Search::new(&txn, &index);
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
|
||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||
[
|
||||
"\"A\"",
|
||||
"\"B\"",
|
||||
"\"C\"",
|
||||
"\"D\"",
|
||||
"\"E\"",
|
||||
"\"F\"",
|
||||
"\"G\"",
|
||||
"\"H\"",
|
||||
"\"I\"",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distinct_placeholder_sort() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Sort]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]);
|
||||
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
|
||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||
[
|
||||
"\"E\"",
|
||||
"__does_not_exist__",
|
||||
"\"B\"",
|
||||
"\"C\"",
|
||||
"\"F\"",
|
||||
"\"I\"",
|
||||
"\"A\"",
|
||||
"\"G\"",
|
||||
"__does_not_exist__",
|
||||
"\"D\"",
|
||||
"\"H\"",
|
||||
"__does_not_exist__",
|
||||
]
|
||||
"###);
|
||||
let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids);
|
||||
insta::assert_debug_snapshot!(rank_values, @r###"
|
||||
[
|
||||
"5",
|
||||
"3",
|
||||
"2",
|
||||
"2",
|
||||
"2",
|
||||
"2",
|
||||
"1",
|
||||
"1",
|
||||
"1",
|
||||
"0",
|
||||
"0",
|
||||
"0",
|
||||
]
|
||||
"###);
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]);
|
||||
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]");
|
||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||
[
|
||||
"\"I\"",
|
||||
"\"H\"",
|
||||
"\"G\"",
|
||||
"\"F\"",
|
||||
"\"E\"",
|
||||
"\"D\"",
|
||||
"\"C\"",
|
||||
"\"B\"",
|
||||
"\"A\"",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
]
|
||||
"###);
|
||||
let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids);
|
||||
insta::assert_debug_snapshot!(rank_values, @r###"
|
||||
[
|
||||
"0",
|
||||
"0",
|
||||
"0",
|
||||
"0",
|
||||
"0",
|
||||
"0",
|
||||
"0",
|
||||
"0",
|
||||
"0",
|
||||
"0",
|
||||
"1",
|
||||
"3",
|
||||
]
|
||||
"###);
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.sort_criteria(vec![
|
||||
AscDesc::Desc(Member::Field(S("letter"))),
|
||||
AscDesc::Desc(Member::Field(S("rank1"))),
|
||||
]);
|
||||
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]");
|
||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||
[
|
||||
"\"I\"",
|
||||
"\"H\"",
|
||||
"\"G\"",
|
||||
"\"F\"",
|
||||
"\"E\"",
|
||||
"\"D\"",
|
||||
"\"C\"",
|
||||
"\"B\"",
|
||||
"\"A\"",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
]
|
||||
"###);
|
||||
let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids);
|
||||
insta::assert_debug_snapshot!(rank_values, @r###"
|
||||
[
|
||||
"2",
|
||||
"0",
|
||||
"1",
|
||||
"2",
|
||||
"5",
|
||||
"0",
|
||||
"2",
|
||||
"2",
|
||||
"1",
|
||||
"3",
|
||||
"1",
|
||||
"0",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distinct_words() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Words]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]");
|
||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||
[
|
||||
"\"A\"",
|
||||
"\"B\"",
|
||||
"__does_not_exist__",
|
||||
"\"C\"",
|
||||
"\"D\"",
|
||||
"\"E\"",
|
||||
"\"F\"",
|
||||
"\"G\"",
|
||||
"\"H\"",
|
||||
"\"I\"",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
]
|
||||
"###);
|
||||
let text_values = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(text_values, @r###"
|
||||
[
|
||||
"\"the quick brown fox jamps over the lazy dog\"",
|
||||
"\"the quick brown foxjumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quickbrownfox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brownf fox jumps over\"",
|
||||
"\"the qick brown fox jumps\"",
|
||||
"\"the quick brow fox jumps\"",
|
||||
"\"the quick brown fox jpmps\"",
|
||||
"\"the quick brown\"",
|
||||
"\"the quick\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distinct_sort_words() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Sort, Criterion::Words, Criterion::Desc(S("rank1"))]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]);
|
||||
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]");
|
||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||
[
|
||||
"\"I\"",
|
||||
"\"H\"",
|
||||
"\"G\"",
|
||||
"\"F\"",
|
||||
"\"E\"",
|
||||
"\"D\"",
|
||||
"\"C\"",
|
||||
"\"B\"",
|
||||
"\"A\"",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
]
|
||||
"###);
|
||||
|
||||
let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids);
|
||||
insta::assert_debug_snapshot!(rank_values, @r###"
|
||||
[
|
||||
"1",
|
||||
"0",
|
||||
"1",
|
||||
"1",
|
||||
"0",
|
||||
"0",
|
||||
"2",
|
||||
"1",
|
||||
"1",
|
||||
"3",
|
||||
"1",
|
||||
"0",
|
||||
]
|
||||
"###);
|
||||
|
||||
let text_values = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(text_values, @r###"
|
||||
[
|
||||
"\"the quick brown fox jumps\"",
|
||||
"\"the quick brow fox jumps\"",
|
||||
"\"the quick brownfoxjumps\"",
|
||||
"\"the quic brown fox jamps over\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumpes over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quick brown\"",
|
||||
"\"the quick\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distinct_all_candidates() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Sort]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]);
|
||||
s.exhaustive_number_hits(true);
|
||||
|
||||
let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap();
|
||||
let candidates = candidates.iter().collect::<Vec<_>>();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
|
||||
// This is incorrect, but unfortunately impossible to do better efficiently.
|
||||
insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_distinct_typo() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Words, Criterion::Typo]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]");
|
||||
|
||||
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
|
||||
insta::assert_debug_snapshot!(distinct_values, @r###"
|
||||
[
|
||||
"\"B\"",
|
||||
"__does_not_exist__",
|
||||
"\"A\"",
|
||||
"\"C\"",
|
||||
"\"D\"",
|
||||
"\"E\"",
|
||||
"\"F\"",
|
||||
"\"I\"",
|
||||
"\"G\"",
|
||||
"\"H\"",
|
||||
"__does_not_exist__",
|
||||
"__does_not_exist__",
|
||||
]
|
||||
"###);
|
||||
|
||||
let text_values = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(text_values, @r###"
|
||||
[
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jamps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brownf fox jumps over\"",
|
||||
"\"the quick brown fox jumps\"",
|
||||
"\"the qick brown fox jumps\"",
|
||||
"\"the quick brow fox jumps\"",
|
||||
"\"the quick brown\"",
|
||||
"\"the quick\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
920
crates/milli/src/search/new/tests/exactness.rs
Normal file
920
crates/milli/src/search/new/tests/exactness.rs
Normal file
@@ -0,0 +1,920 @@
|
||||
/*!
|
||||
This module tests the following properties about the exactness ranking rule:
|
||||
|
||||
- it sorts documents as follows:
|
||||
1. documents which have an attribute which is equal to the whole query
|
||||
2. documents which have an attribute which start with the whole query
|
||||
3. documents which contain the most exact words from the query
|
||||
|
||||
- the `exactness` ranking rule must be preceded by the `words` ranking rule
|
||||
|
||||
- if `words` has already removed terms from the query, then exactness will sort documents as follows:
|
||||
1. those that have an attribute which is equal to the whole remaining query, if this query does not have any "gap"
|
||||
2. those that have an attribute which start with the whole remaining query, if this query does not have any "gap"
|
||||
3. those that contain the most exact words from the remaining query
|
||||
|
||||
- if it is followed by other graph-based ranking rules (`typo`, `proximity`, `attribute`).
|
||||
Then these rules will only work with
|
||||
1. the exact terms selected by `exactness
|
||||
2. the full query term otherwise
|
||||
*/
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::tests::collect_field_values;
|
||||
use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy};
|
||||
|
||||
fn create_index_simple_ordered() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Exactness]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "the",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "the quick",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "the quick brown",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "the quick brown fox",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"text": "the quick brown fox jumps",
|
||||
},
|
||||
|
||||
{
|
||||
"id": 6,
|
||||
"text": "the quick brown fox jumps over",
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"text": "the quick brown fox jumps over the",
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"text": "the quick brown fox jumps over the lazy",
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"text": "the quick brown fox jumps over the lazy dog",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
fn create_index_simple_reversed() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Exactness]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "dog",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"text": "jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"text": "fox jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"text": "brown fox jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"text": "quick brown fox jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"text": "the quick brown fox jumps over the lazy dog",
|
||||
}
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
fn create_index_simple_random() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Exactness]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "over",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "jump dog",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "brown the lazy",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "jump dog quick the",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"text": "fox the lazy dog brown",
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"text": "jump fox quick lazy the dog",
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"text": "the dog brown over jumps quick lazy",
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"text": "the jumps dog quick over brown lazy fox",
|
||||
}
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
fn create_index_attribute_starts_with() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Exactness]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "what a lovely view from this balcony, I love it",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "this balcony is overlooking the sea",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "this balcony",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "over looking the sea is a beautiful balcony",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "a beautiful balcony is overlooking the sea",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"text": "overlooking the sea is a beautiful balcony, I love it",
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"text": "overlooking the sea is a beautiful balcony",
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"text": "overlooking",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
fn create_index_simple_ordered_with_typos() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Exactness]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "the",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "the quack",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "the quack briwn",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "the quack briwn fox",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"text": "the quack briwn fox jlmps",
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"text": "the quack briwn fox jlmps over",
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"text": "the quack briwn fox jlmps over the",
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"text": "the quack briwn fox jlmps over the lazy",
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"text": "the quack briwn fox jlmps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"text": "",
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"text": "the",
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"text": "the quick",
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"text": "the quick brown",
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"text": "the quick brown fox",
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"text": "the quick brown fox jumps",
|
||||
},
|
||||
|
||||
{
|
||||
"id": 16,
|
||||
"text": "the quick brown fox jumps over",
|
||||
},
|
||||
{
|
||||
"id": 17,
|
||||
"text": "the quick brown fox jumps over the",
|
||||
},
|
||||
{
|
||||
"id": 18,
|
||||
"text": "the quick brown fox jumps over the lazy",
|
||||
},
|
||||
{
|
||||
"id": 19,
|
||||
"text": "the quick brown fox jumps over the lazy dog",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
fn create_index_with_varying_proximities() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "lazy jumps dog brown quick the over fox the",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "the quick brown fox jumps over the very lazy dog"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "the quick brown fox jumps over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "dog brown quick the over fox the lazy",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "the quick brown fox over the very lazy dog"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"text": "the quick brown fox over the lazy dog",
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"text": "brown quick the over fox",
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"text": "the very quick brown fox over"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"text": "the quick brown fox over",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
fn create_index_with_typo_and_prefix() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Exactness]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "expraordinarily quick brown fox",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "extraordinarily quick brown fox",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "extra quick brown fox",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "expraordinarily quack brown fox",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "expraordinapily quick brown fox",
|
||||
}
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
fn create_index_all_equal_except_proximity_between_ignored_terms() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "lazy jumps dog brown quick the over fox the"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "lazy jumps dog brown quick the over fox the. quack briwn jlmps",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "lazy jumps dog brown quick the over fox the. quack briwn jlmps overt",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactness_simple_ordered() {
|
||||
let index = create_index_simple_ordered();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the\"",
|
||||
"\"the quick brown fox jumps over\"",
|
||||
"\"the quick brown fox jumps\"",
|
||||
"\"the quick brown fox\"",
|
||||
"\"the quick brown\"",
|
||||
"\"the quick\"",
|
||||
"\"the\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactness_simple_reversed() {
|
||||
let index = create_index_simple_reversed();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"quick brown fox jumps over the lazy dog\"",
|
||||
"\"the lazy dog\"",
|
||||
"\"over the lazy dog\"",
|
||||
"\"jumps over the lazy dog\"",
|
||||
"\"fox jumps over the lazy dog\"",
|
||||
"\"brown fox jumps over the lazy dog\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"quick brown fox jumps over the lazy dog\"",
|
||||
"\"the lazy dog\"",
|
||||
"\"over the lazy dog\"",
|
||||
"\"jumps over the lazy dog\"",
|
||||
"\"fox jumps over the lazy dog\"",
|
||||
"\"brown fox jumps over the lazy dog\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactness_simple_random() {
|
||||
let index = create_index_simple_random();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the jumps dog quick over brown lazy fox\"",
|
||||
"\"the dog brown over jumps quick lazy\"",
|
||||
"\"jump dog quick the\"",
|
||||
"\"jump fox quick lazy the dog\"",
|
||||
"\"brown the lazy\"",
|
||||
"\"fox the lazy dog brown\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactness_attribute_starts_with_simple() {
|
||||
let index = create_index_attribute_starts_with();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("this balcony");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this balcony\"",
|
||||
"\"this balcony is overlooking the sea\"",
|
||||
"\"what a lovely view from this balcony, I love it\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactness_attribute_starts_with_phrase() {
|
||||
let index = create_index_attribute_starts_with();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("\"overlooking the sea\" is a beautiful balcony");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"overlooking the sea is a beautiful balcony\"",
|
||||
"\"overlooking the sea is a beautiful balcony, I love it\"",
|
||||
"\"a beautiful balcony is overlooking the sea\"",
|
||||
"\"this balcony is overlooking the sea\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("overlooking the sea is a beautiful balcony");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"overlooking the sea is a beautiful balcony\"",
|
||||
"\"overlooking the sea is a beautiful balcony, I love it\"",
|
||||
"\"a beautiful balcony is overlooking the sea\"",
|
||||
"\"over looking the sea is a beautiful balcony\"",
|
||||
"\"this balcony is overlooking the sea\"",
|
||||
"\"overlooking\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactness_all_candidates_with_typo() {
|
||||
let index = create_index_attribute_starts_with();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("overlocking the sea is a beautiful balcony");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
// "overlooking" is returned here because the term matching strategy allows it
|
||||
// but it has the worst exactness score (0 exact words)
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"a beautiful balcony is overlooking the sea\"",
|
||||
"\"overlooking the sea is a beautiful balcony, I love it\"",
|
||||
"\"overlooking the sea is a beautiful balcony\"",
|
||||
"\"this balcony is overlooking the sea\"",
|
||||
"\"overlooking\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactness_after_words() {
|
||||
let index = create_index_simple_ordered_with_typos();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Words, Criterion::Exactness]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quack briwn fox jlmps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quack briwn fox jlmps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the\"",
|
||||
"\"the quick brown fox jumps over\"",
|
||||
"\"the quack briwn fox jlmps over\"",
|
||||
"\"the quack briwn fox jlmps over the\"",
|
||||
"\"the quick brown fox jumps\"",
|
||||
"\"the quack briwn fox jlmps\"",
|
||||
"\"the quick brown fox\"",
|
||||
"\"the quack briwn fox\"",
|
||||
"\"the quick brown\"",
|
||||
"\"the quack briwn\"",
|
||||
"\"the quick\"",
|
||||
"\"the quack\"",
|
||||
"\"the\"",
|
||||
"\"the\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_words_after_exactness() {
|
||||
let index = create_index_simple_ordered_with_typos();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Exactness, Criterion::Words]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 9, 18, 8, 17, 16, 6, 7, 15, 5, 14, 4, 13, 3, 12, 2, 1, 11]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quack briwn fox jlmps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy\"",
|
||||
"\"the quack briwn fox jlmps over the lazy\"",
|
||||
"\"the quick brown fox jumps over the\"",
|
||||
"\"the quick brown fox jumps over\"",
|
||||
"\"the quack briwn fox jlmps over\"",
|
||||
"\"the quack briwn fox jlmps over the\"",
|
||||
"\"the quick brown fox jumps\"",
|
||||
"\"the quack briwn fox jlmps\"",
|
||||
"\"the quick brown fox\"",
|
||||
"\"the quack briwn fox\"",
|
||||
"\"the quick brown\"",
|
||||
"\"the quack briwn\"",
|
||||
"\"the quick\"",
|
||||
"\"the quack\"",
|
||||
"\"the\"",
|
||||
"\"the\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_proximity_after_exactness() {
|
||||
let index = create_index_with_varying_proximities();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 4, 5, 8, 7, 3, 6]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the very lazy dog\"",
|
||||
"\"lazy jumps dog brown quick the over fox the\"",
|
||||
"\"the quick brown fox over the very lazy dog\"",
|
||||
"\"the quick brown fox over the lazy dog\"",
|
||||
"\"the quick brown fox over\"",
|
||||
"\"the very quick brown fox over\"",
|
||||
"\"dog brown quick the over fox the lazy\"",
|
||||
"\"brown quick the over fox\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
let index = create_index_all_equal_except_proximity_between_ignored_terms();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"lazy jumps dog brown quick the over fox the\"",
|
||||
"\"lazy jumps dog brown quick the over fox the. quack briwn jlmps\"",
|
||||
"\"lazy jumps dog brown quick the over fox the. quack briwn jlmps overt\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_exactness_followed_by_typo_prefer_no_typo_prefix() {
|
||||
let index = create_index_with_typo_and_prefix();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Typo]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("quick brown fox extra");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 4, 3]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"extra quick brown fox\"",
|
||||
"\"extraordinarily quick brown fox\"",
|
||||
"\"expraordinarily quick brown fox\"",
|
||||
"\"expraordinapily quick brown fox\"",
|
||||
"\"expraordinarily quack brown fox\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_typo_followed_by_exactness() {
|
||||
let index = create_index_with_typo_and_prefix();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Exactness]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
s.query("extraordinarily quick brown fox");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
|
||||
let document_ids_scores: Vec<_> = documents_ids.iter().zip(document_scores).collect();
|
||||
insta::assert_snapshot!(format!("{document_ids_scores:#?}"));
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 4, 3]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"extraordinarily quick brown fox\"",
|
||||
"\"expraordinarily quick brown fox\"",
|
||||
"\"expraordinapily quick brown fox\"",
|
||||
"\"expraordinarily quack brown fox\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
309
crates/milli/src/search/new/tests/geo_sort.rs
Normal file
309
crates/milli/src/search/new/tests/geo_sort.rs
Normal file
@@ -0,0 +1,309 @@
|
||||
/*!
|
||||
This module tests the `geo_sort` ranking rule
|
||||
*/
|
||||
|
||||
use big_s::S;
|
||||
use heed::RoTxn;
|
||||
use maplit::hashset;
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::score_details::ScoreDetails;
|
||||
use crate::search::new::tests::collect_field_values;
|
||||
use crate::{AscDesc, Criterion, GeoSortStrategy, Member, Search, SearchResult};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_sortable_fields(hashset! { S("_geo") });
|
||||
s.set_criteria(vec![Criterion::Words, Criterion::Sort]);
|
||||
})
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn execute_iterative_and_rtree_returns_the_same<'a>(
|
||||
rtxn: &RoTxn<'a>,
|
||||
index: &TempIndex,
|
||||
search: &mut Search<'a>,
|
||||
) -> (Vec<usize>, Vec<Vec<ScoreDetails>>) {
|
||||
search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(2));
|
||||
let SearchResult { documents_ids, document_scores: iterative_scores_bucketed, .. } =
|
||||
search.execute().unwrap();
|
||||
let iterative_ids_bucketed = collect_field_values(index, rtxn, "id", &documents_ids);
|
||||
|
||||
search.geo_sort_strategy(GeoSortStrategy::AlwaysIterative(1000));
|
||||
let SearchResult { documents_ids, document_scores: iterative_scores, .. } =
|
||||
search.execute().unwrap();
|
||||
let iterative_ids = collect_field_values(index, rtxn, "id", &documents_ids);
|
||||
|
||||
assert_eq!(iterative_ids_bucketed, iterative_ids, "iterative bucket");
|
||||
assert_eq!(iterative_scores_bucketed, iterative_scores, "iterative bucket score");
|
||||
|
||||
search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(2));
|
||||
let SearchResult { documents_ids, document_scores: rtree_scores_bucketed, .. } =
|
||||
search.execute().unwrap();
|
||||
let rtree_ids_bucketed = collect_field_values(index, rtxn, "id", &documents_ids);
|
||||
|
||||
search.geo_sort_strategy(GeoSortStrategy::AlwaysRtree(1000));
|
||||
let SearchResult { documents_ids, document_scores: rtree_scores, .. } =
|
||||
search.execute().unwrap();
|
||||
let rtree_ids = collect_field_values(index, rtxn, "id", &documents_ids);
|
||||
|
||||
assert_eq!(rtree_ids_bucketed, rtree_ids, "rtree bucket");
|
||||
assert_eq!(rtree_scores_bucketed, rtree_scores, "rtree bucket score");
|
||||
|
||||
assert_eq!(iterative_ids, rtree_ids, "iterative vs rtree");
|
||||
assert_eq!(iterative_scores, rtree_scores, "iterative vs rtree scores");
|
||||
|
||||
(iterative_ids.into_iter().map(|id| id.parse().unwrap()).collect(), iterative_scores)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_geo_sort() {
|
||||
let index = create_index();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 2, "_geo": { "lat": 2, "lng": -1 } },
|
||||
{ "id": 3, "_geo": { "lat": -2, "lng": -2 } },
|
||||
{ "id": 5, "_geo": { "lat": 6, "lng": -5 } },
|
||||
{ "id": 4, "_geo": { "lat": 3, "lng": 5 } },
|
||||
{ "id": 0, "_geo": { "lat": 0, "lng": 0 } },
|
||||
{ "id": 1, "_geo": { "lat": 1, "lng": 1 } },
|
||||
{ "id": 6 }, { "id": 8 }, { "id": 7 }, { "id": 10 }, { "id": 9 },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&rtxn, &index);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 0.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[0, 1, 2, 3, 4, 5, 6, 8, 7, 10, 9]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 0.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[5, 4, 3, 2, 1, 0, 6, 8, 7, 10, 9]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_geo_sort_around_the_edge_of_the_flat_earth() {
|
||||
let index = create_index();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "_geo": { "lat": 0, "lng": 0 } },
|
||||
{ "id": 1, "_geo": { "lat": 88, "lng": 0 } },
|
||||
{ "id": 2, "_geo": { "lat": -89, "lng": 0 } },
|
||||
|
||||
{ "id": 3, "_geo": { "lat": 0, "lng": 178 } },
|
||||
{ "id": 4, "_geo": { "lat": 0, "lng": -179 } },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&rtxn, &index);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
|
||||
// --- asc
|
||||
s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 0.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[0, 1, 2, 3, 4]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
// ensuring the lat doesn't wrap around
|
||||
s.sort_criteria(vec![AscDesc::Asc(Member::Geo([85., 0.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[1, 0, 3, 4, 2]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
s.sort_criteria(vec![AscDesc::Asc(Member::Geo([-85., 0.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[2, 0, 3, 4, 1]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
// ensuring the lng does wrap around
|
||||
s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 175.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[3, 4, 2, 1, 0]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., -175.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[4, 3, 2, 1, 0]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
// --- desc
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 0.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[4, 3, 2, 1, 0]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
// ensuring the lat doesn't wrap around
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Geo([85., 0.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[2, 4, 3, 0, 1]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Geo([-85., 0.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[1, 4, 3, 0, 2]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
// ensuring the lng does wrap around
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., 175.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[0, 1, 2, 4, 3]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
s.sort_criteria(vec![AscDesc::Desc(Member::Geo([0., -175.]))]);
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[0, 1, 2, 3, 4]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geo_sort_mixed_with_words() {
|
||||
let index = create_index();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "doggo": "jean", "_geo": { "lat": 0, "lng": 0 } },
|
||||
{ "id": 1, "doggo": "intel", "_geo": { "lat": 88, "lng": 0 } },
|
||||
{ "id": 2, "doggo": "jean bob", "_geo": { "lat": -89, "lng": 0 } },
|
||||
{ "id": 3, "doggo": "jean michel", "_geo": { "lat": 0, "lng": 178 } },
|
||||
{ "id": 4, "doggo": "bob marley", "_geo": { "lat": 0, "lng": -179 } },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&rtxn, &index);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 0.]))]);
|
||||
|
||||
s.query("jean");
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[0, 2, 3]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
|
||||
s.query("bob");
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[2, 4]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"), @r###"
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
"###);
|
||||
|
||||
s.query("intel");
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[1]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"), @r###"
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geo_sort_without_any_geo_faceted_documents() {
|
||||
let index = create_index();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "doggo": "jean" },
|
||||
{ "id": 1, "doggo": "intel" },
|
||||
{ "id": 2, "doggo": "jean bob" },
|
||||
{ "id": 3, "doggo": "jean michel" },
|
||||
{ "id": 4, "doggo": "bob marley" },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&rtxn, &index);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.sort_criteria(vec![AscDesc::Asc(Member::Geo([0., 0.]))]);
|
||||
|
||||
s.query("jean");
|
||||
let (ids, scores) = execute_iterative_and_rtree_returns_the_same(&rtxn, &index, &mut s);
|
||||
insta::assert_snapshot!(format!("{ids:?}"), @"[0, 2, 3]");
|
||||
insta::assert_snapshot!(format!("{scores:#?}"));
|
||||
}
|
||||
75
crates/milli/src/search/new/tests/integration.rs
Normal file
75
crates/milli/src/search/new/tests/integration.rs
Normal file
@@ -0,0 +1,75 @@
|
||||
use std::io::Cursor;
|
||||
|
||||
use big_s::S;
|
||||
use heed::EnvOpenOptions;
|
||||
use maplit::{btreemap, hashset};
|
||||
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
|
||||
use crate::{db_snap, Criterion, Index, Object};
|
||||
pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson");
|
||||
|
||||
pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let mut options = EnvOpenOptions::new();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
let index = Index::new(options, &path).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let config = IndexerConfig::default();
|
||||
|
||||
let mut builder = Settings::new(&mut wtxn, &index, &config);
|
||||
|
||||
builder.set_criteria(criteria.to_vec());
|
||||
builder.set_filterable_fields(hashset! {
|
||||
S("tag"),
|
||||
S("asc_desc_rank"),
|
||||
S("_geo"),
|
||||
S("opt1"),
|
||||
S("opt1.opt2"),
|
||||
S("tag_in")
|
||||
});
|
||||
builder.set_sortable_fields(hashset! {
|
||||
S("tag"),
|
||||
S("asc_desc_rank"),
|
||||
});
|
||||
builder.set_synonyms(btreemap! {
|
||||
S("hello") => vec![S("good morning")],
|
||||
S("world") => vec![S("earth")],
|
||||
S("america") => vec![S("the united states")],
|
||||
});
|
||||
builder.set_searchable_fields(vec![S("title"), S("description")]);
|
||||
builder.execute(|_| (), || false).unwrap();
|
||||
|
||||
// index documents
|
||||
let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
|
||||
let indexing_config = IndexDocumentsConfig::default();
|
||||
|
||||
let builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
|
||||
let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
let reader = Cursor::new(CONTENT.as_bytes());
|
||||
|
||||
for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
|
||||
let object = result.unwrap();
|
||||
documents_builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
|
||||
let vector = documents_builder.into_inner().unwrap();
|
||||
|
||||
// index documents
|
||||
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
|
||||
let (builder, user_error) = builder.add_documents(content).unwrap();
|
||||
user_error.unwrap();
|
||||
builder.execute().unwrap();
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn snapshot_integration_dataset() {
|
||||
let index = setup_search_index_with_criteria(&[Criterion::Attribute]);
|
||||
db_snap!(index, word_position_docids, @"3c9347a767bceef3beb31465f1e5f3ae");
|
||||
}
|
||||
23
crates/milli/src/search/new/tests/language.rs
Normal file
23
crates/milli/src/search/new/tests/language.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{Search, SearchResult};
|
||||
|
||||
#[test]
|
||||
fn test_kanji_language_detection() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
|
||||
{ "id": 1, "title": "東京のお寿司。" },
|
||||
{ "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let txn = index.write_txn().unwrap();
|
||||
let mut search = Search::new(&txn, &index);
|
||||
|
||||
search.query("東京");
|
||||
let SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]");
|
||||
}
|
||||
38
crates/milli/src/search/new/tests/mod.rs
Normal file
38
crates/milli/src/search/new/tests/mod.rs
Normal file
@@ -0,0 +1,38 @@
|
||||
pub mod attribute_fid;
|
||||
pub mod attribute_position;
|
||||
pub mod cutoff;
|
||||
pub mod distinct;
|
||||
pub mod exactness;
|
||||
pub mod geo_sort;
|
||||
pub mod integration;
|
||||
#[cfg(feature = "all-tokenizations")]
|
||||
#[cfg(not(feature = "chinese-pinyin"))]
|
||||
pub mod language;
|
||||
pub mod ngram_split_words;
|
||||
pub mod proximity;
|
||||
pub mod proximity_typo;
|
||||
pub mod sort;
|
||||
pub mod stop_words;
|
||||
pub mod typo;
|
||||
pub mod typo_proximity;
|
||||
pub mod words_tms;
|
||||
|
||||
fn collect_field_values(
|
||||
index: &crate::Index,
|
||||
txn: &heed::RoTxn<'_>,
|
||||
fid: &str,
|
||||
docids: &[u32],
|
||||
) -> Vec<String> {
|
||||
let mut values = vec![];
|
||||
let fid = index.fields_ids_map(txn).unwrap().id(fid).unwrap();
|
||||
for doc in index.documents(txn, docids.iter().copied()).unwrap() {
|
||||
if let Some(v) = doc.1.get(fid) {
|
||||
let v: serde_json::Value = serde_json::from_slice(v).unwrap();
|
||||
let v = v.to_string();
|
||||
values.push(v);
|
||||
} else {
|
||||
values.push("__does_not_exist__".to_owned());
|
||||
}
|
||||
}
|
||||
values
|
||||
}
|
||||
428
crates/milli/src/search/new/tests/ngram_split_words.rs
Normal file
428
crates/milli/src/search/new/tests/ngram_split_words.rs
Normal file
@@ -0,0 +1,428 @@
|
||||
/*!
|
||||
This module tests the following properties:
|
||||
|
||||
1. Two consecutive words from a query can be combined into a "2gram"
|
||||
2. Three consecutive words from a query can be combined into a "3gram"
|
||||
3. A word from the query can be split into two consecutive words (split words), no matter how short it is
|
||||
4. A 2gram can be split into two words
|
||||
5. A 3gram can be split into two words
|
||||
6. 2grams can contain up to 1 typo
|
||||
7. 3grams cannot have typos
|
||||
8. 2grams and 3grams can be prefix tolerant
|
||||
9. Disabling typo tolerance also disable the split words feature
|
||||
10. Disabling typo tolerance does not disable prefix tolerance
|
||||
11. Disabling typo tolerance does not disable ngram tolerance
|
||||
12. Prefix tolerance is disabled for the last word if a space follows it
|
||||
13. Ngrams cannot be formed by combining a phrase and a word or two phrases
|
||||
14. Split words are not disabled by the `disableOnAttribute` or `disableOnWords` typo settings
|
||||
*/
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::tests::collect_field_values;
|
||||
use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Words]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "the sun flowers are pretty"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "the sun flower is tall"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "the sunflowers are pretty"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "the sunflower is tall"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "the sunflawer is tall"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"text": "sunflowering is not a verb"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"text": "xy z"
|
||||
}
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_2gram_simple() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_autorize_typos(false);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("sun flower");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
// will also match documents with "sunflower" + prefix tolerance
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]");
|
||||
// scores are empty because the only rule is Words with All matching strategy
|
||||
insta::assert_snapshot!(format!("{document_scores:?}"), @"[[], [], [], [], []]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flowers are pretty\"",
|
||||
"\"the sun flower is tall\"",
|
||||
"\"the sunflowers are pretty\"",
|
||||
"\"the sunflower is tall\"",
|
||||
"\"sunflowering is not a verb\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
#[test]
|
||||
fn test_3gram_simple() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_autorize_typos(false);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("sun flower s are");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flowers are pretty\"",
|
||||
"\"the sunflowers are pretty\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_2gram_typo() {
|
||||
let index = create_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("sun flawer");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flowers are pretty\"",
|
||||
"\"the sun flower is tall\"",
|
||||
"\"the sunflowers are pretty\"",
|
||||
"\"the sunflower is tall\"",
|
||||
"\"the sunflawer is tall\"",
|
||||
"\"sunflowering is not a verb\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_disable_ngrams() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_autorize_typos(false);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("sun flower ");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
// documents containing `sunflower`
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flower is tall\"",
|
||||
"\"the sunflower is tall\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_2gram_prefix() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_autorize_typos(false);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("sun flow");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
// documents containing words beginning with `sunflow`
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flowers are pretty\"",
|
||||
"\"the sun flower is tall\"",
|
||||
"\"the sunflowers are pretty\"",
|
||||
"\"the sunflower is tall\"",
|
||||
"\"sunflowering is not a verb\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_3gram_prefix() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_autorize_typos(false);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("su nf l");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
// documents containing a word beginning with sunfl
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 5]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sunflowers are pretty\"",
|
||||
"\"the sunflower is tall\"",
|
||||
"\"the sunflawer is tall\"",
|
||||
"\"sunflowering is not a verb\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_words() {
|
||||
let index = create_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("sunflower ");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
// all the documents with either `sunflower` or `sun flower` + eventual typo
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flower is tall\"",
|
||||
"\"the sunflowers are pretty\"",
|
||||
"\"the sunflower is tall\"",
|
||||
"\"the sunflawer is tall\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_disable_split_words() {
|
||||
let index = create_index();
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_autorize_typos(false);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("sunflower ");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
// no document containing `sun flower`
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flower is tall\"",
|
||||
"\"the sunflower is tall\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_2gram_split_words() {
|
||||
let index = create_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("sunf lower");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
// all the documents with "sunflower", "sun flower", (sunflower + 1 typo), or (sunflower as prefix)
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4, 5]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flower is tall\"",
|
||||
"\"the sunflowers are pretty\"",
|
||||
"\"the sunflower is tall\"",
|
||||
"\"the sunflawer is tall\"",
|
||||
"\"sunflowering is not a verb\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_3gram_no_split_words() {
|
||||
let index = create_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("sunf lo wer");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
// no document with `sun flower`
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 5]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flower is tall\"",
|
||||
"\"the sunflowers are pretty\"",
|
||||
"\"the sunflower is tall\"",
|
||||
"\"sunflowering is not a verb\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_3gram_no_typos() {
|
||||
let index = create_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("sunf la wer");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sunflawer is tall\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_ngram_phrases() {
|
||||
let index = create_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("\"sun\" flower");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flowers are pretty\"",
|
||||
"\"the sun flower is tall\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("\"sun\" \"flower\"");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flower is tall\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_short_split_words() {
|
||||
let index = create_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("xyz");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"xy z\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_words_never_disabled() {
|
||||
let index = create_index();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_exact_words(["sunflower"].iter().map(ToString::to_string).collect());
|
||||
s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect());
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("the sunflower is tall");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the sun flower is tall\"",
|
||||
"\"the sunflower is tall\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
490
crates/milli/src/search/new/tests/proximity.rs
Normal file
490
crates/milli/src/search/new/tests/proximity.rs
Normal file
@@ -0,0 +1,490 @@
|
||||
/*!
|
||||
This module tests the Proximity ranking rule:
|
||||
|
||||
1. A proximity of >7 always has the same cost.
|
||||
|
||||
2. Phrase terms can be in sprximity to other terms via their start and end words,
|
||||
but we need to make sure that the phrase exists in the document that meets this
|
||||
proximity condition. This is especially relevant with split words and synonyms.
|
||||
|
||||
3. An ngram has the same sprximity cost as its component words being consecutive.
|
||||
e.g. `sunflower` equivalent to `sun flower`.
|
||||
|
||||
4. The prefix databases can be used to find the sprximity between two words, but
|
||||
they store fewer sprximities than the regular word sprximity DB.
|
||||
|
||||
*/
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::tests::collect_field_values;
|
||||
use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy};
|
||||
|
||||
fn create_simple_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"text": "the very quick dark brown and smart fox did jump over the terribly lazy and small dog"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "the. quick brown fox jumps over the lazy. dog"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "the quick brown fox jumps over the lazy. dog"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"text": "dog the quick brown fox jumps over the lazy"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"text": "the quickbrown fox jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"text": "brown quick fox jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"text": "the really quick brown fox jumps over the very lazy dog"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"text": "the really quick brown fox jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"text": "the quick brown fox jumps over the lazy"
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"text": "the quack brown fox jumps over the lazy"
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"text": "the quack brown fox jumps over the lazy dog"
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"text": "the quick brown fox jumps over the lazy dog"
|
||||
}
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
fn create_edge_cases_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Words, Criterion::Proximity]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index.add_documents(documents!([
|
||||
{
|
||||
// This document will insert "s" in the prefix database
|
||||
"id": 0,
|
||||
"text": "
|
||||
saa sab sac sae saf sag sah sai saj sak sal sam san sao sap saq sar sasa sat sau sav saw sax say saz
|
||||
sba sbb sbc sbe sbf sbg sbh sbi sbj sbk sbl sbm sbn sbo sbp sbq sbr sbsb sbt sbu sbv sbw sbx sby sbz
|
||||
sca scb scc sce scf scg sch sci scj sck scl scm scn sco scp scq scr scsc sct scu scv scw scx scy scz
|
||||
sda sdb sdc sde sdf sdg sdh sdi sdj sdk sdl sdm sdn sdo sdp sdq sdr sdsd sdt sdu sdv sdw sdx sdy sdz
|
||||
sea seb sec see sef seg seh sei sej sek sel sem sen seo sep seq ser sese set seu sev sew sex sey sez
|
||||
sfa sfb sfc sfe sff sfg sfh sfi sfj sfk sfl sfm sfn sfo sfp sfq sfr sfsf sft sfu sfv sfw sfx sfy sfz
|
||||
sga sgb sgc sge sgf sgg sgh sgi sgj sgk sgl sgm sgn sgo sgp sgq sgr sgsg sgt sgu sgv sgw sgx sgy sgz
|
||||
ska skb skc ske skf skg skh ski skj skk skl skm skn sko skp skq skr sksk skt sku skv skw skx sky skz
|
||||
sla slb slc sle slf slg slh sli slj slk sll slm sln slo slp slq slr slsl slt slu slv slw slx sly slz
|
||||
sma smb smc sme smf smg smh smi smj smk sml smm smn smo smp smq smr smsm smt smu smv smw smx smy smz
|
||||
sna snb snc sne snf sng snh sni snj snk snl snm snn sno snp snq snr snsn snt snu snv snw snx sny snz
|
||||
soa sob soc soe sof sog soh soi soj sok sol som son soo sop soq sor soso sot sou sov sow sox soy soz
|
||||
spa spb spc spe spf spg sph spi spj spk spl spm spn spo spp spq spr spsp spt spu spv spw spx spy spz
|
||||
sqa sqb sqc sqe sqf sqg sqh sqi sqj sqk sql sqm sqn sqo sqp sqq sqr sqsq sqt squ sqv sqw sqx sqy sqz
|
||||
sra srb src sre srf srg srh sri srj srk srl srm srn sro srp srq srr srsr srt sru srv srw srx sry srz
|
||||
ssa ssb ssc sse ssf ssg ssh ssi ssj ssk ssl ssm ssn sso ssp ssq ssr ssss sst ssu ssv ssw ssx ssy ssz
|
||||
sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz
|
||||
"
|
||||
},
|
||||
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
|
||||
// If the search query is "sunflower", the split word "Sun Flower" will match some documents.
|
||||
// The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
|
||||
// If the search query is "sunflower", the split word "Sun Flower" will match some documents.
|
||||
// If the query is `sunflower wilting`, then we should make sure that
|
||||
// the proximity condition `flower wilting: sprx N` also comes with the condition
|
||||
// `sun wilting: sprx N+1`, but this is not the exact condition we use for now.
|
||||
// We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which
|
||||
// is better than nothing but not the best.
|
||||
{
|
||||
"id": 1,
|
||||
"text": "Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat."
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"text": "Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat."
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
// This document matches the query `sunflower wilting`, but the sprximity condition
|
||||
// This document matches the query `sunflower wilting`, but the sprximity condition
|
||||
// between `sunflower` and `wilting` cannot be through the split-word `Sun Flower`
|
||||
// which would reduce to only `flower` and `wilting` being in sprximity.
|
||||
"text": "A flower wilting under the sun, unlike a sunflower"
|
||||
},
|
||||
{
|
||||
// This should be the best document for `sunflower wilting`
|
||||
"id": 4,
|
||||
"text": "sun flower wilting under the heat"
|
||||
},
|
||||
{
|
||||
// This is also the best document for `sunflower wilting`
|
||||
"id": 5,
|
||||
"text": "sunflower wilting under the heat"
|
||||
},
|
||||
{
|
||||
// Prox MAX between `best` and `s` prefix
|
||||
"id": 6,
|
||||
"text": "this is the best meal I have ever had in such a beautiful summer day"
|
||||
},
|
||||
{
|
||||
// Prox 5 between `best` and `s` prefix
|
||||
"id": 7,
|
||||
"text": "this is the best cooked meal of the summer"
|
||||
},
|
||||
{
|
||||
// Prox 4 between `best` and `s` prefix
|
||||
"id": 8,
|
||||
"text": "this is the best meal of the summer"
|
||||
},
|
||||
{
|
||||
// Prox 3 between `best` and `s` prefix
|
||||
"id": 9,
|
||||
"text": "this is the best meal of summer"
|
||||
},
|
||||
{
|
||||
// Prox 1 between `best` and `s` prefix
|
||||
"id": 10,
|
||||
"text": "this is the best summer meal"
|
||||
},
|
||||
{
|
||||
// Reverse Prox 3 between `best` and `s` prefix
|
||||
"id": 11,
|
||||
"text": "summer x y best"
|
||||
},
|
||||
{
|
||||
// Reverse Prox 2 between `best` and `s` prefix
|
||||
"id": 12,
|
||||
"text": "summer x best"
|
||||
},
|
||||
{
|
||||
// Reverse Prox 1 between `best` and `s` prefix
|
||||
"id": 13,
|
||||
"text": "summer best"
|
||||
},
|
||||
{
|
||||
// This document will insert "win" in the prefix database
|
||||
"id": 14,
|
||||
"text": "
|
||||
winaa winab winac winae winaf winag winah winai winaj winak winal winam winan winao winap winaq winar winasa winat winau winav winaw winax winay winaz
|
||||
winba winbb winbc winbe winbf winbg winbh winbi winbj winbk winbl winbm winbn winbo winbp winbq winbr winbsb winbt winbu winbv winbw winbx winby winbz
|
||||
winca wincb wincc wince wincf wincg winch winci wincj winck wincl wincm wincn winco wincp wincq wincr wincsc winct wincu wincv wincw wincx wincy wincz
|
||||
winda windb windc winde windf windg windh windi windj windk windl windm windn windo windp windq windr windsd windt windu windv windw windx windy windz
|
||||
winea wineb winec winee winef wineg wineh winei winej winek winel winem winen wineo winep wineq winer winese winet wineu winev winew winex winey winez
|
||||
winfa winfb winfc winfe winff winfg winfh winfi winfj winfk winfl winfm winfn winfo winfp winfq winfr winfsf winft winfu winfv winfw winfx winfy winfz
|
||||
winga wingb wingc winge wingf wingg wingh wingi wingj wingk wingl wingm wingn wingo wingp wingq wingr wingsg wingt wingu wingv wingw wingx wingy wingz
|
||||
winka winkb winkc winke winkf winkg winkh winki winkj winkk winkl winkm winkn winko winkp winkq winkr winksk winkt winku winkv winkw winkx winky winkz
|
||||
winla winlb winlc winle winlf winlg winlh winli winlj winlk winll winlm winln winlo winlp winlq winlr winlsl winlt winlu winlv winlw winlx winly winlz
|
||||
winma winmb winmc winme winmf winmg winmh winmi winmj winmk winml winmm winmn winmo winmp winmq winmr winmsm winmt winmu winmv winmw winmx winmy winmz
|
||||
winna winnb winnc winne winnf winng winnh winni winnj winnk winnl winnm winnn winno winnp winnq winnr winnsn winnt winnu winnv winnw winnx winny winnz
|
||||
winoa winob winoc winoe winof winog winoh winoi winoj winok winol winom winon winoo winop winoq winor winoso winot winou winov winow winox winoy winoz
|
||||
winpa winpb winpc winpe winpf winpg winph winpi winpj winpk winpl winpm winpn winpo winpp winpq winpr winpsp winpt winpu winpv winpw winpx winpy winpz
|
||||
winqa winqb winqc winqe winqf winqg winqh winqi winqj winqk winql winqm winqn winqo winqp winqq winqr winqsq winqt winqu winqv winqw winqx winqy winqz
|
||||
winra winrb winrc winre winrf winrg winrh winri winrj winrk winrl winrm winrn winro winrp winrq winrr winrsr winrt winru winrv winrw winrx winry winrz
|
||||
winsa winsb winsc winse winsf winsg winsh winsi winsj winsk winsl winsm winsn winso winsp winsq winsr winsss winst winsu winsv winsw winsx winsy winsz
|
||||
winta wintb wintc winte wintf wintg winth winti wintj wintk wintl wintm wintn winto wintp wintq wintr wintst wintt wintu wintv wintw wintx winty wintz
|
||||
"
|
||||
},
|
||||
{
|
||||
// Prox MAX between `best` and `win` prefix
|
||||
"id": 15,
|
||||
"text": "this is the best meal I have ever had in such a beautiful winter day"
|
||||
},
|
||||
{
|
||||
// Prox 5 between `best` and `win` prefix
|
||||
"id": 16,
|
||||
"text": "this is the best cooked meal of the winter"
|
||||
},
|
||||
{
|
||||
// Prox 4 between `best` and `win` prefix
|
||||
"id": 17,
|
||||
"text": "this is the best meal of the winter"
|
||||
},
|
||||
{
|
||||
// Prox 3 between `best` and `win` prefix
|
||||
"id": 18,
|
||||
"text": "this is the best meal of winter"
|
||||
},
|
||||
{
|
||||
// Prox 1 between `best` and `win` prefix
|
||||
"id": 19,
|
||||
"text": "this is the best winter meal"
|
||||
},
|
||||
{
|
||||
// Reverse Prox 3 between `best` and `win` prefix
|
||||
"id": 20,
|
||||
"text": "winter x y best"
|
||||
},
|
||||
{
|
||||
// Reverse Prox 2 between `best` and `win` prefix
|
||||
"id": 21,
|
||||
"text": "winter x best"
|
||||
},
|
||||
{
|
||||
// Reverse Prox 1 between `best` and `win` prefix
|
||||
"id": 22,
|
||||
"text": "winter best"
|
||||
},
|
||||
])).unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_proximity_simple() {
|
||||
let index = create_simple_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("the quick brown fox jumps over the lazy dog");
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 4, 7, 6, 2, 3, 5, 1, 0]");
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"the quack brown fox jumps over the lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy dog\"",
|
||||
"\"the quickbrown fox jumps over the lazy dog\"",
|
||||
"\"the really quick brown fox jumps over the lazy dog\"",
|
||||
"\"the really quick brown fox jumps over the very lazy dog\"",
|
||||
"\"the quick brown fox jumps over the lazy. dog\"",
|
||||
"\"dog the quick brown fox jumps over the lazy\"",
|
||||
"\"brown quick fox jumps over the lazy dog\"",
|
||||
"\"the. quick brown fox jumps over the lazy. dog\"",
|
||||
"\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_proximity_split_word() {
|
||||
let index = create_edge_cases_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("sunflower wilting");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
// "2" and "4" should be swapped ideally
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
|
||||
"\"sun flower wilting under the heat\"",
|
||||
"\"sunflower wilting under the heat\"",
|
||||
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
|
||||
"\"A flower wilting under the sun, unlike a sunflower\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("\"sun flower\" wilting");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
// "2" and "4" should be swapped ideally
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
|
||||
"\"sun flower wilting under the heat\"",
|
||||
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
|
||||
]
|
||||
"###);
|
||||
drop(txn);
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
let mut syns = BTreeMap::new();
|
||||
syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
|
||||
s.set_synonyms(syns);
|
||||
})
|
||||
.unwrap();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("xyz wilting");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
// "2" and "4" should be swapped ideally
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
|
||||
"\"sun flower wilting under the heat\"",
|
||||
"\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_proximity_prefix_db() {
|
||||
let index = create_edge_cases_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best s");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
// This test illustrates the loss of precision from using the prefix DB
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best summer meal\"",
|
||||
"\"this is the best meal of summer\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"summer x y best\"",
|
||||
"\"summer x best\"",
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
// Difference when using the `su` prefix, which is not in the prefix DB
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best su");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best summer meal\"",
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal of summer\"",
|
||||
"\"summer x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"summer x y best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
// Note that there is a case where a prefix is in the prefix DB but not in the
|
||||
// **proximity** prefix DB. In that case, its sprximity score will always be
|
||||
// the maximum. This happens for prefixes that are larger than 2 bytes.
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best win");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best winter meal\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
"\"winter x best\"",
|
||||
"\"winter best\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
// Now using `wint`, which is not in the prefix DB:
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best wint");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best winter meal\"",
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
// and using `wi` which is in the prefix DB and proximity prefix DB
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best wi");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best winter meal\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
"\"winter x best\"",
|
||||
"\"winter best\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
106
crates/milli/src/search/new/tests/proximity_typo.rs
Normal file
106
crates/milli/src/search/new/tests/proximity_typo.rs
Normal file
@@ -0,0 +1,106 @@
|
||||
/*!
|
||||
This module tests the interactions between the proximity and typo ranking rules.
|
||||
|
||||
The proximity ranking rule should transform the query graph such that it
|
||||
only contains the word pairs that it used to compute its bucket, but this is not currently
|
||||
implemented.
|
||||
*/
|
||||
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::tests::collect_field_values;
|
||||
use crate::{Criterion, Search, SearchResult, TermsMatchingStrategy};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["text".to_owned()]);
|
||||
s.set_criteria(vec![Criterion::Words, Criterion::Proximity, Criterion::Typo]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
// Basic trap.
|
||||
//
|
||||
// We have one document with the perfect word pair: `sommer - holiday`
|
||||
// and another with the perfect word pair: `sommer holidty`.
|
||||
//
|
||||
// The proximity ranking rule will put them both in the same bucket, and it
|
||||
// should minify the query graph to make it represent:
|
||||
// EITHER:
|
||||
// sommer + holiday
|
||||
// OR:
|
||||
// sommer + holidty
|
||||
//
|
||||
// Such that the child typo ranking rule does not find any match
|
||||
// for its zero-typo bucket `summer + holiday`, even though both documents
|
||||
// contain these two exact words.
|
||||
{
|
||||
"id": 0,
|
||||
"text": "summer. holiday. sommer holidty"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"text": "summer. holiday. sommer holiday"
|
||||
},
|
||||
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_trap_basic() {
|
||||
let index = create_index();
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::All);
|
||||
s.query("summer holiday");
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"), @r###"
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
"###);
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
// This is incorrect, 1 should come before 0
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"summer. holiday. sommer holidty\"",
|
||||
"\"summer. holiday. sommer holiday\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
@@ -0,0 +1,244 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/attribute_fid.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
2,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 91,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 14,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 79,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 77,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 12,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 83,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 11,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 75,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 79,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 73,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
11,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 77,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
10,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
13,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
12,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 78,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
14,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 75,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 91,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,244 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/attribute_fid.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
2,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 91,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 14,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 79,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 77,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 12,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 83,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 11,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 75,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 79,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 73,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
11,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 7,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 77,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
10,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
13,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 81,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
12,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 6,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 78,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
14,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 75,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 19,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 91,
|
||||
max_rank: 91,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,244 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/attribute_position.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
10,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
12,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
11,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
13,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
2,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 18,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,244 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/attribute_position.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
10,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
12,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
11,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
13,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
2,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 18,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/attribute_position.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
5,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 11,
|
||||
max_rank: 11,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 51,
|
||||
max_rank: 51,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 11,
|
||||
max_rank: 11,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 51,
|
||||
max_rank: 51,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 11,
|
||||
max_rank: 11,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 51,
|
||||
max_rank: 51,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 11,
|
||||
max_rank: 11,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 51,
|
||||
max_rank: 51,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 11,
|
||||
max_rank: 11,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 50,
|
||||
max_rank: 51,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,244 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/attribute_position.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
10,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
12,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 21,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
11,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
13,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 20,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 19,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
2,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 18,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 15,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 13,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 5,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 21,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,366 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
19,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 6,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
18,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 8,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 8,
|
||||
max_matching_words: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 8,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 5,
|
||||
max_matching_words: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
17,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
16,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
15,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 5,
|
||||
max_matching_words: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
14,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
13,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
12,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
2,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
11,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,106 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 6,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 6,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 6,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 0,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,126 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
MatchesStart,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 6,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,86 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 5,
|
||||
max_matching_words: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
MatchesStart,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 5,
|
||||
max_matching_words: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 5,
|
||||
max_matching_words: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,66 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
2,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
MatchesStart,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,136 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
2,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,186 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
9,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 8,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 8,
|
||||
max_matching_words: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 5,
|
||||
max_matching_words: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
2,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,126 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
8,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,146 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
9,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
MatchesStart,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,146 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
9,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
MatchesStart,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,84 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
0,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
2,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,240 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
2,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 25,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 24,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 25,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
MatchesStart,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
MatchesStart,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
MatchesStart,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 10,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 9,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 5,
|
||||
max_rank: 10,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,110 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
0,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,366 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/exactness.rs
|
||||
expression: "format!(\"{document_ids_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
(
|
||||
19,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
9,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 9,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 6,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
18,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 8,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 8,
|
||||
max_matching_words: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 8,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 5,
|
||||
max_matching_words: 8,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
17,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
16,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 7,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
6,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
7,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 7,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 7,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
15,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 5,
|
||||
max_matching_words: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
5,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 5,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
14,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 4,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
4,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 4,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
13,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
3,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
12,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 2,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
2,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 2,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 2,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
1,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
(
|
||||
11,
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 9,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
ExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,168 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
1.0,
|
||||
1.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
2.0,
|
||||
-1.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
-2.0,
|
||||
-2.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
3.0,
|
||||
5.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
6.0,
|
||||
-5.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,168 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
6.0,
|
||||
-5.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
3.0,
|
||||
5.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
-2.0,
|
||||
-2.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
2.0,
|
||||
-1.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
1.0,
|
||||
1.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
-175.0,
|
||||
],
|
||||
ascending: false,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
-85.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,91 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
-179.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
88.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
175.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,75 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
-89.0,
|
||||
0.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: Some(
|
||||
[
|
||||
0.0,
|
||||
178.0,
|
||||
],
|
||||
),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/geo_sort.rs
|
||||
expression: "format!(\"{scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 1,
|
||||
max_matching_words: 1,
|
||||
},
|
||||
),
|
||||
GeoSort(
|
||||
GeoSort {
|
||||
target_point: [
|
||||
0.0,
|
||||
0.0,
|
||||
],
|
||||
ascending: true,
|
||||
value: None,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,70 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,70 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,78 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,78 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,70 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,46 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,30 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,30 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/proximity.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 4,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,206 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/sort.rs
|
||||
expression: document_scores_json
|
||||
---
|
||||
[
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": 0.0
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": 1.0
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": 1.0
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": 1.0
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": 1.1367
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": 1.2367
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": 1.5673
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": "0"
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": "1"
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": "false"
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": "false"
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": "true"
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": "true"
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": null
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": null
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": null
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": null
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": null
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": null
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
},
|
||||
{
|
||||
"vague:asc": {
|
||||
"order": 0,
|
||||
"value": null
|
||||
},
|
||||
"<hidden-rule-1>": {
|
||||
"order": 1,
|
||||
"value": "<hidden>"
|
||||
}
|
||||
}
|
||||
]
|
||||
@@ -0,0 +1,206 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/sort.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(2.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.5673),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.2367),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.1367),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(0.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("true"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("true"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("false"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("false"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("1"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("0"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,206 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/sort.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("i"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("i"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("i"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("h"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("g"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("g"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("f"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("f"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("f"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("e"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("e"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("e"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("e"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("e"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("e"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("d"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("c"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("c"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("c"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "letter",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: String("b"),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,206 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/sort.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(5.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(4.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(3.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(2.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(2.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(2.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(2.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(2.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(0.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(0.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(0.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(0.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "rank",
|
||||
ascending: false,
|
||||
redacted: false,
|
||||
value: Number(0.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,206 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/sort.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Number(0.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Number(1.0),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Number(1.1367),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Number(1.2367),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Number(1.5673),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: String("0"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: String("1"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: String("false"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: String("false"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: String("true"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: String("true"),
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Sort(
|
||||
Sort {
|
||||
field_name: "vague",
|
||||
ascending: true,
|
||||
redacted: false,
|
||||
value: Null,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,129 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/stop_words.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 1,
|
||||
},
|
||||
),
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 1,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 31,
|
||||
max_rank: 31,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 1,
|
||||
},
|
||||
),
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 1,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 31,
|
||||
max_rank: 31,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Words(
|
||||
Words {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 2,
|
||||
},
|
||||
),
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 1,
|
||||
},
|
||||
),
|
||||
Fid(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 1,
|
||||
},
|
||||
),
|
||||
Position(
|
||||
Rank {
|
||||
rank: 27,
|
||||
max_rank: 31,
|
||||
},
|
||||
),
|
||||
ExactAttribute(
|
||||
NoExactMatch,
|
||||
),
|
||||
ExactWords(
|
||||
ExactWords {
|
||||
matching_words: 3,
|
||||
max_matching_words: 3,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,13 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/stop_words.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
]
|
||||
@@ -0,0 +1,12 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/typo.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
[],
|
||||
]
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/typo.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 1,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 5,
|
||||
max_typo_count: 5,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
source: milli/src/search/new/tests/typo.rs
|
||||
expression: "format!(\"{document_scores:#?}\")"
|
||||
---
|
||||
[
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 6,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 0,
|
||||
max_typo_count: 6,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 6,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 2,
|
||||
max_typo_count: 6,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 3,
|
||||
max_typo_count: 6,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Typo(
|
||||
Typo {
|
||||
typo_count: 4,
|
||||
max_typo_count: 6,
|
||||
},
|
||||
),
|
||||
],
|
||||
]
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user