mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-10-26 05:26:27 +00:00
Merge branch 'main' into indexer-edition-2024
This commit is contained in:
846
crates/milli/src/search/facet/facet_distribution.rs
Normal file
846
crates/milli/src/search/facet/facet_distribution.rs
Normal file
@@ -0,0 +1,846 @@
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::fmt::Display;
|
||||
use std::ops::ControlFlow;
|
||||
use std::{fmt, mem};
|
||||
|
||||
use heed::types::Bytes;
|
||||
use heed::BytesDecode;
|
||||
use indexmap::IndexMap;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::UserError;
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec,
|
||||
};
|
||||
use crate::heed_codec::{BytesRefCodec, StrRefCodec};
|
||||
use crate::search::facet::facet_distribution_iter::{
|
||||
count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution,
|
||||
};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// The default number of values by facets that will
|
||||
/// be fetched from the key-value store.
|
||||
pub const DEFAULT_VALUES_PER_FACET: usize = 100;
|
||||
|
||||
/// Threshold on the number of candidates that will make
|
||||
/// the system to choose between one algorithm or another.
|
||||
const CANDIDATES_THRESHOLD: u64 = 3000;
|
||||
|
||||
/// How should we fetch the facets?
|
||||
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum OrderBy {
|
||||
/// By lexicographic order...
|
||||
#[default]
|
||||
Lexicographic,
|
||||
/// Or by number of docids in common?
|
||||
Count,
|
||||
}
|
||||
|
||||
impl Display for OrderBy {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
OrderBy::Lexicographic => f.write_str("alphabetically"),
|
||||
OrderBy::Count => f.write_str("by count"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetDistribution<'a> {
|
||||
facets: Option<HashMap<String, OrderBy>>,
|
||||
candidates: Option<RoaringBitmap>,
|
||||
max_values_per_facet: usize,
|
||||
default_order_by: OrderBy,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
index: &'a Index,
|
||||
}
|
||||
|
||||
impl<'a> FacetDistribution<'a> {
|
||||
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> FacetDistribution<'a> {
|
||||
FacetDistribution {
|
||||
facets: None,
|
||||
candidates: None,
|
||||
max_values_per_facet: DEFAULT_VALUES_PER_FACET,
|
||||
default_order_by: OrderBy::default(),
|
||||
rtxn,
|
||||
index,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn facets<I: IntoIterator<Item = (A, OrderBy)>, A: AsRef<str>>(
|
||||
&mut self,
|
||||
names_ordered_by: I,
|
||||
) -> &mut Self {
|
||||
self.facets = Some(
|
||||
names_ordered_by
|
||||
.into_iter()
|
||||
.map(|(name, order_by)| (name.as_ref().to_string(), order_by))
|
||||
.collect(),
|
||||
);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn max_values_per_facet(&mut self, max: usize) -> &mut Self {
|
||||
self.max_values_per_facet = max;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn default_order_by(&mut self, order_by: OrderBy) -> &mut Self {
|
||||
self.default_order_by = order_by;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
|
||||
self.candidates = Some(candidates);
|
||||
self
|
||||
}
|
||||
|
||||
/// There is a small amount of candidates OR we ask for facet string values so we
|
||||
/// decide to iterate over the facet values of each one of them, one by one.
|
||||
fn facet_distribution_from_documents(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
facet_type: FacetType,
|
||||
candidates: &RoaringBitmap,
|
||||
distribution: &mut IndexMap<String, u64>,
|
||||
) -> heed::Result<()> {
|
||||
match facet_type {
|
||||
FacetType::Number => {
|
||||
let mut lexicographic_distribution = BTreeMap::new();
|
||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
|
||||
|
||||
let db = self.index.field_id_docid_facet_f64s;
|
||||
for docid in candidates {
|
||||
key_buffer.truncate(mem::size_of::<FieldId>());
|
||||
key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
let iter = db
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(self.rtxn, &key_buffer)?
|
||||
.remap_key_type::<FieldDocIdFacetF64Codec>();
|
||||
|
||||
for result in iter {
|
||||
let ((_, _, value), ()) = result?;
|
||||
*lexicographic_distribution.entry(value.to_string()).or_insert(0) += 1;
|
||||
}
|
||||
}
|
||||
|
||||
distribution.extend(
|
||||
lexicographic_distribution
|
||||
.into_iter()
|
||||
.take(self.max_values_per_facet.saturating_sub(distribution.len())),
|
||||
);
|
||||
}
|
||||
FacetType::String => {
|
||||
let mut normalized_distribution = BTreeMap::new();
|
||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec();
|
||||
|
||||
let db = self.index.field_id_docid_facet_strings;
|
||||
for docid in candidates {
|
||||
key_buffer.truncate(mem::size_of::<FieldId>());
|
||||
key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
let iter = db
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(self.rtxn, &key_buffer)?
|
||||
.remap_key_type::<FieldDocIdFacetStringCodec>();
|
||||
|
||||
for result in iter {
|
||||
let ((_, _, normalized_value), original_value) = result?;
|
||||
let (_, count) = normalized_distribution
|
||||
.entry(normalized_value)
|
||||
.or_insert_with(|| (original_value, 0));
|
||||
*count += 1;
|
||||
|
||||
// we'd like to break here if we have enough facet values, but we are collecting them by increasing docid,
|
||||
// so higher ranked facets could be in later docids
|
||||
}
|
||||
}
|
||||
|
||||
let iter = normalized_distribution
|
||||
.into_iter()
|
||||
.take(self.max_values_per_facet.saturating_sub(distribution.len()))
|
||||
.map(|(_normalized, (original, count))| (original.to_string(), count));
|
||||
distribution.extend(iter);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// There is too much documents, we use the facet levels to move throught
|
||||
/// the facet values, to find the candidates and values associated.
|
||||
fn facet_numbers_distribution_from_facet_levels(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
candidates: &RoaringBitmap,
|
||||
order_by: OrderBy,
|
||||
distribution: &mut IndexMap<String, u64>,
|
||||
) -> heed::Result<()> {
|
||||
let search_function = match order_by {
|
||||
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
|
||||
OrderBy::Count => count_iterate_over_facet_distribution,
|
||||
};
|
||||
|
||||
search_function(
|
||||
self.rtxn,
|
||||
self.index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
||||
field_id,
|
||||
candidates,
|
||||
|facet_key, nbr_docids, _| {
|
||||
let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap();
|
||||
distribution.insert(facet_key.to_string(), nbr_docids);
|
||||
if distribution.len() == self.max_values_per_facet {
|
||||
Ok(ControlFlow::Break(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn facet_strings_distribution_from_facet_levels(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
candidates: &RoaringBitmap,
|
||||
order_by: OrderBy,
|
||||
distribution: &mut IndexMap<String, u64>,
|
||||
) -> heed::Result<()> {
|
||||
let search_function = match order_by {
|
||||
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
|
||||
OrderBy::Count => count_iterate_over_facet_distribution,
|
||||
};
|
||||
|
||||
search_function(
|
||||
self.rtxn,
|
||||
self.index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
||||
field_id,
|
||||
candidates,
|
||||
|facet_key, nbr_docids, any_docid| {
|
||||
let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap();
|
||||
|
||||
let key: (FieldId, _, &str) = (field_id, any_docid, facet_key);
|
||||
let original_string = self
|
||||
.index
|
||||
.field_id_docid_facet_strings
|
||||
.get(self.rtxn, &key)?
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
|
||||
distribution.insert(original_string, nbr_docids);
|
||||
if distribution.len() == self.max_values_per_facet {
|
||||
Ok(ControlFlow::Break(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
fn facet_values(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
order_by: OrderBy,
|
||||
) -> heed::Result<IndexMap<String, u64>> {
|
||||
use FacetType::{Number, String};
|
||||
|
||||
let mut distribution = IndexMap::new();
|
||||
match (order_by, &self.candidates) {
|
||||
(OrderBy::Lexicographic, Some(cnd)) if cnd.len() <= CANDIDATES_THRESHOLD => {
|
||||
// Classic search, candidates were specified, we must return facet values only related
|
||||
// to those candidates. We also enter here for facet strings for performance reasons.
|
||||
self.facet_distribution_from_documents(field_id, Number, cnd, &mut distribution)?;
|
||||
self.facet_distribution_from_documents(field_id, String, cnd, &mut distribution)?;
|
||||
}
|
||||
_ => {
|
||||
let universe;
|
||||
let candidates = match &self.candidates {
|
||||
Some(cnd) => cnd,
|
||||
None => {
|
||||
universe = self.index.documents_ids(self.rtxn)?;
|
||||
&universe
|
||||
}
|
||||
};
|
||||
|
||||
self.facet_numbers_distribution_from_facet_levels(
|
||||
field_id,
|
||||
candidates,
|
||||
order_by,
|
||||
&mut distribution,
|
||||
)?;
|
||||
self.facet_strings_distribution_from_facet_levels(
|
||||
field_id,
|
||||
candidates,
|
||||
order_by,
|
||||
&mut distribution,
|
||||
)?;
|
||||
}
|
||||
};
|
||||
|
||||
Ok(distribution)
|
||||
}
|
||||
|
||||
pub fn compute_stats(&self) -> Result<BTreeMap<String, (f64, f64)>> {
|
||||
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let filterable_fields = self.index.filterable_fields(self.rtxn)?;
|
||||
let candidates = if let Some(candidates) = self.candidates.clone() {
|
||||
candidates
|
||||
} else {
|
||||
return Ok(Default::default());
|
||||
};
|
||||
|
||||
let fields = match &self.facets {
|
||||
Some(facets) => {
|
||||
let invalid_fields: HashSet<_> = facets
|
||||
.iter()
|
||||
.map(|(name, _)| name)
|
||||
.filter(|facet| !crate::is_faceted(facet, &filterable_fields))
|
||||
.collect();
|
||||
if !invalid_fields.is_empty() {
|
||||
return Err(UserError::InvalidFacetsDistribution {
|
||||
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
|
||||
valid_facets_name: filterable_fields.into_iter().collect(),
|
||||
}
|
||||
.into());
|
||||
} else {
|
||||
facets.iter().map(|(name, _)| name).cloned().collect()
|
||||
}
|
||||
}
|
||||
None => filterable_fields,
|
||||
};
|
||||
|
||||
let mut distribution = BTreeMap::new();
|
||||
for (fid, name) in fields_ids_map.iter() {
|
||||
if crate::is_faceted(name, &fields) {
|
||||
let min_value = if let Some(min_value) = crate::search::facet::facet_min_value(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
fid,
|
||||
candidates.clone(),
|
||||
)? {
|
||||
min_value
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
let max_value = if let Some(max_value) = crate::search::facet::facet_max_value(
|
||||
self.index,
|
||||
self.rtxn,
|
||||
fid,
|
||||
candidates.clone(),
|
||||
)? {
|
||||
max_value
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
|
||||
distribution.insert(name.to_string(), (min_value, max_value));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(distribution)
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> Result<BTreeMap<String, IndexMap<String, u64>>> {
|
||||
let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
|
||||
let filterable_fields = self.index.filterable_fields(self.rtxn)?;
|
||||
|
||||
let fields = match self.facets {
|
||||
Some(ref facets) => {
|
||||
let invalid_fields: HashSet<_> = facets
|
||||
.iter()
|
||||
.map(|(name, _)| name)
|
||||
.filter(|facet| !crate::is_faceted(facet, &filterable_fields))
|
||||
.collect();
|
||||
if !invalid_fields.is_empty() {
|
||||
return Err(UserError::InvalidFacetsDistribution {
|
||||
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
|
||||
valid_facets_name: filterable_fields.into_iter().collect(),
|
||||
}
|
||||
.into());
|
||||
} else {
|
||||
facets.iter().map(|(name, _)| name).cloned().collect()
|
||||
}
|
||||
}
|
||||
None => filterable_fields,
|
||||
};
|
||||
|
||||
let mut distribution = BTreeMap::new();
|
||||
for (fid, name) in fields_ids_map.iter() {
|
||||
if crate::is_faceted(name, &fields) {
|
||||
let order_by = self
|
||||
.facets
|
||||
.as_ref()
|
||||
.and_then(|facets| facets.get(name).copied())
|
||||
.unwrap_or(self.default_order_by);
|
||||
let values = self.facet_values(fid, order_by)?;
|
||||
distribution.insert(name.to_string(), values);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(distribution)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for FacetDistribution<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let FacetDistribution {
|
||||
facets,
|
||||
candidates,
|
||||
max_values_per_facet,
|
||||
default_order_by,
|
||||
rtxn: _,
|
||||
index: _,
|
||||
} = self;
|
||||
|
||||
f.debug_struct("FacetDistribution")
|
||||
.field("facets", facets)
|
||||
.field("candidates", candidates)
|
||||
.field("max_values_per_facet", max_values_per_facet)
|
||||
.field("default_order_by", default_order_by)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{milli_snap, FacetDistribution, OrderBy};
|
||||
|
||||
#[test]
|
||||
fn few_candidates_few_facet_values() {
|
||||
// All the tests here avoid using the code in `facet_distribution_iter` because there aren't
|
||||
// enough candidates.
|
||||
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let documents = documents!([
|
||||
{ "colour": "Blue" },
|
||||
{ "colour": " blue" },
|
||||
{ "colour": "RED" }
|
||||
]);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates([0, 1, 2].iter().copied().collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates([1, 2].iter().copied().collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
// I think it would be fine if " blue" was "Blue" instead.
|
||||
// We just need to get any non-normalised string I think, even if it's not in
|
||||
// the candidates
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {" blue": 1, "RED": 1}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates([2].iter().copied().collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"RED": 1}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates([0, 1, 2].iter().copied().collect())
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::Count)))
|
||||
.candidates([0, 1, 2].iter().copied().collect())
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2}}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn many_candidates_few_facet_values() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"];
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..10_000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": facet_values[i % 5],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..10_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..5_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..5_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..5_000).collect())
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::Count)))
|
||||
.candidates((0..5_000).collect())
|
||||
.max_values_per_facet(1)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), @r###"{"colour": {"Red": 3000}}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn many_candidates_many_facet_values() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..10_000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": facet_values[i % 1000],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"ac9229ed5964d893af96a7076e2f8af5");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.max_values_per_facet(2)
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates_with_max_2", @r###"{"colour": {"0": 10, "1": 10}}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..10_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_10_000", @"ac9229ed5964d893af96a7076e2f8af5");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..5_000).collect())
|
||||
.execute()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_5_000", @"825f23a4090d05756f46176987b7d992");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn facet_stats() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": facet_values[i % 1000],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 999.0)}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (217.0, 776.0)}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn facet_stats_array() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": [facet_values[i % 1000], facet_values[i % 1000] + 1000],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 1999.0)}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (217.0, 1776.0)}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn facet_stats_mixed_array() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
let document = serde_json::json!({
|
||||
"colour": [facet_values[i % 1000], format!("{}", facet_values[i % 1000] + 1000)],
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 999.0)}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (217.0, 776.0)}"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn facet_mixed_values() {
|
||||
let mut index = TempIndex::new_with_map_size(4096 * 10_000);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") }))
|
||||
.unwrap();
|
||||
|
||||
let facet_values = (0..1000).collect::<Vec<_>>();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..1000 {
|
||||
let document = if i % 2 == 0 {
|
||||
serde_json::json!({
|
||||
"colour": [facet_values[i % 1000], facet_values[i % 1000] + 1000],
|
||||
})
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"colour": format!("{}", facet_values[i % 1000] + 10000),
|
||||
})
|
||||
};
|
||||
let document = document.as_object().unwrap().clone();
|
||||
documents.push(document);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "no_candidates", @"{}");
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((0..1000).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_0_1000", @r###"{"colour": (0.0, 1998.0)}"###);
|
||||
|
||||
let map = FacetDistribution::new(&txn, &index)
|
||||
.facets(iter::once(("colour", OrderBy::default())))
|
||||
.candidates((217..777).collect())
|
||||
.compute_stats()
|
||||
.unwrap();
|
||||
|
||||
milli_snap!(format!("{map:?}"), "candidates_217_777", @r###"{"colour": (218.0, 1776.0)}"###);
|
||||
}
|
||||
}
|
||||
301
crates/milli/src/search/facet/facet_distribution_iter.rs
Normal file
301
crates/milli/src/search/facet/facet_distribution_iter.rs
Normal file
@@ -0,0 +1,301 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use heed::Result;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{get_first_facet_value, get_highest_level};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId};
|
||||
|
||||
/// Call the given closure on the facet distribution of the candidate documents.
|
||||
///
|
||||
/// The arguments to the closure are:
|
||||
/// - the facet value, as a byte slice
|
||||
/// - the number of documents among the candidates that contain this facet value
|
||||
/// - the id of a document which contains the facet value. Note that this document
|
||||
/// is not necessarily from the list of candidates, it is simply *any* document which
|
||||
/// contains this facet value.
|
||||
///
|
||||
/// The return value of the closure is a `ControlFlow<()>` which indicates whether we should
|
||||
/// keep iterating over the different facet values or stop.
|
||||
pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
candidates: &RoaringBitmap,
|
||||
callback: CB,
|
||||
) -> Result<()>
|
||||
where
|
||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||
{
|
||||
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
|
||||
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
|
||||
Ok(())
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn count_iterate_over_facet_distribution<'t, CB>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
candidates: &RoaringBitmap,
|
||||
mut callback: CB,
|
||||
) -> Result<()>
|
||||
where
|
||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||
{
|
||||
/// # Important
|
||||
/// The order of the fields determines the order in which the facet values will be returned.
|
||||
/// This struct is inserted in a BinaryHeap and popped later on.
|
||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
|
||||
struct LevelEntry<'t> {
|
||||
/// The number of candidates in this entry.
|
||||
count: u64,
|
||||
/// The key level of the entry.
|
||||
level: Reverse<u8>,
|
||||
/// The left bound key.
|
||||
left_bound: &'t [u8],
|
||||
/// The number of keys we must look for after `left_bound`.
|
||||
group_size: u8,
|
||||
/// Any docid in the set of matching documents. Used to find the original facet string.
|
||||
any_docid: u32,
|
||||
}
|
||||
|
||||
// Represents the list of keys that we must explore.
|
||||
let mut heap = BinaryHeap::new();
|
||||
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
// We first fill the heap with values from the highest level
|
||||
let starting_key =
|
||||
FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
||||
for el in db.range(rtxn, &(&starting_key..))?.take(usize::MAX) {
|
||||
let (key, value) = el?;
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if key.field_id != field_id {
|
||||
break;
|
||||
}
|
||||
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
candidates,
|
||||
)?;
|
||||
let count = intersection.len();
|
||||
if count != 0 {
|
||||
heap.push(LevelEntry {
|
||||
count,
|
||||
level: Reverse(key.level),
|
||||
left_bound: key.left_bound,
|
||||
group_size: value.size,
|
||||
any_docid: intersection.min().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop()
|
||||
{
|
||||
if let Reverse(0) = level {
|
||||
match (callback)(left_bound, count, any_docid)? {
|
||||
ControlFlow::Continue(_) => (),
|
||||
ControlFlow::Break(_) => return Ok(()),
|
||||
}
|
||||
} else {
|
||||
let starting_key = FacetGroupKey { field_id, level: level.0 - 1, left_bound };
|
||||
for el in db.range(rtxn, &(&starting_key..))?.take(group_size as usize) {
|
||||
let (key, value) = el?;
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if key.field_id != field_id {
|
||||
break;
|
||||
}
|
||||
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
candidates,
|
||||
)?;
|
||||
let count = intersection.len();
|
||||
if count != 0 {
|
||||
heap.push(LevelEntry {
|
||||
count,
|
||||
level: Reverse(key.level),
|
||||
left_bound: key.left_bound,
|
||||
group_size: value.size,
|
||||
any_docid: intersection.min().unwrap(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Iterate over the facets values by lexicographic order.
|
||||
struct LexicographicFacetDistribution<'t, CB>
|
||||
where
|
||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||
{
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
|
||||
field_id: u16,
|
||||
callback: CB,
|
||||
}
|
||||
|
||||
impl<'t, CB> LexicographicFacetDistribution<'t, CB>
|
||||
where
|
||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||
{
|
||||
fn iterate_level_0(
|
||||
&mut self,
|
||||
candidates: &RoaringBitmap,
|
||||
starting_bound: &'t [u8],
|
||||
group_size: usize,
|
||||
) -> Result<ControlFlow<()>> {
|
||||
let starting_key =
|
||||
FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_bound };
|
||||
let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size);
|
||||
for el in iter {
|
||||
let (key, value) = el?;
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if key.field_id != self.field_id {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
candidates,
|
||||
)?;
|
||||
if !docids_in_common.is_empty() {
|
||||
let any_docid_in_common = docids_in_common.min().unwrap();
|
||||
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
|
||||
{
|
||||
ControlFlow::Continue(_) => (),
|
||||
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
|
||||
fn iterate(
|
||||
&mut self,
|
||||
candidates: &RoaringBitmap,
|
||||
level: u8,
|
||||
starting_bound: &'t [u8],
|
||||
group_size: usize,
|
||||
) -> Result<ControlFlow<()>> {
|
||||
if level == 0 {
|
||||
return self.iterate_level_0(candidates, starting_bound, group_size);
|
||||
}
|
||||
let starting_key =
|
||||
FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound };
|
||||
let iter = self.db.range(self.rtxn, &(&starting_key..))?.take(group_size);
|
||||
|
||||
for el in iter {
|
||||
let (key, value) = el?;
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if key.field_id != self.field_id {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
candidates,
|
||||
)?;
|
||||
if !docids_in_common.is_empty() {
|
||||
let cf = self.iterate(
|
||||
&docids_in_common,
|
||||
level - 1,
|
||||
key.left_bound,
|
||||
value.size as usize,
|
||||
)?;
|
||||
match cf {
|
||||
ControlFlow::Continue(_) => (),
|
||||
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use heed::BytesDecode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::lexicographically_iterate_over_facet_distribution;
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::milli_snap;
|
||||
use crate::search::facet::tests::{get_random_looking_index, get_simple_index};
|
||||
|
||||
#[test]
|
||||
fn filter_distribution_all() {
|
||||
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (0..=255).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
lexicographically_iterate_over_facet_distribution(
|
||||
&txn,
|
||||
index.content,
|
||||
0,
|
||||
&candidates,
|
||||
|facet, count, _| {
|
||||
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
|
||||
results.push_str(&format!("{facet}: {count}\n"));
|
||||
Ok(ControlFlow::Continue(()))
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
milli_snap!(results, i);
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_distribution_all_stop_early() {
|
||||
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (0..=255).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let mut nbr_facets = 0;
|
||||
lexicographically_iterate_over_facet_distribution(
|
||||
&txn,
|
||||
index.content,
|
||||
0,
|
||||
&candidates,
|
||||
|facet, count, _| {
|
||||
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
|
||||
if nbr_facets == 100 {
|
||||
Ok(ControlFlow::Break(()))
|
||||
} else {
|
||||
nbr_facets += 1;
|
||||
results.push_str(&format!("{facet}: {count}\n"));
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
milli_snap!(results, i);
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
688
crates/milli/src/search/facet/facet_range_search.rs
Normal file
688
crates/milli/src/search/facet/facet_range_search.rs
Normal file
@@ -0,0 +1,688 @@
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::{CboRoaringBitmapCodec, Result};
|
||||
|
||||
/// Find all the document ids for which the given field contains a value contained within
|
||||
/// the two bounds.
|
||||
pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BoundCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
|
||||
right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
docids: &mut RoaringBitmap,
|
||||
) -> Result<()>
|
||||
where
|
||||
BoundCodec: for<'a> BytesEncode<'a>,
|
||||
for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Sized,
|
||||
{
|
||||
let inner;
|
||||
let left = match left {
|
||||
Bound::Included(left) => {
|
||||
inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?;
|
||||
Bound::Included(inner.as_ref())
|
||||
}
|
||||
Bound::Excluded(left) => {
|
||||
inner = BoundCodec::bytes_encode(left).map_err(heed::Error::Encoding)?;
|
||||
Bound::Excluded(inner.as_ref())
|
||||
}
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
};
|
||||
let inner;
|
||||
let right = match right {
|
||||
Bound::Included(right) => {
|
||||
inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?;
|
||||
Bound::Included(inner.as_ref())
|
||||
}
|
||||
Bound::Excluded(right) => {
|
||||
inner = BoundCodec::bytes_encode(right).map_err(heed::Error::Encoding)?;
|
||||
Bound::Excluded(inner.as_ref())
|
||||
}
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
};
|
||||
let db = db.remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>();
|
||||
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids };
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
|
||||
if let Some(starting_left_bound) =
|
||||
get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?
|
||||
{
|
||||
let rightmost_bound =
|
||||
Bound::Included(get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded
|
||||
let group_size = usize::MAX;
|
||||
f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?;
|
||||
Ok(())
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Fetch the document ids that have a facet with a value between the two given bounds
|
||||
struct FacetRangeSearch<'t, 'b, 'bitmap> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
|
||||
field_id: u16,
|
||||
left: Bound<&'b [u8]>,
|
||||
right: Bound<&'b [u8]>,
|
||||
/// The subset of documents ids that are useful for this search.
|
||||
/// Great performance optimizations can be achieved by only fetching values matching this subset.
|
||||
universe: Option<&'bitmap RoaringBitmap>,
|
||||
docids: &'bitmap mut RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
|
||||
fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> {
|
||||
let left_key =
|
||||
FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound };
|
||||
let iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size);
|
||||
for el in iter {
|
||||
let (key, value) = el?;
|
||||
// the right side of the iter range is unbounded, so we need to make sure that we are not iterating
|
||||
// on the next field id
|
||||
if key.field_id != self.field_id {
|
||||
return Ok(());
|
||||
}
|
||||
let should_skip = {
|
||||
match self.left {
|
||||
Bound::Included(left) => left > key.left_bound,
|
||||
Bound::Excluded(left) => left >= key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_skip {
|
||||
continue;
|
||||
}
|
||||
let should_stop = {
|
||||
match self.right {
|
||||
Bound::Included(right) => right < key.left_bound,
|
||||
Bound::Excluded(right) => right <= key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_stop {
|
||||
break;
|
||||
}
|
||||
|
||||
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
|
||||
*self.docids |= match self.universe {
|
||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
universe,
|
||||
)?,
|
||||
None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
|
||||
};
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Recursive part of the algorithm for level > 0.
|
||||
///
|
||||
/// It works by visiting a slice of a level and checking whether the range asscociated
|
||||
/// with each visited element is contained within the bounds.
|
||||
///
|
||||
/// 1. So long as the element's range is less than the left bound, we do nothing and keep iterating
|
||||
/// 2. If the element's range is fully contained by the bounds, then all of its docids are added to
|
||||
/// the roaring bitmap.
|
||||
/// 3. If the element's range merely intersects the bounds, then we call the algorithm recursively
|
||||
/// on the children of the element from the level below.
|
||||
/// 4. If the element's range is greater than the right bound, we do nothing and stop iterating.
|
||||
/// Note that the right bound is found through either the `left_bound` of the *next* element,
|
||||
/// or from the `rightmost_bound` argument
|
||||
///
|
||||
/// ## Arguments
|
||||
/// - `level`: the level being visited
|
||||
/// - `starting_left_bound`: the left_bound of the first element to visit
|
||||
/// - `rightmost_bound`: the right bound of the last element that should be visited
|
||||
/// - `group_size`: the number of elements that should be visited
|
||||
fn run(
|
||||
&mut self,
|
||||
level: u8,
|
||||
starting_left_bound: &'t [u8],
|
||||
rightmost_bound: Bound<&'t [u8]>,
|
||||
group_size: usize,
|
||||
) -> Result<()> {
|
||||
if level == 0 {
|
||||
return self.run_level_0(starting_left_bound, group_size);
|
||||
}
|
||||
|
||||
let left_key =
|
||||
FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound };
|
||||
let mut iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size);
|
||||
|
||||
// We iterate over the range while keeping in memory the previous value
|
||||
let (mut previous_key, mut previous_value) = iter.next().unwrap()?;
|
||||
for el in iter {
|
||||
let (next_key, next_value) = el?;
|
||||
// the right of the iter range is potentially unbounded (e.g. if `group_size` is usize::MAX),
|
||||
// so we need to make sure that we are not iterating on the next field id
|
||||
if next_key.field_id != self.field_id {
|
||||
break;
|
||||
}
|
||||
// now, do we skip, stop, or visit?
|
||||
let should_skip = {
|
||||
match self.left {
|
||||
Bound::Included(left) => left >= next_key.left_bound,
|
||||
Bound::Excluded(left) => left >= next_key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_skip {
|
||||
previous_key = next_key;
|
||||
previous_value = next_value;
|
||||
continue;
|
||||
}
|
||||
|
||||
// should we stop?
|
||||
// We should if the search range doesn't include any
|
||||
// element from the previous key or its successors
|
||||
let should_stop = {
|
||||
match self.right {
|
||||
Bound::Included(right) => right < previous_key.left_bound,
|
||||
Bound::Excluded(right) => right <= previous_key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_stop {
|
||||
return Ok(());
|
||||
}
|
||||
// should we take the whole thing, without recursing down?
|
||||
let should_take_whole_group = {
|
||||
let left_condition = match self.left {
|
||||
Bound::Included(left) => previous_key.left_bound >= left,
|
||||
Bound::Excluded(left) => previous_key.left_bound > left,
|
||||
Bound::Unbounded => true,
|
||||
};
|
||||
let right_condition = match self.right {
|
||||
Bound::Included(right) => next_key.left_bound <= right,
|
||||
Bound::Excluded(right) => next_key.left_bound <= right,
|
||||
Bound::Unbounded => true,
|
||||
};
|
||||
left_condition && right_condition
|
||||
};
|
||||
if should_take_whole_group {
|
||||
*self.docids |= match self.universe {
|
||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
previous_value.bitmap_bytes,
|
||||
universe,
|
||||
)?,
|
||||
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
||||
};
|
||||
previous_key = next_key;
|
||||
previous_value = next_value;
|
||||
continue;
|
||||
}
|
||||
// from here, we should visit the children of the previous element and
|
||||
// call the function recursively
|
||||
|
||||
let level = level - 1;
|
||||
let starting_left_bound = previous_key.left_bound;
|
||||
let rightmost_bound = Bound::Excluded(next_key.left_bound);
|
||||
let group_size = previous_value.size as usize;
|
||||
|
||||
self.run(level, starting_left_bound, rightmost_bound, group_size)?;
|
||||
|
||||
previous_key = next_key;
|
||||
previous_value = next_value;
|
||||
}
|
||||
// previous_key/previous_value are the last element's key/value
|
||||
|
||||
// now, do we skip, stop, or visit?
|
||||
let should_skip = {
|
||||
match (self.left, rightmost_bound) {
|
||||
(Bound::Included(left), Bound::Included(right)) => left > right,
|
||||
(Bound::Included(left), Bound::Excluded(right)) => left >= right,
|
||||
(Bound::Excluded(left), Bound::Included(right) | Bound::Excluded(right)) => {
|
||||
left >= right
|
||||
}
|
||||
(Bound::Unbounded, _) => false,
|
||||
(_, Bound::Unbounded) => false, // should never run?
|
||||
}
|
||||
};
|
||||
if should_skip {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// should we stop?
|
||||
// We should if the search range doesn't include any
|
||||
// element from the previous key or its successors
|
||||
let should_stop = {
|
||||
match self.right {
|
||||
Bound::Included(right) => right < previous_key.left_bound,
|
||||
Bound::Excluded(right) => right <= previous_key.left_bound,
|
||||
Bound::Unbounded => false,
|
||||
}
|
||||
};
|
||||
if should_stop {
|
||||
return Ok(());
|
||||
}
|
||||
// should we take the whole thing, without recursing down?
|
||||
let should_take_whole_group = {
|
||||
let left_condition = match self.left {
|
||||
Bound::Included(left) => previous_key.left_bound >= left,
|
||||
Bound::Excluded(left) => previous_key.left_bound > left,
|
||||
Bound::Unbounded => true,
|
||||
};
|
||||
let right_condition = match (self.right, rightmost_bound) {
|
||||
(Bound::Included(right), Bound::Included(rightmost)) => {
|
||||
// we need to stay within the bound ..=right
|
||||
// the element's range goes to ..=righmost
|
||||
// so the element fits entirely within the bound if rightmost <= right
|
||||
rightmost <= right
|
||||
}
|
||||
(Bound::Included(right), Bound::Excluded(rightmost)) => {
|
||||
// we need to stay within the bound ..=right
|
||||
// the element's range goes to ..righmost
|
||||
// so the element fits entirely within the bound if rightmost <= right
|
||||
rightmost <= right
|
||||
}
|
||||
(Bound::Excluded(right), Bound::Included(rightmost)) => {
|
||||
// we need to stay within the bound ..right
|
||||
// the element's range goes to ..=righmost
|
||||
// so the element fits entirely within the bound if rightmost < right
|
||||
rightmost < right
|
||||
}
|
||||
(Bound::Excluded(right), Bound::Excluded(rightmost)) => {
|
||||
// we need to stay within the bound ..right
|
||||
// the element's range goes to ..righmost
|
||||
// so the element fits entirely within the bound if rightmost <= right
|
||||
rightmost <= right
|
||||
}
|
||||
(Bound::Unbounded, _) => {
|
||||
// we need to stay within the bound ..inf
|
||||
// so the element always fits entirely within the bound
|
||||
true
|
||||
}
|
||||
(_, Bound::Unbounded) => {
|
||||
// we need to stay within a finite bound
|
||||
// but the element's range goes to ..inf
|
||||
// so the element never fits entirely within the bound
|
||||
false
|
||||
}
|
||||
};
|
||||
left_condition && right_condition
|
||||
};
|
||||
if should_take_whole_group {
|
||||
*self.docids |= match self.universe {
|
||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
previous_value.bitmap_bytes,
|
||||
universe,
|
||||
)?,
|
||||
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
||||
};
|
||||
} else {
|
||||
let level = level - 1;
|
||||
let starting_left_bound = previous_key.left_bound;
|
||||
let group_size = previous_value.size as usize;
|
||||
|
||||
self.run(level, starting_left_bound, rightmost_bound, group_size)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Bound;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::find_docids_of_facet_within_bounds;
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
|
||||
use crate::milli_snap;
|
||||
use crate::search::facet::tests::{
|
||||
get_random_looking_index, get_random_looking_index_with_multiple_field_ids,
|
||||
get_simple_index, get_simple_index_with_multiple_field_ids,
|
||||
};
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
|
||||
#[test]
|
||||
fn random_looking_index_snap() {
|
||||
let index = get_random_looking_index();
|
||||
milli_snap!(format!("{index}"), @"3256c76a7c1b768a013e78d5fa6e9ff9");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn random_looking_index_with_multiple_field_ids_snap() {
|
||||
let index = get_random_looking_index_with_multiple_field_ids();
|
||||
milli_snap!(format!("{index}"), @"c3e5fe06a8f1c404ed4935b32c90a89b");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_index_snap() {
|
||||
let index = get_simple_index();
|
||||
milli_snap!(format!("{index}"), @"5dbfa134cc44abeb3ab6242fc182e48e");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_index_with_multiple_field_ids_snap() {
|
||||
let index = get_simple_index_with_multiple_field_ids();
|
||||
milli_snap!(format!("{index}"), @"a4893298218f682bc76357f46777448c");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_range_increasing() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let mut results = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(0.);
|
||||
let end = Bound::Included(i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results.push_str(&format!("0 <= . <= {i} : {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results, format!("included_{i}"));
|
||||
let mut results = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Excluded(0.);
|
||||
let end = Bound::Excluded(i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results.push_str(&format!("0 < . < {i} : {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results, format!("excluded_{i}"));
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn filter_range_decreasing() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=255).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Included(255.);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
results.push_str(&format!("{i} <= . <= 255 : {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
|
||||
milli_snap!(results, format!("included_{i}"));
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=255).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Excluded(i);
|
||||
let end = Bound::Excluded(255.);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
results.push_str(&format!("{i} < . < 255 : {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
|
||||
milli_snap!(results, format!("excluded_{i}"));
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn filter_range_pinch() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=128).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Included(255. - i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
results.push_str(&format!(
|
||||
"{i} <= . <= {r} : {docids}\n",
|
||||
r = 255. - i,
|
||||
docids = display_bitmap(&docids)
|
||||
));
|
||||
}
|
||||
|
||||
milli_snap!(results, format!("included_{i}"));
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
for i in (0..=128).rev() {
|
||||
let i = i as f64;
|
||||
let start = Bound::Excluded(i);
|
||||
let end = Bound::Excluded(255. - i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
results.push_str(&format!(
|
||||
"{i} < . < {r} {docids}\n",
|
||||
r = 255. - i,
|
||||
docids = display_bitmap(&docids)
|
||||
));
|
||||
}
|
||||
|
||||
milli_snap!(results, format!("excluded_{i}"));
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_range_unbounded() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let mut results = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Unbounded;
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results.push_str(&format!(">= {i}: {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results, format!("start_from_included_{i}"));
|
||||
let mut results = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Unbounded;
|
||||
let end = Bound::Included(i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results.push_str(&format!("<= {i}: {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results, format!("end_at_included_{i}"));
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&Bound::Unbounded,
|
||||
&Bound::Unbounded,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
milli_snap!(
|
||||
&format!("all field_id 0: {}\n", display_bitmap(&docids)),
|
||||
format!("unbounded_field_id_0_{i}")
|
||||
);
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
1,
|
||||
&Bound::Unbounded,
|
||||
&Bound::Unbounded,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
milli_snap!(
|
||||
&format!("all field_id 1: {}\n", display_bitmap(&docids)),
|
||||
format!("unbounded_field_id_1_{i}")
|
||||
);
|
||||
|
||||
drop(txn);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_range_exact() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
get_random_looking_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let mut results_0 = String::new();
|
||||
let mut results_1 = String::new();
|
||||
for i in 0..=255 {
|
||||
let i = i as f64;
|
||||
let start = Bound::Included(i);
|
||||
let end = Bound::Included(i);
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results_0.push_str(&format!("{i}: {}\n", display_bitmap(&docids)));
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
&txn,
|
||||
index.content.remap_key_type::<FacetGroupKeyCodec<OrderedF64Codec>>(),
|
||||
1,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
#[allow(clippy::format_push_string)]
|
||||
results_1.push_str(&format!("{i}: {}\n", display_bitmap(&docids)));
|
||||
}
|
||||
milli_snap!(results_0, format!("field_id_0_exact_{i}"));
|
||||
milli_snap!(results_1, format!("field_id_1_exact_{i}"));
|
||||
|
||||
drop(txn);
|
||||
}
|
||||
}
|
||||
}
|
||||
230
crates/milli/src/search/facet/facet_sort_ascending.rs
Normal file
230
crates/milli/src/search/facet/facet_sort_ascending.rs
Normal file
@@ -0,0 +1,230 @@
|
||||
use heed::Result;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{get_first_facet_value, get_highest_level};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
|
||||
/// Return an iterator which iterates over the given candidate documents in
|
||||
/// ascending order of their facet value for the given field id.
|
||||
///
|
||||
/// The documents returned by the iterator are grouped by the facet values that
|
||||
/// determined their rank. For example, given the documents:
|
||||
///
|
||||
/// ```text
|
||||
/// 0: { "colour": ["blue", "green"] }
|
||||
/// 1: { "colour": ["blue", "red"] }
|
||||
/// 2: { "colour": ["orange", "red"] }
|
||||
/// 3: { "colour": ["green", "red"] }
|
||||
/// 4: { "colour": ["blue", "orange", "red"] }
|
||||
/// ```
|
||||
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
|
||||
/// over the following elements:
|
||||
/// ```text
|
||||
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
|
||||
/// [3] // same for "green"
|
||||
/// [2] // same for "orange"
|
||||
/// END
|
||||
/// ```
|
||||
/// Note that once a document id is returned by the iterator, it is never returned again.
|
||||
pub fn ascending_facet_sort<'t>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
||||
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);
|
||||
|
||||
Ok(itertools::Either::Left(AscendingFacetSort {
|
||||
rtxn,
|
||||
db,
|
||||
field_id,
|
||||
stack: vec![(candidates, iter)],
|
||||
}))
|
||||
} else {
|
||||
Ok(itertools::Either::Right(std::iter::empty()))
|
||||
}
|
||||
}
|
||||
|
||||
struct AscendingFacetSort<'t, 'e> {
|
||||
rtxn: &'t heed::RoTxn<'e>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
#[allow(clippy::type_complexity)]
|
||||
stack: Vec<(
|
||||
RoaringBitmap,
|
||||
std::iter::Take<heed::RoRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>>,
|
||||
)>,
|
||||
}
|
||||
|
||||
impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
|
||||
type Item = Result<(RoaringBitmap, &'t [u8])>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
'outer: loop {
|
||||
let (documents_ids, deepest_iter) = self.stack.last_mut()?;
|
||||
for result in deepest_iter {
|
||||
let (
|
||||
FacetGroupKey { level, left_bound, field_id },
|
||||
FacetGroupValue { size: group_size, mut bitmap },
|
||||
) = result.unwrap();
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if field_id != self.field_id {
|
||||
return None;
|
||||
}
|
||||
|
||||
// If the last iterator found an empty set of documents it means
|
||||
// that we found all the documents in the sub level iterations already,
|
||||
// we can pop this level iterator.
|
||||
if documents_ids.is_empty() {
|
||||
// break our of the for loop into the end of the 'outer loop, which
|
||||
// pops the stack
|
||||
break;
|
||||
}
|
||||
|
||||
bitmap &= &*documents_ids;
|
||||
if !bitmap.is_empty() {
|
||||
*documents_ids -= &bitmap;
|
||||
|
||||
if level == 0 {
|
||||
// Since the level is 0, the left_bound is the exact value.
|
||||
return Some(Ok((bitmap, left_bound)));
|
||||
}
|
||||
let starting_key_below =
|
||||
FacetGroupKey { field_id: self.field_id, level: level - 1, left_bound };
|
||||
let iter = match self.db.range(self.rtxn, &(starting_key_below..)) {
|
||||
Ok(iter) => iter,
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
.take(group_size as usize);
|
||||
|
||||
self.stack.push((bitmap, iter));
|
||||
continue 'outer;
|
||||
}
|
||||
}
|
||||
self.stack.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::milli_snap;
|
||||
use crate::search::facet::facet_sort_ascending::ascending_facet_sort;
|
||||
use crate::search::facet::tests::{
|
||||
get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids,
|
||||
get_simple_index, get_simple_string_index_with_multiple_field_ids,
|
||||
};
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending() {
|
||||
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, i);
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending_multiple_field_ids() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, format!("{i}-0"));
|
||||
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, format!("{i}-1"));
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending_with_no_candidates() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for index in indexes {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = RoaringBitmap::new();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending_with_inexisting_field_id() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for index in indexes {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = RoaringBitmap::new();
|
||||
let mut results = String::new();
|
||||
let iter = ascending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
244
crates/milli/src/search/facet/facet_sort_descending.rs
Normal file
244
crates/milli/src/search/facet/facet_sort_descending.rs
Normal file
@@ -0,0 +1,244 @@
|
||||
use std::ops::Bound;
|
||||
|
||||
use heed::Result;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
|
||||
/// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort).
|
||||
///
|
||||
/// This function does the same thing, but in the opposite order.
|
||||
pub fn descending_facet_sort<'t>(
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
||||
let last_bound = get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap();
|
||||
let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound };
|
||||
let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX);
|
||||
Ok(itertools::Either::Left(DescendingFacetSort {
|
||||
rtxn,
|
||||
db,
|
||||
field_id,
|
||||
stack: vec![(candidates, iter, Bound::Included(last_bound))],
|
||||
}))
|
||||
} else {
|
||||
Ok(itertools::Either::Right(std::iter::empty()))
|
||||
}
|
||||
}
|
||||
|
||||
struct DescendingFacetSort<'t> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
field_id: u16,
|
||||
#[allow(clippy::type_complexity)]
|
||||
stack: Vec<(
|
||||
RoaringBitmap,
|
||||
std::iter::Take<
|
||||
heed::RoRevRange<'t, FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
>,
|
||||
Bound<&'t [u8]>,
|
||||
)>,
|
||||
}
|
||||
|
||||
impl<'t> Iterator for DescendingFacetSort<'t> {
|
||||
type Item = Result<(RoaringBitmap, &'t [u8])>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
'outer: loop {
|
||||
let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?;
|
||||
for result in deepest_iter.by_ref() {
|
||||
let (
|
||||
FacetGroupKey { level, left_bound, field_id },
|
||||
FacetGroupValue { size: group_size, mut bitmap },
|
||||
) = result.unwrap();
|
||||
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||
// so we need to check that we are not iterating over the next field id
|
||||
if field_id != self.field_id {
|
||||
return None;
|
||||
}
|
||||
// If the last iterator found an empty set of documents it means
|
||||
// that we found all the documents in the sub level iterations already,
|
||||
// we can pop this level iterator.
|
||||
if documents_ids.is_empty() {
|
||||
break;
|
||||
}
|
||||
|
||||
bitmap &= &*documents_ids;
|
||||
if !bitmap.is_empty() {
|
||||
*documents_ids -= &bitmap;
|
||||
|
||||
if level == 0 {
|
||||
// Since we're at the level 0 the left_bound is the exact value.
|
||||
return Some(Ok((bitmap, left_bound)));
|
||||
}
|
||||
let starting_key_below =
|
||||
FacetGroupKey { field_id, level: level - 1, left_bound };
|
||||
|
||||
let end_key_kelow = match *right_bound {
|
||||
Bound::Included(right) => Bound::Included(FacetGroupKey {
|
||||
field_id,
|
||||
level: level - 1,
|
||||
left_bound: right,
|
||||
}),
|
||||
Bound::Excluded(right) => Bound::Excluded(FacetGroupKey {
|
||||
field_id,
|
||||
level: level - 1,
|
||||
left_bound: right,
|
||||
}),
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
};
|
||||
let prev_right_bound = *right_bound;
|
||||
*right_bound = Bound::Excluded(left_bound);
|
||||
let iter = match self
|
||||
.db
|
||||
.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
.rev_range(self.rtxn, &(Bound::Included(starting_key_below), end_key_kelow))
|
||||
{
|
||||
Ok(iter) => iter,
|
||||
Err(e) => return Some(Err(e)),
|
||||
}
|
||||
.take(group_size as usize);
|
||||
|
||||
self.stack.push((bitmap, iter, prev_right_bound));
|
||||
continue 'outer;
|
||||
}
|
||||
*right_bound = Bound::Excluded(left_bound);
|
||||
}
|
||||
self.stack.pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::FacetGroupKeyCodec;
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::milli_snap;
|
||||
use crate::search::facet::facet_sort_descending::descending_facet_sort;
|
||||
use crate::search::facet::tests::{
|
||||
get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids,
|
||||
get_simple_index, get_simple_index_with_multiple_field_ids,
|
||||
get_simple_string_index_with_multiple_field_ids,
|
||||
};
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
|
||||
#[test]
|
||||
fn filter_sort_descending() {
|
||||
let indexes = [
|
||||
get_simple_index(),
|
||||
get_random_looking_index(),
|
||||
get_simple_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, i);
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_descending_multiple_field_ids() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for (i, index) in indexes.iter().enumerate() {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = (200..=300).collect::<RoaringBitmap>();
|
||||
let mut results = String::new();
|
||||
let db = index.content.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, format!("{i}-0"));
|
||||
|
||||
let mut results = String::new();
|
||||
|
||||
let iter = descending_facet_sort(&txn, db, 1, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
milli_snap!(results, format!("{i}-1"));
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn filter_sort_ascending_with_no_candidates() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for index in indexes {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = RoaringBitmap::new();
|
||||
let mut results = String::new();
|
||||
let iter = descending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
let mut results = String::new();
|
||||
let iter = descending_facet_sort(&txn, index.content, 1, candidates).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_sort_ascending_with_inexisting_field_id() {
|
||||
let indexes = [
|
||||
get_simple_string_index_with_multiple_field_ids(),
|
||||
get_random_looking_string_index_with_multiple_field_ids(),
|
||||
];
|
||||
for index in indexes {
|
||||
let txn = index.env.read_txn().unwrap();
|
||||
let candidates = RoaringBitmap::new();
|
||||
let mut results = String::new();
|
||||
let iter = descending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap();
|
||||
for el in iter {
|
||||
let (docids, _) = el.unwrap();
|
||||
results.push_str(&display_bitmap(&docids));
|
||||
results.push('\n');
|
||||
}
|
||||
assert!(results.is_empty());
|
||||
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
1211
crates/milli/src/search/facet/filter.rs
Normal file
1211
crates/milli/src/search/facet/filter.rs
Normal file
File diff suppressed because it is too large
Load Diff
227
crates/milli/src/search/facet/mod.rs
Normal file
227
crates/milli/src/search/facet/mod.rs
Normal file
@@ -0,0 +1,227 @@
|
||||
pub use facet_sort_ascending::ascending_facet_sort;
|
||||
pub use facet_sort_descending::descending_facet_sort;
|
||||
use heed::types::{Bytes, DecodeIgnore};
|
||||
use heed::{BytesDecode, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::filter::{BadGeoError, Filter};
|
||||
pub use self::search::{FacetValueHit, SearchForFacetValues};
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::{Index, Result};
|
||||
|
||||
mod facet_distribution;
|
||||
mod facet_distribution_iter;
|
||||
mod facet_range_search;
|
||||
mod facet_sort_ascending;
|
||||
mod facet_sort_descending;
|
||||
mod filter;
|
||||
mod search;
|
||||
|
||||
fn facet_extreme_value<'t>(
|
||||
mut extreme_it: impl Iterator<Item = heed::Result<(RoaringBitmap, &'t [u8])>> + 't,
|
||||
) -> Result<Option<f64>> {
|
||||
let extreme_value =
|
||||
if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) };
|
||||
let (_, extreme_value) = extreme_value?;
|
||||
OrderedF64Codec::bytes_decode(extreme_value)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
pub fn facet_min_value<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Option<f64>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let it = ascending_facet_sort(rtxn, db, field_id, candidates)?;
|
||||
facet_extreme_value(it)
|
||||
}
|
||||
|
||||
pub fn facet_max_value<'t>(
|
||||
index: &'t Index,
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
field_id: u16,
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<Option<f64>> {
|
||||
let db = index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let it = descending_facet_sort(rtxn, db, field_id, candidates)?;
|
||||
facet_extreme_value(it)
|
||||
}
|
||||
|
||||
/// Get the first facet value in the facet database
|
||||
pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>(
|
||||
txn: &'t RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
|
||||
field_id: u16,
|
||||
) -> heed::Result<Option<BoundCodec::DItem>>
|
||||
where
|
||||
BoundCodec: BytesDecode<'t>,
|
||||
{
|
||||
let mut level0prefix = vec![];
|
||||
level0prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
level0prefix.push(0);
|
||||
let mut level0_iter_forward =
|
||||
db.remap_types::<Bytes, DecodeIgnore>().prefix_iter(txn, level0prefix.as_slice())?;
|
||||
if let Some(first) = level0_iter_forward.next() {
|
||||
let (first_key, _) = first?;
|
||||
let first_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(first_key)
|
||||
.map_err(heed::Error::Decoding)?;
|
||||
Ok(Some(first_key.left_bound))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the last facet value in the facet database
|
||||
pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>(
|
||||
txn: &'t RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
|
||||
field_id: u16,
|
||||
) -> heed::Result<Option<BoundCodec::DItem>>
|
||||
where
|
||||
BoundCodec: BytesDecode<'t>,
|
||||
{
|
||||
let mut level0prefix = vec![];
|
||||
level0prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
level0prefix.push(0);
|
||||
let mut level0_iter_backward =
|
||||
db.remap_types::<Bytes, DecodeIgnore>().rev_prefix_iter(txn, level0prefix.as_slice())?;
|
||||
if let Some(last) = level0_iter_backward.next() {
|
||||
let (last_key, _) = last?;
|
||||
let last_key = FacetGroupKeyCodec::<BoundCodec>::bytes_decode(last_key)
|
||||
.map_err(heed::Error::Decoding)?;
|
||||
Ok(Some(last_key.left_bound))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the height of the highest level in the facet database
|
||||
pub(crate) fn get_highest_level<'t, DC>(
|
||||
txn: &'t RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
|
||||
field_id: u16,
|
||||
) -> heed::Result<u8> {
|
||||
let field_id_prefix = &field_id.to_be_bytes();
|
||||
Ok(db
|
||||
.remap_types::<Bytes, DecodeIgnore>()
|
||||
.rev_prefix_iter(txn, field_id_prefix)?
|
||||
.next()
|
||||
.map(|el| {
|
||||
let (key, _) = el.unwrap();
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key).unwrap();
|
||||
key.level
|
||||
})
|
||||
.unwrap_or(0))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use rand::{Rng, SeedableRng};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::heed_codec::StrRefCodec;
|
||||
use crate::update::facet::test_helpers::FacetIndex;
|
||||
|
||||
pub fn get_simple_index() -> FacetIndex<OrderedF64Codec> {
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
for i in 0..256u16 {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(i as u32);
|
||||
index.insert(&mut txn, 0, &(i as f64), &bitmap);
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_random_looking_index() -> FacetIndex<OrderedF64Codec> {
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
|
||||
for key in std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128) {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(key);
|
||||
bitmap.insert(key + 100);
|
||||
index.insert(&mut txn, 0, &(key as f64), &bitmap);
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_simple_index_with_multiple_field_ids() -> FacetIndex<OrderedF64Codec> {
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
for fid in 0..2 {
|
||||
for i in 0..256u16 {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(i as u32);
|
||||
index.insert(&mut txn, fid, &(i as f64), &bitmap);
|
||||
}
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_random_looking_index_with_multiple_field_ids() -> FacetIndex<OrderedF64Codec> {
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
let keys =
|
||||
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
|
||||
for fid in 0..2 {
|
||||
for &key in &keys {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(key);
|
||||
bitmap.insert(key + 100);
|
||||
index.insert(&mut txn, fid, &(key as f64), &bitmap);
|
||||
}
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_simple_string_index_with_multiple_field_ids() -> FacetIndex<StrRefCodec> {
|
||||
let index = FacetIndex::<StrRefCodec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
for fid in 0..2 {
|
||||
for i in 0..256u16 {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(i as u32);
|
||||
if i % 2 == 0 {
|
||||
index.insert(&mut txn, fid, &format!("{i}").as_str(), &bitmap);
|
||||
} else {
|
||||
index.insert(&mut txn, fid, &"", &bitmap);
|
||||
}
|
||||
}
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
pub fn get_random_looking_string_index_with_multiple_field_ids() -> FacetIndex<StrRefCodec> {
|
||||
let index = FacetIndex::<StrRefCodec>::new(4, 8, 5);
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
|
||||
let keys =
|
||||
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
|
||||
for fid in 0..2 {
|
||||
for &key in &keys {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(key);
|
||||
bitmap.insert(key + 100);
|
||||
if key % 2 == 0 {
|
||||
index.insert(&mut txn, fid, &format!("{key}").as_str(), &bitmap);
|
||||
} else {
|
||||
index.insert(&mut txn, fid, &"", &bitmap);
|
||||
}
|
||||
}
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
index
|
||||
}
|
||||
}
|
||||
358
crates/milli/src/search/facet/search.rs
Normal file
358
crates/milli/src/search/facet/search.rs
Normal file
@@ -0,0 +1,358 @@
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::BinaryHeap;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use charabia::normalizer::NormalizerOption;
|
||||
use charabia::{Language, Normalize, StrDetection, Token};
|
||||
use fst::automaton::{Automaton, Str};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use roaring::RoaringBitmap;
|
||||
use tracing::error;
|
||||
|
||||
use crate::error::UserError;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
|
||||
use crate::search::build_dfa;
|
||||
use crate::{DocumentId, FieldId, OrderBy, Result, Search};
|
||||
|
||||
/// The maximum number of values per facet returned by the facet search route.
|
||||
const DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET: usize = 100;
|
||||
|
||||
pub struct SearchForFacetValues<'a> {
|
||||
query: Option<String>,
|
||||
facet: String,
|
||||
search_query: Search<'a>,
|
||||
max_values: usize,
|
||||
is_hybrid: bool,
|
||||
locales: Option<Vec<Language>>,
|
||||
}
|
||||
|
||||
impl<'a> SearchForFacetValues<'a> {
|
||||
pub fn new(
|
||||
facet: String,
|
||||
search_query: Search<'a>,
|
||||
is_hybrid: bool,
|
||||
) -> SearchForFacetValues<'a> {
|
||||
SearchForFacetValues {
|
||||
query: None,
|
||||
facet,
|
||||
search_query,
|
||||
max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET,
|
||||
is_hybrid,
|
||||
locales: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn query(&mut self, query: impl Into<String>) -> &mut Self {
|
||||
self.query = Some(query.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn max_values(&mut self, max: usize) -> &mut Self {
|
||||
self.max_values = max;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn locales(&mut self, locales: Vec<Language>) -> &mut Self {
|
||||
self.locales = Some(locales);
|
||||
self
|
||||
}
|
||||
|
||||
fn one_original_value_of(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
facet_str: &str,
|
||||
any_docid: DocumentId,
|
||||
) -> Result<Option<String>> {
|
||||
let index = self.search_query.index;
|
||||
let rtxn = self.search_query.rtxn;
|
||||
let key: (FieldId, _, &str) = (field_id, any_docid, facet_str);
|
||||
Ok(index.field_id_docid_facet_strings.get(rtxn, &key)?.map(|v| v.to_owned()))
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> Result<Vec<FacetValueHit>> {
|
||||
let index = self.search_query.index;
|
||||
let rtxn = self.search_query.rtxn;
|
||||
|
||||
let filterable_fields = index.filterable_fields(rtxn)?;
|
||||
if !filterable_fields.contains(&self.facet) {
|
||||
let (valid_fields, hidden_fields) =
|
||||
index.remove_hidden_fields(rtxn, filterable_fields)?;
|
||||
|
||||
return Err(UserError::InvalidFacetSearchFacetName {
|
||||
field: self.facet.clone(),
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let fid = match fields_ids_map.id(&self.facet) {
|
||||
Some(fid) => fid,
|
||||
// we return an empty list of results when the attribute has been
|
||||
// set as filterable but no document contains this field (yet).
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
let fst = match self.search_query.index.facet_id_string_fst.get(rtxn, &fid)? {
|
||||
Some(fst) => fst,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
let search_candidates = self.search_query.execute_for_candidates(
|
||||
self.is_hybrid
|
||||
|| self
|
||||
.search_query
|
||||
.semantic
|
||||
.as_ref()
|
||||
.and_then(|semantic| semantic.vector.as_ref())
|
||||
.is_some(),
|
||||
)?;
|
||||
|
||||
let mut results = match index.sort_facet_values_by(rtxn)?.get(&self.facet) {
|
||||
OrderBy::Lexicographic => ValuesCollection::by_lexicographic(self.max_values),
|
||||
OrderBy::Count => ValuesCollection::by_count(self.max_values),
|
||||
};
|
||||
|
||||
match self.query.as_ref() {
|
||||
Some(query) => {
|
||||
let query = normalize_facet_string(query, self.locales.as_deref());
|
||||
let query = query.as_ref();
|
||||
|
||||
let authorize_typos = self.search_query.index.authorize_typos(rtxn)?;
|
||||
let field_authorizes_typos =
|
||||
!self.search_query.index.exact_attributes_ids(rtxn)?.contains(&fid);
|
||||
|
||||
if authorize_typos && field_authorizes_typos {
|
||||
let exact_words_fst = self.search_query.index.exact_words(rtxn)?;
|
||||
if exact_words_fst.map_or(false, |fst| fst.contains(query)) {
|
||||
if fst.contains(query) {
|
||||
self.fetch_original_facets_using_normalized(
|
||||
fid,
|
||||
query,
|
||||
query,
|
||||
&search_candidates,
|
||||
&mut results,
|
||||
)?;
|
||||
}
|
||||
} else {
|
||||
let one_typo = self.search_query.index.min_word_len_one_typo(rtxn)?;
|
||||
let two_typos = self.search_query.index.min_word_len_two_typos(rtxn)?;
|
||||
|
||||
let is_prefix = true;
|
||||
let automaton = if query.len() < one_typo as usize {
|
||||
build_dfa(query, 0, is_prefix)
|
||||
} else if query.len() < two_typos as usize {
|
||||
build_dfa(query, 1, is_prefix)
|
||||
} else {
|
||||
build_dfa(query, 2, is_prefix)
|
||||
};
|
||||
|
||||
let mut stream = fst.search(automaton).into_stream();
|
||||
while let Some(facet_value) = stream.next() {
|
||||
let value = std::str::from_utf8(facet_value)?;
|
||||
if self
|
||||
.fetch_original_facets_using_normalized(
|
||||
fid,
|
||||
value,
|
||||
query,
|
||||
&search_candidates,
|
||||
&mut results,
|
||||
)?
|
||||
.is_break()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let automaton = Str::new(query).starts_with();
|
||||
let mut stream = fst.search(automaton).into_stream();
|
||||
while let Some(facet_value) = stream.next() {
|
||||
let value = std::str::from_utf8(facet_value)?;
|
||||
if self
|
||||
.fetch_original_facets_using_normalized(
|
||||
fid,
|
||||
value,
|
||||
query,
|
||||
&search_candidates,
|
||||
&mut results,
|
||||
)?
|
||||
.is_break()
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let prefix = FacetGroupKey { field_id: fid, level: 0, left_bound: "" };
|
||||
for result in index.facet_id_string_docids.prefix_iter(rtxn, &prefix)? {
|
||||
let (FacetGroupKey { left_bound, .. }, FacetGroupValue { bitmap, .. }) =
|
||||
result?;
|
||||
let count = search_candidates.intersection_len(&bitmap);
|
||||
if count != 0 {
|
||||
let value = self
|
||||
.one_original_value_of(fid, left_bound, bitmap.min().unwrap())?
|
||||
.unwrap_or_else(|| left_bound.to_string());
|
||||
if results.insert(FacetValueHit { value, count }).is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(results.into_sorted_vec())
|
||||
}
|
||||
|
||||
fn fetch_original_facets_using_normalized(
|
||||
&self,
|
||||
fid: FieldId,
|
||||
value: &str,
|
||||
query: &str,
|
||||
search_candidates: &RoaringBitmap,
|
||||
results: &mut ValuesCollection,
|
||||
) -> Result<ControlFlow<()>> {
|
||||
let index = self.search_query.index;
|
||||
let rtxn = self.search_query.rtxn;
|
||||
|
||||
let database = index.facet_id_normalized_string_strings;
|
||||
let key = (fid, value);
|
||||
let original_strings = match database.get(rtxn, &key)? {
|
||||
Some(original_strings) => original_strings,
|
||||
None => {
|
||||
error!("the facet value is missing from the facet database: {key:?}");
|
||||
return Ok(ControlFlow::Continue(()));
|
||||
}
|
||||
};
|
||||
for original in original_strings {
|
||||
let key = FacetGroupKey { field_id: fid, level: 0, left_bound: original.as_str() };
|
||||
let docids = match index.facet_id_string_docids.get(rtxn, &key)? {
|
||||
Some(FacetGroupValue { bitmap, .. }) => bitmap,
|
||||
None => {
|
||||
error!("the facet value is missing from the facet database: {key:?}");
|
||||
return Ok(ControlFlow::Continue(()));
|
||||
}
|
||||
};
|
||||
let count = search_candidates.intersection_len(&docids);
|
||||
if count != 0 {
|
||||
let value = self
|
||||
.one_original_value_of(fid, &original, docids.min().unwrap())?
|
||||
.unwrap_or_else(|| query.to_string());
|
||||
if results.insert(FacetValueHit { value, count }).is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, serde::Serialize, PartialEq)]
|
||||
pub struct FacetValueHit {
|
||||
/// The original facet value
|
||||
pub value: String,
|
||||
/// The number of documents associated to this facet
|
||||
pub count: u64,
|
||||
}
|
||||
|
||||
impl PartialOrd for FacetValueHit {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for FacetValueHit {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
self.count.cmp(&other.count).then_with(|| self.value.cmp(&other.value))
|
||||
}
|
||||
}
|
||||
|
||||
impl Eq for FacetValueHit {}
|
||||
|
||||
/// A wrapper type that collects the best facet values by
|
||||
/// lexicographic or number of associated values.
|
||||
enum ValuesCollection {
|
||||
/// Keeps the top values according to the lexicographic order.
|
||||
Lexicographic { max: usize, content: Vec<FacetValueHit> },
|
||||
/// Keeps the top values according to the number of values associated to them.
|
||||
///
|
||||
/// Note that it is a max heap and we need to move the smallest counts
|
||||
/// at the top to be able to pop them when we reach the max_values limit.
|
||||
Count { max: usize, content: BinaryHeap<Reverse<FacetValueHit>> },
|
||||
}
|
||||
|
||||
impl ValuesCollection {
|
||||
pub fn by_lexicographic(max: usize) -> Self {
|
||||
ValuesCollection::Lexicographic { max, content: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn by_count(max: usize) -> Self {
|
||||
ValuesCollection::Count { max, content: BinaryHeap::new() }
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, value: FacetValueHit) -> ControlFlow<()> {
|
||||
match self {
|
||||
ValuesCollection::Lexicographic { max, content } => {
|
||||
if content.len() < *max {
|
||||
content.push(value);
|
||||
if content.len() < *max {
|
||||
return ControlFlow::Continue(());
|
||||
}
|
||||
}
|
||||
ControlFlow::Break(())
|
||||
}
|
||||
ValuesCollection::Count { max, content } => {
|
||||
if content.len() == *max {
|
||||
// Peeking gives us the worst value in the list as
|
||||
// this is a max-heap and we reversed it.
|
||||
let Some(mut peek) = content.peek_mut() else { return ControlFlow::Break(()) };
|
||||
if peek.0.count <= value.count {
|
||||
// Replace the current worst value in the heap
|
||||
// with the new one we received that is better.
|
||||
*peek = Reverse(value);
|
||||
}
|
||||
} else {
|
||||
content.push(Reverse(value));
|
||||
}
|
||||
ControlFlow::Continue(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the list of facet values in descending order of, either,
|
||||
/// count or lexicographic order of the value depending on the type.
|
||||
pub fn into_sorted_vec(self) -> Vec<FacetValueHit> {
|
||||
match self {
|
||||
ValuesCollection::Lexicographic { content, .. } => content.into_iter().collect(),
|
||||
ValuesCollection::Count { content, .. } => {
|
||||
// Convert the heap into a vec of hits by removing the Reverse wrapper.
|
||||
// Hits are already in the right order as they were reversed and there
|
||||
// are output in ascending order.
|
||||
content.into_sorted_vec().into_iter().map(|Reverse(hit)| hit).collect()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
|
||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let mut detection = StrDetection::new(facet_string, locales);
|
||||
|
||||
// Detect the language of the facet string only if several locales are explicitly provided.
|
||||
let language = match locales {
|
||||
Some(&[language]) => Some(language),
|
||||
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let token = Token {
|
||||
lemma: std::borrow::Cow::Borrowed(facet_string),
|
||||
script: detection.script(),
|
||||
language,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
token.normalize(&options).lemma.into_owned()
|
||||
}
|
||||
@@ -0,0 +1,260 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_distribution_iter.rs
|
||||
---
|
||||
0: 1
|
||||
1: 1
|
||||
2: 1
|
||||
3: 1
|
||||
4: 1
|
||||
5: 1
|
||||
6: 1
|
||||
7: 1
|
||||
8: 1
|
||||
9: 1
|
||||
10: 1
|
||||
11: 1
|
||||
12: 1
|
||||
13: 1
|
||||
14: 1
|
||||
15: 1
|
||||
16: 1
|
||||
17: 1
|
||||
18: 1
|
||||
19: 1
|
||||
20: 1
|
||||
21: 1
|
||||
22: 1
|
||||
23: 1
|
||||
24: 1
|
||||
25: 1
|
||||
26: 1
|
||||
27: 1
|
||||
28: 1
|
||||
29: 1
|
||||
30: 1
|
||||
31: 1
|
||||
32: 1
|
||||
33: 1
|
||||
34: 1
|
||||
35: 1
|
||||
36: 1
|
||||
37: 1
|
||||
38: 1
|
||||
39: 1
|
||||
40: 1
|
||||
41: 1
|
||||
42: 1
|
||||
43: 1
|
||||
44: 1
|
||||
45: 1
|
||||
46: 1
|
||||
47: 1
|
||||
48: 1
|
||||
49: 1
|
||||
50: 1
|
||||
51: 1
|
||||
52: 1
|
||||
53: 1
|
||||
54: 1
|
||||
55: 1
|
||||
56: 1
|
||||
57: 1
|
||||
58: 1
|
||||
59: 1
|
||||
60: 1
|
||||
61: 1
|
||||
62: 1
|
||||
63: 1
|
||||
64: 1
|
||||
65: 1
|
||||
66: 1
|
||||
67: 1
|
||||
68: 1
|
||||
69: 1
|
||||
70: 1
|
||||
71: 1
|
||||
72: 1
|
||||
73: 1
|
||||
74: 1
|
||||
75: 1
|
||||
76: 1
|
||||
77: 1
|
||||
78: 1
|
||||
79: 1
|
||||
80: 1
|
||||
81: 1
|
||||
82: 1
|
||||
83: 1
|
||||
84: 1
|
||||
85: 1
|
||||
86: 1
|
||||
87: 1
|
||||
88: 1
|
||||
89: 1
|
||||
90: 1
|
||||
91: 1
|
||||
92: 1
|
||||
93: 1
|
||||
94: 1
|
||||
95: 1
|
||||
96: 1
|
||||
97: 1
|
||||
98: 1
|
||||
99: 1
|
||||
100: 1
|
||||
101: 1
|
||||
102: 1
|
||||
103: 1
|
||||
104: 1
|
||||
105: 1
|
||||
106: 1
|
||||
107: 1
|
||||
108: 1
|
||||
109: 1
|
||||
110: 1
|
||||
111: 1
|
||||
112: 1
|
||||
113: 1
|
||||
114: 1
|
||||
115: 1
|
||||
116: 1
|
||||
117: 1
|
||||
118: 1
|
||||
119: 1
|
||||
120: 1
|
||||
121: 1
|
||||
122: 1
|
||||
123: 1
|
||||
124: 1
|
||||
125: 1
|
||||
126: 1
|
||||
127: 1
|
||||
128: 1
|
||||
129: 1
|
||||
130: 1
|
||||
131: 1
|
||||
132: 1
|
||||
133: 1
|
||||
134: 1
|
||||
135: 1
|
||||
136: 1
|
||||
137: 1
|
||||
138: 1
|
||||
139: 1
|
||||
140: 1
|
||||
141: 1
|
||||
142: 1
|
||||
143: 1
|
||||
144: 1
|
||||
145: 1
|
||||
146: 1
|
||||
147: 1
|
||||
148: 1
|
||||
149: 1
|
||||
150: 1
|
||||
151: 1
|
||||
152: 1
|
||||
153: 1
|
||||
154: 1
|
||||
155: 1
|
||||
156: 1
|
||||
157: 1
|
||||
158: 1
|
||||
159: 1
|
||||
160: 1
|
||||
161: 1
|
||||
162: 1
|
||||
163: 1
|
||||
164: 1
|
||||
165: 1
|
||||
166: 1
|
||||
167: 1
|
||||
168: 1
|
||||
169: 1
|
||||
170: 1
|
||||
171: 1
|
||||
172: 1
|
||||
173: 1
|
||||
174: 1
|
||||
175: 1
|
||||
176: 1
|
||||
177: 1
|
||||
178: 1
|
||||
179: 1
|
||||
180: 1
|
||||
181: 1
|
||||
182: 1
|
||||
183: 1
|
||||
184: 1
|
||||
185: 1
|
||||
186: 1
|
||||
187: 1
|
||||
188: 1
|
||||
189: 1
|
||||
190: 1
|
||||
191: 1
|
||||
192: 1
|
||||
193: 1
|
||||
194: 1
|
||||
195: 1
|
||||
196: 1
|
||||
197: 1
|
||||
198: 1
|
||||
199: 1
|
||||
200: 1
|
||||
201: 1
|
||||
202: 1
|
||||
203: 1
|
||||
204: 1
|
||||
205: 1
|
||||
206: 1
|
||||
207: 1
|
||||
208: 1
|
||||
209: 1
|
||||
210: 1
|
||||
211: 1
|
||||
212: 1
|
||||
213: 1
|
||||
214: 1
|
||||
215: 1
|
||||
216: 1
|
||||
217: 1
|
||||
218: 1
|
||||
219: 1
|
||||
220: 1
|
||||
221: 1
|
||||
222: 1
|
||||
223: 1
|
||||
224: 1
|
||||
225: 1
|
||||
226: 1
|
||||
227: 1
|
||||
228: 1
|
||||
229: 1
|
||||
230: 1
|
||||
231: 1
|
||||
232: 1
|
||||
233: 1
|
||||
234: 1
|
||||
235: 1
|
||||
236: 1
|
||||
237: 1
|
||||
238: 1
|
||||
239: 1
|
||||
240: 1
|
||||
241: 1
|
||||
242: 1
|
||||
243: 1
|
||||
244: 1
|
||||
245: 1
|
||||
246: 1
|
||||
247: 1
|
||||
248: 1
|
||||
249: 1
|
||||
250: 1
|
||||
251: 1
|
||||
252: 1
|
||||
253: 1
|
||||
254: 1
|
||||
255: 1
|
||||
|
||||
@@ -0,0 +1,105 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_distribution_iter.rs
|
||||
---
|
||||
3: 2
|
||||
5: 2
|
||||
6: 2
|
||||
9: 2
|
||||
10: 2
|
||||
11: 2
|
||||
14: 2
|
||||
18: 2
|
||||
19: 2
|
||||
24: 2
|
||||
26: 2
|
||||
28: 2
|
||||
29: 2
|
||||
32: 2
|
||||
33: 2
|
||||
35: 2
|
||||
36: 2
|
||||
37: 2
|
||||
38: 2
|
||||
39: 2
|
||||
41: 2
|
||||
46: 2
|
||||
47: 2
|
||||
49: 2
|
||||
52: 2
|
||||
53: 2
|
||||
55: 2
|
||||
59: 2
|
||||
61: 2
|
||||
64: 2
|
||||
68: 2
|
||||
71: 2
|
||||
74: 2
|
||||
75: 2
|
||||
76: 2
|
||||
81: 2
|
||||
83: 2
|
||||
85: 2
|
||||
86: 2
|
||||
88: 2
|
||||
90: 2
|
||||
91: 2
|
||||
92: 2
|
||||
98: 2
|
||||
99: 2
|
||||
101: 2
|
||||
102: 2
|
||||
103: 2
|
||||
107: 2
|
||||
111: 2
|
||||
115: 2
|
||||
119: 2
|
||||
123: 2
|
||||
124: 2
|
||||
130: 2
|
||||
131: 2
|
||||
133: 2
|
||||
135: 2
|
||||
136: 2
|
||||
137: 2
|
||||
139: 2
|
||||
141: 2
|
||||
143: 2
|
||||
144: 2
|
||||
147: 2
|
||||
150: 2
|
||||
156: 1
|
||||
158: 1
|
||||
160: 1
|
||||
162: 1
|
||||
163: 1
|
||||
164: 1
|
||||
167: 1
|
||||
169: 1
|
||||
173: 1
|
||||
177: 1
|
||||
178: 1
|
||||
179: 1
|
||||
181: 1
|
||||
182: 1
|
||||
186: 1
|
||||
189: 1
|
||||
192: 1
|
||||
193: 1
|
||||
195: 1
|
||||
197: 1
|
||||
205: 1
|
||||
206: 1
|
||||
207: 1
|
||||
208: 1
|
||||
209: 1
|
||||
210: 1
|
||||
216: 1
|
||||
219: 1
|
||||
220: 1
|
||||
223: 1
|
||||
226: 1
|
||||
235: 1
|
||||
236: 1
|
||||
238: 1
|
||||
243: 1
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_distribution_iter.rs
|
||||
---
|
||||
0: 1
|
||||
1: 1
|
||||
2: 1
|
||||
3: 1
|
||||
4: 1
|
||||
5: 1
|
||||
6: 1
|
||||
7: 1
|
||||
8: 1
|
||||
9: 1
|
||||
10: 1
|
||||
11: 1
|
||||
12: 1
|
||||
13: 1
|
||||
14: 1
|
||||
15: 1
|
||||
16: 1
|
||||
17: 1
|
||||
18: 1
|
||||
19: 1
|
||||
20: 1
|
||||
21: 1
|
||||
22: 1
|
||||
23: 1
|
||||
24: 1
|
||||
25: 1
|
||||
26: 1
|
||||
27: 1
|
||||
28: 1
|
||||
29: 1
|
||||
30: 1
|
||||
31: 1
|
||||
32: 1
|
||||
33: 1
|
||||
34: 1
|
||||
35: 1
|
||||
36: 1
|
||||
37: 1
|
||||
38: 1
|
||||
39: 1
|
||||
40: 1
|
||||
41: 1
|
||||
42: 1
|
||||
43: 1
|
||||
44: 1
|
||||
45: 1
|
||||
46: 1
|
||||
47: 1
|
||||
48: 1
|
||||
49: 1
|
||||
50: 1
|
||||
51: 1
|
||||
52: 1
|
||||
53: 1
|
||||
54: 1
|
||||
55: 1
|
||||
56: 1
|
||||
57: 1
|
||||
58: 1
|
||||
59: 1
|
||||
60: 1
|
||||
61: 1
|
||||
62: 1
|
||||
63: 1
|
||||
64: 1
|
||||
65: 1
|
||||
66: 1
|
||||
67: 1
|
||||
68: 1
|
||||
69: 1
|
||||
70: 1
|
||||
71: 1
|
||||
72: 1
|
||||
73: 1
|
||||
74: 1
|
||||
75: 1
|
||||
76: 1
|
||||
77: 1
|
||||
78: 1
|
||||
79: 1
|
||||
80: 1
|
||||
81: 1
|
||||
82: 1
|
||||
83: 1
|
||||
84: 1
|
||||
85: 1
|
||||
86: 1
|
||||
87: 1
|
||||
88: 1
|
||||
89: 1
|
||||
90: 1
|
||||
91: 1
|
||||
92: 1
|
||||
93: 1
|
||||
94: 1
|
||||
95: 1
|
||||
96: 1
|
||||
97: 1
|
||||
98: 1
|
||||
99: 1
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_distribution_iter.rs
|
||||
---
|
||||
3: 2
|
||||
5: 2
|
||||
6: 2
|
||||
9: 2
|
||||
10: 2
|
||||
11: 2
|
||||
14: 2
|
||||
18: 2
|
||||
19: 2
|
||||
24: 2
|
||||
26: 2
|
||||
28: 2
|
||||
29: 2
|
||||
32: 2
|
||||
33: 2
|
||||
35: 2
|
||||
36: 2
|
||||
37: 2
|
||||
38: 2
|
||||
39: 2
|
||||
41: 2
|
||||
46: 2
|
||||
47: 2
|
||||
49: 2
|
||||
52: 2
|
||||
53: 2
|
||||
55: 2
|
||||
59: 2
|
||||
61: 2
|
||||
64: 2
|
||||
68: 2
|
||||
71: 2
|
||||
74: 2
|
||||
75: 2
|
||||
76: 2
|
||||
81: 2
|
||||
83: 2
|
||||
85: 2
|
||||
86: 2
|
||||
88: 2
|
||||
90: 2
|
||||
91: 2
|
||||
92: 2
|
||||
98: 2
|
||||
99: 2
|
||||
101: 2
|
||||
102: 2
|
||||
103: 2
|
||||
107: 2
|
||||
111: 2
|
||||
115: 2
|
||||
119: 2
|
||||
123: 2
|
||||
124: 2
|
||||
130: 2
|
||||
131: 2
|
||||
133: 2
|
||||
135: 2
|
||||
136: 2
|
||||
137: 2
|
||||
139: 2
|
||||
141: 2
|
||||
143: 2
|
||||
144: 2
|
||||
147: 2
|
||||
150: 2
|
||||
156: 1
|
||||
158: 1
|
||||
160: 1
|
||||
162: 1
|
||||
163: 1
|
||||
164: 1
|
||||
167: 1
|
||||
169: 1
|
||||
173: 1
|
||||
177: 1
|
||||
178: 1
|
||||
179: 1
|
||||
181: 1
|
||||
182: 1
|
||||
186: 1
|
||||
189: 1
|
||||
192: 1
|
||||
193: 1
|
||||
195: 1
|
||||
197: 1
|
||||
205: 1
|
||||
206: 1
|
||||
207: 1
|
||||
208: 1
|
||||
209: 1
|
||||
210: 1
|
||||
216: 1
|
||||
219: 1
|
||||
220: 1
|
||||
223: 1
|
||||
226: 1
|
||||
235: 1
|
||||
236: 1
|
||||
238: 1
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
adf484f467a31ee9460dec539621938a
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
c9939aa4977fcd4bfd35852e102dbc82
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
adf484f467a31ee9460dec539621938a
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
c9939aa4977fcd4bfd35852e102dbc82
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
618738d28ff1386b6e93d171a5acb08f
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
ffb62ab3eef55c2254c13dc0f4099849
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
618738d28ff1386b6e93d171a5acb08f
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
ffb62ab3eef55c2254c13dc0f4099849
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
9c25261cec7275cb5cfd85835904d023
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2f97f18c15e915853e4df879be6e1f63
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
9c25261cec7275cb5cfd85835904d023
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2f97f18c15e915853e4df879be6e1f63
|
||||
@@ -0,0 +1,260 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
0: []
|
||||
1: []
|
||||
2: []
|
||||
3: []
|
||||
4: []
|
||||
5: []
|
||||
6: []
|
||||
7: []
|
||||
8: []
|
||||
9: []
|
||||
10: []
|
||||
11: []
|
||||
12: []
|
||||
13: []
|
||||
14: []
|
||||
15: []
|
||||
16: []
|
||||
17: []
|
||||
18: []
|
||||
19: []
|
||||
20: []
|
||||
21: []
|
||||
22: []
|
||||
23: []
|
||||
24: []
|
||||
25: []
|
||||
26: []
|
||||
27: []
|
||||
28: []
|
||||
29: []
|
||||
30: []
|
||||
31: []
|
||||
32: []
|
||||
33: []
|
||||
34: []
|
||||
35: []
|
||||
36: []
|
||||
37: []
|
||||
38: []
|
||||
39: []
|
||||
40: []
|
||||
41: []
|
||||
42: []
|
||||
43: []
|
||||
44: []
|
||||
45: []
|
||||
46: []
|
||||
47: []
|
||||
48: []
|
||||
49: []
|
||||
50: []
|
||||
51: []
|
||||
52: []
|
||||
53: []
|
||||
54: []
|
||||
55: []
|
||||
56: []
|
||||
57: []
|
||||
58: []
|
||||
59: []
|
||||
60: []
|
||||
61: []
|
||||
62: []
|
||||
63: []
|
||||
64: []
|
||||
65: []
|
||||
66: []
|
||||
67: []
|
||||
68: []
|
||||
69: []
|
||||
70: []
|
||||
71: []
|
||||
72: []
|
||||
73: []
|
||||
74: []
|
||||
75: []
|
||||
76: []
|
||||
77: []
|
||||
78: []
|
||||
79: []
|
||||
80: []
|
||||
81: []
|
||||
82: []
|
||||
83: []
|
||||
84: []
|
||||
85: []
|
||||
86: []
|
||||
87: []
|
||||
88: []
|
||||
89: []
|
||||
90: []
|
||||
91: []
|
||||
92: []
|
||||
93: []
|
||||
94: []
|
||||
95: []
|
||||
96: []
|
||||
97: []
|
||||
98: []
|
||||
99: []
|
||||
100: []
|
||||
101: []
|
||||
102: []
|
||||
103: []
|
||||
104: []
|
||||
105: []
|
||||
106: []
|
||||
107: []
|
||||
108: []
|
||||
109: []
|
||||
110: []
|
||||
111: []
|
||||
112: []
|
||||
113: []
|
||||
114: []
|
||||
115: []
|
||||
116: []
|
||||
117: []
|
||||
118: []
|
||||
119: []
|
||||
120: []
|
||||
121: []
|
||||
122: []
|
||||
123: []
|
||||
124: []
|
||||
125: []
|
||||
126: []
|
||||
127: []
|
||||
128: []
|
||||
129: []
|
||||
130: []
|
||||
131: []
|
||||
132: []
|
||||
133: []
|
||||
134: []
|
||||
135: []
|
||||
136: []
|
||||
137: []
|
||||
138: []
|
||||
139: []
|
||||
140: []
|
||||
141: []
|
||||
142: []
|
||||
143: []
|
||||
144: []
|
||||
145: []
|
||||
146: []
|
||||
147: []
|
||||
148: []
|
||||
149: []
|
||||
150: []
|
||||
151: []
|
||||
152: []
|
||||
153: []
|
||||
154: []
|
||||
155: []
|
||||
156: []
|
||||
157: []
|
||||
158: []
|
||||
159: []
|
||||
160: []
|
||||
161: []
|
||||
162: []
|
||||
163: []
|
||||
164: []
|
||||
165: []
|
||||
166: []
|
||||
167: []
|
||||
168: []
|
||||
169: []
|
||||
170: []
|
||||
171: []
|
||||
172: []
|
||||
173: []
|
||||
174: []
|
||||
175: []
|
||||
176: []
|
||||
177: []
|
||||
178: []
|
||||
179: []
|
||||
180: []
|
||||
181: []
|
||||
182: []
|
||||
183: []
|
||||
184: []
|
||||
185: []
|
||||
186: []
|
||||
187: []
|
||||
188: []
|
||||
189: []
|
||||
190: []
|
||||
191: []
|
||||
192: []
|
||||
193: []
|
||||
194: []
|
||||
195: []
|
||||
196: []
|
||||
197: []
|
||||
198: []
|
||||
199: []
|
||||
200: []
|
||||
201: []
|
||||
202: []
|
||||
203: []
|
||||
204: []
|
||||
205: []
|
||||
206: []
|
||||
207: []
|
||||
208: []
|
||||
209: []
|
||||
210: []
|
||||
211: []
|
||||
212: []
|
||||
213: []
|
||||
214: []
|
||||
215: []
|
||||
216: []
|
||||
217: []
|
||||
218: []
|
||||
219: []
|
||||
220: []
|
||||
221: []
|
||||
222: []
|
||||
223: []
|
||||
224: []
|
||||
225: []
|
||||
226: []
|
||||
227: []
|
||||
228: []
|
||||
229: []
|
||||
230: []
|
||||
231: []
|
||||
232: []
|
||||
233: []
|
||||
234: []
|
||||
235: []
|
||||
236: []
|
||||
237: []
|
||||
238: []
|
||||
239: []
|
||||
240: []
|
||||
241: []
|
||||
242: []
|
||||
243: []
|
||||
244: []
|
||||
245: []
|
||||
246: []
|
||||
247: []
|
||||
248: []
|
||||
249: []
|
||||
250: []
|
||||
251: []
|
||||
252: []
|
||||
253: []
|
||||
254: []
|
||||
255: []
|
||||
|
||||
@@ -0,0 +1,260 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
0: []
|
||||
1: []
|
||||
2: []
|
||||
3: []
|
||||
4: []
|
||||
5: []
|
||||
6: []
|
||||
7: []
|
||||
8: []
|
||||
9: []
|
||||
10: []
|
||||
11: []
|
||||
12: []
|
||||
13: []
|
||||
14: []
|
||||
15: []
|
||||
16: []
|
||||
17: []
|
||||
18: []
|
||||
19: []
|
||||
20: []
|
||||
21: []
|
||||
22: []
|
||||
23: []
|
||||
24: []
|
||||
25: []
|
||||
26: []
|
||||
27: []
|
||||
28: []
|
||||
29: []
|
||||
30: []
|
||||
31: []
|
||||
32: []
|
||||
33: []
|
||||
34: []
|
||||
35: []
|
||||
36: []
|
||||
37: []
|
||||
38: []
|
||||
39: []
|
||||
40: []
|
||||
41: []
|
||||
42: []
|
||||
43: []
|
||||
44: []
|
||||
45: []
|
||||
46: []
|
||||
47: []
|
||||
48: []
|
||||
49: []
|
||||
50: []
|
||||
51: []
|
||||
52: []
|
||||
53: []
|
||||
54: []
|
||||
55: []
|
||||
56: []
|
||||
57: []
|
||||
58: []
|
||||
59: []
|
||||
60: []
|
||||
61: []
|
||||
62: []
|
||||
63: []
|
||||
64: []
|
||||
65: []
|
||||
66: []
|
||||
67: []
|
||||
68: []
|
||||
69: []
|
||||
70: []
|
||||
71: []
|
||||
72: []
|
||||
73: []
|
||||
74: []
|
||||
75: []
|
||||
76: []
|
||||
77: []
|
||||
78: []
|
||||
79: []
|
||||
80: []
|
||||
81: []
|
||||
82: []
|
||||
83: []
|
||||
84: []
|
||||
85: []
|
||||
86: []
|
||||
87: []
|
||||
88: []
|
||||
89: []
|
||||
90: []
|
||||
91: []
|
||||
92: []
|
||||
93: []
|
||||
94: []
|
||||
95: []
|
||||
96: []
|
||||
97: []
|
||||
98: []
|
||||
99: []
|
||||
100: []
|
||||
101: []
|
||||
102: []
|
||||
103: []
|
||||
104: []
|
||||
105: []
|
||||
106: []
|
||||
107: []
|
||||
108: []
|
||||
109: []
|
||||
110: []
|
||||
111: []
|
||||
112: []
|
||||
113: []
|
||||
114: []
|
||||
115: []
|
||||
116: []
|
||||
117: []
|
||||
118: []
|
||||
119: []
|
||||
120: []
|
||||
121: []
|
||||
122: []
|
||||
123: []
|
||||
124: []
|
||||
125: []
|
||||
126: []
|
||||
127: []
|
||||
128: []
|
||||
129: []
|
||||
130: []
|
||||
131: []
|
||||
132: []
|
||||
133: []
|
||||
134: []
|
||||
135: []
|
||||
136: []
|
||||
137: []
|
||||
138: []
|
||||
139: []
|
||||
140: []
|
||||
141: []
|
||||
142: []
|
||||
143: []
|
||||
144: []
|
||||
145: []
|
||||
146: []
|
||||
147: []
|
||||
148: []
|
||||
149: []
|
||||
150: []
|
||||
151: []
|
||||
152: []
|
||||
153: []
|
||||
154: []
|
||||
155: []
|
||||
156: []
|
||||
157: []
|
||||
158: []
|
||||
159: []
|
||||
160: []
|
||||
161: []
|
||||
162: []
|
||||
163: []
|
||||
164: []
|
||||
165: []
|
||||
166: []
|
||||
167: []
|
||||
168: []
|
||||
169: []
|
||||
170: []
|
||||
171: []
|
||||
172: []
|
||||
173: []
|
||||
174: []
|
||||
175: []
|
||||
176: []
|
||||
177: []
|
||||
178: []
|
||||
179: []
|
||||
180: []
|
||||
181: []
|
||||
182: []
|
||||
183: []
|
||||
184: []
|
||||
185: []
|
||||
186: []
|
||||
187: []
|
||||
188: []
|
||||
189: []
|
||||
190: []
|
||||
191: []
|
||||
192: []
|
||||
193: []
|
||||
194: []
|
||||
195: []
|
||||
196: []
|
||||
197: []
|
||||
198: []
|
||||
199: []
|
||||
200: []
|
||||
201: []
|
||||
202: []
|
||||
203: []
|
||||
204: []
|
||||
205: []
|
||||
206: []
|
||||
207: []
|
||||
208: []
|
||||
209: []
|
||||
210: []
|
||||
211: []
|
||||
212: []
|
||||
213: []
|
||||
214: []
|
||||
215: []
|
||||
216: []
|
||||
217: []
|
||||
218: []
|
||||
219: []
|
||||
220: []
|
||||
221: []
|
||||
222: []
|
||||
223: []
|
||||
224: []
|
||||
225: []
|
||||
226: []
|
||||
227: []
|
||||
228: []
|
||||
229: []
|
||||
230: []
|
||||
231: []
|
||||
232: []
|
||||
233: []
|
||||
234: []
|
||||
235: []
|
||||
236: []
|
||||
237: []
|
||||
238: []
|
||||
239: []
|
||||
240: []
|
||||
241: []
|
||||
242: []
|
||||
243: []
|
||||
244: []
|
||||
245: []
|
||||
246: []
|
||||
247: []
|
||||
248: []
|
||||
249: []
|
||||
250: []
|
||||
251: []
|
||||
252: []
|
||||
253: []
|
||||
254: []
|
||||
255: []
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
9c25261cec7275cb5cfd85835904d023
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2f97f18c15e915853e4df879be6e1f63
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
e849066b0e43d5c456f086c552372afc
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
8cc5e82995b0443b660f419bb9ea2e85
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
e849066b0e43d5c456f086c552372afc
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
8cc5e82995b0443b660f419bb9ea2e85
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
73b48005dc57b04f0939bbf21a68dab6
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
3c23d35627667dcee98468bfdecf09d3
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
73b48005dc57b04f0939bbf21a68dab6
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
3c23d35627667dcee98468bfdecf09d3
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
c3f8b0b858a4820a508b25b42328cedd
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
38a42f5dc25e99d7a5312a63ce94ed30
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
c3f8b0b858a4820a508b25b42328cedd
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
38a42f5dc25e99d7a5312a63ce94ed30
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2049930204498b323885c91de88e44ca
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
7f0ca8c0fc6494f3dba46e8eb9699045
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
2049930204498b323885c91de88e44ca
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
7f0ca8c0fc6494f3dba46e8eb9699045
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
ad8fc873747aaf1d3590e7ccab735985
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
7c6cc88697da835d33877b2df41fa1cb
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
ad8fc873747aaf1d3590e7ccab735985
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
7c6cc88697da835d33877b2df41fa1cb
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
9a8c7343b4735d37704748cabcd51ff2
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
898a7dc25a1441bc3e7e2a8a62d99090
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
9a8c7343b4735d37704748cabcd51ff2
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
898a7dc25a1441bc3e7e2a8a62d99090
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
all field_id 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ]
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
all field_id 0: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ]
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
all field_id 0: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ]
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
all field_id 0: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ]
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
all field_id 1: []
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
all field_id 1: []
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
all field_id 1: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ]
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_range_search.rs
|
||||
---
|
||||
all field_id 1: [3, 5, 6, 9, 10, 11, 14, 18, 19, 24, 26, 28, 29, 32, 33, 35, 36, 37, 38, 39, 41, 46, 47, 49, 52, 53, 55, 59, 61, 64, 68, 71, 74, 75, 76, 81, 83, 85, 86, 88, 90, 91, 92, 98, 99, 101, 102, 103, 105, 106, 107, 109, 110, 111, 114, 115, 118, 119, 123, 124, 126, 128, 129, 130, 131, 132, 133, 135, 136, 137, 138, 139, 141, 143, 144, 146, 147, 149, 150, 152, 153, 155, 156, 158, 159, 160, 161, 162, 163, 164, 167, 168, 169, 171, 173, 174, 175, 176, 177, 178, 179, 181, 182, 183, 185, 186, 188, 189, 190, 191, 192, 193, 195, 197, 198, 199, 201, 202, 203, 205, 206, 207, 208, 209, 210, 211, 215, 216, 219, 220, 223, 224, 226, 230, 231, 233, 235, 236, 237, 238, 239, 241, 243, 244, 247, 250, 256, 258, 260, 262, 263, 264, 267, 269, 273, 277, 278, 279, 281, 282, 286, 289, 292, 293, 295, 297, 305, 306, 307, 308, 309, 310, 316, 319, 320, 323, 326, 335, 336, 338, 343, ]
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_ascending.rs
|
||||
---
|
||||
[200, ]
|
||||
[201, ]
|
||||
[202, ]
|
||||
[203, ]
|
||||
[204, ]
|
||||
[205, ]
|
||||
[206, ]
|
||||
[207, ]
|
||||
[208, ]
|
||||
[209, ]
|
||||
[210, ]
|
||||
[211, ]
|
||||
[212, ]
|
||||
[213, ]
|
||||
[214, ]
|
||||
[215, ]
|
||||
[216, ]
|
||||
[217, ]
|
||||
[218, ]
|
||||
[219, ]
|
||||
[220, ]
|
||||
[221, ]
|
||||
[222, ]
|
||||
[223, ]
|
||||
[224, ]
|
||||
[225, ]
|
||||
[226, ]
|
||||
[227, ]
|
||||
[228, ]
|
||||
[229, ]
|
||||
[230, ]
|
||||
[231, ]
|
||||
[232, ]
|
||||
[233, ]
|
||||
[234, ]
|
||||
[235, ]
|
||||
[236, ]
|
||||
[237, ]
|
||||
[238, ]
|
||||
[239, ]
|
||||
[240, ]
|
||||
[241, ]
|
||||
[242, ]
|
||||
[243, ]
|
||||
[244, ]
|
||||
[245, ]
|
||||
[246, ]
|
||||
[247, ]
|
||||
[248, ]
|
||||
[249, ]
|
||||
[250, ]
|
||||
[251, ]
|
||||
[252, ]
|
||||
[253, ]
|
||||
[254, ]
|
||||
[255, ]
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_ascending.rs
|
||||
---
|
||||
[201, ]
|
||||
[202, ]
|
||||
[203, ]
|
||||
[207, ]
|
||||
[211, ]
|
||||
[215, ]
|
||||
[219, ]
|
||||
[223, ]
|
||||
[224, ]
|
||||
[230, ]
|
||||
[231, ]
|
||||
[233, ]
|
||||
[235, ]
|
||||
[236, ]
|
||||
[237, ]
|
||||
[239, ]
|
||||
[241, ]
|
||||
[243, ]
|
||||
[244, ]
|
||||
[247, ]
|
||||
[250, ]
|
||||
[256, ]
|
||||
[258, ]
|
||||
[260, ]
|
||||
[262, ]
|
||||
[263, ]
|
||||
[264, ]
|
||||
[267, ]
|
||||
[269, ]
|
||||
[273, ]
|
||||
[277, ]
|
||||
[278, ]
|
||||
[279, ]
|
||||
[281, ]
|
||||
[282, ]
|
||||
[286, ]
|
||||
[289, ]
|
||||
[292, ]
|
||||
[293, ]
|
||||
[295, ]
|
||||
[297, ]
|
||||
[205, ]
|
||||
[206, ]
|
||||
[208, ]
|
||||
[209, ]
|
||||
[210, ]
|
||||
[216, ]
|
||||
[220, ]
|
||||
[226, ]
|
||||
[238, ]
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_ascending.rs
|
||||
---
|
||||
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]
|
||||
[200, ]
|
||||
[202, ]
|
||||
[204, ]
|
||||
[206, ]
|
||||
[208, ]
|
||||
[210, ]
|
||||
[212, ]
|
||||
[214, ]
|
||||
[216, ]
|
||||
[218, ]
|
||||
[220, ]
|
||||
[222, ]
|
||||
[224, ]
|
||||
[226, ]
|
||||
[228, ]
|
||||
[230, ]
|
||||
[232, ]
|
||||
[234, ]
|
||||
[236, ]
|
||||
[238, ]
|
||||
[240, ]
|
||||
[242, ]
|
||||
[244, ]
|
||||
[246, ]
|
||||
[248, ]
|
||||
[250, ]
|
||||
[252, ]
|
||||
[254, ]
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_ascending.rs
|
||||
---
|
||||
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]
|
||||
[200, ]
|
||||
[202, ]
|
||||
[204, ]
|
||||
[206, ]
|
||||
[208, ]
|
||||
[210, ]
|
||||
[212, ]
|
||||
[214, ]
|
||||
[216, ]
|
||||
[218, ]
|
||||
[220, ]
|
||||
[222, ]
|
||||
[224, ]
|
||||
[226, ]
|
||||
[228, ]
|
||||
[230, ]
|
||||
[232, ]
|
||||
[234, ]
|
||||
[236, ]
|
||||
[238, ]
|
||||
[240, ]
|
||||
[242, ]
|
||||
[244, ]
|
||||
[246, ]
|
||||
[248, ]
|
||||
[250, ]
|
||||
[252, ]
|
||||
[254, ]
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_ascending.rs
|
||||
---
|
||||
[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ]
|
||||
[202, ]
|
||||
[224, ]
|
||||
[230, ]
|
||||
[236, ]
|
||||
[244, ]
|
||||
[250, ]
|
||||
[256, ]
|
||||
[258, ]
|
||||
[260, ]
|
||||
[262, ]
|
||||
[264, ]
|
||||
[278, ]
|
||||
[282, ]
|
||||
[286, ]
|
||||
[292, ]
|
||||
[206, ]
|
||||
[208, ]
|
||||
[210, ]
|
||||
[216, ]
|
||||
[220, ]
|
||||
[226, ]
|
||||
[238, ]
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_ascending.rs
|
||||
---
|
||||
[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ]
|
||||
[202, ]
|
||||
[224, ]
|
||||
[230, ]
|
||||
[236, ]
|
||||
[244, ]
|
||||
[250, ]
|
||||
[256, ]
|
||||
[258, ]
|
||||
[260, ]
|
||||
[262, ]
|
||||
[264, ]
|
||||
[278, ]
|
||||
[282, ]
|
||||
[286, ]
|
||||
[292, ]
|
||||
[206, ]
|
||||
[208, ]
|
||||
[210, ]
|
||||
[216, ]
|
||||
[220, ]
|
||||
[226, ]
|
||||
[238, ]
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_descending.rs
|
||||
---
|
||||
[255, ]
|
||||
[254, ]
|
||||
[253, ]
|
||||
[252, ]
|
||||
[251, ]
|
||||
[250, ]
|
||||
[249, ]
|
||||
[248, ]
|
||||
[247, ]
|
||||
[246, ]
|
||||
[245, ]
|
||||
[244, ]
|
||||
[243, ]
|
||||
[242, ]
|
||||
[241, ]
|
||||
[240, ]
|
||||
[239, ]
|
||||
[238, ]
|
||||
[237, ]
|
||||
[236, ]
|
||||
[235, ]
|
||||
[234, ]
|
||||
[233, ]
|
||||
[232, ]
|
||||
[231, ]
|
||||
[230, ]
|
||||
[229, ]
|
||||
[228, ]
|
||||
[227, ]
|
||||
[226, ]
|
||||
[225, ]
|
||||
[224, ]
|
||||
[223, ]
|
||||
[222, ]
|
||||
[221, ]
|
||||
[220, ]
|
||||
[219, ]
|
||||
[218, ]
|
||||
[217, ]
|
||||
[216, ]
|
||||
[215, ]
|
||||
[214, ]
|
||||
[213, ]
|
||||
[212, ]
|
||||
[211, ]
|
||||
[210, ]
|
||||
[209, ]
|
||||
[208, ]
|
||||
[207, ]
|
||||
[206, ]
|
||||
[205, ]
|
||||
[204, ]
|
||||
[203, ]
|
||||
[202, ]
|
||||
[201, ]
|
||||
[200, ]
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_descending.rs
|
||||
---
|
||||
[243, ]
|
||||
[238, ]
|
||||
[236, ]
|
||||
[235, ]
|
||||
[226, ]
|
||||
[223, ]
|
||||
[220, ]
|
||||
[219, ]
|
||||
[216, ]
|
||||
[210, ]
|
||||
[209, ]
|
||||
[208, ]
|
||||
[207, ]
|
||||
[206, ]
|
||||
[205, ]
|
||||
[297, ]
|
||||
[295, ]
|
||||
[293, ]
|
||||
[292, ]
|
||||
[289, ]
|
||||
[286, ]
|
||||
[282, ]
|
||||
[281, ]
|
||||
[279, ]
|
||||
[278, ]
|
||||
[277, ]
|
||||
[273, ]
|
||||
[269, ]
|
||||
[267, ]
|
||||
[264, ]
|
||||
[263, ]
|
||||
[262, ]
|
||||
[260, ]
|
||||
[258, ]
|
||||
[256, ]
|
||||
[250, ]
|
||||
[247, ]
|
||||
[244, ]
|
||||
[241, ]
|
||||
[239, ]
|
||||
[237, ]
|
||||
[233, ]
|
||||
[231, ]
|
||||
[230, ]
|
||||
[224, ]
|
||||
[215, ]
|
||||
[211, ]
|
||||
[203, ]
|
||||
[202, ]
|
||||
[201, ]
|
||||
|
||||
@@ -0,0 +1,60 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_descending.rs
|
||||
---
|
||||
[255, ]
|
||||
[254, ]
|
||||
[253, ]
|
||||
[252, ]
|
||||
[251, ]
|
||||
[250, ]
|
||||
[249, ]
|
||||
[248, ]
|
||||
[247, ]
|
||||
[246, ]
|
||||
[245, ]
|
||||
[244, ]
|
||||
[243, ]
|
||||
[242, ]
|
||||
[241, ]
|
||||
[240, ]
|
||||
[239, ]
|
||||
[238, ]
|
||||
[237, ]
|
||||
[236, ]
|
||||
[235, ]
|
||||
[234, ]
|
||||
[233, ]
|
||||
[232, ]
|
||||
[231, ]
|
||||
[230, ]
|
||||
[229, ]
|
||||
[228, ]
|
||||
[227, ]
|
||||
[226, ]
|
||||
[225, ]
|
||||
[224, ]
|
||||
[223, ]
|
||||
[222, ]
|
||||
[221, ]
|
||||
[220, ]
|
||||
[219, ]
|
||||
[218, ]
|
||||
[217, ]
|
||||
[216, ]
|
||||
[215, ]
|
||||
[214, ]
|
||||
[213, ]
|
||||
[212, ]
|
||||
[211, ]
|
||||
[210, ]
|
||||
[209, ]
|
||||
[208, ]
|
||||
[207, ]
|
||||
[206, ]
|
||||
[205, ]
|
||||
[204, ]
|
||||
[203, ]
|
||||
[202, ]
|
||||
[201, ]
|
||||
[200, ]
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_descending.rs
|
||||
---
|
||||
[254, ]
|
||||
[252, ]
|
||||
[250, ]
|
||||
[248, ]
|
||||
[246, ]
|
||||
[244, ]
|
||||
[242, ]
|
||||
[240, ]
|
||||
[238, ]
|
||||
[236, ]
|
||||
[234, ]
|
||||
[232, ]
|
||||
[230, ]
|
||||
[228, ]
|
||||
[226, ]
|
||||
[224, ]
|
||||
[222, ]
|
||||
[220, ]
|
||||
[218, ]
|
||||
[216, ]
|
||||
[214, ]
|
||||
[212, ]
|
||||
[210, ]
|
||||
[208, ]
|
||||
[206, ]
|
||||
[204, ]
|
||||
[202, ]
|
||||
[200, ]
|
||||
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_descending.rs
|
||||
---
|
||||
[254, ]
|
||||
[252, ]
|
||||
[250, ]
|
||||
[248, ]
|
||||
[246, ]
|
||||
[244, ]
|
||||
[242, ]
|
||||
[240, ]
|
||||
[238, ]
|
||||
[236, ]
|
||||
[234, ]
|
||||
[232, ]
|
||||
[230, ]
|
||||
[228, ]
|
||||
[226, ]
|
||||
[224, ]
|
||||
[222, ]
|
||||
[220, ]
|
||||
[218, ]
|
||||
[216, ]
|
||||
[214, ]
|
||||
[212, ]
|
||||
[210, ]
|
||||
[208, ]
|
||||
[206, ]
|
||||
[204, ]
|
||||
[202, ]
|
||||
[200, ]
|
||||
[201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235, 237, 239, 241, 243, 245, 247, 249, 251, 253, 255, ]
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_descending.rs
|
||||
---
|
||||
[238, ]
|
||||
[236, ]
|
||||
[226, ]
|
||||
[220, ]
|
||||
[216, ]
|
||||
[210, ]
|
||||
[208, ]
|
||||
[206, ]
|
||||
[292, ]
|
||||
[286, ]
|
||||
[282, ]
|
||||
[278, ]
|
||||
[264, ]
|
||||
[262, ]
|
||||
[260, ]
|
||||
[258, ]
|
||||
[256, ]
|
||||
[250, ]
|
||||
[244, ]
|
||||
[230, ]
|
||||
[224, ]
|
||||
[202, ]
|
||||
[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ]
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
---
|
||||
source: milli/src/search/facet/facet_sort_descending.rs
|
||||
---
|
||||
[238, ]
|
||||
[236, ]
|
||||
[226, ]
|
||||
[220, ]
|
||||
[216, ]
|
||||
[210, ]
|
||||
[208, ]
|
||||
[206, ]
|
||||
[292, ]
|
||||
[286, ]
|
||||
[282, ]
|
||||
[278, ]
|
||||
[264, ]
|
||||
[262, ]
|
||||
[260, ]
|
||||
[258, ]
|
||||
[256, ]
|
||||
[250, ]
|
||||
[244, ]
|
||||
[230, ]
|
||||
[224, ]
|
||||
[202, ]
|
||||
[201, 203, 205, 207, 209, 211, 215, 219, 223, 231, 233, 235, 237, 239, 241, 243, 247, 263, 267, 269, 273, 277, 279, 281, 289, 293, 295, 297, ]
|
||||
|
||||
187
crates/milli/src/search/fst_utils.rs
Normal file
187
crates/milli/src/search/fst_utils.rs
Normal file
@@ -0,0 +1,187 @@
|
||||
/// This mod is necessary until https://github.com/BurntSushi/fst/pull/137 gets merged.
|
||||
/// All credits for this code go to BurntSushi.
|
||||
use fst::Automaton;
|
||||
|
||||
pub struct StartsWith<A>(pub A);
|
||||
|
||||
/// The `Automaton` state for `StartsWith<A>`.
|
||||
pub struct StartsWithState<A: Automaton>(pub StartsWithStateKind<A>);
|
||||
|
||||
impl<A: Automaton> Clone for StartsWithState<A>
|
||||
where
|
||||
A::State: Clone,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
Self(self.0.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// The inner state of a `StartsWithState<A>`.
|
||||
pub enum StartsWithStateKind<A: Automaton> {
|
||||
/// Sink state that is reached when the automaton has matched the prefix.
|
||||
Done,
|
||||
/// State in which the automaton is while it hasn't matched the prefix.
|
||||
Running(A::State),
|
||||
}
|
||||
|
||||
impl<A: Automaton> Clone for StartsWithStateKind<A>
|
||||
where
|
||||
A::State: Clone,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
match self {
|
||||
StartsWithStateKind::Done => StartsWithStateKind::Done,
|
||||
StartsWithStateKind::Running(inner) => StartsWithStateKind::Running(inner.clone()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Automaton> Automaton for StartsWith<A> {
|
||||
type State = StartsWithState<A>;
|
||||
|
||||
fn start(&self) -> StartsWithState<A> {
|
||||
StartsWithState({
|
||||
let inner = self.0.start();
|
||||
if self.0.is_match(&inner) {
|
||||
StartsWithStateKind::Done
|
||||
} else {
|
||||
StartsWithStateKind::Running(inner)
|
||||
}
|
||||
})
|
||||
}
|
||||
fn is_match(&self, state: &StartsWithState<A>) -> bool {
|
||||
match state.0 {
|
||||
StartsWithStateKind::Done => true,
|
||||
StartsWithStateKind::Running(_) => false,
|
||||
}
|
||||
}
|
||||
fn can_match(&self, state: &StartsWithState<A>) -> bool {
|
||||
match state.0 {
|
||||
StartsWithStateKind::Done => true,
|
||||
StartsWithStateKind::Running(ref inner) => self.0.can_match(inner),
|
||||
}
|
||||
}
|
||||
fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
|
||||
match state.0 {
|
||||
StartsWithStateKind::Done => true,
|
||||
StartsWithStateKind::Running(_) => false,
|
||||
}
|
||||
}
|
||||
fn accept(&self, state: &StartsWithState<A>, byte: u8) -> StartsWithState<A> {
|
||||
StartsWithState(match state.0 {
|
||||
StartsWithStateKind::Done => StartsWithStateKind::Done,
|
||||
StartsWithStateKind::Running(ref inner) => {
|
||||
let next_inner = self.0.accept(inner, byte);
|
||||
if self.0.is_match(&next_inner) {
|
||||
StartsWithStateKind::Done
|
||||
} else {
|
||||
StartsWithStateKind::Running(next_inner)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
/// An automaton that matches when one of its component automata match.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Union<A, B>(pub A, pub B);
|
||||
|
||||
/// The `Automaton` state for `Union<A, B>`.
|
||||
pub struct UnionState<A: Automaton, B: Automaton>(pub A::State, pub B::State);
|
||||
|
||||
impl<A: Automaton, B: Automaton> Clone for UnionState<A, B>
|
||||
where
|
||||
A::State: Clone,
|
||||
B::State: Clone,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
Self(self.0.clone(), self.1.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Automaton, B: Automaton> Automaton for Union<A, B> {
|
||||
type State = UnionState<A, B>;
|
||||
fn start(&self) -> UnionState<A, B> {
|
||||
UnionState(self.0.start(), self.1.start())
|
||||
}
|
||||
fn is_match(&self, state: &UnionState<A, B>) -> bool {
|
||||
self.0.is_match(&state.0) || self.1.is_match(&state.1)
|
||||
}
|
||||
fn can_match(&self, state: &UnionState<A, B>) -> bool {
|
||||
self.0.can_match(&state.0) || self.1.can_match(&state.1)
|
||||
}
|
||||
fn will_always_match(&self, state: &UnionState<A, B>) -> bool {
|
||||
self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1)
|
||||
}
|
||||
fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> {
|
||||
UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
|
||||
}
|
||||
}
|
||||
/// An automaton that matches when both of its component automata match.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Intersection<A, B>(pub A, pub B);
|
||||
|
||||
/// The `Automaton` state for `Intersection<A, B>`.
|
||||
pub struct IntersectionState<A: Automaton, B: Automaton>(pub A::State, pub B::State);
|
||||
|
||||
impl<A: Automaton, B: Automaton> Clone for IntersectionState<A, B>
|
||||
where
|
||||
A::State: Clone,
|
||||
B::State: Clone,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
Self(self.0.clone(), self.1.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> {
|
||||
type State = IntersectionState<A, B>;
|
||||
fn start(&self) -> IntersectionState<A, B> {
|
||||
IntersectionState(self.0.start(), self.1.start())
|
||||
}
|
||||
fn is_match(&self, state: &IntersectionState<A, B>) -> bool {
|
||||
self.0.is_match(&state.0) && self.1.is_match(&state.1)
|
||||
}
|
||||
fn can_match(&self, state: &IntersectionState<A, B>) -> bool {
|
||||
self.0.can_match(&state.0) && self.1.can_match(&state.1)
|
||||
}
|
||||
fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool {
|
||||
self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1)
|
||||
}
|
||||
fn accept(&self, state: &IntersectionState<A, B>, byte: u8) -> IntersectionState<A, B> {
|
||||
IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
|
||||
}
|
||||
}
|
||||
/// An automaton that matches exactly when the automaton it wraps does not.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Complement<A>(pub A);
|
||||
|
||||
/// The `Automaton` state for `Complement<A>`.
|
||||
pub struct ComplementState<A: Automaton>(pub A::State);
|
||||
|
||||
impl<A: Automaton> Clone for ComplementState<A>
|
||||
where
|
||||
A::State: Clone,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
Self(self.0.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl<A: Automaton> Automaton for Complement<A> {
|
||||
type State = ComplementState<A>;
|
||||
fn start(&self) -> ComplementState<A> {
|
||||
ComplementState(self.0.start())
|
||||
}
|
||||
fn is_match(&self, state: &ComplementState<A>) -> bool {
|
||||
!self.0.is_match(&state.0)
|
||||
}
|
||||
fn can_match(&self, state: &ComplementState<A>) -> bool {
|
||||
!self.0.will_always_match(&state.0)
|
||||
}
|
||||
fn will_always_match(&self, state: &ComplementState<A>) -> bool {
|
||||
!self.0.can_match(&state.0)
|
||||
}
|
||||
fn accept(&self, state: &ComplementState<A>, byte: u8) -> ComplementState<A> {
|
||||
ComplementState(self.0.accept(&state.0, byte))
|
||||
}
|
||||
}
|
||||
289
crates/milli/src/search/hybrid.rs
Normal file
289
crates/milli/src/search/hybrid.rs
Normal file
@@ -0,0 +1,289 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use itertools::Itertools;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
|
||||
use crate::search::SemanticSearch;
|
||||
use crate::{MatchingWords, Result, Search, SearchResult};
|
||||
|
||||
struct ScoreWithRatioResult {
|
||||
matching_words: MatchingWords,
|
||||
candidates: RoaringBitmap,
|
||||
document_scores: Vec<(u32, ScoreWithRatio)>,
|
||||
degraded: bool,
|
||||
used_negative_operator: bool,
|
||||
}
|
||||
|
||||
type ScoreWithRatio = (Vec<ScoreDetails>, f32);
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::hybrid")]
|
||||
fn compare_scores(
|
||||
&(ref left_scores, left_ratio): &ScoreWithRatio,
|
||||
&(ref right_scores, right_ratio): &ScoreWithRatio,
|
||||
) -> Ordering {
|
||||
let mut left_it = ScoreDetails::score_values(left_scores.iter());
|
||||
let mut right_it = ScoreDetails::score_values(right_scores.iter());
|
||||
|
||||
loop {
|
||||
let left = left_it.next();
|
||||
let right = right_it.next();
|
||||
|
||||
match (left, right) {
|
||||
(None, None) => return Ordering::Equal,
|
||||
(None, Some(_)) => return Ordering::Less,
|
||||
(Some(_), None) => return Ordering::Greater,
|
||||
(Some(ScoreValue::Score(left)), Some(ScoreValue::Score(right))) => {
|
||||
let left = left * left_ratio as f64;
|
||||
let right = right * right_ratio as f64;
|
||||
if (left - right).abs() <= f64::EPSILON {
|
||||
continue;
|
||||
}
|
||||
return left.partial_cmp(&right).unwrap();
|
||||
}
|
||||
(Some(ScoreValue::Sort(left)), Some(ScoreValue::Sort(right))) => {
|
||||
match left.partial_cmp(right).unwrap() {
|
||||
Ordering::Equal => continue,
|
||||
order => return order,
|
||||
}
|
||||
}
|
||||
(Some(ScoreValue::GeoSort(left)), Some(ScoreValue::GeoSort(right))) => {
|
||||
match left.partial_cmp(right).unwrap() {
|
||||
Ordering::Equal => continue,
|
||||
order => return order,
|
||||
}
|
||||
}
|
||||
(Some(ScoreValue::Score(x)), Some(_)) => {
|
||||
return if x == 0. { Ordering::Less } else { Ordering::Greater }
|
||||
}
|
||||
(Some(_), Some(ScoreValue::Score(x))) => {
|
||||
return if x == 0. { Ordering::Greater } else { Ordering::Less }
|
||||
}
|
||||
// if we have this, we're bad
|
||||
(Some(ScoreValue::GeoSort(_)), Some(ScoreValue::Sort(_)))
|
||||
| (Some(ScoreValue::Sort(_)), Some(ScoreValue::GeoSort(_))) => {
|
||||
unreachable!("Unexpected geo and sort comparison")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ScoreWithRatioResult {
|
||||
fn new(results: SearchResult, ratio: f32) -> Self {
|
||||
let document_scores = results
|
||||
.documents_ids
|
||||
.into_iter()
|
||||
.zip(results.document_scores.into_iter().map(|scores| (scores, ratio)))
|
||||
.collect();
|
||||
|
||||
Self {
|
||||
matching_words: results.matching_words,
|
||||
candidates: results.candidates,
|
||||
document_scores,
|
||||
degraded: results.degraded,
|
||||
used_negative_operator: results.used_negative_operator,
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::hybrid")]
|
||||
fn merge(
|
||||
vector_results: Self,
|
||||
keyword_results: Self,
|
||||
from: usize,
|
||||
length: usize,
|
||||
) -> (SearchResult, u32) {
|
||||
#[derive(Clone, Copy)]
|
||||
enum ResultSource {
|
||||
Semantic,
|
||||
Keyword,
|
||||
}
|
||||
let mut semantic_hit_count = 0;
|
||||
|
||||
let mut documents_ids = Vec::with_capacity(
|
||||
vector_results.document_scores.len() + keyword_results.document_scores.len(),
|
||||
);
|
||||
let mut document_scores = Vec::with_capacity(
|
||||
vector_results.document_scores.len() + keyword_results.document_scores.len(),
|
||||
);
|
||||
|
||||
let mut documents_seen = RoaringBitmap::new();
|
||||
for ((docid, (main_score, _sub_score)), source) in vector_results
|
||||
.document_scores
|
||||
.into_iter()
|
||||
.zip(std::iter::repeat(ResultSource::Semantic))
|
||||
.merge_by(
|
||||
keyword_results
|
||||
.document_scores
|
||||
.into_iter()
|
||||
.zip(std::iter::repeat(ResultSource::Keyword)),
|
||||
|((_, left), _), ((_, right), _)| {
|
||||
// the first value is the one with the greatest score
|
||||
compare_scores(left, right).is_ge()
|
||||
},
|
||||
)
|
||||
// remove documents we already saw
|
||||
.filter(|((docid, _), _)| documents_seen.insert(*docid))
|
||||
// start skipping **after** the filter
|
||||
.skip(from)
|
||||
// take **after** skipping
|
||||
.take(length)
|
||||
{
|
||||
if let ResultSource::Semantic = source {
|
||||
semantic_hit_count += 1;
|
||||
}
|
||||
documents_ids.push(docid);
|
||||
// TODO: pass both scores to documents_score in some way?
|
||||
document_scores.push(main_score);
|
||||
}
|
||||
|
||||
(
|
||||
SearchResult {
|
||||
matching_words: keyword_results.matching_words,
|
||||
candidates: vector_results.candidates | keyword_results.candidates,
|
||||
documents_ids,
|
||||
document_scores,
|
||||
degraded: vector_results.degraded | keyword_results.degraded,
|
||||
used_negative_operator: vector_results.used_negative_operator
|
||||
| keyword_results.used_negative_operator,
|
||||
},
|
||||
semantic_hit_count,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Search<'a> {
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::hybrid")]
|
||||
pub fn execute_hybrid(&self, semantic_ratio: f32) -> Result<(SearchResult, Option<u32>)> {
|
||||
// TODO: find classier way to achieve that than to reset vector and query params
|
||||
// create separate keyword and semantic searches
|
||||
let mut search = Search {
|
||||
query: self.query.clone(),
|
||||
filter: self.filter.clone(),
|
||||
offset: 0,
|
||||
limit: self.limit + self.offset,
|
||||
sort_criteria: self.sort_criteria.clone(),
|
||||
distinct: self.distinct.clone(),
|
||||
searchable_attributes: self.searchable_attributes,
|
||||
geo_strategy: self.geo_strategy,
|
||||
terms_matching_strategy: self.terms_matching_strategy,
|
||||
scoring_strategy: ScoringStrategy::Detailed,
|
||||
words_limit: self.words_limit,
|
||||
exhaustive_number_hits: self.exhaustive_number_hits,
|
||||
rtxn: self.rtxn,
|
||||
index: self.index,
|
||||
semantic: self.semantic.clone(),
|
||||
time_budget: self.time_budget.clone(),
|
||||
ranking_score_threshold: self.ranking_score_threshold,
|
||||
locales: self.locales.clone(),
|
||||
};
|
||||
|
||||
let semantic = search.semantic.take();
|
||||
let keyword_results = search.execute()?;
|
||||
|
||||
// completely skip semantic search if the results of the keyword search are good enough
|
||||
if self.results_good_enough(&keyword_results, semantic_ratio) {
|
||||
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
||||
}
|
||||
|
||||
// no vector search against placeholder search
|
||||
let Some(query) = search.query.take() else {
|
||||
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
||||
};
|
||||
// no embedder, no semantic search
|
||||
let Some(SemanticSearch { vector, embedder_name, embedder, quantized }) = semantic else {
|
||||
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
||||
};
|
||||
|
||||
let vector_query = match vector {
|
||||
Some(vector_query) => vector_query,
|
||||
None => {
|
||||
// attempt to embed the vector
|
||||
let span = tracing::trace_span!(target: "search::hybrid", "embed_one");
|
||||
let _entered = span.enter();
|
||||
|
||||
match embedder.embed_one(query) {
|
||||
Ok(embedding) => embedding,
|
||||
Err(error) => {
|
||||
tracing::error!(error=%error, "Embedding failed");
|
||||
return Ok((keyword_results, Some(0)));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
search.semantic =
|
||||
Some(SemanticSearch { vector: Some(vector_query), embedder_name, embedder, quantized });
|
||||
|
||||
// TODO: would be better to have two distinct functions at this point
|
||||
let vector_results = search.execute()?;
|
||||
|
||||
let keyword_results = ScoreWithRatioResult::new(keyword_results, 1.0 - semantic_ratio);
|
||||
let vector_results = ScoreWithRatioResult::new(vector_results, semantic_ratio);
|
||||
|
||||
let (merge_results, semantic_hit_count) =
|
||||
ScoreWithRatioResult::merge(vector_results, keyword_results, self.offset, self.limit);
|
||||
assert!(merge_results.documents_ids.len() <= self.limit);
|
||||
Ok((merge_results, Some(semantic_hit_count)))
|
||||
}
|
||||
|
||||
fn results_good_enough(&self, keyword_results: &SearchResult, semantic_ratio: f32) -> bool {
|
||||
// A result is good enough if its keyword score is > 0.9 with a semantic ratio of 0.5 => 0.9 * 0.5
|
||||
const GOOD_ENOUGH_SCORE: f64 = 0.45;
|
||||
|
||||
// 1. we check that we got a sufficient number of results
|
||||
if keyword_results.document_scores.len() < self.limit + self.offset {
|
||||
return false;
|
||||
}
|
||||
|
||||
// 2. and that all results have a good enough score.
|
||||
// we need to check all results because due to sort like rules, they're not necessarily in relevancy order
|
||||
for score in &keyword_results.document_scores {
|
||||
let score = ScoreDetails::global_score(score.iter());
|
||||
if score * ((1.0 - semantic_ratio) as f64) < GOOD_ENOUGH_SCORE {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
fn return_keyword_results(
|
||||
limit: usize,
|
||||
offset: usize,
|
||||
SearchResult {
|
||||
matching_words,
|
||||
candidates,
|
||||
mut documents_ids,
|
||||
mut document_scores,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
}: SearchResult,
|
||||
) -> (SearchResult, Option<u32>) {
|
||||
let (documents_ids, document_scores) = if offset >= documents_ids.len() ||
|
||||
// technically redudant because documents_ids.len() == document_scores.len(),
|
||||
// defensive programming
|
||||
offset >= document_scores.len()
|
||||
{
|
||||
(vec![], vec![])
|
||||
} else {
|
||||
// PANICS: offset < len
|
||||
documents_ids.rotate_left(offset);
|
||||
documents_ids.truncate(limit);
|
||||
|
||||
// PANICS: offset < len
|
||||
document_scores.rotate_left(offset);
|
||||
document_scores.truncate(limit);
|
||||
(documents_ids, document_scores)
|
||||
};
|
||||
(
|
||||
SearchResult {
|
||||
matching_words,
|
||||
candidates,
|
||||
documents_ids,
|
||||
document_scores,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
},
|
||||
Some(0),
|
||||
)
|
||||
}
|
||||
413
crates/milli/src/search/mod.rs
Normal file
413
crates/milli/src/search/mod.rs
Normal file
@@ -0,0 +1,413 @@
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use charabia::Language;
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||
use once_cell::sync::Lazy;
|
||||
use roaring::bitmap::RoaringBitmap;
|
||||
|
||||
pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords};
|
||||
use self::new::{execute_vector_search, PartialSearchResult};
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{
|
||||
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index,
|
||||
Result, SearchContext, TimeBudget, UserError,
|
||||
};
|
||||
|
||||
// Building these factories is not free.
|
||||
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
|
||||
static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true));
|
||||
static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
|
||||
|
||||
pub mod facet;
|
||||
mod fst_utils;
|
||||
pub mod hybrid;
|
||||
pub mod new;
|
||||
pub mod similar;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct SemanticSearch {
|
||||
vector: Option<Vec<f32>>,
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
quantized: bool,
|
||||
}
|
||||
|
||||
pub struct Search<'a> {
|
||||
query: Option<String>,
|
||||
// this should be linked to the String in the query
|
||||
filter: Option<Filter<'a>>,
|
||||
offset: usize,
|
||||
limit: usize,
|
||||
sort_criteria: Option<Vec<AscDesc>>,
|
||||
distinct: Option<String>,
|
||||
searchable_attributes: Option<&'a [String]>,
|
||||
geo_strategy: new::GeoSortStrategy,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
scoring_strategy: ScoringStrategy,
|
||||
words_limit: usize,
|
||||
exhaustive_number_hits: bool,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
index: &'a Index,
|
||||
semantic: Option<SemanticSearch>,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
locales: Option<Vec<Language>>,
|
||||
}
|
||||
|
||||
impl<'a> Search<'a> {
|
||||
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Search<'a> {
|
||||
Search {
|
||||
query: None,
|
||||
filter: None,
|
||||
offset: 0,
|
||||
limit: 20,
|
||||
sort_criteria: None,
|
||||
distinct: None,
|
||||
searchable_attributes: None,
|
||||
geo_strategy: new::GeoSortStrategy::default(),
|
||||
terms_matching_strategy: TermsMatchingStrategy::default(),
|
||||
scoring_strategy: Default::default(),
|
||||
exhaustive_number_hits: false,
|
||||
words_limit: 10,
|
||||
rtxn,
|
||||
index,
|
||||
semantic: None,
|
||||
locales: None,
|
||||
time_budget: TimeBudget::max(),
|
||||
ranking_score_threshold: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> {
|
||||
self.query = Some(query.into());
|
||||
self
|
||||
}
|
||||
|
||||
pub fn semantic(
|
||||
&mut self,
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
quantized: bool,
|
||||
vector: Option<Vec<f32>>,
|
||||
) -> &mut Search<'a> {
|
||||
self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector });
|
||||
self
|
||||
}
|
||||
|
||||
pub fn offset(&mut self, offset: usize) -> &mut Search<'a> {
|
||||
self.offset = offset;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn limit(&mut self, limit: usize) -> &mut Search<'a> {
|
||||
self.limit = limit;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn sort_criteria(&mut self, criteria: Vec<AscDesc>) -> &mut Search<'a> {
|
||||
self.sort_criteria = Some(criteria);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn distinct(&mut self, distinct: String) -> &mut Search<'a> {
|
||||
self.distinct = Some(distinct);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> {
|
||||
self.searchable_attributes = Some(searchable);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn terms_matching_strategy(&mut self, value: TermsMatchingStrategy) -> &mut Search<'a> {
|
||||
self.terms_matching_strategy = value;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn scoring_strategy(&mut self, value: ScoringStrategy) -> &mut Search<'a> {
|
||||
self.scoring_strategy = value;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn words_limit(&mut self, value: usize) -> &mut Search<'a> {
|
||||
self.words_limit = value;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn filter(&mut self, condition: Filter<'a>) -> &mut Search<'a> {
|
||||
self.filter = Some(condition);
|
||||
self
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn geo_sort_strategy(&mut self, strategy: new::GeoSortStrategy) -> &mut Search<'a> {
|
||||
self.geo_strategy = strategy;
|
||||
self
|
||||
}
|
||||
|
||||
/// Forces the search to exhaustively compute the number of candidates,
|
||||
/// this will increase the search time but allows finite pagination.
|
||||
pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> {
|
||||
self.exhaustive_number_hits = exhaustive_number_hits;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn time_budget(&mut self, time_budget: TimeBudget) -> &mut Search<'a> {
|
||||
self.time_budget = time_budget;
|
||||
self
|
||||
}
|
||||
|
||||
pub fn ranking_score_threshold(&mut self, ranking_score_threshold: f64) -> &mut Search<'a> {
|
||||
self.ranking_score_threshold = Some(ranking_score_threshold);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn locales(&mut self, locales: Vec<Language>) -> &mut Search<'a> {
|
||||
self.locales = Some(locales);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> {
|
||||
if has_vector_search {
|
||||
let ctx = SearchContext::new(self.index, self.rtxn)?;
|
||||
filtered_universe(ctx.index, ctx.txn, &self.filter)
|
||||
} else {
|
||||
Ok(self.execute()?.candidates)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(&self) -> Result<SearchResult> {
|
||||
let mut ctx = SearchContext::new(self.index, self.rtxn)?;
|
||||
|
||||
if let Some(searchable_attributes) = self.searchable_attributes {
|
||||
ctx.attributes_to_search_on(searchable_attributes)?;
|
||||
}
|
||||
|
||||
if let Some(distinct) = &self.distinct {
|
||||
let filterable_fields = ctx.index.filterable_fields(ctx.txn)?;
|
||||
if !crate::is_faceted(distinct, &filterable_fields) {
|
||||
let (valid_fields, hidden_fields) =
|
||||
ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?;
|
||||
return Err(Error::UserError(UserError::InvalidDistinctAttribute {
|
||||
field: distinct.clone(),
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
|
||||
let PartialSearchResult {
|
||||
located_query_terms,
|
||||
candidates,
|
||||
documents_ids,
|
||||
document_scores,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
} = match self.semantic.as_ref() {
|
||||
Some(SemanticSearch { vector: Some(vector), embedder_name, embedder, quantized }) => {
|
||||
execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
&self.distinct,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
embedder_name,
|
||||
embedder,
|
||||
*quantized,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
)?
|
||||
}
|
||||
_ => execute_search(
|
||||
&mut ctx,
|
||||
self.query.as_deref(),
|
||||
self.terms_matching_strategy,
|
||||
self.scoring_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
&self.distinct,
|
||||
self.geo_strategy,
|
||||
self.offset,
|
||||
self.limit,
|
||||
Some(self.words_limit),
|
||||
&mut DefaultSearchLogger,
|
||||
&mut DefaultSearchLogger,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
self.locales.as_ref(),
|
||||
)?,
|
||||
};
|
||||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
Ok(SearchResult {
|
||||
matching_words,
|
||||
candidates,
|
||||
document_scores,
|
||||
documents_ids,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Search<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let Search {
|
||||
query,
|
||||
filter,
|
||||
offset,
|
||||
limit,
|
||||
sort_criteria,
|
||||
distinct,
|
||||
searchable_attributes,
|
||||
geo_strategy: _,
|
||||
terms_matching_strategy,
|
||||
scoring_strategy,
|
||||
words_limit,
|
||||
exhaustive_number_hits,
|
||||
rtxn: _,
|
||||
index: _,
|
||||
semantic,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
locales,
|
||||
} = self;
|
||||
f.debug_struct("Search")
|
||||
.field("query", query)
|
||||
.field("vector", &"[...]")
|
||||
.field("filter", filter)
|
||||
.field("offset", offset)
|
||||
.field("limit", limit)
|
||||
.field("sort_criteria", sort_criteria)
|
||||
.field("distinct", distinct)
|
||||
.field("searchable_attributes", searchable_attributes)
|
||||
.field("terms_matching_strategy", terms_matching_strategy)
|
||||
.field("scoring_strategy", scoring_strategy)
|
||||
.field("exhaustive_number_hits", exhaustive_number_hits)
|
||||
.field("words_limit", words_limit)
|
||||
.field(
|
||||
"semantic.embedder_name",
|
||||
&semantic.as_ref().map(|semantic| &semantic.embedder_name),
|
||||
)
|
||||
.field("time_budget", time_budget)
|
||||
.field("ranking_score_threshold", ranking_score_threshold)
|
||||
.field("locales", locales)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct SearchResult {
|
||||
pub matching_words: MatchingWords,
|
||||
pub candidates: RoaringBitmap,
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
pub document_scores: Vec<Vec<ScoreDetails>>,
|
||||
pub degraded: bool,
|
||||
pub used_negative_operator: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum TermsMatchingStrategy {
|
||||
// remove last word first
|
||||
Last,
|
||||
// all words are mandatory
|
||||
All,
|
||||
// remove more frequent word first
|
||||
Frequency,
|
||||
}
|
||||
|
||||
impl Default for TermsMatchingStrategy {
|
||||
fn default() -> Self {
|
||||
Self::Last
|
||||
}
|
||||
}
|
||||
|
||||
fn get_first(s: &str) -> &str {
|
||||
match s.chars().next() {
|
||||
Some(c) => &s[..c.len_utf8()],
|
||||
None => panic!("unexpected empty query"),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
|
||||
let lev = match typos {
|
||||
0 => &LEVDIST0,
|
||||
1 => &LEVDIST1,
|
||||
_ => &LEVDIST2,
|
||||
};
|
||||
|
||||
if is_prefix {
|
||||
lev.build_prefix_dfa(word)
|
||||
} else {
|
||||
lev.build_dfa(word)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
#[allow(unused_imports)]
|
||||
use super::*;
|
||||
|
||||
#[cfg(feature = "japanese")]
|
||||
#[cfg(not(feature = "chinese-pinyin"))]
|
||||
#[test]
|
||||
fn test_kanji_language_detection() {
|
||||
use crate::index::tests::TempIndex;
|
||||
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
|
||||
{ "id": 1, "title": "東京のお寿司。" },
|
||||
{ "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let txn = index.write_txn().unwrap();
|
||||
let mut search = Search::new(&txn, &index);
|
||||
|
||||
search.query("東京");
|
||||
let SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||
|
||||
assert_eq!(documents_ids, vec![1]);
|
||||
}
|
||||
|
||||
#[cfg(feature = "korean")]
|
||||
#[test]
|
||||
fn test_hangul_language_detection() {
|
||||
use crate::index::tests::TempIndex;
|
||||
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
|
||||
{ "id": 1, "title": "김밥먹을래。" },
|
||||
{ "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let txn = index.write_txn().unwrap();
|
||||
let mut search = Search::new(&txn, &index);
|
||||
|
||||
search.query("김밥");
|
||||
let SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||
|
||||
assert_eq!(documents_ids, vec![1]);
|
||||
}
|
||||
}
|
||||
366
crates/milli/src/search/new/bucket_sort.rs
Normal file
366
crates/milli/src/search/new/bucket_sort.rs
Normal file
@@ -0,0 +1,366 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::logger::SearchLogger;
|
||||
use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait};
|
||||
use super::SearchContext;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput};
|
||||
use crate::{Result, TimeBudget};
|
||||
|
||||
pub struct BucketSortOutput {
|
||||
pub docids: Vec<u32>,
|
||||
pub scores: Vec<Vec<ScoreDetails>>,
|
||||
pub all_candidates: RoaringBitmap,
|
||||
|
||||
pub degraded: bool,
|
||||
}
|
||||
|
||||
// TODO: would probably be good to regroup some of these inside of a struct?
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::bucket_sort")]
|
||||
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
|
||||
query: &Q,
|
||||
distinct: Option<&str>,
|
||||
universe: &RoaringBitmap,
|
||||
from: usize,
|
||||
length: usize,
|
||||
scoring_strategy: ScoringStrategy,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
) -> Result<BucketSortOutput> {
|
||||
logger.initial_query(query);
|
||||
logger.ranking_rules(&ranking_rules);
|
||||
logger.initial_universe(universe);
|
||||
|
||||
let distinct_field = match distinct {
|
||||
Some(distinct) => Some(distinct),
|
||||
None => ctx.index.distinct_field(ctx.txn)?,
|
||||
};
|
||||
|
||||
let distinct_fid = if let Some(field) = distinct_field {
|
||||
ctx.index.fields_ids_map(ctx.txn)?.id(field)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
if universe.len() < from as u64 {
|
||||
return Ok(BucketSortOutput {
|
||||
docids: vec![],
|
||||
scores: vec![],
|
||||
all_candidates: universe.clone(),
|
||||
degraded: false,
|
||||
});
|
||||
}
|
||||
if ranking_rules.is_empty() {
|
||||
if let Some(distinct_fid) = distinct_fid {
|
||||
let mut excluded = RoaringBitmap::new();
|
||||
let mut results = vec![];
|
||||
for docid in universe.iter() {
|
||||
if results.len() >= from + length {
|
||||
break;
|
||||
}
|
||||
if excluded.contains(docid) {
|
||||
continue;
|
||||
}
|
||||
|
||||
distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
|
||||
results.push(docid);
|
||||
}
|
||||
|
||||
let mut all_candidates = universe - excluded;
|
||||
all_candidates.extend(results.iter().copied());
|
||||
// drain the results of the skipped elements
|
||||
// this **must** be done **after** writing the entire results in `all_candidates` to ensure
|
||||
// e.g. estimatedTotalHits is correct.
|
||||
if results.len() >= from {
|
||||
results.drain(..from);
|
||||
} else {
|
||||
results.clear();
|
||||
}
|
||||
|
||||
return Ok(BucketSortOutput {
|
||||
scores: vec![Default::default(); results.len()],
|
||||
docids: results,
|
||||
all_candidates,
|
||||
degraded: false,
|
||||
});
|
||||
} else {
|
||||
let docids: Vec<u32> = universe.iter().skip(from).take(length).collect();
|
||||
return Ok(BucketSortOutput {
|
||||
scores: vec![Default::default(); docids.len()],
|
||||
docids,
|
||||
all_candidates: universe.clone(),
|
||||
degraded: false,
|
||||
});
|
||||
};
|
||||
}
|
||||
|
||||
let ranking_rules_len = ranking_rules.len();
|
||||
|
||||
logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe);
|
||||
|
||||
ranking_rules[0].start_iteration(ctx, logger, universe, query)?;
|
||||
|
||||
let mut ranking_rule_scores: Vec<ScoreDetails> = vec![];
|
||||
|
||||
let mut ranking_rule_universes: Vec<RoaringBitmap> =
|
||||
vec![RoaringBitmap::default(); ranking_rules_len];
|
||||
ranking_rule_universes[0].clone_from(universe);
|
||||
let mut cur_ranking_rule_index = 0;
|
||||
|
||||
/// Finish iterating over the current ranking rule, yielding
|
||||
/// control to the parent (or finishing the search if not possible).
|
||||
/// Update the universes accordingly and inform the logger.
|
||||
macro_rules! back {
|
||||
() => {
|
||||
// FIXME: temporarily disabled assert: see <https://github.com/meilisearch/meilisearch/pull/4013>
|
||||
// assert!(
|
||||
// ranking_rule_universes[cur_ranking_rule_index].is_empty(),
|
||||
// "The ranking rule {} did not sort its bucket exhaustively",
|
||||
// ranking_rules[cur_ranking_rule_index].id()
|
||||
// );
|
||||
logger.end_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
);
|
||||
ranking_rule_universes[cur_ranking_rule_index].clear();
|
||||
ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);
|
||||
if cur_ranking_rule_index == 0 {
|
||||
break;
|
||||
} else {
|
||||
cur_ranking_rule_index -= 1;
|
||||
}
|
||||
if ranking_rule_scores.len() > cur_ranking_rule_index {
|
||||
ranking_rule_scores.pop();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
let mut all_candidates = universe.clone();
|
||||
let mut valid_docids = vec![];
|
||||
let mut valid_scores = vec![];
|
||||
let mut cur_offset = 0usize;
|
||||
|
||||
macro_rules! maybe_add_to_results {
|
||||
($candidates:expr) => {
|
||||
maybe_add_to_results(
|
||||
ctx,
|
||||
from,
|
||||
length,
|
||||
logger,
|
||||
&mut valid_docids,
|
||||
&mut valid_scores,
|
||||
&mut all_candidates,
|
||||
&mut ranking_rule_universes,
|
||||
&mut ranking_rules,
|
||||
cur_ranking_rule_index,
|
||||
&mut cur_offset,
|
||||
distinct_fid,
|
||||
&ranking_rule_scores,
|
||||
$candidates,
|
||||
)?;
|
||||
};
|
||||
}
|
||||
|
||||
while valid_docids.len() < length {
|
||||
if time_budget.exceeded() {
|
||||
loop {
|
||||
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
|
||||
ranking_rule_scores.push(ScoreDetails::Skipped);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -= bucket | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
maybe_add_to_results!(bucket);
|
||||
|
||||
ranking_rule_scores.pop();
|
||||
|
||||
if cur_ranking_rule_index == 0 {
|
||||
break;
|
||||
}
|
||||
|
||||
back!();
|
||||
}
|
||||
|
||||
return Ok(BucketSortOutput {
|
||||
scores: valid_scores,
|
||||
docids: valid_docids,
|
||||
all_candidates,
|
||||
degraded: true,
|
||||
});
|
||||
}
|
||||
|
||||
// The universe for this bucket is zero, so we don't need to sort
|
||||
// anything, just go back to the parent ranking rule.
|
||||
if ranking_rule_universes[cur_ranking_rule_index].is_empty()
|
||||
|| (scoring_strategy == ScoringStrategy::Skip
|
||||
&& ranking_rule_universes[cur_ranking_rule_index].len() == 1)
|
||||
{
|
||||
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
|
||||
maybe_add_to_results!(bucket);
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(
|
||||
ctx,
|
||||
logger,
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
)?
|
||||
else {
|
||||
back!();
|
||||
continue;
|
||||
};
|
||||
|
||||
ranking_rule_scores.push(next_bucket.score);
|
||||
|
||||
logger.next_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
&next_bucket.candidates,
|
||||
);
|
||||
|
||||
debug_assert!(
|
||||
ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)
|
||||
);
|
||||
|
||||
// remove candidates from the universe without adding them to result if their score is below the threshold
|
||||
if let Some(ranking_score_threshold) = ranking_score_threshold {
|
||||
let current_score = ScoreDetails::global_score(ranking_rule_scores.iter());
|
||||
if current_score < ranking_score_threshold {
|
||||
all_candidates -=
|
||||
next_bucket.candidates | &ranking_rule_universes[cur_ranking_rule_index];
|
||||
back!();
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
|
||||
|
||||
if cur_ranking_rule_index == ranking_rules_len - 1
|
||||
|| (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
|
||||
|| cur_offset + (next_bucket.candidates.len() as usize) < from
|
||||
{
|
||||
maybe_add_to_results!(next_bucket.candidates);
|
||||
ranking_rule_scores.pop();
|
||||
continue;
|
||||
}
|
||||
|
||||
cur_ranking_rule_index += 1;
|
||||
ranking_rule_universes[cur_ranking_rule_index].clone_from(&next_bucket.candidates);
|
||||
logger.start_iteration_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&next_bucket.query,
|
||||
&ranking_rule_universes[cur_ranking_rule_index],
|
||||
);
|
||||
ranking_rules[cur_ranking_rule_index].start_iteration(
|
||||
ctx,
|
||||
logger,
|
||||
&next_bucket.candidates,
|
||||
&next_bucket.query,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(BucketSortOutput {
|
||||
docids: valid_docids,
|
||||
scores: valid_scores,
|
||||
all_candidates,
|
||||
degraded: false,
|
||||
})
|
||||
}
|
||||
|
||||
/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset`
|
||||
/// into account and inform the logger.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
from: usize,
|
||||
length: usize,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
|
||||
valid_docids: &mut Vec<u32>,
|
||||
valid_scores: &mut Vec<Vec<ScoreDetails>>,
|
||||
all_candidates: &mut RoaringBitmap,
|
||||
|
||||
ranking_rule_universes: &mut [RoaringBitmap],
|
||||
ranking_rules: &mut [BoxRankingRule<'ctx, Q>],
|
||||
|
||||
cur_ranking_rule_index: usize,
|
||||
|
||||
cur_offset: &mut usize,
|
||||
|
||||
distinct_fid: Option<u16>,
|
||||
ranking_rule_scores: &[ScoreDetails],
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
// First apply the distinct rule on the candidates, reducing the universes if necessary
|
||||
let candidates = if let Some(distinct_fid) = distinct_fid {
|
||||
let DistinctOutput { remaining, excluded } =
|
||||
apply_distinct_rule(ctx, distinct_fid, &candidates)?;
|
||||
for universe in ranking_rule_universes.iter_mut() {
|
||||
*universe -= &excluded;
|
||||
*all_candidates -= &excluded;
|
||||
}
|
||||
remaining
|
||||
} else {
|
||||
candidates.clone()
|
||||
};
|
||||
*all_candidates |= &candidates;
|
||||
|
||||
// if the candidates are empty, there is nothing to do;
|
||||
if candidates.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// if we still haven't reached the first document to return
|
||||
if *cur_offset < from {
|
||||
// and if no document from this bucket can be returned
|
||||
if *cur_offset + (candidates.len() as usize) < from {
|
||||
// then just skip the bucket
|
||||
logger.skip_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&candidates,
|
||||
);
|
||||
} else {
|
||||
// otherwise, skip some of the documents and add some of the rest, in order of ids
|
||||
let candidates_vec = candidates.iter().collect::<Vec<_>>();
|
||||
let (skipped_candidates, candidates) = candidates_vec.split_at(from - *cur_offset);
|
||||
|
||||
logger.skip_bucket_ranking_rule(
|
||||
cur_ranking_rule_index,
|
||||
ranking_rules[cur_ranking_rule_index].as_ref(),
|
||||
&skipped_candidates.iter().collect(),
|
||||
);
|
||||
let candidates =
|
||||
candidates.iter().take(length - valid_docids.len()).copied().collect::<Vec<_>>();
|
||||
logger.add_to_results(&candidates);
|
||||
valid_docids.extend_from_slice(&candidates);
|
||||
valid_scores
|
||||
.extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len()));
|
||||
}
|
||||
} else {
|
||||
// if we have passed the offset already, add some of the documents (up to the limit)
|
||||
let candidates = candidates.iter().take(length - valid_docids.len()).collect::<Vec<u32>>();
|
||||
logger.add_to_results(&candidates);
|
||||
valid_docids.extend_from_slice(&candidates);
|
||||
valid_scores
|
||||
.extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len()));
|
||||
}
|
||||
|
||||
*cur_offset += candidates.len() as usize;
|
||||
Ok(())
|
||||
}
|
||||
719
crates/milli/src/search/new/db_cache.rs
Normal file
719
crates/milli/src/search/new/db_cache.rs
Normal file
@@ -0,0 +1,719 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::hash::Hash;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use grenad::MergeFunction;
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesEncode, Database, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::interner::Interned;
|
||||
use super::Word;
|
||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::MergeCboRoaringBitmaps;
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
|
||||
};
|
||||
|
||||
/// A cache storing pointers to values in the LMDB databases.
|
||||
///
|
||||
/// Used for performance reasons only. By using this cache, we avoid performing a
|
||||
/// database lookup and instead get a direct reference to the value using a fast
|
||||
/// local HashMap lookup.
|
||||
#[derive(Default)]
|
||||
pub struct DatabaseCache<'ctx> {
|
||||
pub word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>,
|
||||
pub prefix_word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
pub exact_word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
pub exact_word_prefix_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
|
||||
pub words_fst: Option<fst::Set<Cow<'ctx, [u8]>>>,
|
||||
pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_position_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_positions: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
|
||||
pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_fid_docids: FxHashMap<(Interned<String>, u16), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
pub word_prefix_fids: FxHashMap<Interned<String>, Vec<u16>>,
|
||||
}
|
||||
impl<'ctx> DatabaseCache<'ctx> {
|
||||
fn get_value<'v, K1, KC>(
|
||||
txn: &'ctx RoTxn<'_>,
|
||||
cache_key: K1,
|
||||
db_key: &'v KC::EItem,
|
||||
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
db: Database<KC, Bytes>,
|
||||
) -> Result<Option<RoaringBitmap>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
{
|
||||
if let Entry::Vacant(entry) = cache.entry(cache_key) {
|
||||
let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
|
||||
entry.insert(bitmap_ptr);
|
||||
}
|
||||
|
||||
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
|
||||
Some(Cow::Borrowed(bytes)) => bytes,
|
||||
Some(Cow::Owned(bytes)) => bytes.as_slice(),
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
match (bitmap_bytes, universe) {
|
||||
(bytes, Some(universe)) => {
|
||||
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
|
||||
.map(Some)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)
|
||||
.map_err(Into::into),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_value_length<'v, K1, KC>(
|
||||
txn: &'ctx RoTxn<'_>,
|
||||
cache_key: K1,
|
||||
db_key: &'v KC::EItem,
|
||||
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
|
||||
db: Database<KC, Bytes>,
|
||||
) -> Result<Option<u64>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
{
|
||||
if let Entry::Vacant(entry) = cache.entry(cache_key) {
|
||||
let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed);
|
||||
entry.insert(bitmap_ptr);
|
||||
}
|
||||
|
||||
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
|
||||
Some(Cow::Borrowed(bytes)) => bytes,
|
||||
Some(Cow::Owned(bytes)) => bytes.as_slice(),
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
CboRoaringBitmapLenCodec::bytes_decode_owned(bitmap_bytes)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
fn get_value_from_keys<'v, K1, KC, MF>(
|
||||
txn: &'ctx RoTxn<'_>,
|
||||
cache_key: K1,
|
||||
db_keys: &'v [KC::EItem],
|
||||
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
|
||||
db: Database<KC, Bytes>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
merger: MF,
|
||||
) -> Result<Option<RoaringBitmap>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
KC::EItem: Sized,
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
if let Entry::Vacant(entry) = cache.entry(cache_key) {
|
||||
let bitmap_ptr: Option<Cow<'ctx, [u8]>> = match db_keys {
|
||||
[] => None,
|
||||
[key] => db.get(txn, key)?.map(Cow::Borrowed),
|
||||
keys => {
|
||||
let bitmaps = keys
|
||||
.iter()
|
||||
.filter_map(|key| db.get(txn, key).transpose())
|
||||
.map(|v| v.map(Cow::Borrowed))
|
||||
.collect::<std::result::Result<Vec<Cow<'_, [u8]>>, _>>()?;
|
||||
|
||||
if bitmaps.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(merger.merge(&[], &bitmaps[..])?)
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
entry.insert(bitmap_ptr);
|
||||
}
|
||||
|
||||
let bitmap_bytes = match cache.get(&cache_key).unwrap() {
|
||||
Some(Cow::Borrowed(bytes)) => bytes,
|
||||
Some(Cow::Owned(bytes)) => bytes.as_slice(),
|
||||
None => return Ok(None),
|
||||
};
|
||||
|
||||
match (bitmap_bytes, universe) {
|
||||
(bytes, Some(universe)) => {
|
||||
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
|
||||
.map(Some)
|
||||
.map_err(Into::into)
|
||||
}
|
||||
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)
|
||||
.map_err(Into::into),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
pub fn get_words_fst(&mut self) -> Result<fst::Set<Cow<'ctx, [u8]>>> {
|
||||
if let Some(fst) = self.db_cache.words_fst.clone() {
|
||||
Ok(fst)
|
||||
} else {
|
||||
let fst = self.index.words_fst(self.txn)?;
|
||||
self.db_cache.words_fst = Some(fst.clone());
|
||||
Ok(fst)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn word_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Word,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match word {
|
||||
Word::Original(word) => {
|
||||
let exact = self.get_db_exact_word_docids(universe, word)?;
|
||||
let tolerant = self.get_db_word_docids(universe, word)?;
|
||||
Ok(match (exact, tolerant) {
|
||||
(None, None) => None,
|
||||
(None, Some(tolerant)) => Some(tolerant),
|
||||
(Some(exact), None) => Some(exact),
|
||||
(Some(exact), Some(tolerant)) => {
|
||||
let mut both = exact;
|
||||
both |= tolerant;
|
||||
Some(both)
|
||||
}
|
||||
})
|
||||
}
|
||||
Word::Derived(word) => self.get_db_word_docids(universe, word),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve or insert the given value in the `word_docids` database.
|
||||
fn get_db_word_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(word).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys(
|
||||
self.txn,
|
||||
word,
|
||||
&keys[..],
|
||||
&mut self.db_cache.word_docids,
|
||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
MergeCboRoaringBitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.word_docids,
|
||||
universe,
|
||||
self.index.word_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_db_exact_word_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(word).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys(
|
||||
self.txn,
|
||||
word,
|
||||
&keys[..],
|
||||
&mut self.db_cache.exact_word_docids,
|
||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
MergeCboRoaringBitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
&mut self.db_cache.exact_word_docids,
|
||||
universe,
|
||||
self.index.exact_word_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn word_prefix_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
prefix: Word,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match prefix {
|
||||
Word::Original(prefix) => {
|
||||
let exact = self.get_db_exact_word_prefix_docids(universe, prefix)?;
|
||||
let tolerant = self.get_db_word_prefix_docids(universe, prefix)?;
|
||||
Ok(match (exact, tolerant) {
|
||||
(None, None) => None,
|
||||
(None, Some(tolerant)) => Some(tolerant),
|
||||
(Some(exact), None) => Some(exact),
|
||||
(Some(exact), Some(tolerant)) => {
|
||||
let mut both = exact;
|
||||
both |= tolerant;
|
||||
Some(both)
|
||||
}
|
||||
})
|
||||
}
|
||||
Word::Derived(prefix) => self.get_db_word_prefix_docids(universe, prefix),
|
||||
}
|
||||
}
|
||||
|
||||
/// Retrieve or insert the given value in the `word_prefix_docids` database.
|
||||
fn get_db_word_prefix_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(prefix).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys(
|
||||
self.txn,
|
||||
prefix,
|
||||
&keys[..],
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
MergeCboRoaringBitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
universe,
|
||||
self.index.word_prefix_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn get_db_exact_word_prefix_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
prefix: Interned<String>,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match &self.restricted_fids {
|
||||
Some(restricted_fids) => {
|
||||
let interned = self.word_interner.get(prefix).as_str();
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys(
|
||||
self.txn,
|
||||
prefix,
|
||||
&keys[..],
|
||||
&mut self.db_cache.exact_word_prefix_docids,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
MergeCboRoaringBitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
&mut self.db_cache.exact_word_prefix_docids,
|
||||
universe,
|
||||
self.index.exact_word_prefix_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_db_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word1: Interned<String>,
|
||||
word2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
|
||||
ProximityPrecision::ByAttribute => {
|
||||
// Force proximity to 0 because:
|
||||
// in ByAttribute, there are only 2 possible distances:
|
||||
// 1. words in same attribute: in that the DB contains (0, word1, word2)
|
||||
// 2. words in different attributes: no DB entry for these two words.
|
||||
let proximity = 0;
|
||||
let docids = if let Some(docids) =
|
||||
self.db_cache.word_pair_proximity_docids.get(&(proximity, word1, word2))
|
||||
{
|
||||
docids
|
||||
.as_ref()
|
||||
.map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d))
|
||||
.transpose()
|
||||
.map_err(heed::Error::Decoding)?
|
||||
} else {
|
||||
// Compute the distance at the attribute level and store it in the cache.
|
||||
let fids = self.index.searchable_fields_ids(self.txn)?;
|
||||
let mut docids = RoaringBitmap::new();
|
||||
for fid in fids {
|
||||
// for each field, intersect left word bitmap and right word bitmap,
|
||||
// then merge the result in a global bitmap before storing it in the cache.
|
||||
let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
|
||||
let word2_docids = self.get_db_word_fid_docids(universe, word2, fid)?;
|
||||
if let (Some(word1_docids), Some(word2_docids)) =
|
||||
(word1_docids, word2_docids)
|
||||
{
|
||||
docids |= word1_docids & word2_docids;
|
||||
}
|
||||
}
|
||||
let encoded = CboRoaringBitmapCodec::bytes_encode(&docids)
|
||||
.map(Cow::into_owned)
|
||||
.map(Cow::Owned)
|
||||
.map(Some)
|
||||
.map_err(heed::Error::Decoding)?;
|
||||
self.db_cache
|
||||
.word_pair_proximity_docids
|
||||
.insert((proximity, word1, word2), encoded);
|
||||
Some(docids)
|
||||
};
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
ProximityPrecision::ByWord => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(word2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_pair_proximity_docids,
|
||||
universe,
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_db_word_pair_proximity_docids_len(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word1: Interned<String>,
|
||||
word2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<u64>> {
|
||||
match self.index.proximity_precision(self.txn)?.unwrap_or_default() {
|
||||
ProximityPrecision::ByAttribute => Ok(self
|
||||
.get_db_word_pair_proximity_docids(universe, word1, word2, proximity)?
|
||||
.map(|d| d.len())),
|
||||
ProximityPrecision::ByWord => DatabaseCache::get_value_length::<_, _>(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(word2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_pair_proximity_docids,
|
||||
self.index.word_pair_proximity_docids.remap_data_type::<Bytes>(),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_pair_proximity_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word1: Interned<String>,
|
||||
prefix2: Interned<String>,
|
||||
mut proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
let proximity_precision = self.index.proximity_precision(self.txn)?.unwrap_or_default();
|
||||
if proximity_precision == ProximityPrecision::ByAttribute {
|
||||
// Force proximity to 0 because:
|
||||
// in ByAttribute, there are only 2 possible distances:
|
||||
// 1. words in same attribute: in that the DB contains (0, word1, word2)
|
||||
// 2. words in different attributes: no DB entry for these two words.
|
||||
proximity = 0;
|
||||
}
|
||||
|
||||
let docids = if let Some(docids) =
|
||||
self.db_cache.word_prefix_pair_proximity_docids.get(&(proximity, word1, prefix2))
|
||||
{
|
||||
docids.clone()
|
||||
} else {
|
||||
let prefix_docids = match proximity_precision {
|
||||
ProximityPrecision::ByAttribute => {
|
||||
// Compute the distance at the attribute level and store it in the cache.
|
||||
let fids = self.index.searchable_fields_ids(self.txn)?;
|
||||
let mut prefix_docids = RoaringBitmap::new();
|
||||
// for each field, intersect left word bitmap and right word bitmap,
|
||||
// then merge the result in a global bitmap before storing it in the cache.
|
||||
for fid in fids {
|
||||
let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?;
|
||||
let prefix2_docids =
|
||||
self.get_db_word_prefix_fid_docids(universe, prefix2, fid)?;
|
||||
if let (Some(word1_docids), Some(prefix2_docids)) =
|
||||
(word1_docids, prefix2_docids)
|
||||
{
|
||||
prefix_docids |= word1_docids & prefix2_docids;
|
||||
}
|
||||
}
|
||||
prefix_docids
|
||||
}
|
||||
ProximityPrecision::ByWord => {
|
||||
// compute docids using prefix iter and store the result in the cache.
|
||||
let key = U8StrStrCodec::bytes_encode(&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(prefix2).as_str(),
|
||||
))
|
||||
.unwrap()
|
||||
.into_owned();
|
||||
let mut prefix_docids = RoaringBitmap::new();
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_pair_proximity_docids
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(self.txn, &key)?;
|
||||
for result in remap_key_type {
|
||||
let (_, docids) = result?;
|
||||
|
||||
prefix_docids |= docids;
|
||||
}
|
||||
prefix_docids
|
||||
}
|
||||
};
|
||||
self.db_cache
|
||||
.word_prefix_pair_proximity_docids
|
||||
.insert((proximity, word1, prefix2), Some(prefix_docids.clone()));
|
||||
Some(prefix_docids)
|
||||
};
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
pub fn get_db_prefix_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
left_prefix: Interned<String>,
|
||||
right: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
// only accept exact matches on reverted positions
|
||||
self.get_db_word_pair_proximity_docids(universe, left_prefix, right, proximity)
|
||||
}
|
||||
|
||||
pub fn get_db_word_fid_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Interned<String>,
|
||||
fid: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
// if the requested fid isn't in the restricted list, return None.
|
||||
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word, fid),
|
||||
&(self.word_interner.get(word).as_str(), fid),
|
||||
&mut self.db_cache.word_fid_docids,
|
||||
universe,
|
||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_fid_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word_prefix: Interned<String>,
|
||||
fid: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
// if the requested fid isn't in the restricted list, return None.
|
||||
if self.restricted_fids.as_ref().map_or(false, |fids| !fids.contains(&fid)) {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, fid),
|
||||
&(self.word_interner.get(word_prefix).as_str(), fid),
|
||||
&mut self.db_cache.word_prefix_fid_docids,
|
||||
universe,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_db_word_fids(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
let fids = match self.db_cache.word_fids.entry(word) {
|
||||
Entry::Occupied(fids) => fids.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut fids = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_fid_docids
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, fid), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache.word_fid_docids.insert((word, fid), Some(Cow::Borrowed(value)));
|
||||
fids.push(fid);
|
||||
}
|
||||
entry.insert(fids.clone());
|
||||
fids
|
||||
}
|
||||
};
|
||||
Ok(fids)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_fids(&mut self, word_prefix: Interned<String>) -> Result<Vec<u16>> {
|
||||
let fids = match self.db_cache.word_prefix_fids.entry(word_prefix) {
|
||||
Entry::Occupied(fids) => fids.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut fids = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_prefix_fid_docids
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, fid), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_prefix_fid_docids
|
||||
.insert((word_prefix, fid), Some(Cow::Borrowed(value)));
|
||||
fids.push(fid);
|
||||
}
|
||||
entry.insert(fids.clone());
|
||||
fids
|
||||
}
|
||||
};
|
||||
Ok(fids)
|
||||
}
|
||||
|
||||
pub fn get_db_word_position_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word, position),
|
||||
&(self.word_interner.get(word).as_str(), position),
|
||||
&mut self.db_cache.word_position_docids,
|
||||
universe,
|
||||
self.index.word_position_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_position_docids(
|
||||
&mut self,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
word_prefix: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, position),
|
||||
&(self.word_interner.get(word_prefix).as_str(), position),
|
||||
&mut self.db_cache.word_prefix_position_docids,
|
||||
universe,
|
||||
self.index.word_prefix_position_docids.remap_data_type::<Bytes>(),
|
||||
)
|
||||
}
|
||||
|
||||
pub fn get_db_word_positions(&mut self, word: Interned<String>) -> Result<Vec<u16>> {
|
||||
let positions = match self.db_cache.word_positions.entry(word) {
|
||||
Entry::Occupied(positions) => positions.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut positions = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_position_docids
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, position), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_position_docids
|
||||
.insert((word, position), Some(Cow::Borrowed(value)));
|
||||
positions.push(position);
|
||||
}
|
||||
entry.insert(positions.clone());
|
||||
positions
|
||||
}
|
||||
};
|
||||
Ok(positions)
|
||||
}
|
||||
|
||||
pub fn get_db_word_prefix_positions(
|
||||
&mut self,
|
||||
word_prefix: Interned<String>,
|
||||
) -> Result<Vec<u16>> {
|
||||
let positions = match self.db_cache.word_prefix_positions.entry(word_prefix) {
|
||||
Entry::Occupied(positions) => positions.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
let mut key = self.word_interner.get(word_prefix).as_bytes().to_owned();
|
||||
key.push(0);
|
||||
let mut positions = vec![];
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_prefix_position_docids
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(self.txn, &key)?
|
||||
.remap_key_type::<StrBEU16Codec>();
|
||||
for result in remap_key_type {
|
||||
let ((_, position), value) = result?;
|
||||
// filling other caches to avoid searching for them again
|
||||
self.db_cache
|
||||
.word_prefix_position_docids
|
||||
.insert((word_prefix, position), Some(Cow::Borrowed(value)));
|
||||
positions.push(position);
|
||||
}
|
||||
entry.insert(positions.clone());
|
||||
positions
|
||||
}
|
||||
};
|
||||
Ok(positions)
|
||||
}
|
||||
}
|
||||
123
crates/milli/src/search/new/distinct.rs
Normal file
123
crates/milli/src/search/new/distinct.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
use heed::types::{Bytes, Str, Unit};
|
||||
use heed::{Database, RoPrefix, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
const FID_SIZE: usize = 2;
|
||||
const DOCID_SIZE: usize = 4;
|
||||
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::{Index, Result, SearchContext};
|
||||
|
||||
pub struct DistinctOutput {
|
||||
pub remaining: RoaringBitmap,
|
||||
pub excluded: RoaringBitmap,
|
||||
}
|
||||
|
||||
/// Return a [`DistinctOutput`] containing:
|
||||
/// - `remaining`: a set of docids built such that exactly one element from `candidates`
|
||||
/// is kept for each distinct value inside the given field. If the field does not exist, it
|
||||
/// is considered unique.
|
||||
/// - `excluded`: the set of document ids that contain a value for the given field that occurs
|
||||
/// in the given candidates.
|
||||
pub fn apply_distinct_rule(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
field_id: u16,
|
||||
candidates: &RoaringBitmap,
|
||||
) -> Result<DistinctOutput> {
|
||||
let mut excluded = RoaringBitmap::new();
|
||||
let mut remaining = RoaringBitmap::new();
|
||||
for docid in candidates {
|
||||
if excluded.contains(docid) {
|
||||
continue;
|
||||
}
|
||||
distinct_single_docid(ctx.index, ctx.txn, field_id, docid, &mut excluded)?;
|
||||
remaining.push(docid);
|
||||
}
|
||||
Ok(DistinctOutput { remaining, excluded })
|
||||
}
|
||||
|
||||
/// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id.
|
||||
pub fn distinct_single_docid(
|
||||
index: &Index,
|
||||
txn: &RoTxn<'_>,
|
||||
field_id: u16,
|
||||
docid: u32,
|
||||
excluded: &mut RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
for item in facet_string_values(docid, field_id, index, txn)? {
|
||||
let ((_, _, facet_value), _) = item?;
|
||||
if let Some(facet_docids) = facet_value_docids(
|
||||
index.facet_id_string_docids.remap_types(),
|
||||
txn,
|
||||
field_id,
|
||||
facet_value,
|
||||
)? {
|
||||
*excluded |= facet_docids;
|
||||
}
|
||||
}
|
||||
for item in facet_number_values(docid, field_id, index, txn)? {
|
||||
let ((_, _, facet_value), _) = item?;
|
||||
if let Some(facet_docids) =
|
||||
facet_value_docids(index.facet_id_f64_docids.remap_types(), txn, field_id, facet_value)?
|
||||
{
|
||||
*excluded |= facet_docids;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Return all the docids containing the given value in the given field
|
||||
fn facet_value_docids(
|
||||
database: Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
txn: &RoTxn<'_>,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
) -> heed::Result<Option<RoaringBitmap>> {
|
||||
database
|
||||
.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })
|
||||
.map(|opt| opt.map(|v| v.bitmap))
|
||||
}
|
||||
|
||||
/// Return an iterator over each number value in the given field of the given document.
|
||||
fn facet_number_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn<'a>,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Unit>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
/// Return an iterator over each string value in the given field of the given document.
|
||||
pub fn facet_string_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn<'a>,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<BytesRefCodec>, Str>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_types();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||
}
|
||||
299
crates/milli/src/search/new/exact_attribute.rs
Normal file
299
crates/milli/src/search/new/exact_attribute.rs
Normal file
@@ -0,0 +1,299 @@
|
||||
use heed::types::Bytes;
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
|
||||
use super::query_graph::QueryGraph;
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::search::new::query_graph::QueryNodeData;
|
||||
use crate::search::new::query_term::ExactTerm;
|
||||
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
|
||||
|
||||
/// A ranking rule that produces 3 disjoint buckets:
|
||||
///
|
||||
/// 1. Documents from the universe whose value is exactly the query.
|
||||
/// 2. Documents from the universe not in (1) whose value starts with the query.
|
||||
/// 3. Documents from the universe not in (1) or (2).
|
||||
pub struct ExactAttribute {
|
||||
state: State,
|
||||
}
|
||||
|
||||
impl ExactAttribute {
|
||||
pub fn new() -> Self {
|
||||
Self { state: Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
|
||||
fn id(&self) -> String {
|
||||
"exact_attribute".to_owned()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::exact_attribute")]
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &roaring::RoaringBitmap,
|
||||
query: &QueryGraph,
|
||||
) -> Result<()> {
|
||||
self.state = State::start_iteration(ctx, universe, query)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::exact_attribute")]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &roaring::RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||
let state = std::mem::take(&mut self.state);
|
||||
let (state, output) = State::next(state, universe);
|
||||
self.state = state;
|
||||
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::exact_attribute")]
|
||||
fn end_iteration(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
self.state = Default::default();
|
||||
}
|
||||
}
|
||||
|
||||
/// Inner state of the ranking rule.
|
||||
#[derive(Default)]
|
||||
enum State {
|
||||
/// State between two iterations
|
||||
#[default]
|
||||
Uninitialized,
|
||||
/// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
|
||||
ExactAttribute(QueryGraph, Vec<FieldCandidates>),
|
||||
/// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
|
||||
/// but isn't the exact query.
|
||||
AttributeStarts(QueryGraph, Vec<FieldCandidates>),
|
||||
/// The next calls to `next` will output the input universe.
|
||||
Empty(QueryGraph),
|
||||
}
|
||||
|
||||
/// The candidates sorted by attributes
|
||||
///
|
||||
/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
|
||||
struct FieldCandidates {
|
||||
/// The candidates that start with all the words of the query in the field
|
||||
start_with_exact: RoaringBitmap,
|
||||
/// The candidates that have the same number of words as the query in the field
|
||||
exact_word_count: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl State {
|
||||
fn start_iteration(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
) -> Result<Self> {
|
||||
struct ExactTermInfo {
|
||||
exact_term: ExactTerm,
|
||||
start_position: u16,
|
||||
start_term_id: u8,
|
||||
position_count: usize,
|
||||
}
|
||||
|
||||
let mut exact_terms: Vec<ExactTermInfo> =
|
||||
Vec::with_capacity(query_graph.nodes.len() as usize);
|
||||
for (_, node) in query_graph.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(term) => {
|
||||
let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
|
||||
exact_term
|
||||
} else {
|
||||
continue;
|
||||
};
|
||||
exact_terms.push(ExactTermInfo {
|
||||
exact_term,
|
||||
start_position: *term.positions.start(),
|
||||
start_term_id: *term.term_ids.start(),
|
||||
position_count: term.positions.len(),
|
||||
});
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||
}
|
||||
}
|
||||
|
||||
exact_terms.sort_by_key(|x| x.start_term_id);
|
||||
exact_terms.dedup_by_key(|x| x.start_term_id);
|
||||
let count_all_positions = exact_terms.iter().fold(0, |acc, x| acc + x.position_count);
|
||||
|
||||
// bail if there is a "hole" (missing word) in remaining query graph
|
||||
if let Some(e) = exact_terms.first() {
|
||||
if e.start_term_id != 0 {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
} else {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
let mut previous_id = 0;
|
||||
for e in exact_terms.iter() {
|
||||
if e.start_term_id < previous_id || e.start_term_id - previous_id > 1 {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
} else {
|
||||
previous_id = e.start_term_id;
|
||||
}
|
||||
}
|
||||
|
||||
// sample query: "sunflower are pretty"
|
||||
// sunflower at pos 0 in attr A
|
||||
// are at pos 1 in attr B
|
||||
// pretty at pos 2 in attr C
|
||||
// We want to eliminate such document
|
||||
|
||||
// first check that for each term, there exists some attribute that has this term at the correct position
|
||||
//"word-position-docids";
|
||||
let mut candidates = universe.clone();
|
||||
let words_positions: Vec<(Vec<_>, _)> = exact_terms
|
||||
.iter()
|
||||
.map(|e| (e.exact_term.interned_words(ctx).collect(), e.start_position))
|
||||
.collect();
|
||||
for (words, position) in &words_positions {
|
||||
if candidates.is_empty() {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
|
||||
'words: for (offset, word) in words.iter().enumerate() {
|
||||
let offset = offset as u16;
|
||||
let word = if let Some(word) = word {
|
||||
word
|
||||
} else {
|
||||
continue 'words;
|
||||
};
|
||||
// Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
|
||||
// longer phrases we'll be losing on precision here.
|
||||
let bucketed_position = crate::bucketed_position(position + offset);
|
||||
let word_position_docids = ctx
|
||||
.get_db_word_position_docids(Some(universe), *word, bucketed_position)?
|
||||
.unwrap_or_default();
|
||||
candidates &= word_position_docids;
|
||||
if candidates.is_empty() {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let candidates = candidates;
|
||||
|
||||
if candidates.is_empty() {
|
||||
return Ok(State::Empty(query_graph.clone()));
|
||||
}
|
||||
|
||||
let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?;
|
||||
|
||||
let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
|
||||
// then check that there exists at least one attribute that has all of the terms
|
||||
for fid in searchable_fields_ids {
|
||||
let intersection = MultiOps::intersection(
|
||||
words_positions
|
||||
.iter()
|
||||
.flat_map(|(words, ..)| words.iter())
|
||||
// ignore stop words words in phrases
|
||||
.flatten()
|
||||
.map(|word| -> Result<_> {
|
||||
Ok(ctx
|
||||
.get_db_word_fid_docids(Some(&candidates), *word, fid)?
|
||||
.unwrap_or_default())
|
||||
}),
|
||||
)?;
|
||||
if !intersection.is_empty() {
|
||||
// Although not really worth it in terms of performance,
|
||||
// if would be good to put this in cache for the sake of consistency
|
||||
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
|
||||
let bitmap_bytes = ctx
|
||||
.index
|
||||
.field_id_word_count_docids
|
||||
.remap_data_type::<Bytes>()
|
||||
.get(ctx.txn, &(fid, count_all_positions as u8))?;
|
||||
|
||||
match bitmap_bytes {
|
||||
Some(bytes) => {
|
||||
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
|
||||
}
|
||||
None => RoaringBitmap::default(),
|
||||
}
|
||||
} else {
|
||||
RoaringBitmap::default()
|
||||
};
|
||||
candidates_per_attribute.push(FieldCandidates {
|
||||
start_with_exact: intersection,
|
||||
exact_word_count: candidates_with_exact_word_count,
|
||||
});
|
||||
}
|
||||
}
|
||||
// note we could have "false positives" where there both exist different attributes that collectively
|
||||
// have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
|
||||
|
||||
Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
|
||||
}
|
||||
|
||||
fn next(
|
||||
state: State,
|
||||
universe: &RoaringBitmap,
|
||||
) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
|
||||
let (state, output) = match state {
|
||||
State::Uninitialized => (state, None),
|
||||
State::ExactAttribute(query_graph, candidates_per_attribute) => {
|
||||
// TODO it can be much faster to do the intersections before the unions...
|
||||
// or maybe the candidates_per_attribute are not containing anything outside universe
|
||||
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|
||||
|FieldCandidates { start_with_exact, exact_word_count }| {
|
||||
start_with_exact & exact_word_count
|
||||
},
|
||||
));
|
||||
candidates &= universe;
|
||||
(
|
||||
State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
|
||||
Some(RankingRuleOutput {
|
||||
query: query_graph,
|
||||
candidates,
|
||||
score: ScoreDetails::ExactAttribute(
|
||||
score_details::ExactAttribute::ExactMatch,
|
||||
),
|
||||
}),
|
||||
)
|
||||
}
|
||||
State::AttributeStarts(query_graph, candidates_per_attribute) => {
|
||||
// TODO it can be much faster to do the intersections before the unions...
|
||||
// or maybe the candidates_per_attribute are not containing anything outside universe
|
||||
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|
||||
|FieldCandidates { mut start_with_exact, exact_word_count }| {
|
||||
start_with_exact -= exact_word_count;
|
||||
start_with_exact
|
||||
},
|
||||
));
|
||||
candidates &= universe;
|
||||
(
|
||||
State::Empty(query_graph.clone()),
|
||||
Some(RankingRuleOutput {
|
||||
query: query_graph,
|
||||
candidates,
|
||||
score: ScoreDetails::ExactAttribute(
|
||||
score_details::ExactAttribute::MatchesStart,
|
||||
),
|
||||
}),
|
||||
)
|
||||
}
|
||||
State::Empty(query_graph) => (
|
||||
State::Empty(query_graph.clone()),
|
||||
Some(RankingRuleOutput {
|
||||
query: query_graph,
|
||||
candidates: universe.clone(),
|
||||
score: ScoreDetails::ExactAttribute(
|
||||
score_details::ExactAttribute::NoExactMatch,
|
||||
),
|
||||
}),
|
||||
),
|
||||
};
|
||||
(state, output)
|
||||
}
|
||||
}
|
||||
309
crates/milli/src/search/new/geo_sort.rs
Normal file
309
crates/milli/src/search/new/geo_sort.rs
Normal file
@@ -0,0 +1,309 @@
|
||||
use std::collections::VecDeque;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use heed::types::{Bytes, Unit};
|
||||
use heed::{RoPrefix, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
use rstar::RTree;
|
||||
|
||||
use super::facet_string_values;
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec};
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::{
|
||||
distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext,
|
||||
SearchLogger,
|
||||
};
|
||||
|
||||
const FID_SIZE: usize = 2;
|
||||
const DOCID_SIZE: usize = 4;
|
||||
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||
concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||
}
|
||||
|
||||
/// Return an iterator over each number value in the given field of the given document.
|
||||
fn facet_number_values<'a>(
|
||||
docid: u32,
|
||||
field_id: u16,
|
||||
index: &Index,
|
||||
txn: &'a RoTxn<'a>,
|
||||
) -> Result<RoPrefix<'a, FieldDocIdFacetCodec<OrderedF64Codec>, Unit>> {
|
||||
let key = facet_values_prefix_key(field_id, docid);
|
||||
|
||||
let iter = index
|
||||
.field_id_docid_facet_f64s
|
||||
.remap_key_type::<Bytes>()
|
||||
.prefix_iter(txn, &key)?
|
||||
.remap_key_type();
|
||||
|
||||
Ok(iter)
|
||||
}
|
||||
|
||||
/// Define the strategy used by the geo sort.
|
||||
/// The parameter represents the cache size, and, in the case of the Dynamic strategy,
|
||||
/// the point where we move from using the iterative strategy to the rtree.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum Strategy {
|
||||
AlwaysIterative(usize),
|
||||
AlwaysRtree(usize),
|
||||
Dynamic(usize),
|
||||
}
|
||||
|
||||
impl Default for Strategy {
|
||||
fn default() -> Self {
|
||||
Strategy::Dynamic(1000)
|
||||
}
|
||||
}
|
||||
|
||||
impl Strategy {
|
||||
pub fn use_rtree(&self, candidates: usize) -> bool {
|
||||
match self {
|
||||
Strategy::AlwaysIterative(_) => false,
|
||||
Strategy::AlwaysRtree(_) => true,
|
||||
Strategy::Dynamic(i) => candidates >= *i,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cache_size(&self) -> usize {
|
||||
match self {
|
||||
Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct GeoSort<Q: RankingRuleQueryTrait> {
|
||||
query: Option<Q>,
|
||||
|
||||
strategy: Strategy,
|
||||
ascending: bool,
|
||||
point: [f64; 2],
|
||||
field_ids: Option<[u16; 2]>,
|
||||
rtree: Option<RTree<GeoPoint>>,
|
||||
|
||||
cached_sorted_docids: VecDeque<(u32, [f64; 2])>,
|
||||
geo_candidates: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> GeoSort<Q> {
|
||||
pub fn new(
|
||||
strategy: Strategy,
|
||||
geo_faceted_docids: RoaringBitmap,
|
||||
point: [f64; 2],
|
||||
ascending: bool,
|
||||
) -> Result<Self> {
|
||||
Ok(Self {
|
||||
query: None,
|
||||
strategy,
|
||||
ascending,
|
||||
point,
|
||||
geo_candidates: geo_faceted_docids,
|
||||
field_ids: None,
|
||||
rtree: None,
|
||||
cached_sorted_docids: VecDeque::new(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Refill the internal buffer of cached docids based on the strategy.
|
||||
/// Drop the rtree if we don't need it anymore.
|
||||
fn fill_buffer(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'_>,
|
||||
geo_candidates: &RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng");
|
||||
debug_assert!(self.cached_sorted_docids.is_empty());
|
||||
|
||||
// lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree`
|
||||
let rtree = if self.strategy.use_rtree(geo_candidates.len() as usize) {
|
||||
if let Some(rtree) = self.rtree.as_ref() {
|
||||
// get rtree from cache
|
||||
Some(rtree)
|
||||
} else {
|
||||
let rtree = ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree");
|
||||
// insert rtree in cache and returns it.
|
||||
// Can't use `get_or_insert_with` because getting the rtree from the DB is a fallible operation.
|
||||
Some(&*self.rtree.insert(rtree))
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let cache_size = self.strategy.cache_size();
|
||||
if let Some(rtree) = rtree {
|
||||
if self.ascending {
|
||||
let point = lat_lng_to_xyz(&self.point);
|
||||
for point in rtree.nearest_neighbor_iter(&point) {
|
||||
if geo_candidates.contains(point.data.0) {
|
||||
self.cached_sorted_docids.push_back(point.data);
|
||||
if self.cached_sorted_docids.len() >= cache_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// in the case of the desc geo sort we look for the closest point to the opposite of the queried point
|
||||
// and we insert the points in reverse order they get reversed when emptying the cache later on
|
||||
let point = lat_lng_to_xyz(&opposite_of(self.point));
|
||||
for point in rtree.nearest_neighbor_iter(&point) {
|
||||
if geo_candidates.contains(point.data.0) {
|
||||
self.cached_sorted_docids.push_front(point.data);
|
||||
if self.cached_sorted_docids.len() >= cache_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// the iterative version
|
||||
let [lat, lng] = self.field_ids.unwrap();
|
||||
|
||||
let mut documents = geo_candidates
|
||||
.iter()
|
||||
.map(|id| -> Result<_> { Ok((id, geo_value(id, lat, lng, ctx.index, ctx.txn)?)) })
|
||||
.collect::<Result<Vec<(u32, [f64; 2])>>>()?;
|
||||
// computing the distance between two points is expensive thus we cache the result
|
||||
documents
|
||||
.sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize);
|
||||
self.cached_sorted_docids.extend(documents);
|
||||
};
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the lat and long values from a single document.
|
||||
///
|
||||
/// If it is not able to find it in the facet number index it will extract it
|
||||
/// from the facet string index and parse it as f64 (as the geo extraction behaves).
|
||||
fn geo_value(
|
||||
docid: u32,
|
||||
field_lat: u16,
|
||||
field_lng: u16,
|
||||
index: &Index,
|
||||
rtxn: &RoTxn<'_>,
|
||||
) -> Result<[f64; 2]> {
|
||||
let extract_geo = |geo_field: u16| -> Result<f64> {
|
||||
match facet_number_values(docid, geo_field, index, rtxn)?.next() {
|
||||
Some(Ok(((_, _, geo), ()))) => Ok(geo),
|
||||
Some(Err(e)) => Err(e.into()),
|
||||
None => match facet_string_values(docid, geo_field, index, rtxn)?.next() {
|
||||
Some(Ok((_, geo))) => {
|
||||
Ok(geo.parse::<f64>().expect("cannot parse geo field as f64"))
|
||||
}
|
||||
Some(Err(e)) => Err(e.into()),
|
||||
None => panic!("A geo faceted document doesn't contain any lat or lng"),
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let lat = extract_geo(field_lat)?;
|
||||
let lng = extract_geo(field_lng)?;
|
||||
|
||||
Ok([lat, lng])
|
||||
}
|
||||
|
||||
impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort<Q> {
|
||||
fn id(&self) -> String {
|
||||
"geo_sort".to_owned()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")]
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
query: &Q,
|
||||
) -> Result<()> {
|
||||
assert!(self.query.is_none());
|
||||
|
||||
self.query = Some(query.clone());
|
||||
|
||||
let geo_candidates = &self.geo_candidates & universe;
|
||||
|
||||
if geo_candidates.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let fid_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
let lat = fid_map.id("_geo.lat").expect("geo candidates but no fid for lat");
|
||||
let lng = fid_map.id("_geo.lng").expect("geo candidates but no fid for lng");
|
||||
self.field_ids = Some([lat, lng]);
|
||||
self.fill_buffer(ctx, &geo_candidates)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")]
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<Q>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<Q>>> {
|
||||
let query = self.query.as_ref().unwrap().clone();
|
||||
|
||||
let geo_candidates = &self.geo_candidates & universe;
|
||||
|
||||
if geo_candidates.is_empty() {
|
||||
return Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: universe.clone(),
|
||||
score: ScoreDetails::GeoSort(score_details::GeoSort {
|
||||
target_point: self.point,
|
||||
ascending: self.ascending,
|
||||
value: None,
|
||||
}),
|
||||
}));
|
||||
}
|
||||
|
||||
let ascending = self.ascending;
|
||||
let next = |cache: &mut VecDeque<_>| {
|
||||
if ascending {
|
||||
cache.pop_front()
|
||||
} else {
|
||||
cache.pop_back()
|
||||
}
|
||||
};
|
||||
while let Some((id, point)) = next(&mut self.cached_sorted_docids) {
|
||||
if geo_candidates.contains(id) {
|
||||
return Ok(Some(RankingRuleOutput {
|
||||
query,
|
||||
candidates: RoaringBitmap::from_iter([id]),
|
||||
score: ScoreDetails::GeoSort(score_details::GeoSort {
|
||||
target_point: self.point,
|
||||
ascending: self.ascending,
|
||||
value: Some(point),
|
||||
}),
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
// if we got out of this loop it means we've exhausted our cache.
|
||||
// we need to refill it and run the function again.
|
||||
self.fill_buffer(ctx, &geo_candidates)?;
|
||||
self.next_bucket(ctx, logger, universe)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")]
|
||||
fn end_iteration(&mut self, _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger<Q>) {
|
||||
// we do not reset the rtree here, it could be used in a next iteration
|
||||
self.query = None;
|
||||
self.cached_sorted_docids.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute the antipodal coordinate of `coord`
|
||||
fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] {
|
||||
coord[0] *= -1.;
|
||||
// in the case of x,0 we want to return x,180
|
||||
if coord[1] > 0. {
|
||||
coord[1] -= 180.;
|
||||
} else {
|
||||
coord[1] += 180.;
|
||||
}
|
||||
|
||||
coord
|
||||
}
|
||||
431
crates/milli/src/search/new/graph_based_ranking_rule.rs
Normal file
431
crates/milli/src/search/new/graph_based_ranking_rule.rs
Normal file
@@ -0,0 +1,431 @@
|
||||
/*! Implementation of a generic graph-based ranking rule.
|
||||
|
||||
A graph-based ranking rule is a ranking rule that works by representing
|
||||
its possible operations and their relevancy cost as a directed acyclic multi-graph
|
||||
built on top of the query graph. It then computes its buckets by finding the
|
||||
cheapest paths from the start node to the end node and computing the document ids
|
||||
that satisfy those paths.
|
||||
|
||||
For example, the proximity ranking rule builds a graph where the edges between two
|
||||
nodes represent a condition that the term of the source node is in a certain proximity
|
||||
to the term of the destination node. With the query "pretty house by" where the term
|
||||
"pretty" has three possible proximities to the term "house" and "house" has two
|
||||
proximities to "by", the graph will look like this:
|
||||
|
||||
```txt
|
||||
┌───────┐ ┌───────┐─────1────▶┌───────┐──1──▶┌─────┐ ┌───────┐
|
||||
│ START │──0─▶│pretty │─────2────▶│ house │ │ by │─0─▶│ END │
|
||||
└───────┘ └───────┘─────3────▶└───────┘──2-─▶└─────┘ └───────┘
|
||||
```
|
||||
The proximity ranking rule's first bucket will be determined by the union of all
|
||||
the shortest paths from START to END, which in this case is:
|
||||
```txt
|
||||
START --0-> pretty --1--> house --1--> by --0--> end
|
||||
```
|
||||
The path's corresponding document ids are found by taking the intersection of the
|
||||
document ids of each edge. That is, we find the documents where both `pretty` is
|
||||
1-close to `house` AND `house` is 1-close to `by`.
|
||||
|
||||
For the second bucket, we get the union of the second-cheapest paths, which are:
|
||||
```txt
|
||||
START --0-> pretty --1--> house --2--> by --0--> end
|
||||
START --0-> pretty --2--> house --1--> by --0--> end
|
||||
```
|
||||
That is we find the documents where either:
|
||||
- `pretty` is 1-close to `house` AND `house` is 2-close to `by`
|
||||
- OR: `pretty` is 2-close to `house` AND `house` is 1-close to `by`
|
||||
*/
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::interner::{Interned, MappedInterner};
|
||||
use super::logger::SearchLogger;
|
||||
use super::query_graph::QueryNode;
|
||||
use super::ranking_rule_graph::{
|
||||
ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, FidGraph, PositionGraph, ProximityGraph,
|
||||
RankingRuleGraph, RankingRuleGraphTrait, TypoGraph, WordsGraph,
|
||||
};
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
|
||||
use crate::score_details::Rank;
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::ranking_rule_graph::PathVisitor;
|
||||
use crate::{Result, TermsMatchingStrategy};
|
||||
|
||||
pub type Words = GraphBasedRankingRule<WordsGraph>;
|
||||
impl GraphBasedRankingRule<WordsGraph> {
|
||||
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
|
||||
Self::new_with_id("words".to_owned(), Some(terms_matching_strategy))
|
||||
}
|
||||
}
|
||||
pub type Proximity = GraphBasedRankingRule<ProximityGraph>;
|
||||
impl GraphBasedRankingRule<ProximityGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("proximity".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Fid = GraphBasedRankingRule<FidGraph>;
|
||||
impl GraphBasedRankingRule<FidGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("fid".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Position = GraphBasedRankingRule<PositionGraph>;
|
||||
impl GraphBasedRankingRule<PositionGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("position".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Typo = GraphBasedRankingRule<TypoGraph>;
|
||||
impl GraphBasedRankingRule<TypoGraph> {
|
||||
pub fn new(terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self::new_with_id("typo".to_owned(), terms_matching_strategy)
|
||||
}
|
||||
}
|
||||
pub type Exactness = GraphBasedRankingRule<ExactnessGraph>;
|
||||
impl GraphBasedRankingRule<ExactnessGraph> {
|
||||
pub fn new() -> Self {
|
||||
Self::new_with_id("exactness".to_owned(), None)
|
||||
}
|
||||
}
|
||||
|
||||
/// A generic graph-based ranking rule
|
||||
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
|
||||
id: String,
|
||||
terms_matching_strategy: Option<TermsMatchingStrategy>,
|
||||
// When the ranking rule is not iterating over its buckets,
|
||||
// its state is `None`.
|
||||
state: Option<GraphBasedRankingRuleState<G>>,
|
||||
}
|
||||
impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
|
||||
/// Creates the ranking rule with the given identifier
|
||||
pub fn new_with_id(id: String, terms_matching_strategy: Option<TermsMatchingStrategy>) -> Self {
|
||||
Self { id, terms_matching_strategy, state: None }
|
||||
}
|
||||
}
|
||||
|
||||
/// The internal state of a graph-based ranking rule during iteration
|
||||
pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
|
||||
/// The current graph
|
||||
graph: RankingRuleGraph<G>,
|
||||
/// Cache to retrieve the docids associated with each edge
|
||||
conditions_cache: ConditionDocIdsCache<G>,
|
||||
/// Cache used to optimistically discard paths that resolve to no documents.
|
||||
dead_ends_cache: DeadEndsCache<G::Condition>,
|
||||
/// A structure giving the list of possible costs from each node to the end node
|
||||
all_costs: MappedInterner<QueryNode, Vec<u64>>,
|
||||
/// An index in the first element of `all_distances`, giving the cost of the next bucket
|
||||
cur_cost: u64,
|
||||
/// One above the highest possible cost for this rule
|
||||
next_max_cost: u64,
|
||||
}
|
||||
|
||||
impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBasedRankingRule<G> {
|
||||
fn id(&self) -> String {
|
||||
self.id.clone()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::graph_based")]
|
||||
fn start_iteration(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
_universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
) -> Result<()> {
|
||||
// the `next_max_cost` is the successor integer to the maximum cost of the paths in the graph.
|
||||
//
|
||||
// When there is a matching strategy, it also factors the additional costs of:
|
||||
// 1. The words that are matched in phrases
|
||||
// 2. Skipping words (by adding them to the paths with a cost)
|
||||
let mut next_max_cost = 1;
|
||||
let removal_cost = if let Some(terms_matching_strategy) = self.terms_matching_strategy {
|
||||
// add the cost of the phrase to the next_max_cost
|
||||
next_max_cost += query_graph
|
||||
.words_in_phrases_count(ctx)
|
||||
// remove 1 from the words in phrases count, because when there is a phrase we can now have a document
|
||||
// where only the phrase is matching, and none of the non-phrase words.
|
||||
// With the `1` that `next_max_cost` is initialized with, this gets counted twice.
|
||||
.saturating_sub(1) as u64;
|
||||
match terms_matching_strategy {
|
||||
TermsMatchingStrategy::Last => {
|
||||
let removal_order =
|
||||
query_graph.removal_order_for_terms_matching_strategy_last(ctx);
|
||||
let mut forbidden_nodes =
|
||||
SmallBitmap::for_interned_values_in(&query_graph.nodes);
|
||||
let mut costs = query_graph.nodes.map(|_| None);
|
||||
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
|
||||
for ns in removal_order {
|
||||
for n in ns.iter() {
|
||||
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
|
||||
}
|
||||
forbidden_nodes.union(&ns);
|
||||
}
|
||||
costs
|
||||
}
|
||||
TermsMatchingStrategy::Frequency => {
|
||||
let removal_order =
|
||||
query_graph.removal_order_for_terms_matching_strategy_frequency(ctx)?;
|
||||
let mut forbidden_nodes =
|
||||
SmallBitmap::for_interned_values_in(&query_graph.nodes);
|
||||
let mut costs = query_graph.nodes.map(|_| None);
|
||||
// FIXME: this works because only words uses termsmatchingstrategy at the moment.
|
||||
for ns in removal_order {
|
||||
for n in ns.iter() {
|
||||
*costs.get_mut(n) = Some((1, forbidden_nodes.clone()));
|
||||
}
|
||||
forbidden_nodes.union(&ns);
|
||||
}
|
||||
costs
|
||||
}
|
||||
TermsMatchingStrategy::All => query_graph.nodes.map(|_| None),
|
||||
}
|
||||
} else {
|
||||
query_graph.nodes.map(|_| None)
|
||||
};
|
||||
|
||||
let graph = RankingRuleGraph::build(ctx, query_graph.clone(), removal_cost)?;
|
||||
let condition_docids_cache = ConditionDocIdsCache::default();
|
||||
let dead_ends_cache = DeadEndsCache::new(&graph.conditions_interner);
|
||||
|
||||
// Then pre-compute the cost of all paths from each node to the end node
|
||||
let all_costs = graph.find_all_costs_to_end();
|
||||
|
||||
next_max_cost +=
|
||||
all_costs.get(graph.query_graph.root_node).iter().copied().max().unwrap_or(0);
|
||||
|
||||
let state = GraphBasedRankingRuleState {
|
||||
graph,
|
||||
conditions_cache: condition_docids_cache,
|
||||
dead_ends_cache,
|
||||
all_costs,
|
||||
cur_cost: 0,
|
||||
next_max_cost,
|
||||
};
|
||||
|
||||
self.state = Some(state);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::graph_based")]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
|
||||
// Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
|
||||
// should never happen
|
||||
let mut state = self.state.take().unwrap();
|
||||
|
||||
let all_costs = state.all_costs.get(state.graph.query_graph.root_node);
|
||||
// Retrieve the cost of the paths to compute
|
||||
let Some(&cost) = all_costs.iter().find(|c| **c >= state.cur_cost) else {
|
||||
self.state = None;
|
||||
return Ok(None);
|
||||
};
|
||||
state.cur_cost = cost + 1;
|
||||
|
||||
let mut bucket = RoaringBitmap::new();
|
||||
|
||||
let GraphBasedRankingRuleState {
|
||||
graph,
|
||||
conditions_cache: condition_docids_cache,
|
||||
dead_ends_cache,
|
||||
all_costs,
|
||||
cur_cost: _,
|
||||
next_max_cost,
|
||||
} = &mut state;
|
||||
|
||||
let rank = *next_max_cost - cost;
|
||||
let score = G::rank_to_score(Rank { rank: rank as u32, max_rank: *next_max_cost as u32 });
|
||||
|
||||
let mut universe = universe.clone();
|
||||
|
||||
let mut used_conditions = SmallBitmap::for_interned_values_in(&graph.conditions_interner);
|
||||
let mut good_paths = vec![];
|
||||
let mut considered_paths = vec![];
|
||||
|
||||
// For each path of the given cost, we will compute its associated
|
||||
// document ids.
|
||||
// In case the path does not resolve to any document id, we try to figure out why
|
||||
// and update the `dead_ends_cache` accordingly.
|
||||
// Updating the dead_ends_cache helps speed up the execution of `visit_paths_of_cost` and reduces
|
||||
// the number of future candidate paths given by that same function.
|
||||
|
||||
let mut subpaths_docids: Vec<(Interned<G::Condition>, RoaringBitmap)> = vec![];
|
||||
|
||||
let mut nodes_with_removed_outgoing_conditions = BTreeSet::new();
|
||||
let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache);
|
||||
|
||||
visitor.visit_paths(&mut |path, graph, dead_ends_cache| {
|
||||
considered_paths.push(path.to_vec());
|
||||
// If the universe is empty, stop exploring the graph, since no docids will ever be found anymore.
|
||||
if universe.is_empty() {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
// `visit_paths` performs a depth-first search, so the previously visited path
|
||||
// is likely to share a prefix with the current one.
|
||||
// We stored the previous path and the docids associated to each of its prefixes in `subpaths_docids`.
|
||||
// We take advantage of this to avoid computing the docids associated with the common prefix between
|
||||
// the old and current path.
|
||||
let idx_of_first_different_condition = {
|
||||
let mut idx = 0;
|
||||
for (&last_c, cur_c) in path.iter().zip(subpaths_docids.iter().map(|x| x.0)) {
|
||||
if last_c == cur_c {
|
||||
idx += 1;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
subpaths_docids.truncate(idx);
|
||||
idx
|
||||
};
|
||||
// Then for the remaining of the path, we continue computing docids.
|
||||
for latest_condition in path[idx_of_first_different_condition..].iter().copied() {
|
||||
let success = visit_path_condition(
|
||||
ctx,
|
||||
graph,
|
||||
&universe,
|
||||
dead_ends_cache,
|
||||
condition_docids_cache,
|
||||
&mut subpaths_docids,
|
||||
&mut nodes_with_removed_outgoing_conditions,
|
||||
latest_condition,
|
||||
)?;
|
||||
if !success {
|
||||
return Ok(ControlFlow::Continue(()));
|
||||
}
|
||||
}
|
||||
assert!(subpaths_docids.iter().map(|x| x.0).eq(path.iter().copied()));
|
||||
|
||||
let path_docids =
|
||||
subpaths_docids.pop().map(|x| x.1).unwrap_or_else(|| universe.clone());
|
||||
assert!(!path_docids.is_empty());
|
||||
|
||||
// Accumulate the path for logging purposes only
|
||||
good_paths.push(path.to_vec());
|
||||
for &condition in path {
|
||||
used_conditions.insert(condition);
|
||||
}
|
||||
bucket |= &path_docids;
|
||||
// Reduce the size of the universe so that we can more optimistically discard candidate paths
|
||||
universe -= &path_docids;
|
||||
for (_, docids) in subpaths_docids.iter_mut() {
|
||||
*docids -= &path_docids;
|
||||
}
|
||||
|
||||
if universe.is_empty() {
|
||||
Ok(ControlFlow::Break(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Continue(()))
|
||||
}
|
||||
})?;
|
||||
logger.log_internal_state(graph);
|
||||
logger.log_internal_state(&good_paths);
|
||||
|
||||
// We modify the next query graph so that it only contains the subgraph
|
||||
// that was used to compute this bucket
|
||||
|
||||
let paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>> = good_paths
|
||||
.into_iter()
|
||||
.map(|path| {
|
||||
path.into_iter()
|
||||
.map(|condition| {
|
||||
let (a, b) =
|
||||
condition_docids_cache.get_subsets_used_by_condition(condition);
|
||||
(a.clone(), b.clone())
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let next_query_graph = QueryGraph::build_from_paths(paths);
|
||||
|
||||
#[allow(clippy::comparison_chain)]
|
||||
if nodes_with_removed_outgoing_conditions.len() == 1 {
|
||||
graph.update_all_costs_before_node(
|
||||
*nodes_with_removed_outgoing_conditions.first().unwrap(),
|
||||
all_costs,
|
||||
);
|
||||
} else if nodes_with_removed_outgoing_conditions.len() > 1 {
|
||||
*all_costs = graph.find_all_costs_to_end();
|
||||
}
|
||||
|
||||
self.state = Some(state);
|
||||
|
||||
Ok(Some(RankingRuleOutput { query: next_query_graph, candidates: bucket, score }))
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::graph_based")]
|
||||
fn end_iteration(
|
||||
&mut self,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
self.state = None;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns false if the intersection between the condition
|
||||
/// docids and the previous path docids is empty.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn visit_path_condition<G: RankingRuleGraphTrait>(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
universe: &RoaringBitmap,
|
||||
dead_ends_cache: &mut DeadEndsCache<G::Condition>,
|
||||
condition_docids_cache: &mut ConditionDocIdsCache<G>,
|
||||
subpath: &mut Vec<(Interned<G::Condition>, RoaringBitmap)>,
|
||||
nodes_with_removed_outgoing_conditions: &mut BTreeSet<Interned<QueryNode>>,
|
||||
latest_condition: Interned<G::Condition>,
|
||||
) -> Result<bool> {
|
||||
let condition_docids = &condition_docids_cache
|
||||
.get_computed_condition(ctx, latest_condition, graph, universe)?
|
||||
.docids;
|
||||
if condition_docids.is_empty() {
|
||||
// 1. Store in the cache that this edge is empty for this universe
|
||||
dead_ends_cache.forbid_condition(latest_condition);
|
||||
// 2. remove all the edges with this condition from the ranking rule graph
|
||||
let source_nodes = graph.remove_edges_with_condition(latest_condition);
|
||||
nodes_with_removed_outgoing_conditions.extend(source_nodes);
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let latest_path_docids = if let Some((_, prev_docids)) = subpath.last() {
|
||||
prev_docids & condition_docids
|
||||
} else {
|
||||
condition_docids.clone()
|
||||
};
|
||||
if !latest_path_docids.is_empty() {
|
||||
subpath.push((latest_condition, latest_path_docids));
|
||||
return Ok(true);
|
||||
}
|
||||
// If the (sub)path is empty, we try to figure out why and update the caches accordingly.
|
||||
|
||||
// First, we know that this path is empty, and thus any path
|
||||
// that is a superset of it will also be empty.
|
||||
dead_ends_cache.forbid_condition_after_prefix(subpath.iter().map(|x| x.0), latest_condition);
|
||||
|
||||
if subpath.len() <= 1 {
|
||||
return Ok(false);
|
||||
}
|
||||
let mut subprefix = vec![];
|
||||
// Deadend if the intersection between this edge and any
|
||||
// previous prefix is disjoint with the universe
|
||||
// We already know that the intersection with the last one
|
||||
// is empty,
|
||||
for (past_condition, sp_docids) in subpath[..subpath.len() - 1].iter() {
|
||||
subprefix.push(*past_condition);
|
||||
if condition_docids.is_disjoint(sp_docids) {
|
||||
dead_ends_cache
|
||||
.forbid_condition_after_prefix(subprefix.iter().copied(), latest_condition);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
259
crates/milli/src/search/new/interner.rs
Normal file
259
crates/milli/src/search/new/interner.rs
Normal file
@@ -0,0 +1,259 @@
|
||||
use std::fmt;
|
||||
use std::hash::Hash;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
|
||||
/// An index within an interner ([`FixedSizeInterner`], [`DedupInterner`], or [`MappedInterner`]).
|
||||
pub struct Interned<T> {
|
||||
idx: u16,
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
impl<T> Interned<T> {
|
||||
/// Create an interned value manually from its raw index within the interner.
|
||||
pub fn from_raw(idx: u16) -> Self {
|
||||
Self { idx, _phantom: PhantomData }
|
||||
}
|
||||
/// Get the raw index from the interned value
|
||||
pub fn into_raw(self) -> u16 {
|
||||
self.idx
|
||||
}
|
||||
}
|
||||
|
||||
/// A [`DedupInterner`] is used to store a unique copy of a value of type `T`. This value
|
||||
/// is then identified by a lightweight index of type [`Interned<T>`], which can
|
||||
/// be copied, compared, and hashed efficiently. An immutable reference to the original value
|
||||
/// can be retrieved using `self.get(interned)`. A set of values within the interner can be
|
||||
/// efficiently managed using [`SmallBitmap<T>`](super::small_bitmap::SmallBitmap).
|
||||
///
|
||||
/// A dedup-interner can contain a maximum of `u16::MAX` values.
|
||||
#[derive(Clone)]
|
||||
pub struct DedupInterner<T> {
|
||||
stable_store: Vec<T>,
|
||||
lookup: FxHashMap<T, Interned<T>>,
|
||||
}
|
||||
impl<T> Default for DedupInterner<T> {
|
||||
fn default() -> Self {
|
||||
Self { stable_store: Default::default(), lookup: Default::default() }
|
||||
}
|
||||
}
|
||||
impl<T> DedupInterner<T> {
|
||||
/// Convert the dedup-interner into a fixed-size interner, such that new
|
||||
/// elements cannot be added to it anymore.
|
||||
pub fn freeze(self) -> FixedSizeInterner<T> {
|
||||
FixedSizeInterner { stable_store: self.stable_store }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> DedupInterner<T>
|
||||
where
|
||||
T: Clone + Eq + Hash,
|
||||
{
|
||||
/// Insert the given value into the dedup-interner, and return
|
||||
/// its index.
|
||||
pub fn insert(&mut self, s: T) -> Interned<T> {
|
||||
if let Some(interned) = self.lookup.get(&s) {
|
||||
*interned
|
||||
} else {
|
||||
assert!(self.stable_store.len() < u16::MAX as usize);
|
||||
self.stable_store.push(s.clone());
|
||||
let interned = Interned::from_raw(self.stable_store.len() as u16 - 1);
|
||||
self.lookup.insert(s, interned);
|
||||
interned
|
||||
}
|
||||
}
|
||||
/// Get a reference to the interned value.
|
||||
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
}
|
||||
|
||||
/// A fixed-length store for values of type `T`, where each value is identified
|
||||
/// by an index of type [`Interned<T>`].
|
||||
#[derive(Clone)]
|
||||
pub struct FixedSizeInterner<T> {
|
||||
stable_store: Vec<T>,
|
||||
}
|
||||
impl<T: Clone> FixedSizeInterner<T> {
|
||||
/// Create a fixed-size interner of the given length containing
|
||||
/// clones of the given value.
|
||||
pub fn new(length: u16, value: T) -> Self {
|
||||
Self { stable_store: vec![value; length as usize] }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> FixedSizeInterner<T> {
|
||||
pub fn from_vec(store: Vec<T>) -> Self {
|
||||
Self { stable_store: store }
|
||||
}
|
||||
pub fn all_interned_values(&self) -> SmallBitmap<T> {
|
||||
let mut b = SmallBitmap::for_interned_values_in(self);
|
||||
for i in self.indexes() {
|
||||
b.insert(i);
|
||||
}
|
||||
b
|
||||
}
|
||||
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn get_mut(&mut self, interned: Interned<T>) -> &mut T {
|
||||
&mut self.stable_store[interned.idx as usize]
|
||||
}
|
||||
|
||||
pub fn len(&self) -> u16 {
|
||||
self.stable_store.len() as u16
|
||||
}
|
||||
pub fn map_move<U>(self, map_f: impl Fn(T) -> U) -> FixedSizeInterner<U> {
|
||||
FixedSizeInterner { stable_store: self.stable_store.into_iter().map(map_f).collect() }
|
||||
}
|
||||
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner {
|
||||
stable_store: self.stable_store.iter().map(map_f).collect(),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn map_indexes<U>(&self, map_f: impl Fn(Interned<T>) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData }
|
||||
}
|
||||
pub fn indexes(&self) -> impl Iterator<Item = Interned<T>> {
|
||||
(0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16))
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (Interned<T>, &T)> {
|
||||
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
|
||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
}
|
||||
|
||||
/// A fixed-length store for values of type `T`, where each value is identified
|
||||
/// by an index of type [`Interned<T>`].
|
||||
#[derive(Clone)]
|
||||
pub struct Interner<T> {
|
||||
stable_store: Vec<T>,
|
||||
}
|
||||
impl<T> Default for Interner<T> {
|
||||
fn default() -> Self {
|
||||
Self { stable_store: vec![] }
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Interner<T> {
|
||||
pub fn from_vec(v: Vec<T>) -> Self {
|
||||
Self { stable_store: v }
|
||||
}
|
||||
pub fn get(&self, interned: Interned<T>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn get_mut(&mut self, interned: Interned<T>) -> &mut T {
|
||||
&mut self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn push(&mut self, value: T) -> Interned<T> {
|
||||
assert!(self.stable_store.len() < u16::MAX as usize);
|
||||
self.stable_store.push(value);
|
||||
Interned::from_raw(self.stable_store.len() as u16 - 1)
|
||||
}
|
||||
pub fn len(&self) -> u16 {
|
||||
self.stable_store.len() as u16
|
||||
}
|
||||
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner {
|
||||
stable_store: self.stable_store.iter().map(map_f).collect(),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn map_indexes<U>(&self, map_f: impl Fn(Interned<T>) -> U) -> MappedInterner<T, U> {
|
||||
MappedInterner { stable_store: self.indexes().map(map_f).collect(), _phantom: PhantomData }
|
||||
}
|
||||
pub fn indexes(&self) -> impl Iterator<Item = Interned<T>> {
|
||||
(0..self.stable_store.len()).map(|i| Interned::from_raw(i as u16))
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (Interned<T>, &T)> {
|
||||
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
|
||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn freeze(self) -> FixedSizeInterner<T> {
|
||||
FixedSizeInterner { stable_store: self.stable_store }
|
||||
}
|
||||
}
|
||||
|
||||
/// A store of values of type `T`, each linked to a value of type `From`
|
||||
/// stored in another interner. To create a mapped interner, use the
|
||||
/// `map` method on [`FixedSizeInterner`] or [`MappedInterner`].
|
||||
///
|
||||
/// Values in this interner are indexed with [`Interned<From>`].
|
||||
#[derive(Clone)]
|
||||
pub struct MappedInterner<From, T> {
|
||||
stable_store: Vec<T>,
|
||||
_phantom: PhantomData<From>,
|
||||
}
|
||||
|
||||
impl<From, T> MappedInterner<From, T> {
|
||||
pub fn get(&self, interned: Interned<From>) -> &T {
|
||||
&self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn get_mut(&mut self, interned: Interned<From>) -> &mut T {
|
||||
&mut self.stable_store[interned.idx as usize]
|
||||
}
|
||||
pub fn map<U>(&self, map_f: impl Fn(&T) -> U) -> MappedInterner<From, U> {
|
||||
MappedInterner {
|
||||
stable_store: self.stable_store.iter().map(map_f).collect(),
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn iter(&self) -> impl Iterator<Item = (Interned<From>, &T)> {
|
||||
self.stable_store.iter().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<From>, &mut T)> {
|
||||
self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
|
||||
}
|
||||
}
|
||||
// Interned<T> boilerplate implementations
|
||||
|
||||
impl<T> Hash for Interned<T> {
|
||||
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
||||
self.idx.hash(state);
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Ord for Interned<T> {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.idx.cmp(&other.idx)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> PartialOrd for Interned<T> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Eq for Interned<T> {}
|
||||
|
||||
impl<T> PartialEq for Interned<T> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.idx == other.idx
|
||||
}
|
||||
}
|
||||
impl<T> Clone for Interned<T> {
|
||||
fn clone(&self) -> Self {
|
||||
*self
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Copy for Interned<T> {}
|
||||
|
||||
impl<T> fmt::Display for Interned<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Display::fmt(&self.idx, f)
|
||||
}
|
||||
}
|
||||
impl<T> fmt::Debug for Interned<T> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
fmt::Debug::fmt(&self.idx, f)
|
||||
}
|
||||
}
|
||||
17
crates/milli/src/search/new/limits.rs
Normal file
17
crates/milli/src/search/new/limits.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
/// Maximum number of tokens we consider in a single search.
|
||||
pub const MAX_TOKEN_COUNT: usize = 1_000;
|
||||
|
||||
/// Maximum number of prefixes that can be derived from a single word.
|
||||
pub const MAX_PREFIX_COUNT: usize = 1_000;
|
||||
/// Maximum number of words that can be derived from a single word with a distance of one to that word.
|
||||
pub const MAX_ONE_TYPO_COUNT: usize = 150;
|
||||
/// Maximum number of words that can be derived from a single word with a distance of two to that word.
|
||||
pub const MAX_TWO_TYPOS_COUNT: usize = 50;
|
||||
|
||||
/// Maximum amount of synonym phrases that can be derived from a single word.
|
||||
pub const MAX_SYNONYM_PHRASE_COUNT: usize = 50;
|
||||
|
||||
/// Maximum amount of words inside of all the synonym phrases that can be derived from a single word.
|
||||
///
|
||||
/// This limit is meant to gracefully handle the case where a word would have very long phrases as synonyms.
|
||||
pub const MAX_SYNONYM_WORD_COUNT: usize = 100;
|
||||
81
crates/milli/src/search/new/logger/mod.rs
Normal file
81
crates/milli/src/search/new/logger/mod.rs
Normal file
@@ -0,0 +1,81 @@
|
||||
// #[cfg(test)]
|
||||
pub mod visual;
|
||||
|
||||
use std::any::Any;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::ranking_rules::BoxRankingRule;
|
||||
use super::{RankingRule, RankingRuleQueryTrait};
|
||||
|
||||
/// Trait for structure logging the execution of a search query.
|
||||
pub trait SearchLogger<Q: RankingRuleQueryTrait> {
|
||||
/// Logs the initial query
|
||||
fn initial_query(&mut self, _query: &Q);
|
||||
|
||||
/// Logs the value of the initial set of all candidates
|
||||
fn initial_universe(&mut self, _universe: &RoaringBitmap);
|
||||
|
||||
/// Logs the query that was used to compute the set of all candidates
|
||||
fn query_for_initial_universe(&mut self, _query: &Q);
|
||||
|
||||
/// Logs the ranking rules used to perform the search query
|
||||
fn ranking_rules(&mut self, _rr: &[BoxRankingRule<'_, Q>]);
|
||||
|
||||
/// Logs the start of a ranking rule's iteration.
|
||||
fn start_iteration_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, Q>,
|
||||
_query: &Q,
|
||||
_universe: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the end of the computation of a ranking rule bucket
|
||||
fn next_bucket_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, Q>,
|
||||
_universe: &RoaringBitmap,
|
||||
_candidates: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the skipping of a ranking rule bucket
|
||||
fn skip_bucket_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, Q>,
|
||||
_candidates: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the end of a ranking rule's iteration.
|
||||
fn end_iteration_ranking_rule(
|
||||
&mut self,
|
||||
_ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, Q>,
|
||||
_universe: &RoaringBitmap,
|
||||
) {
|
||||
}
|
||||
/// Logs the addition of document ids to the final results
|
||||
fn add_to_results(&mut self, _docids: &[u32]);
|
||||
|
||||
/// Logs an internal state in the search algorithms
|
||||
fn log_internal_state(&mut self, _rr: &dyn Any);
|
||||
}
|
||||
|
||||
/// A dummy [`SearchLogger`] which does nothing.
|
||||
pub struct DefaultSearchLogger;
|
||||
|
||||
impl<Q: RankingRuleQueryTrait> SearchLogger<Q> for DefaultSearchLogger {
|
||||
fn initial_query(&mut self, _query: &Q) {}
|
||||
|
||||
fn initial_universe(&mut self, _universe: &RoaringBitmap) {}
|
||||
|
||||
fn query_for_initial_universe(&mut self, _query: &Q) {}
|
||||
|
||||
fn ranking_rules(&mut self, _rr: &[BoxRankingRule<'_, Q>]) {}
|
||||
|
||||
fn add_to_results(&mut self, _docids: &[u32]) {}
|
||||
|
||||
fn log_internal_state(&mut self, _rr: &dyn Any) {}
|
||||
}
|
||||
554
crates/milli/src/search/new/logger/visual.rs
Normal file
554
crates/milli/src/search/new/logger/visual.rs
Normal file
@@ -0,0 +1,554 @@
|
||||
use std::any::Any;
|
||||
use std::fs::File;
|
||||
use std::io::{BufWriter, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::Instant;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::search::new::interner::Interned;
|
||||
use crate::search::new::query_graph::QueryNodeData;
|
||||
use crate::search::new::query_term::LocatedQueryTermSubset;
|
||||
use crate::search::new::ranking_rule_graph::{
|
||||
Edge, FidCondition, FidGraph, PositionCondition, PositionGraph, ProximityCondition,
|
||||
ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph,
|
||||
WordsCondition, WordsGraph,
|
||||
};
|
||||
use crate::search::new::ranking_rules::BoxRankingRule;
|
||||
use crate::search::new::{QueryGraph, QueryNode, RankingRule, SearchContext, SearchLogger};
|
||||
use crate::Result;
|
||||
|
||||
pub enum SearchEvents {
|
||||
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
|
||||
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
|
||||
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
|
||||
RankingRuleEndIteration { ranking_rule_idx: usize },
|
||||
ExtendResults { new: Vec<u32> },
|
||||
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
|
||||
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
|
||||
TypoGraph { graph: RankingRuleGraph<TypoGraph> },
|
||||
TypoPaths { paths: Vec<Vec<Interned<TypoCondition>>> },
|
||||
WordsGraph { graph: RankingRuleGraph<WordsGraph> },
|
||||
WordsPaths { paths: Vec<Vec<Interned<WordsCondition>>> },
|
||||
FidGraph { graph: RankingRuleGraph<FidGraph> },
|
||||
FidPaths { paths: Vec<Vec<Interned<FidCondition>>> },
|
||||
PositionGraph { graph: RankingRuleGraph<PositionGraph> },
|
||||
PositionPaths { paths: Vec<Vec<Interned<PositionCondition>>> },
|
||||
}
|
||||
|
||||
enum Location {
|
||||
Words,
|
||||
Typo,
|
||||
Proximity,
|
||||
Fid,
|
||||
Position,
|
||||
Other,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct VisualSearchLogger {
|
||||
initial_query: Option<QueryGraph>,
|
||||
initial_query_time: Option<Instant>,
|
||||
query_for_universe: Option<QueryGraph>,
|
||||
initial_universe: Option<RoaringBitmap>,
|
||||
ranking_rules_ids: Option<Vec<String>>,
|
||||
events: Vec<SearchEvents>,
|
||||
location: Vec<Location>,
|
||||
}
|
||||
|
||||
impl SearchLogger<QueryGraph> for VisualSearchLogger {
|
||||
fn initial_query(&mut self, query: &QueryGraph) {
|
||||
self.initial_query = Some(query.clone());
|
||||
self.initial_query_time = Some(Instant::now());
|
||||
}
|
||||
|
||||
fn query_for_initial_universe(&mut self, query: &QueryGraph) {
|
||||
self.query_for_universe = Some(query.clone());
|
||||
}
|
||||
|
||||
fn initial_universe(&mut self, universe: &RoaringBitmap) {
|
||||
self.initial_universe = Some(universe.clone());
|
||||
}
|
||||
fn ranking_rules(&mut self, rr: &[BoxRankingRule<'_, QueryGraph>]) {
|
||||
self.ranking_rules_ids = Some(rr.iter().map(|rr| rr.id()).collect());
|
||||
}
|
||||
|
||||
fn start_iteration_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
ranking_rule: &dyn RankingRule<'_, QueryGraph>,
|
||||
_query: &QueryGraph,
|
||||
universe: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleStartIteration {
|
||||
ranking_rule_idx,
|
||||
universe_len: universe.len(),
|
||||
});
|
||||
self.location.push(match ranking_rule.id().as_str() {
|
||||
"words" => Location::Words,
|
||||
"typo" => Location::Typo,
|
||||
"proximity" => Location::Proximity,
|
||||
"fid" => Location::Fid,
|
||||
"position" => Location::Position,
|
||||
_ => Location::Other,
|
||||
});
|
||||
}
|
||||
|
||||
fn next_bucket_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
bucket: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleNextBucket {
|
||||
ranking_rule_idx,
|
||||
universe_len: universe.len(),
|
||||
bucket_len: bucket.len(),
|
||||
});
|
||||
}
|
||||
fn skip_bucket_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, QueryGraph>,
|
||||
bucket: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleSkipBucket {
|
||||
ranking_rule_idx,
|
||||
bucket_len: bucket.len(),
|
||||
})
|
||||
}
|
||||
|
||||
fn end_iteration_ranking_rule(
|
||||
&mut self,
|
||||
ranking_rule_idx: usize,
|
||||
_ranking_rule: &dyn RankingRule<'_, QueryGraph>,
|
||||
_universe: &RoaringBitmap,
|
||||
) {
|
||||
self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx });
|
||||
self.location.pop();
|
||||
}
|
||||
fn add_to_results(&mut self, docids: &[u32]) {
|
||||
self.events.push(SearchEvents::ExtendResults { new: docids.to_vec() });
|
||||
}
|
||||
|
||||
/// Logs the internal state of the ranking rule
|
||||
fn log_internal_state(&mut self, state: &dyn Any) {
|
||||
let Some(location) = self.location.last() else { return };
|
||||
match location {
|
||||
Location::Words => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<WordsGraph>>() {
|
||||
self.events.push(SearchEvents::WordsGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<WordsCondition>>>>() {
|
||||
self.events.push(SearchEvents::WordsPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Typo => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<TypoGraph>>() {
|
||||
self.events.push(SearchEvents::TypoGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<TypoCondition>>>>() {
|
||||
self.events.push(SearchEvents::TypoPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Proximity => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<ProximityGraph>>() {
|
||||
self.events.push(SearchEvents::ProximityGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<ProximityCondition>>>>()
|
||||
{
|
||||
self.events.push(SearchEvents::ProximityPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Fid => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<FidGraph>>() {
|
||||
self.events.push(SearchEvents::FidGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<FidCondition>>>>() {
|
||||
self.events.push(SearchEvents::FidPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Position => {
|
||||
if let Some(graph) = state.downcast_ref::<RankingRuleGraph<PositionGraph>>() {
|
||||
self.events.push(SearchEvents::PositionGraph { graph: graph.clone() });
|
||||
}
|
||||
if let Some(paths) = state.downcast_ref::<Vec<Vec<Interned<PositionCondition>>>>() {
|
||||
self.events.push(SearchEvents::PositionPaths { paths: paths.clone() });
|
||||
}
|
||||
}
|
||||
Location::Other => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl VisualSearchLogger {
|
||||
pub fn finish<'ctx>(self, ctx: &'ctx mut SearchContext<'ctx>, folder: &Path) -> Result<()> {
|
||||
let mut f = DetailedLoggerFinish::new(ctx, folder)?;
|
||||
f.finish(self)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct DetailedLoggerFinish<'ctx> {
|
||||
ctx: &'ctx mut SearchContext<'ctx>,
|
||||
/// The folder where all the files should be printed
|
||||
folder_path: PathBuf,
|
||||
/// The main file visualising the search request
|
||||
index_file: BufWriter<File>,
|
||||
/// A vector of counters where each counter at index i represents the number of times
|
||||
/// that the ranking rule at idx i-1 was called since its last call to `start_iteration`.
|
||||
/// This is used to uniquely identify a point in the sequence diagram.
|
||||
rr_action_counter: Vec<usize>,
|
||||
/// The file storing information about the internal state of the latest active ranking rule
|
||||
file_for_internal_state: Option<BufWriter<File>>,
|
||||
}
|
||||
|
||||
impl<'ctx> DetailedLoggerFinish<'ctx> {
|
||||
fn cur_file(&mut self) -> &mut BufWriter<File> {
|
||||
if let Some(file) = self.file_for_internal_state.as_mut() {
|
||||
file
|
||||
} else {
|
||||
&mut self.index_file
|
||||
}
|
||||
}
|
||||
fn pop_rr_action(&mut self) {
|
||||
self.file_for_internal_state = None;
|
||||
self.rr_action_counter.pop();
|
||||
}
|
||||
fn push_new_rr_action(&mut self) {
|
||||
self.file_for_internal_state = None;
|
||||
self.rr_action_counter.push(0);
|
||||
}
|
||||
fn increment_cur_rr_action(&mut self) {
|
||||
self.file_for_internal_state = None;
|
||||
if let Some(c) = self.rr_action_counter.last_mut() {
|
||||
*c += 1;
|
||||
}
|
||||
}
|
||||
fn id_of_timestamp(&self) -> String {
|
||||
let mut s = String::new();
|
||||
for t in self.rr_action_counter.iter() {
|
||||
s.push_str(&format!("{t}_"));
|
||||
}
|
||||
s
|
||||
}
|
||||
fn id_of_extend_results(&self) -> String {
|
||||
let mut s = String::new();
|
||||
s.push_str("results.\"");
|
||||
s.push_str(&self.id_of_timestamp());
|
||||
s.push('"');
|
||||
s
|
||||
}
|
||||
fn id_of_last_rr_action(&self) -> String {
|
||||
let mut s = String::new();
|
||||
let rr_id = if self.rr_action_counter.is_empty() {
|
||||
"start.\"".to_owned()
|
||||
} else {
|
||||
format!("{}.\"", self.rr_action_counter.len() - 1)
|
||||
};
|
||||
s.push_str(&rr_id);
|
||||
s.push_str(&self.id_of_timestamp());
|
||||
s.push('"');
|
||||
s
|
||||
}
|
||||
fn make_new_file_for_internal_state_if_needed(&mut self) -> Result<()> {
|
||||
if self.file_for_internal_state.is_some() {
|
||||
return Ok(());
|
||||
}
|
||||
let timestamp = self.id_of_timestamp();
|
||||
let id = self.id_of_last_rr_action();
|
||||
let new_file_path = self.folder_path.join(format!("{timestamp}.d2"));
|
||||
self.file_for_internal_state = Some(BufWriter::new(File::create(new_file_path)?));
|
||||
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{id} {{
|
||||
link: \"{timestamp}.d2.svg\"
|
||||
}}"
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
fn new(ctx: &'ctx mut SearchContext<'ctx>, folder_path: &Path) -> Result<Self> {
|
||||
let index_path = folder_path.join("index.d2");
|
||||
let index_file = BufWriter::new(File::create(index_path)?);
|
||||
|
||||
Ok(Self {
|
||||
ctx,
|
||||
folder_path: folder_path.to_owned(),
|
||||
index_file,
|
||||
rr_action_counter: vec![],
|
||||
file_for_internal_state: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn finish(&mut self, logger: VisualSearchLogger) -> Result<()> {
|
||||
writeln!(&mut self.index_file, "direction: right")?;
|
||||
if let Some(qg) = logger.initial_query {
|
||||
writeln!(&mut self.index_file, "Initial Query Graph: {{")?;
|
||||
self.write_query_graph(&qg)?;
|
||||
writeln!(&mut self.index_file, "}}")?;
|
||||
}
|
||||
if let Some(qg) = logger.query_for_universe {
|
||||
writeln!(&mut self.index_file, "Query Graph Used To Compute Universe: {{")?;
|
||||
self.write_query_graph(&qg)?;
|
||||
writeln!(&mut self.index_file, "}}")?;
|
||||
}
|
||||
let Some(ranking_rules_ids) = logger.ranking_rules_ids else { return Ok(()) };
|
||||
writeln!(&mut self.index_file, "Control Flow Between Ranking Rules: {{")?;
|
||||
writeln!(&mut self.index_file, "shape: sequence_diagram")?;
|
||||
writeln!(&mut self.index_file, "start")?;
|
||||
for (idx, rr_id) in ranking_rules_ids.iter().enumerate() {
|
||||
writeln!(&mut self.index_file, "{idx}: {rr_id}")?;
|
||||
}
|
||||
writeln!(&mut self.index_file, "results")?;
|
||||
for event in logger.events {
|
||||
self.write_event(event)?;
|
||||
}
|
||||
writeln!(&mut self.index_file, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_event(&mut self, e: SearchEvents) -> Result<()> {
|
||||
match e {
|
||||
SearchEvents::RankingRuleStartIteration { ranking_rule_idx, universe_len } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len());
|
||||
self.write_start_iteration(universe_len)?;
|
||||
}
|
||||
SearchEvents::RankingRuleNextBucket { ranking_rule_idx, universe_len, bucket_len } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||
self.write_next_bucket(bucket_len, universe_len)?;
|
||||
}
|
||||
SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, bucket_len } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||
self.write_skip_bucket(bucket_len)?;
|
||||
}
|
||||
SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => {
|
||||
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
|
||||
self.write_end_iteration()?;
|
||||
}
|
||||
SearchEvents::ExtendResults { new } => {
|
||||
self.write_extend_results(new)?;
|
||||
}
|
||||
SearchEvents::ProximityGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::ProximityPaths { paths } => {
|
||||
self.write_rr_graph_paths::<ProximityGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::TypoGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::TypoPaths { paths } => {
|
||||
self.write_rr_graph_paths::<TypoGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::WordsGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::WordsPaths { paths } => {
|
||||
self.write_rr_graph_paths::<WordsGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::FidGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::FidPaths { paths } => {
|
||||
self.write_rr_graph_paths::<FidGraph>(paths)?;
|
||||
}
|
||||
SearchEvents::PositionGraph { graph } => self.write_rr_graph(&graph)?,
|
||||
SearchEvents::PositionPaths { paths } => {
|
||||
self.write_rr_graph_paths::<PositionGraph>(paths)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn write_query_graph(&mut self, qg: &QueryGraph) -> Result<()> {
|
||||
writeln!(self.cur_file(), "direction: right")?;
|
||||
for (node_id, node) in qg.nodes.iter() {
|
||||
if matches!(node.data, QueryNodeData::Deleted) {
|
||||
continue;
|
||||
}
|
||||
self.write_query_node(node_id, node)?;
|
||||
|
||||
for edge in node.successors.iter() {
|
||||
writeln!(self.cur_file(), "{node_id} -> {edge};\n").unwrap();
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_start_iteration(&mut self, _universe_len: u64) -> Result<()> {
|
||||
let parent_action_id = self.id_of_last_rr_action();
|
||||
self.push_new_rr_action();
|
||||
let self_action_id = self.id_of_last_rr_action();
|
||||
writeln!(&mut self.index_file, "{parent_action_id} -> {self_action_id} : start iteration")?;
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{self_action_id} {{
|
||||
style {{
|
||||
fill: \"#D8A7B1\"
|
||||
}}
|
||||
}}"
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_next_bucket(&mut self, bucket_len: u64, universe_len: u64) -> Result<()> {
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
self.increment_cur_rr_action();
|
||||
let next_action_id = self.id_of_last_rr_action();
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{cur_action_id} -> {next_action_id} : next bucket {bucket_len}/{universe_len}"
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_skip_bucket(&mut self, bucket_len: u64) -> Result<()> {
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
self.increment_cur_rr_action();
|
||||
let next_action_id = self.id_of_last_rr_action();
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{cur_action_id} -> {next_action_id} : skip bucket ({bucket_len})"
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_end_iteration(&mut self) -> Result<()> {
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
self.pop_rr_action();
|
||||
let parent_action_id = self.id_of_last_rr_action();
|
||||
|
||||
writeln!(&mut self.index_file, "{cur_action_id} -> {parent_action_id} : end iteration",)?;
|
||||
Ok(())
|
||||
}
|
||||
fn write_extend_results(&mut self, new: Vec<u32>) -> Result<()> {
|
||||
if new.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let cur_action_id = self.id_of_last_rr_action();
|
||||
let results_id = self.id_of_extend_results();
|
||||
let docids = new.iter().collect::<Vec<_>>();
|
||||
let len = new.len();
|
||||
|
||||
writeln!(
|
||||
&mut self.index_file,
|
||||
"{cur_action_id} -> {results_id} : \"add {len}\"
|
||||
{results_id} {{
|
||||
tooltip: \"{docids:?}\"
|
||||
style {{
|
||||
fill: \"#B6E2D3\"
|
||||
}}
|
||||
}}
|
||||
"
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_query_node(&mut self, node_idx: Interned<QueryNode>, node: &QueryNode) -> Result<()> {
|
||||
let Self {
|
||||
ctx, index_file, file_for_internal_state: active_ranking_rule_state_file, ..
|
||||
} = self;
|
||||
let file = if let Some(file) = active_ranking_rule_state_file.as_mut() {
|
||||
file
|
||||
} else {
|
||||
index_file
|
||||
};
|
||||
match &node.data {
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset,
|
||||
positions: _,
|
||||
term_ids: _,
|
||||
}) => {
|
||||
writeln!(
|
||||
file,
|
||||
"{node_idx} : \"{}\" {{
|
||||
shape: class
|
||||
max_nbr_typo: {}",
|
||||
term_subset.description(ctx),
|
||||
term_subset.max_typo_cost(ctx)
|
||||
)?;
|
||||
|
||||
for w in term_subset.all_single_words_except_prefix_db(ctx)? {
|
||||
let w = ctx.word_interner.get(w.interned());
|
||||
writeln!(file, "{w}: word")?;
|
||||
}
|
||||
for p in term_subset.all_phrases(ctx)? {
|
||||
writeln!(file, "{}: phrase", p.description(ctx))?;
|
||||
}
|
||||
if let Some(w) = term_subset.use_prefix_db(ctx) {
|
||||
let w = ctx.word_interner.get(w.interned());
|
||||
writeln!(file, "{w}: prefix db")?;
|
||||
}
|
||||
|
||||
writeln!(file, "}}")?;
|
||||
}
|
||||
QueryNodeData::Deleted => panic!(),
|
||||
QueryNodeData::Start => {
|
||||
writeln!(file, "{node_idx} : START")?;
|
||||
}
|
||||
QueryNodeData::End => {
|
||||
writeln!(file, "{node_idx} : END")?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn write_rr_graph<R: RankingRuleGraphTrait>(
|
||||
&mut self,
|
||||
graph: &RankingRuleGraph<R>,
|
||||
) -> Result<()> {
|
||||
self.make_new_file_for_internal_state_if_needed()?;
|
||||
|
||||
writeln!(self.cur_file(), "direction: right")?;
|
||||
|
||||
writeln!(self.cur_file(), "Graph {{")?;
|
||||
for (node_idx, node) in graph.query_graph.nodes.iter() {
|
||||
if matches!(&node.data, QueryNodeData::Deleted) {
|
||||
continue;
|
||||
}
|
||||
self.write_query_node(node_idx, node)?;
|
||||
}
|
||||
for (_edge_id, edge) in graph.edges_store.iter() {
|
||||
let Some(edge) = edge else { continue };
|
||||
let Edge { source_node, dest_node, condition: details, cost, nodes_to_skip: _ } = edge;
|
||||
|
||||
match &details {
|
||||
None => {
|
||||
writeln!(
|
||||
self.cur_file(),
|
||||
"{source_node} -> {dest_node} : \"always cost {cost}\"",
|
||||
)?;
|
||||
}
|
||||
Some(condition) => {
|
||||
writeln!(
|
||||
self.cur_file(),
|
||||
"{source_node} -> {dest_node} : \"{condition} cost {cost}\"",
|
||||
cost = edge.cost,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
writeln!(self.cur_file(), "}}")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_rr_graph_paths<R: RankingRuleGraphTrait>(
|
||||
&mut self,
|
||||
paths: Vec<Vec<Interned<R::Condition>>>,
|
||||
) -> Result<()> {
|
||||
self.make_new_file_for_internal_state_if_needed()?;
|
||||
let file = if let Some(file) = self.file_for_internal_state.as_mut() {
|
||||
file
|
||||
} else {
|
||||
&mut self.index_file
|
||||
};
|
||||
writeln!(file, "Path {{")?;
|
||||
for (path_idx, condition_indexes) in paths.iter().enumerate() {
|
||||
writeln!(file, "{path_idx} {{")?;
|
||||
for condition in condition_indexes.iter() {
|
||||
writeln!(file, "{condition}")?;
|
||||
}
|
||||
for couple_edges in condition_indexes.windows(2) {
|
||||
let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() };
|
||||
writeln!(file, "{src_edge_idx} -> {dest_edge_idx}")?;
|
||||
}
|
||||
writeln!(file, "}}")?;
|
||||
}
|
||||
writeln!(file, "}}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
139
crates/milli/src/search/new/matches/best_match_interval.rs
Normal file
139
crates/milli/src/search/new/matches/best_match_interval.rs
Normal file
@@ -0,0 +1,139 @@
|
||||
use super::matching_words::WordId;
|
||||
use super::{Match, MatchPosition};
|
||||
|
||||
struct MatchIntervalWithScore {
|
||||
interval: [usize; 2],
|
||||
score: [i16; 3],
|
||||
}
|
||||
|
||||
// count score for phrases
|
||||
fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
|
||||
let words_in_phrase_minus_one = (lwp - fwp) as i16;
|
||||
// will always be ordered, so +1 for each space between words
|
||||
*order_score += words_in_phrase_minus_one;
|
||||
// distance will always be 1, so -1 for each space between words
|
||||
*distance_score -= words_in_phrase_minus_one;
|
||||
}
|
||||
|
||||
/// Compute the score of a match interval:
|
||||
/// 1) count unique matches
|
||||
/// 2) calculate distance between matches
|
||||
/// 3) count ordered matches
|
||||
fn get_interval_score(matches: &[Match]) -> [i16; 3] {
|
||||
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
|
||||
let mut order_score = 0;
|
||||
let mut distance_score = 0;
|
||||
|
||||
let mut iter = matches.iter().peekable();
|
||||
while let Some(m) = iter.next() {
|
||||
if let Some(next_match) = iter.peek() {
|
||||
// if matches are ordered
|
||||
if next_match.ids.iter().min() > m.ids.iter().min() {
|
||||
order_score += 1;
|
||||
}
|
||||
|
||||
let m_last_word_pos = match m.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
|
||||
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||
lwp
|
||||
}
|
||||
};
|
||||
let next_match_first_word_pos = next_match.get_first_word_pos();
|
||||
|
||||
// compute distance between matches
|
||||
distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
|
||||
} else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
|
||||
// in case last match is a phrase, count score for its words
|
||||
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||
}
|
||||
|
||||
ids.extend(m.ids.iter());
|
||||
}
|
||||
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
let uniq_score = ids.len() as i16;
|
||||
|
||||
// rank by unique match count, then by distance between matches, then by ordered match count.
|
||||
[uniq_score, distance_score, order_score]
|
||||
}
|
||||
|
||||
/// Returns the first and last match where the score computed by match_interval_score is the best.
|
||||
pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
|
||||
if matches.is_empty() {
|
||||
panic!("`matches` should not be empty at this point");
|
||||
}
|
||||
|
||||
// positions of the first and the last match of the best matches interval in `matches`.
|
||||
let mut best_interval: Option<MatchIntervalWithScore> = None;
|
||||
|
||||
let mut save_best_interval = |interval_first, interval_last| {
|
||||
let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
|
||||
let is_interval_score_better = &best_interval
|
||||
.as_ref()
|
||||
.map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score);
|
||||
|
||||
if *is_interval_score_better {
|
||||
best_interval = Some(MatchIntervalWithScore {
|
||||
interval: [interval_first, interval_last],
|
||||
score: interval_score,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// we compute the matches interval if we have at least 2 matches.
|
||||
// current interval positions.
|
||||
let mut interval_first = 0;
|
||||
let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||
|
||||
for (index, next_match) in matches.iter().enumerate() {
|
||||
// if next match would make interval gross more than crop_size,
|
||||
// we compare the current interval with the best one,
|
||||
// then we increase `interval_first` until next match can be added.
|
||||
let next_match_last_word_pos = next_match.get_last_word_pos();
|
||||
|
||||
// if the next match would mean that we pass the crop size window,
|
||||
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
|
||||
// and calculate a score for it, and check if it's better than our best so far
|
||||
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
|
||||
// if index is 0 there is no last viable match
|
||||
if index != 0 {
|
||||
let interval_last = index - 1;
|
||||
// keep interval if it's the best
|
||||
save_best_interval(interval_first, interval_last);
|
||||
}
|
||||
|
||||
// advance start of the interval while interval is longer than crop_size.
|
||||
loop {
|
||||
interval_first += 1;
|
||||
if interval_first == matches.len() {
|
||||
interval_first -= 1;
|
||||
break;
|
||||
}
|
||||
|
||||
interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||
|
||||
if interval_first_match_first_word_pos > next_match_last_word_pos
|
||||
|| next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// compute the last interval score and compare it to the best one.
|
||||
let interval_last = matches.len() - 1;
|
||||
// if it's the last match with itself, we need to make sure it's
|
||||
// not a phrase longer than the crop window
|
||||
if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
|
||||
save_best_interval(interval_first, interval_last);
|
||||
}
|
||||
|
||||
// if none of the matches fit the criteria above, default to the first one
|
||||
best_interval.map_or(
|
||||
[&matches[0], &matches[0]],
|
||||
|MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
|
||||
)
|
||||
}
|
||||
62
crates/milli/src/search/new/matches/match.rs
Normal file
62
crates/milli/src/search/new/matches/match.rs
Normal file
@@ -0,0 +1,62 @@
|
||||
use super::matching_words::WordId;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum MatchPosition {
|
||||
Word {
|
||||
// position of the word in the whole text.
|
||||
word_position: usize,
|
||||
// position of the token in the whole text.
|
||||
token_position: usize,
|
||||
},
|
||||
Phrase {
|
||||
// position of the first and last word in the phrase in the whole text.
|
||||
word_positions: [usize; 2],
|
||||
// position of the first and last token in the phrase in the whole text.
|
||||
token_positions: [usize; 2],
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Match {
|
||||
pub char_count: usize,
|
||||
// ids of the query words that matches.
|
||||
pub ids: Vec<WordId>,
|
||||
pub position: MatchPosition,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
pub(super) fn get_first_word_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_last_word_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_first_token_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { token_position, .. } => token_position,
|
||||
MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_last_token_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { token_position, .. } => token_position,
|
||||
MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_word_count(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { .. } => 1,
|
||||
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
331
crates/milli/src/search/new/matches/matching_words.rs
Normal file
331
crates/milli/src/search/new/matches/matching_words.rs
Normal file
@@ -0,0 +1,331 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::fmt;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use charabia::Token;
|
||||
|
||||
use super::super::interner::Interned;
|
||||
use super::super::query_term::LocatedQueryTerm;
|
||||
use super::super::{DedupInterner, Phrase};
|
||||
use crate::SearchContext;
|
||||
|
||||
pub struct LocatedMatchingPhrase {
|
||||
pub value: Interned<Phrase>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
}
|
||||
|
||||
pub struct LocatedMatchingWords {
|
||||
pub value: Vec<Interned<String>>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
pub is_prefix: bool,
|
||||
pub original_char_count: usize,
|
||||
}
|
||||
|
||||
/// Structure created from a query tree
|
||||
/// referencing words that match the given query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
word_interner: DedupInterner<String>,
|
||||
phrase_interner: DedupInterner<Phrase>,
|
||||
phrases: Vec<LocatedMatchingPhrase>,
|
||||
words: Vec<LocatedMatchingWords>,
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(ctx: SearchContext<'_>, located_terms: Vec<LocatedQueryTerm>) -> Self {
|
||||
let mut phrases = Vec::new();
|
||||
let mut words = Vec::new();
|
||||
|
||||
// Extract and centralize the different phrases and words to match stored in a QueryTerm
|
||||
// and wrap them in dedicated structures.
|
||||
for located_term in located_terms {
|
||||
let term = ctx.term_interner.get(located_term.value);
|
||||
let (matching_words, matching_phrases) = term.all_computed_derivations();
|
||||
|
||||
for matching_phrase in matching_phrases {
|
||||
phrases.push(LocatedMatchingPhrase {
|
||||
value: matching_phrase,
|
||||
positions: located_term.positions.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
words.push(LocatedMatchingWords {
|
||||
value: matching_words,
|
||||
positions: located_term.positions.clone(),
|
||||
is_prefix: term.is_prefix(),
|
||||
original_char_count: term.original_word(&ctx).chars().count(),
|
||||
});
|
||||
}
|
||||
|
||||
// Sort word to put prefixes at the bottom prioritizing the exact matches.
|
||||
words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
|
||||
|
||||
Self {
|
||||
phrases,
|
||||
words,
|
||||
word_interner: ctx.word_interner,
|
||||
phrase_interner: ctx.phrase_interner,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over terms that match or partially match the given token.
|
||||
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
|
||||
MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
|
||||
}
|
||||
|
||||
/// Try to match the token with one of the located_words.
|
||||
fn match_unique_words<'a>(&'a self, token: &Token<'_>) -> Option<MatchType<'a>> {
|
||||
for located_words in &self.words {
|
||||
for word in &located_words.value {
|
||||
let word = self.word_interner.get(*word);
|
||||
// if the word is a prefix we match using starts_with.
|
||||
if located_words.is_prefix && token.lemma().starts_with(word) {
|
||||
let Some((char_index, c)) =
|
||||
word.char_indices().take(located_words.original_char_count).last()
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
let prefix_length = char_index + c.len_utf8();
|
||||
let (char_count, byte_len) = token.original_lengths(prefix_length);
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full { ids, char_count, byte_len });
|
||||
// else we exact match the token.
|
||||
} else if token.lemma() == word {
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full {
|
||||
char_count: token.char_end - token.char_start,
|
||||
byte_len: token.byte_end - token.byte_start,
|
||||
ids,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
matching_words: &'a MatchingWords,
|
||||
phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for MatchesIter<'a, '_> {
|
||||
type Item = MatchType<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.phrases.next() {
|
||||
// Try to match all the phrases first.
|
||||
Some(located_phrase) => {
|
||||
let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
|
||||
|
||||
// create a PartialMatch struct to make it compute the first match
|
||||
// instead of duplicating the code.
|
||||
let ids = &located_phrase.positions;
|
||||
// collect the references of words from the interner.
|
||||
let words = phrase
|
||||
.words
|
||||
.iter()
|
||||
.map(|word| {
|
||||
word.map(|word| self.matching_words.word_interner.get(word).as_str())
|
||||
})
|
||||
.collect();
|
||||
let partial = PartialMatch { matching_words: words, ids };
|
||||
|
||||
partial.match_token(self.token).or_else(|| self.next())
|
||||
}
|
||||
// If no phrases matches, try to match uiques words.
|
||||
None => self.matching_words.match_unique_words(self.token),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Id of a matching term corespounding to a word written by the end user.
|
||||
pub type WordId = u16;
|
||||
|
||||
/// A given token can partially match a query word for several reasons:
|
||||
/// - split words
|
||||
/// - multi-word synonyms
|
||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MatchType<'a> {
|
||||
Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> },
|
||||
Partial(PartialMatch<'a>),
|
||||
}
|
||||
|
||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: Vec<Option<&'a str>>,
|
||||
ids: &'a RangeInclusive<WordId>,
|
||||
}
|
||||
|
||||
impl<'a> PartialMatch<'a> {
|
||||
/// Returns:
|
||||
/// - None if the given token breaks the partial match
|
||||
/// - Partial if the given token matches the partial match but doesn't complete it
|
||||
/// - Full if the given token completes the partial match
|
||||
pub fn match_token(self, token: &Token<'_>) -> Option<MatchType<'a>> {
|
||||
let Self { mut matching_words, ids, .. } = self;
|
||||
|
||||
let is_matching = match matching_words.first()? {
|
||||
Some(word) => &token.lemma() == word,
|
||||
// a None value in the phrase corresponds to a stop word,
|
||||
// the walue is considered a match if the current token is categorized as a stop word.
|
||||
None => token.is_stopword(),
|
||||
};
|
||||
|
||||
// if there are remaining words to match in the phrase and the current token is matching,
|
||||
// return a new Partial match allowing the highlighter to continue.
|
||||
if is_matching && matching_words.len() > 1 {
|
||||
matching_words.remove(0);
|
||||
Some(MatchType::Partial(Self { matching_words, ids }))
|
||||
// if there is no remaining word to match in the phrase and the current token is matching,
|
||||
// return a Full match.
|
||||
} else if is_matching {
|
||||
Some(MatchType::Full {
|
||||
char_count: token.char_end - token.char_start,
|
||||
byte_len: token.byte_end - token.byte_start,
|
||||
ids,
|
||||
})
|
||||
// if the current token doesn't match, return None to break the match sequence.
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
|
||||
|
||||
let phrases: Vec<_> = phrases
|
||||
.iter()
|
||||
.map(|p| {
|
||||
(
|
||||
phrase_interner
|
||||
.get(p.value)
|
||||
.words
|
||||
.iter()
|
||||
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
p.positions.clone(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let words: Vec<_> = words
|
||||
.iter()
|
||||
.flat_map(|w| {
|
||||
w.value
|
||||
.iter()
|
||||
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::borrow::Cow;
|
||||
|
||||
use charabia::{TokenKind, TokenizerBuilder};
|
||||
|
||||
use super::super::super::located_query_terms_from_tokens;
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::query_term::ExtractedTokens;
|
||||
|
||||
pub(crate) fn temp_index_with_documents() -> TempIndex {
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
|
||||
{ "id": 2, "name": "Westfália" },
|
||||
{ "id": 3, "name": "Ŵôřlḑôle" },
|
||||
]))
|
||||
.unwrap();
|
||||
temp_index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize("split this world");
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("split"),
|
||||
char_end: "split".chars().count(),
|
||||
byte_end: "split".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("worlded"),
|
||||
char_end: "worlded".chars().count(),
|
||||
byte_end: "worlded".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
}
|
||||
}
|
||||
929
crates/milli/src/search/new/matches/mod.rs
Normal file
929
crates/milli/src/search/new/matches/mod.rs
Normal file
@@ -0,0 +1,929 @@
|
||||
mod best_match_interval;
|
||||
mod r#match;
|
||||
mod matching_words;
|
||||
mod simple_token_kind;
|
||||
|
||||
use charabia::{Language, SeparatorKind, Token, Tokenizer};
|
||||
use either::Either;
|
||||
pub use matching_words::MatchingWords;
|
||||
use matching_words::{MatchType, PartialMatch};
|
||||
use r#match::{Match, MatchPosition};
|
||||
use serde::Serialize;
|
||||
use simple_token_kind::SimpleTokenKind;
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
cmp::{max, min},
|
||||
};
|
||||
|
||||
const DEFAULT_CROP_MARKER: &str = "…";
|
||||
const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
|
||||
const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
|
||||
|
||||
/// Structure used to build a Matcher allowing to customize formating tags.
|
||||
pub struct MatcherBuilder<'m> {
|
||||
matching_words: MatchingWords,
|
||||
tokenizer: Tokenizer<'m>,
|
||||
crop_marker: Option<String>,
|
||||
highlight_prefix: Option<String>,
|
||||
highlight_suffix: Option<String>,
|
||||
}
|
||||
|
||||
impl<'m> MatcherBuilder<'m> {
|
||||
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self {
|
||||
Self {
|
||||
matching_words,
|
||||
tokenizer,
|
||||
crop_marker: None,
|
||||
highlight_prefix: None,
|
||||
highlight_suffix: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn crop_marker(&mut self, marker: String) -> &Self {
|
||||
self.crop_marker = Some(marker);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn highlight_prefix(&mut self, prefix: String) -> &Self {
|
||||
self.highlight_prefix = Some(prefix);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn highlight_suffix(&mut self, suffix: String) -> &Self {
|
||||
self.highlight_suffix = Some(suffix);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build<'t, 'lang>(
|
||||
&self,
|
||||
text: &'t str,
|
||||
locales: Option<&'lang [Language]>,
|
||||
) -> Matcher<'t, 'm, '_, 'lang> {
|
||||
let crop_marker = match &self.crop_marker {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_CROP_MARKER,
|
||||
};
|
||||
|
||||
let highlight_prefix = match &self.highlight_prefix {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_HIGHLIGHT_PREFIX,
|
||||
};
|
||||
let highlight_suffix = match &self.highlight_suffix {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_HIGHLIGHT_SUFFIX,
|
||||
};
|
||||
Matcher {
|
||||
text,
|
||||
matching_words: &self.matching_words,
|
||||
tokenizer: &self.tokenizer,
|
||||
crop_marker,
|
||||
highlight_prefix,
|
||||
highlight_suffix,
|
||||
matches: None,
|
||||
locales,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default, Debug)]
|
||||
pub struct FormatOptions {
|
||||
pub highlight: bool,
|
||||
pub crop: Option<usize>,
|
||||
}
|
||||
|
||||
impl FormatOptions {
|
||||
pub fn merge(self, other: Self) -> Self {
|
||||
Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) }
|
||||
}
|
||||
|
||||
pub fn should_format(&self) -> bool {
|
||||
self.highlight || self.crop.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub struct MatchBounds {
|
||||
pub start: usize,
|
||||
pub length: usize,
|
||||
}
|
||||
|
||||
/// Structure used to analyze a string, compute words that match,
|
||||
/// and format the source string, returning a highlighted and cropped sub-string.
|
||||
pub struct Matcher<'t, 'tokenizer, 'b, 'lang> {
|
||||
text: &'t str,
|
||||
matching_words: &'b MatchingWords,
|
||||
tokenizer: &'b Tokenizer<'tokenizer>,
|
||||
locales: Option<&'lang [Language]>,
|
||||
crop_marker: &'b str,
|
||||
highlight_prefix: &'b str,
|
||||
highlight_suffix: &'b str,
|
||||
matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
|
||||
}
|
||||
|
||||
impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
||||
/// Iterates over tokens and save any of them that matches the query.
|
||||
fn compute_matches(&mut self) -> &mut Self {
|
||||
/// some words are counted as matches only if they are close together and in the good order,
|
||||
/// compute_partial_match peek into next words to validate if the match is complete.
|
||||
fn compute_partial_match<'a>(
|
||||
mut partial: PartialMatch<'a>,
|
||||
first_token_position: usize,
|
||||
first_word_position: usize,
|
||||
first_word_char_start: &usize,
|
||||
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
|
||||
matches: &mut Vec<Match>,
|
||||
) -> bool {
|
||||
for (token_position, word_position, word) in words_positions {
|
||||
partial = match partial.match_token(word) {
|
||||
// token matches the partial match, but the match is not full,
|
||||
// we temporarily save the current token then we try to match the next one.
|
||||
Some(MatchType::Partial(partial)) => partial,
|
||||
// partial match is now full, we keep this matches and we advance positions
|
||||
Some(MatchType::Full { ids, .. }) => {
|
||||
// save the token that closes the partial match as a match.
|
||||
matches.push(Match {
|
||||
char_count: word.char_end - *first_word_char_start,
|
||||
ids: ids.clone().collect(),
|
||||
position: MatchPosition::Phrase {
|
||||
word_positions: [first_word_position, word_position],
|
||||
token_positions: [first_token_position, token_position],
|
||||
},
|
||||
});
|
||||
|
||||
// the match is complete, we return true.
|
||||
return true;
|
||||
}
|
||||
// no match, continue to next match.
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
|
||||
// the match is not complete, we return false.
|
||||
false
|
||||
}
|
||||
|
||||
let tokens: Vec<_> =
|
||||
self.tokenizer.tokenize_with_allow_list(self.text, self.locales).collect();
|
||||
let mut matches = Vec::new();
|
||||
|
||||
let mut words_positions = tokens
|
||||
.iter()
|
||||
.scan((0, 0), |(token_position, word_position), token| {
|
||||
let current_token_position = *token_position;
|
||||
let current_word_position = *word_position;
|
||||
*token_position += 1;
|
||||
if !token.is_separator() {
|
||||
*word_position += 1;
|
||||
}
|
||||
|
||||
Some((current_token_position, current_word_position, token))
|
||||
})
|
||||
.filter(|(_, _, token)| !token.is_separator());
|
||||
|
||||
while let Some((token_position, word_position, word)) = words_positions.next() {
|
||||
for match_type in self.matching_words.match_token(word) {
|
||||
match match_type {
|
||||
// we match, we save the current token as a match,
|
||||
// then we continue the rest of the tokens.
|
||||
MatchType::Full { ids, char_count, .. } => {
|
||||
let ids: Vec<_> = ids.clone().collect();
|
||||
matches.push(Match {
|
||||
char_count,
|
||||
ids,
|
||||
position: MatchPosition::Word { word_position, token_position },
|
||||
});
|
||||
break;
|
||||
}
|
||||
// we match partially, iterate over next tokens to check if we can complete the match.
|
||||
MatchType::Partial(partial) => {
|
||||
// if match is completed, we break the matching loop over the current token,
|
||||
// then we continue the rest of the tokens.
|
||||
let mut wp = words_positions.clone();
|
||||
if compute_partial_match(
|
||||
partial,
|
||||
token_position,
|
||||
word_position,
|
||||
&word.char_start,
|
||||
&mut wp,
|
||||
&mut matches,
|
||||
) {
|
||||
words_positions = wp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.matches = Some((tokens, matches));
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns boundaries of the words that match the query.
|
||||
pub fn matches(&mut self) -> Vec<MatchBounds> {
|
||||
match &self.matches {
|
||||
None => self.compute_matches().matches(),
|
||||
Some((tokens, matches)) => matches
|
||||
.iter()
|
||||
.map(|m| MatchBounds {
|
||||
start: tokens[m.get_first_token_pos()].byte_start,
|
||||
// TODO: Why is this in chars, while start is in bytes?
|
||||
length: m.char_count,
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the bounds in byte index of the crop window.
|
||||
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
|
||||
let (
|
||||
mut remaining_words,
|
||||
is_iterating_forward,
|
||||
before_tokens_starting_index,
|
||||
after_tokens_starting_index,
|
||||
) = if !matches.is_empty() {
|
||||
let [matches_first, matches_last] =
|
||||
best_match_interval::find_best_match_interval(matches, crop_size);
|
||||
|
||||
let matches_size =
|
||||
matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1;
|
||||
|
||||
let is_crop_size_gte_match_size = crop_size >= matches_size;
|
||||
let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size;
|
||||
|
||||
let remaining_words = if is_crop_size_gte_match_size {
|
||||
crop_size - matches_size
|
||||
} else {
|
||||
// in case matches size is greater than crop size, which implies there's only one match,
|
||||
// we count words backwards, because we have to remove words, as they're extra words outside of
|
||||
// crop window
|
||||
matches_size - crop_size
|
||||
};
|
||||
|
||||
let after_tokens_starting_index = if matches_size == 0 {
|
||||
0
|
||||
} else {
|
||||
let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1;
|
||||
if last_match_last_token_position_plus_one < tokens.len() {
|
||||
last_match_last_token_position_plus_one
|
||||
} else {
|
||||
// we have matched the end of possible tokens, there's nothing to advance
|
||||
tokens.len() - 1
|
||||
}
|
||||
};
|
||||
|
||||
(
|
||||
remaining_words,
|
||||
is_iterating_forward,
|
||||
if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 },
|
||||
after_tokens_starting_index,
|
||||
)
|
||||
} else {
|
||||
(crop_size, true, 0, 0)
|
||||
};
|
||||
|
||||
// create the initial state of the crop window: 2 iterators starting from the matches positions,
|
||||
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
|
||||
let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable();
|
||||
// an iterator ...
|
||||
let mut after_tokens = if is_iterating_forward {
|
||||
// ... starting from the last match token position and going towards the end of the text.
|
||||
Either::Left(tokens[after_tokens_starting_index..].iter().peekable())
|
||||
} else {
|
||||
// ... starting from the last match token position and going towards the start of the text.
|
||||
Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable())
|
||||
};
|
||||
|
||||
// grows the crop window peeking in both directions
|
||||
// until the window contains the good number of words:
|
||||
while remaining_words > 0 {
|
||||
let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
|
||||
let after_token_kind =
|
||||
after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new);
|
||||
|
||||
match (before_token_kind, after_token_kind) {
|
||||
// we can expand both sides.
|
||||
(Some(before_token_kind), Some(after_token_kind)) => {
|
||||
match (before_token_kind, after_token_kind) {
|
||||
// if they are both separators and are the same kind then advance both,
|
||||
// or expand in the soft separator separator side.
|
||||
(
|
||||
SimpleTokenKind::Separator(before_token_separator_kind),
|
||||
SimpleTokenKind::Separator(after_token_separator_kind),
|
||||
) => {
|
||||
if before_token_separator_kind == after_token_separator_kind {
|
||||
before_tokens.next();
|
||||
|
||||
// this avoid having an ending separator before crop marker.
|
||||
if remaining_words > 1 {
|
||||
after_tokens.next();
|
||||
}
|
||||
} else if matches!(before_token_separator_kind, SeparatorKind::Hard) {
|
||||
after_tokens.next();
|
||||
} else {
|
||||
before_tokens.next();
|
||||
}
|
||||
}
|
||||
// if one of the tokens is a word, we expend in the side of the word.
|
||||
// left is a word, advance left.
|
||||
(SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
|
||||
before_tokens.next();
|
||||
remaining_words -= 1;
|
||||
}
|
||||
// right is a word, advance right.
|
||||
(SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
|
||||
after_tokens.next();
|
||||
remaining_words -= 1;
|
||||
}
|
||||
// both are words, advance left then right if remaining_word > 0.
|
||||
(SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
|
||||
before_tokens.next();
|
||||
remaining_words -= 1;
|
||||
|
||||
if remaining_words > 0 {
|
||||
after_tokens.next();
|
||||
remaining_words -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// the end of the text is reached, advance left.
|
||||
(Some(before_token_kind), None) => {
|
||||
before_tokens.next();
|
||||
if matches!(before_token_kind, SimpleTokenKind::NotSeparator) {
|
||||
remaining_words -= 1;
|
||||
}
|
||||
}
|
||||
// the start of the text is reached, advance right.
|
||||
(None, Some(after_token_kind)) => {
|
||||
after_tokens.next();
|
||||
if matches!(after_token_kind, SimpleTokenKind::NotSeparator) {
|
||||
remaining_words -= 1;
|
||||
}
|
||||
}
|
||||
// no more token to add.
|
||||
(None, None) => break,
|
||||
}
|
||||
}
|
||||
|
||||
// finally, keep the byte index of each bound of the crop window.
|
||||
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
||||
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
||||
|
||||
[crop_byte_start, crop_byte_end]
|
||||
}
|
||||
|
||||
// Returns the formatted version of the original text.
|
||||
pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> {
|
||||
if !format_options.highlight && format_options.crop.is_none() {
|
||||
// compute matches is not needed if no highlight nor crop is requested.
|
||||
Cow::Borrowed(self.text)
|
||||
} else {
|
||||
match &self.matches {
|
||||
Some((tokens, matches)) => {
|
||||
// If the text has to be cropped, crop around the best interval.
|
||||
let [crop_byte_start, crop_byte_end] = match format_options.crop {
|
||||
Some(crop_size) if crop_size > 0 => {
|
||||
self.crop_bounds(tokens, matches, crop_size)
|
||||
}
|
||||
_ => [0, self.text.len()],
|
||||
};
|
||||
|
||||
let mut formatted = Vec::new();
|
||||
|
||||
// push crop marker if it's not the start of the text.
|
||||
if crop_byte_start > 0 && !self.crop_marker.is_empty() {
|
||||
formatted.push(self.crop_marker);
|
||||
}
|
||||
|
||||
let mut byte_index = crop_byte_start;
|
||||
|
||||
if format_options.highlight {
|
||||
// insert highlight markers around matches.
|
||||
for m in matches {
|
||||
let [m_byte_start, m_byte_end] = match m.position {
|
||||
MatchPosition::Word { token_position, .. } => {
|
||||
let token = &tokens[token_position];
|
||||
[&token.byte_start, &token.byte_end]
|
||||
}
|
||||
MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => {
|
||||
[&tokens[ftp].byte_start, &tokens[ltp].byte_end]
|
||||
}
|
||||
};
|
||||
|
||||
// skip matches out of the crop window
|
||||
if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end {
|
||||
continue;
|
||||
}
|
||||
|
||||
// adjust start and end to the crop window size
|
||||
let [m_byte_start, m_byte_end] = [
|
||||
max(m_byte_start, &crop_byte_start),
|
||||
min(m_byte_end, &crop_byte_end),
|
||||
];
|
||||
|
||||
// push text that is positioned before our matches
|
||||
if byte_index < *m_byte_start {
|
||||
formatted.push(&self.text[byte_index..*m_byte_start]);
|
||||
}
|
||||
|
||||
formatted.push(self.highlight_prefix);
|
||||
|
||||
// TODO: This is additional work done, charabia::token::Token byte_len
|
||||
// should already get us the original byte length, however, that doesn't work as
|
||||
// it's supposed to, investigate why
|
||||
let highlight_byte_index = self.text[*m_byte_start..]
|
||||
.char_indices()
|
||||
.nth(m.char_count)
|
||||
.map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end));
|
||||
formatted.push(&self.text[*m_byte_start..highlight_byte_index]);
|
||||
|
||||
formatted.push(self.highlight_suffix);
|
||||
|
||||
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
||||
if highlight_byte_index < *m_byte_end {
|
||||
formatted.push(&self.text[highlight_byte_index..*m_byte_end]);
|
||||
}
|
||||
|
||||
byte_index = *m_byte_end;
|
||||
}
|
||||
}
|
||||
|
||||
// push the rest of the text between last match and the end of crop.
|
||||
if byte_index < crop_byte_end {
|
||||
formatted.push(&self.text[byte_index..crop_byte_end]);
|
||||
}
|
||||
|
||||
// push crop marker if it's not the end of the text.
|
||||
if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() {
|
||||
formatted.push(self.crop_marker);
|
||||
}
|
||||
|
||||
if formatted.len() == 1 {
|
||||
// avoid concatenating if there is already 1 slice.
|
||||
Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end])
|
||||
} else {
|
||||
Cow::Owned(formatted.concat())
|
||||
}
|
||||
}
|
||||
None => self.compute_matches().format(format_options),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use charabia::TokenizerBuilder;
|
||||
use matching_words::tests::temp_index_with_documents;
|
||||
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{execute_search, filtered_universe, SearchContext, TimeBudget};
|
||||
|
||||
impl<'a> MatcherBuilder<'a> {
|
||||
fn new_test(rtxn: &'a heed::RoTxn<'a>, index: &'a TempIndex, query: &str) -> Self {
|
||||
let mut ctx = SearchContext::new(index, rtxn).unwrap();
|
||||
let universe = filtered_universe(ctx.index, ctx.txn, &None).unwrap();
|
||||
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
||||
&mut ctx,
|
||||
Some(query),
|
||||
crate::TermsMatchingStrategy::default(),
|
||||
crate::score_details::ScoringStrategy::Skip,
|
||||
false,
|
||||
universe,
|
||||
&None,
|
||||
&None,
|
||||
crate::search::new::GeoSortStrategy::default(),
|
||||
0,
|
||||
100,
|
||||
Some(10),
|
||||
&mut crate::DefaultSearchLogger,
|
||||
&mut crate::DefaultSearchLogger,
|
||||
TimeBudget::max(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_identity() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: None };
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// empty text.
|
||||
let text = "";
|
||||
let mut matcher = builder.build(text, None);
|
||||
assert_eq!(&matcher.format(format_options), "");
|
||||
|
||||
// text containing only separators.
|
||||
let text = ":-)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
assert_eq!(&matcher.format(format_options), ":-)");
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text, because there is no matches.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn highlight_unicode() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "world");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing prefix match.
|
||||
let text = "Ŵôřlḑôle";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Ŵôřlḑ</em>ôle"
|
||||
);
|
||||
|
||||
// Text containing unicode match.
|
||||
let text = "Ŵôřlḑ";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Ŵôřlḑ</em>"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "westfali");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing unicode match.
|
||||
let text = "Westfália";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Westfáli</em>a"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_crop() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(10) };
|
||||
|
||||
// empty text.
|
||||
let text = "";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@""
|
||||
);
|
||||
|
||||
// text containing only separators.
|
||||
let text = ":-)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@":-)"
|
||||
);
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 first words with a marker at the end.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"A quick brown fox can not jump 32 feet, right…"
|
||||
);
|
||||
|
||||
// Text without any match starting by a separator.
|
||||
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 first words with a marker at the end.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"(A quick brown fox can not jump 32 feet, right…"
|
||||
);
|
||||
|
||||
// Test phrase propagation
|
||||
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// should crop the phrase instead of croping around the match.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…Split The World is a book written by Emily Henry…"
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…future to build a world with the boy she loves…"
|
||||
);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…she loves. Emily Henry: The Love That Split The World."
|
||||
);
|
||||
|
||||
// Text containing a match unordered and a match ordered.
|
||||
let text = "The world split void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
|
||||
// Text containing matches with different density.
|
||||
let text = "split void the void void world void void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
|
||||
// Text containing matches with same word.
|
||||
let text = "split split split split split split void void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||
|
||||
// empty text.
|
||||
let text = "";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@""
|
||||
);
|
||||
|
||||
// text containing only separators.
|
||||
let text = ":-)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@":-)"
|
||||
);
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// both should return 10 first words with a marker at the end.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"A quick brown fox can not jump 32 feet, right…"
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…future to build a <em>world</em> with <em>the</em> boy she loves…"
|
||||
);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
|
||||
);
|
||||
|
||||
// Text containing a match unordered and a match ordered.
|
||||
let text = "The world split void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop_phrase_query() {
|
||||
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
||||
let temp_index = TempIndex::new();
|
||||
|
||||
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "text": text }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
|
||||
let mut matcher = builder.build(text, None);
|
||||
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…the power to split <em>the world</em> between those who embraced…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
|
||||
let mut matcher = builder.build(text, None);
|
||||
// should highlight "those" and the phrase "and those".
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"The groundbreaking invention had the power to split the world\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>The groundbreaking invention had the power to split the world</em>…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"The groundbreaking invention had the power to split the world between those\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>The groundbreaking invention had the power to split the world</em>…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
// TODO: Should include exclamation mark without crop markers
|
||||
@"…between those who <em>embraced progress and those who resisted change</em>…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"groundbreaking invention\" \"split the world between\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"groundbreaking invention\" \"had the power to split the world between those\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…<em>invention</em> <em>had the power to split the world between those</em>…"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn smaller_crop_size() {
|
||||
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let text = "void void split the world void void.";
|
||||
|
||||
// set a smaller crop size
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(2) };
|
||||
let mut matcher = builder.build(text, None);
|
||||
// because crop size < query size, partially format matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…split the…"
|
||||
);
|
||||
|
||||
// set a smaller crop size
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(1) };
|
||||
let mut matcher = builder.build(text, None);
|
||||
// because crop size < query size, partially format matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…split…"
|
||||
);
|
||||
|
||||
// set crop size to 0
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(0) };
|
||||
let mut matcher = builder.build(text, None);
|
||||
// because crop size is 0, crop is ignored.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"void void split the world void void."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_matches() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut builder =
|
||||
MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
|
||||
builder.highlight_prefix("_".to_string());
|
||||
builder.highlight_suffix("_".to_string());
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
let text = "the do or die can't be he do and or isn't he";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"_the_ _do or_ die can't be he do and or isn'_t he_"
|
||||
);
|
||||
}
|
||||
}
|
||||
15
crates/milli/src/search/new/matches/simple_token_kind.rs
Normal file
15
crates/milli/src/search/new/matches/simple_token_kind.rs
Normal file
@@ -0,0 +1,15 @@
|
||||
use charabia::{SeparatorKind, Token, TokenKind};
|
||||
|
||||
pub enum SimpleTokenKind {
|
||||
Separator(SeparatorKind),
|
||||
NotSeparator,
|
||||
}
|
||||
|
||||
impl SimpleTokenKind {
|
||||
pub fn new(token: &&Token<'_>) -> Self {
|
||||
match token.kind {
|
||||
TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
|
||||
_ => Self::NotSeparator,
|
||||
}
|
||||
}
|
||||
}
|
||||
883
crates/milli/src/search/new/mod.rs
Normal file
883
crates/milli/src/search/new/mod.rs
Normal file
@@ -0,0 +1,883 @@
|
||||
mod bucket_sort;
|
||||
mod db_cache;
|
||||
mod distinct;
|
||||
mod geo_sort;
|
||||
mod graph_based_ranking_rule;
|
||||
mod interner;
|
||||
mod limits;
|
||||
mod logger;
|
||||
pub mod matches;
|
||||
mod query_graph;
|
||||
mod query_term;
|
||||
mod ranking_rule_graph;
|
||||
mod ranking_rules;
|
||||
mod resolve_query_graph;
|
||||
mod small_bitmap;
|
||||
|
||||
mod exact_attribute;
|
||||
mod sort;
|
||||
mod vector_sort;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use bucket_sort::{bucket_sort, BucketSortOutput};
|
||||
use charabia::{Language, TokenizerBuilder};
|
||||
use db_cache::DatabaseCache;
|
||||
use exact_attribute::ExactAttribute;
|
||||
use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo};
|
||||
use heed::RoTxn;
|
||||
use interner::{DedupInterner, Interner};
|
||||
pub use logger::visual::VisualSearchLogger;
|
||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||
use query_graph::{QueryGraph, QueryNode};
|
||||
use query_term::{
|
||||
located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm,
|
||||
};
|
||||
use ranking_rules::{
|
||||
BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait,
|
||||
};
|
||||
use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache};
|
||||
use roaring::RoaringBitmap;
|
||||
use sort::Sort;
|
||||
|
||||
use self::distinct::facet_string_values;
|
||||
use self::geo_sort::GeoSort;
|
||||
pub use self::geo_sort::Strategy as GeoSortStrategy;
|
||||
use self::graph_based_ranking_rule::Words;
|
||||
use self::interner::Interned;
|
||||
use self::vector_sort::VectorSort;
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::vector::Embedder;
|
||||
use crate::{
|
||||
AscDesc, DocumentId, FieldId, Filter, Index, Member, Result, TermsMatchingStrategy, TimeBudget,
|
||||
UserError, Weight,
|
||||
};
|
||||
|
||||
/// A structure used throughout the execution of a search query.
|
||||
pub struct SearchContext<'ctx> {
|
||||
pub index: &'ctx Index,
|
||||
pub txn: &'ctx RoTxn<'ctx>,
|
||||
pub db_cache: DatabaseCache<'ctx>,
|
||||
pub word_interner: DedupInterner<String>,
|
||||
pub phrase_interner: DedupInterner<Phrase>,
|
||||
pub term_interner: Interner<QueryTerm>,
|
||||
pub phrase_docids: PhraseDocIdsCache,
|
||||
pub restricted_fids: Option<RestrictedFids>,
|
||||
}
|
||||
|
||||
impl<'ctx> SearchContext<'ctx> {
|
||||
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Result<Self> {
|
||||
let searchable_fids = index.searchable_fields_and_weights(txn)?;
|
||||
let exact_attributes_ids = index.exact_attributes_ids(txn)?;
|
||||
|
||||
let mut exact = Vec::new();
|
||||
let mut tolerant = Vec::new();
|
||||
for (_name, fid, weight) in searchable_fids {
|
||||
if exact_attributes_ids.contains(&fid) {
|
||||
exact.push((fid, weight));
|
||||
} else {
|
||||
tolerant.push((fid, weight));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Self {
|
||||
index,
|
||||
txn,
|
||||
db_cache: <_>::default(),
|
||||
word_interner: <_>::default(),
|
||||
phrase_interner: <_>::default(),
|
||||
term_interner: <_>::default(),
|
||||
phrase_docids: <_>::default(),
|
||||
restricted_fids: None,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn attributes_to_search_on(
|
||||
&mut self,
|
||||
attributes_to_search_on: &'ctx [String],
|
||||
) -> Result<()> {
|
||||
let user_defined_searchable = self.index.user_defined_searchable_fields(self.txn)?;
|
||||
let searchable_fields_weights = self.index.searchable_fields_and_weights(self.txn)?;
|
||||
let exact_attributes_ids = self.index.exact_attributes_ids(self.txn)?;
|
||||
|
||||
let mut wildcard = false;
|
||||
|
||||
let mut restricted_fids = RestrictedFids::default();
|
||||
for field_name in attributes_to_search_on {
|
||||
if field_name == "*" {
|
||||
wildcard = true;
|
||||
// we cannot early exit as we want to returns error in case of unknown fields
|
||||
continue;
|
||||
}
|
||||
let searchable_weight =
|
||||
searchable_fields_weights.iter().find(|(name, _, _)| name == field_name);
|
||||
let (fid, weight) = match searchable_weight {
|
||||
// The Field id exist and the field is searchable
|
||||
Some((_name, fid, weight)) => (*fid, *weight),
|
||||
// The field is not searchable but the user didn't define any searchable attributes
|
||||
None if user_defined_searchable.is_none() => continue,
|
||||
// The field is not searchable => User error
|
||||
None => {
|
||||
let (valid_fields, hidden_fields) = self.index.remove_hidden_fields(
|
||||
self.txn,
|
||||
searchable_fields_weights.iter().map(|(name, _, _)| name),
|
||||
)?;
|
||||
|
||||
let field = field_name.to_string();
|
||||
return Err(UserError::InvalidSearchableAttribute {
|
||||
field,
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
};
|
||||
|
||||
if exact_attributes_ids.contains(&fid) {
|
||||
restricted_fids.exact.push((fid, weight));
|
||||
} else {
|
||||
restricted_fids.tolerant.push((fid, weight));
|
||||
};
|
||||
}
|
||||
|
||||
if wildcard {
|
||||
self.restricted_fids = None;
|
||||
} else {
|
||||
self.restricted_fids = Some(restricted_fids);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, PartialOrd, Ord, Eq)]
|
||||
pub enum Word {
|
||||
Original(Interned<String>),
|
||||
Derived(Interned<String>),
|
||||
}
|
||||
|
||||
impl Word {
|
||||
pub fn interned(&self) -> Interned<String> {
|
||||
match self {
|
||||
Word::Original(word) => *word,
|
||||
Word::Derived(word) => *word,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct RestrictedFids {
|
||||
pub tolerant: Vec<(FieldId, Weight)>,
|
||||
pub exact: Vec<(FieldId, Weight)>,
|
||||
}
|
||||
|
||||
impl RestrictedFids {
|
||||
pub fn contains(&self, fid: &FieldId) -> bool {
|
||||
self.tolerant.iter().any(|(id, _)| id == fid) || self.exact.iter().any(|(id, _)| id == fid)
|
||||
}
|
||||
}
|
||||
|
||||
/// Apply the [`TermsMatchingStrategy`] to the query graph and resolve it.
|
||||
fn resolve_maximally_reduced_query_graph(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
matching_strategy: TermsMatchingStrategy,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut graph = query_graph.clone();
|
||||
|
||||
let nodes_to_remove = match matching_strategy {
|
||||
TermsMatchingStrategy::Last => query_graph
|
||||
.removal_order_for_terms_matching_strategy_last(ctx)
|
||||
.iter()
|
||||
.flat_map(|x| x.iter())
|
||||
.collect(),
|
||||
TermsMatchingStrategy::Frequency => query_graph
|
||||
.removal_order_for_terms_matching_strategy_frequency(ctx)?
|
||||
.iter()
|
||||
.flat_map(|x| x.iter())
|
||||
.collect(),
|
||||
TermsMatchingStrategy::All => vec![],
|
||||
};
|
||||
graph.remove_nodes_keep_edges(&nodes_to_remove);
|
||||
|
||||
logger.query_for_initial_universe(&graph);
|
||||
let docids = compute_query_graph_docids(ctx, &graph, universe)?;
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::universe")]
|
||||
fn resolve_universe(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
initial_universe: &RoaringBitmap,
|
||||
query_graph: &QueryGraph,
|
||||
matching_strategy: TermsMatchingStrategy,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
resolve_maximally_reduced_query_graph(
|
||||
ctx,
|
||||
initial_universe,
|
||||
query_graph,
|
||||
matching_strategy,
|
||||
logger,
|
||||
)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
||||
fn resolve_negative_words(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
negative_words: &[Word],
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut negative_bitmap = RoaringBitmap::new();
|
||||
for &word in negative_words {
|
||||
if let Some(bitmap) = ctx.word_docids(universe, word)? {
|
||||
negative_bitmap |= bitmap;
|
||||
}
|
||||
}
|
||||
Ok(negative_bitmap)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
||||
fn resolve_negative_phrases(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
negative_phrases: &[LocatedQueryTerm],
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut negative_bitmap = RoaringBitmap::new();
|
||||
for term in negative_phrases {
|
||||
let query_term = ctx.term_interner.get(term.value);
|
||||
if let Some(phrase) = query_term.original_phrase() {
|
||||
negative_bitmap |= ctx.get_phrase_docids(phrase)?;
|
||||
}
|
||||
}
|
||||
Ok(negative_bitmap)
|
||||
}
|
||||
|
||||
/// Return the list of initialised ranking rules to be used for a placeholder search.
|
||||
fn get_ranking_rules_for_placeholder_search<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
|
||||
let mut sort = false;
|
||||
let mut sorted_fields = HashSet::new();
|
||||
let mut geo_sorted = false;
|
||||
let mut ranking_rules: Vec<BoxRankingRule<'ctx, PlaceholderQuery>> = vec![];
|
||||
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
||||
for rr in settings_ranking_rules {
|
||||
match rr {
|
||||
// These rules need a query to have an effect; ignore them in placeholder search
|
||||
crate::Criterion::Words
|
||||
| crate::Criterion::Typo
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Proximity
|
||||
| crate::Criterion::Exactness => continue,
|
||||
crate::Criterion::Sort => {
|
||||
if sort {
|
||||
continue;
|
||||
}
|
||||
resolve_sort_criteria(
|
||||
sort_criteria,
|
||||
ctx,
|
||||
&mut ranking_rules,
|
||||
&mut sorted_fields,
|
||||
&mut geo_sorted,
|
||||
geo_strategy,
|
||||
)?;
|
||||
sort = true;
|
||||
}
|
||||
crate::Criterion::Asc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
crate::Criterion::Desc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn get_ranking_rules_for_vector<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
limit_plus_offset: usize,
|
||||
target: &[f32],
|
||||
embedder_name: &str,
|
||||
embedder: &Embedder,
|
||||
quantized: bool,
|
||||
) -> Result<Vec<BoxRankingRule<'ctx, PlaceholderQuery>>> {
|
||||
// query graph search
|
||||
|
||||
let mut sort = false;
|
||||
let mut sorted_fields = HashSet::new();
|
||||
let mut geo_sorted = false;
|
||||
|
||||
let mut vector = false;
|
||||
let mut ranking_rules: Vec<BoxRankingRule<'ctx, PlaceholderQuery>> = vec![];
|
||||
|
||||
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
||||
for rr in settings_ranking_rules {
|
||||
match rr {
|
||||
crate::Criterion::Words
|
||||
| crate::Criterion::Typo
|
||||
| crate::Criterion::Proximity
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Exactness => {
|
||||
if !vector {
|
||||
let vector_candidates = ctx.index.documents_ids(ctx.txn)?;
|
||||
let vector_sort = VectorSort::new(
|
||||
ctx,
|
||||
target.to_vec(),
|
||||
vector_candidates,
|
||||
limit_plus_offset,
|
||||
embedder_name,
|
||||
embedder,
|
||||
quantized,
|
||||
)?;
|
||||
ranking_rules.push(Box::new(vector_sort));
|
||||
vector = true;
|
||||
}
|
||||
}
|
||||
crate::Criterion::Sort => {
|
||||
if sort {
|
||||
continue;
|
||||
}
|
||||
resolve_sort_criteria(
|
||||
sort_criteria,
|
||||
ctx,
|
||||
&mut ranking_rules,
|
||||
&mut sorted_fields,
|
||||
&mut geo_sorted,
|
||||
geo_strategy,
|
||||
)?;
|
||||
sort = true;
|
||||
}
|
||||
crate::Criterion::Asc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
crate::Criterion::Desc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
/// Return the list of initialised ranking rules to be used for a query graph search.
|
||||
fn get_ranking_rules_for_query_graph_search<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
) -> Result<Vec<BoxRankingRule<'ctx, QueryGraph>>> {
|
||||
// query graph search
|
||||
let mut words = false;
|
||||
let mut typo = false;
|
||||
let mut proximity = false;
|
||||
let mut sort = false;
|
||||
let mut attribute = false;
|
||||
let mut exactness = false;
|
||||
let mut sorted_fields = HashSet::new();
|
||||
let mut geo_sorted = false;
|
||||
|
||||
// Don't add the `words` ranking rule if the term matching strategy is `All`
|
||||
if matches!(terms_matching_strategy, TermsMatchingStrategy::All) {
|
||||
words = true;
|
||||
}
|
||||
|
||||
let mut ranking_rules: Vec<BoxRankingRule<'ctx, QueryGraph>> = vec![];
|
||||
let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
|
||||
for rr in settings_ranking_rules {
|
||||
// Add Words before any of: typo, proximity, attribute
|
||||
match rr {
|
||||
crate::Criterion::Typo
|
||||
| crate::Criterion::Attribute
|
||||
| crate::Criterion::Proximity
|
||||
| crate::Criterion::Exactness => {
|
||||
if !words {
|
||||
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
|
||||
words = true;
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
match rr {
|
||||
crate::Criterion::Words => {
|
||||
if words {
|
||||
continue;
|
||||
}
|
||||
ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
|
||||
words = true;
|
||||
}
|
||||
crate::Criterion::Typo => {
|
||||
if typo {
|
||||
continue;
|
||||
}
|
||||
typo = true;
|
||||
ranking_rules.push(Box::new(Typo::new(None)));
|
||||
}
|
||||
crate::Criterion::Proximity => {
|
||||
if proximity {
|
||||
continue;
|
||||
}
|
||||
proximity = true;
|
||||
ranking_rules.push(Box::new(Proximity::new(None)));
|
||||
}
|
||||
crate::Criterion::Attribute => {
|
||||
if attribute {
|
||||
continue;
|
||||
}
|
||||
attribute = true;
|
||||
ranking_rules.push(Box::new(Fid::new(None)));
|
||||
ranking_rules.push(Box::new(Position::new(None)));
|
||||
}
|
||||
crate::Criterion::Sort => {
|
||||
if sort {
|
||||
continue;
|
||||
}
|
||||
resolve_sort_criteria(
|
||||
sort_criteria,
|
||||
ctx,
|
||||
&mut ranking_rules,
|
||||
&mut sorted_fields,
|
||||
&mut geo_sorted,
|
||||
geo_strategy,
|
||||
)?;
|
||||
sort = true;
|
||||
}
|
||||
crate::Criterion::Exactness => {
|
||||
if exactness {
|
||||
continue;
|
||||
}
|
||||
ranking_rules.push(Box::new(ExactAttribute::new()));
|
||||
ranking_rules.push(Box::new(Exactness::new()));
|
||||
exactness = true;
|
||||
}
|
||||
crate::Criterion::Asc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
crate::Criterion::Desc(field_name) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ranking_rules)
|
||||
}
|
||||
|
||||
fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
ctx: &SearchContext<'ctx>,
|
||||
ranking_rules: &mut Vec<BoxRankingRule<'ctx, Query>>,
|
||||
sorted_fields: &mut HashSet<String>,
|
||||
geo_sorted: &mut bool,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
) -> Result<()> {
|
||||
let sort_criteria = sort_criteria.clone().unwrap_or_default();
|
||||
ranking_rules.reserve(sort_criteria.len());
|
||||
for criterion in sort_criteria {
|
||||
match criterion {
|
||||
AscDesc::Asc(Member::Field(field_name)) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, true)?));
|
||||
}
|
||||
AscDesc::Desc(Member::Field(field_name)) => {
|
||||
if sorted_fields.contains(&field_name) {
|
||||
continue;
|
||||
}
|
||||
sorted_fields.insert(field_name.clone());
|
||||
ranking_rules.push(Box::new(Sort::new(ctx.index, ctx.txn, field_name, false)?));
|
||||
}
|
||||
AscDesc::Asc(Member::Geo(point)) => {
|
||||
if *geo_sorted {
|
||||
continue;
|
||||
}
|
||||
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
|
||||
ranking_rules.push(Box::new(GeoSort::new(
|
||||
geo_strategy,
|
||||
geo_faceted_docids,
|
||||
point,
|
||||
true,
|
||||
)?));
|
||||
}
|
||||
AscDesc::Desc(Member::Geo(point)) => {
|
||||
if *geo_sorted {
|
||||
continue;
|
||||
}
|
||||
let geo_faceted_docids = ctx.index.geo_faceted_documents_ids(ctx.txn)?;
|
||||
ranking_rules.push(Box::new(GeoSort::new(
|
||||
geo_strategy,
|
||||
geo_faceted_docids,
|
||||
point,
|
||||
false,
|
||||
)?));
|
||||
}
|
||||
};
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::universe")]
|
||||
pub fn filtered_universe(
|
||||
index: &Index,
|
||||
txn: &RoTxn<'_>,
|
||||
filters: &Option<Filter<'_>>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
Ok(if let Some(filters) = filters {
|
||||
filters.evaluate(txn, index)?
|
||||
} else {
|
||||
index.documents_ids(txn)?
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn execute_vector_search(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
vector: &[f32],
|
||||
scoring_strategy: ScoringStrategy,
|
||||
universe: RoaringBitmap,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
distinct: &Option<String>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
from: usize,
|
||||
length: usize,
|
||||
embedder_name: &str,
|
||||
embedder: &Embedder,
|
||||
quantized: bool,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
) -> Result<PartialSearchResult> {
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
// FIXME: input universe = universe & documents_with_vectors
|
||||
// for now if we're computing embeddings for ALL documents, we can assume that this is just universe
|
||||
let ranking_rules = get_ranking_rules_for_vector(
|
||||
ctx,
|
||||
sort_criteria,
|
||||
geo_strategy,
|
||||
from + length,
|
||||
vector,
|
||||
embedder_name,
|
||||
embedder,
|
||||
quantized,
|
||||
)?;
|
||||
|
||||
let mut placeholder_search_logger = logger::DefaultSearchLogger;
|
||||
let placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery> =
|
||||
&mut placeholder_search_logger;
|
||||
|
||||
let BucketSortOutput { docids, scores, all_candidates, degraded } = bucket_sort(
|
||||
ctx,
|
||||
ranking_rules,
|
||||
&PlaceholderQuery,
|
||||
distinct.as_deref(),
|
||||
&universe,
|
||||
from,
|
||||
length,
|
||||
scoring_strategy,
|
||||
placeholder_search_logger,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
)?;
|
||||
|
||||
Ok(PartialSearchResult {
|
||||
candidates: all_candidates,
|
||||
document_scores: scores,
|
||||
documents_ids: docids,
|
||||
located_query_terms: None,
|
||||
degraded,
|
||||
used_negative_operator: false,
|
||||
})
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::main")]
|
||||
pub fn execute_search(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
query: Option<&str>,
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
scoring_strategy: ScoringStrategy,
|
||||
exhaustive_number_hits: bool,
|
||||
mut universe: RoaringBitmap,
|
||||
sort_criteria: &Option<Vec<AscDesc>>,
|
||||
distinct: &Option<String>,
|
||||
geo_strategy: geo_sort::Strategy,
|
||||
from: usize,
|
||||
length: usize,
|
||||
words_limit: Option<usize>,
|
||||
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
|
||||
query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
time_budget: TimeBudget,
|
||||
ranking_score_threshold: Option<f64>,
|
||||
locales: Option<&Vec<Language>>,
|
||||
) -> Result<PartialSearchResult> {
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
let mut used_negative_operator = false;
|
||||
let mut located_query_terms = None;
|
||||
let query_terms = if let Some(query) = query {
|
||||
let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder");
|
||||
let entered = span.enter();
|
||||
|
||||
// We make sure that the analyzer is aware of the stop words
|
||||
// this ensures that the query builder is able to properly remove them.
|
||||
let mut tokbuilder = TokenizerBuilder::new();
|
||||
let stop_words = ctx.index.stop_words(ctx.txn)?;
|
||||
if let Some(ref stop_words) = stop_words {
|
||||
tokbuilder.stop_words(stop_words);
|
||||
}
|
||||
|
||||
let separators = ctx.index.allowed_separators(ctx.txn)?;
|
||||
let separators: Option<Vec<_>> =
|
||||
separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
if let Some(ref separators) = separators {
|
||||
tokbuilder.separators(separators);
|
||||
}
|
||||
|
||||
let dictionary = ctx.index.dictionary(ctx.txn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
|
||||
if let Some(ref dictionary) = dictionary {
|
||||
tokbuilder.words_dict(dictionary);
|
||||
}
|
||||
|
||||
let db_locales;
|
||||
match locales {
|
||||
Some(locales) => {
|
||||
if !locales.is_empty() {
|
||||
tokbuilder.allow_list(locales);
|
||||
}
|
||||
}
|
||||
None => {
|
||||
// If no locales are specified, we use the locales specified in the localized attributes rules
|
||||
let localized_attributes_rules = ctx.index.localized_attributes_rules(ctx.txn)?;
|
||||
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
let searchable_fields = ctx.index.searchable_fields_ids(ctx.txn)?;
|
||||
|
||||
let localized_fields = match &ctx.restricted_fids {
|
||||
// if AttributeToSearchOn is set, use the restricted list of ids
|
||||
Some(restricted_fids) => {
|
||||
let iter = restricted_fids
|
||||
.exact
|
||||
.iter()
|
||||
.chain(restricted_fids.tolerant.iter())
|
||||
.map(|(fid, _)| *fid);
|
||||
|
||||
LocalizedFieldIds::new(&localized_attributes_rules, &fields_ids_map, iter)
|
||||
}
|
||||
// Otherwise use the full list of ids coming from the index searchable fields
|
||||
None => LocalizedFieldIds::new(
|
||||
&localized_attributes_rules,
|
||||
&fields_ids_map,
|
||||
searchable_fields.into_iter(),
|
||||
),
|
||||
};
|
||||
|
||||
db_locales = localized_fields.all_locales();
|
||||
if !db_locales.is_empty() {
|
||||
tokbuilder.allow_list(&db_locales);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let tokenizer = tokbuilder.build();
|
||||
drop(entered);
|
||||
|
||||
let span = tracing::trace_span!(target: "search::tokens", "tokenize");
|
||||
let entered = span.enter();
|
||||
let tokens = tokenizer.tokenize(query);
|
||||
drop(entered);
|
||||
|
||||
let ExtractedTokens { query_terms, negative_words, negative_phrases } =
|
||||
located_query_terms_from_tokens(ctx, tokens, words_limit)?;
|
||||
used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
|
||||
|
||||
let ignored_documents = resolve_negative_words(ctx, Some(&universe), &negative_words)?;
|
||||
let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?;
|
||||
|
||||
universe -= ignored_documents;
|
||||
universe -= ignored_phrases;
|
||||
|
||||
if query_terms.is_empty() {
|
||||
// Do a placeholder search instead
|
||||
None
|
||||
} else {
|
||||
Some(query_terms)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let bucket_sort_output = if let Some(query_terms) = query_terms {
|
||||
let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
|
||||
located_query_terms = Some(new_located_query_terms);
|
||||
|
||||
let ranking_rules = get_ranking_rules_for_query_graph_search(
|
||||
ctx,
|
||||
sort_criteria,
|
||||
geo_strategy,
|
||||
terms_matching_strategy,
|
||||
)?;
|
||||
|
||||
universe &=
|
||||
resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?;
|
||||
|
||||
bucket_sort(
|
||||
ctx,
|
||||
ranking_rules,
|
||||
&graph,
|
||||
distinct.as_deref(),
|
||||
&universe,
|
||||
from,
|
||||
length,
|
||||
scoring_strategy,
|
||||
query_graph_logger,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
)?
|
||||
} else {
|
||||
let ranking_rules =
|
||||
get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_strategy)?;
|
||||
bucket_sort(
|
||||
ctx,
|
||||
ranking_rules,
|
||||
&PlaceholderQuery,
|
||||
distinct.as_deref(),
|
||||
&universe,
|
||||
from,
|
||||
length,
|
||||
scoring_strategy,
|
||||
placeholder_search_logger,
|
||||
time_budget,
|
||||
ranking_score_threshold,
|
||||
)?
|
||||
};
|
||||
|
||||
let BucketSortOutput { docids, scores, mut all_candidates, degraded } = bucket_sort_output;
|
||||
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
|
||||
|
||||
// The candidates is the universe unless the exhaustive number of hits
|
||||
// is requested and a distinct attribute is set.
|
||||
if exhaustive_number_hits {
|
||||
let distinct_field = match distinct.as_deref() {
|
||||
Some(distinct) => Some(distinct),
|
||||
None => ctx.index.distinct_field(ctx.txn)?,
|
||||
};
|
||||
|
||||
if let Some(f) = distinct_field {
|
||||
if let Some(distinct_fid) = fields_ids_map.id(f) {
|
||||
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(PartialSearchResult {
|
||||
candidates: all_candidates,
|
||||
document_scores: scores,
|
||||
documents_ids: docids,
|
||||
located_query_terms,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
})
|
||||
}
|
||||
|
||||
fn check_sort_criteria(
|
||||
ctx: &SearchContext<'_>,
|
||||
sort_criteria: Option<&Vec<AscDesc>>,
|
||||
) -> Result<()> {
|
||||
let sort_criteria = if let Some(sort_criteria) = sort_criteria {
|
||||
sort_criteria
|
||||
} else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
if sort_criteria.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// We check that the sort ranking rule exists and throw an
|
||||
// error if we try to use it and that it doesn't.
|
||||
let sort_ranking_rule_missing = !ctx.index.criteria(ctx.txn)?.contains(&crate::Criterion::Sort);
|
||||
if sort_ranking_rule_missing {
|
||||
return Err(UserError::SortRankingRuleMissing.into());
|
||||
}
|
||||
|
||||
// We check that we are allowed to use the sort criteria, we check
|
||||
// that they are declared in the sortable fields.
|
||||
let sortable_fields = ctx.index.sortable_fields(ctx.txn)?;
|
||||
for asc_desc in sort_criteria {
|
||||
match asc_desc.member() {
|
||||
Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => {
|
||||
let (valid_fields, hidden_fields) =
|
||||
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
|
||||
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: field.to_string(),
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
Member::Geo(_) if !sortable_fields.contains("_geo") => {
|
||||
let (valid_fields, hidden_fields) =
|
||||
ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?;
|
||||
|
||||
return Err(UserError::InvalidSortableAttribute {
|
||||
field: "_geo".to_string(),
|
||||
valid_fields,
|
||||
hidden_fields,
|
||||
}
|
||||
.into());
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct PartialSearchResult {
|
||||
pub located_query_terms: Option<Vec<LocatedQueryTerm>>,
|
||||
pub candidates: RoaringBitmap,
|
||||
pub documents_ids: Vec<DocumentId>,
|
||||
pub document_scores: Vec<Vec<ScoreDetails>>,
|
||||
|
||||
pub degraded: bool,
|
||||
pub used_negative_operator: bool,
|
||||
}
|
||||
536
crates/milli/src/search/new/query_graph.rs
Normal file
536
crates/milli/src/search/new/query_graph.rs
Normal file
@@ -0,0 +1,536 @@
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use std::collections::BTreeMap;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
use fxhash::{FxHashMap, FxHasher};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::interner::{FixedSizeInterner, Interned};
|
||||
use super::query_term::{
|
||||
self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset,
|
||||
};
|
||||
use super::small_bitmap::SmallBitmap;
|
||||
use super::SearchContext;
|
||||
use crate::search::new::interner::Interner;
|
||||
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
|
||||
use crate::Result;
|
||||
|
||||
/// A node of the [`QueryGraph`].
|
||||
///
|
||||
/// There are four types of nodes:
|
||||
/// 1. `Start` : unique, represents the start of the query
|
||||
/// 2. `End` : unique, represents the end of a query
|
||||
/// 3. `Deleted` : represents a node that was deleted.
|
||||
/// All deleted nodes are unreachable from the start node.
|
||||
/// 4. `Term` is a regular node representing a word or combination of words
|
||||
/// from the user query.
|
||||
#[derive(Clone)]
|
||||
pub struct QueryNode {
|
||||
pub data: QueryNodeData,
|
||||
pub predecessors: SmallBitmap<QueryNode>,
|
||||
pub successors: SmallBitmap<QueryNode>,
|
||||
}
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum QueryNodeData {
|
||||
Term(LocatedQueryTermSubset),
|
||||
Deleted,
|
||||
Start,
|
||||
End,
|
||||
}
|
||||
|
||||
/**
|
||||
A graph representing all the ways to interpret the user's search query.
|
||||
|
||||
## Example 1
|
||||
For the search query `sunflower`, we need to register the following things:
|
||||
- we need to look for the exact word `sunflower`
|
||||
- but also any word which is 1 or 2 typos apart from `sunflower`
|
||||
- and every word that contains the prefix `sunflower`
|
||||
- and also the couple of adjacent words `sun flower`
|
||||
- as well as all the user-defined synonyms of `sunflower`
|
||||
|
||||
All these derivations of a word will be stored in [`QueryTerm`].
|
||||
|
||||
## Example 2:
|
||||
For the search query `summer house by`.
|
||||
|
||||
We also look for all word derivations of each term. And we also need to consider
|
||||
the potential n-grams `summerhouse`, `summerhouseby`, and `houseby`.
|
||||
Furthermore, we need to know which words these ngrams replace. This is done by creating the
|
||||
following graph, where each node also contains a list of derivations:
|
||||
```txt
|
||||
┌───────┐
|
||||
┌─│houseby│─────────┐
|
||||
│ └───────┘ │
|
||||
┌───────┐ ┌───────┐ │ ┌───────┐ ┌────┐ │ ┌───────┐
|
||||
│ START │─┬─│summer │─┴─│ house │┌─│ by │─┼─│ END │
|
||||
└───────┘ │ └───────┘ └───────┘│ └────┘ │ └───────┘
|
||||
│ ┌────────────┐ │ │
|
||||
├─│summerhouse │───────┘ │
|
||||
│ └────────────┘ │
|
||||
│ ┌─────────────┐ │
|
||||
└─────────│summerhouseby│───────┘
|
||||
└─────────────┘
|
||||
```
|
||||
Note also that each node has a range of positions associated with it,
|
||||
such that `summer` is known to be a word at the positions `0..=0` and `houseby`
|
||||
is registered with the positions `1..=2`. When two nodes are connected by an edge,
|
||||
it means that they are potentially next to each other in the user's search query
|
||||
(depending on the [`TermsMatchingStrategy`](crate::search::TermsMatchingStrategy)
|
||||
and the transformations that were done on the query graph).
|
||||
*/
|
||||
#[derive(Clone)]
|
||||
pub struct QueryGraph {
|
||||
/// The index of the start node within `self.nodes`
|
||||
pub root_node: Interned<QueryNode>,
|
||||
/// The index of the end node within `self.nodes`
|
||||
pub end_node: Interned<QueryNode>,
|
||||
/// The list of all query nodes
|
||||
pub nodes: FixedSizeInterner<QueryNode>,
|
||||
}
|
||||
|
||||
impl QueryGraph {
|
||||
/// Build the query graph from the parsed user search query, return an updated list of the located query terms
|
||||
/// which contains ngrams.
|
||||
pub fn from_query(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
// The terms here must be consecutive
|
||||
terms: &[LocatedQueryTerm],
|
||||
) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
|
||||
let mut new_located_query_terms = terms.to_vec();
|
||||
|
||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||
|
||||
let mut nodes_data: Vec<QueryNodeData> = vec![QueryNodeData::Start, QueryNodeData::End];
|
||||
let root_node = 0;
|
||||
let end_node = 1;
|
||||
|
||||
// Ee could consider generalizing to 4,5,6,7,etc. ngrams
|
||||
let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
|
||||
(vec![], vec![], vec![root_node]);
|
||||
|
||||
let original_terms_len = terms.len();
|
||||
for term_idx in 0..original_terms_len {
|
||||
let mut new_nodes = vec![];
|
||||
|
||||
let new_node_idx = add_node(
|
||||
&mut nodes_data,
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset: QueryTermSubset::full(terms[term_idx].value),
|
||||
positions: terms[term_idx].positions.clone(),
|
||||
term_ids: term_idx as u8..=term_idx as u8,
|
||||
}),
|
||||
);
|
||||
new_nodes.push(new_node_idx);
|
||||
|
||||
if !prev1.is_empty() {
|
||||
if let Some(ngram) =
|
||||
query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)?
|
||||
{
|
||||
new_located_query_terms.push(ngram.clone());
|
||||
let ngram_idx = add_node(
|
||||
&mut nodes_data,
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset: QueryTermSubset::full(ngram.value),
|
||||
positions: ngram.positions,
|
||||
term_ids: term_idx as u8 - 1..=term_idx as u8,
|
||||
}),
|
||||
);
|
||||
new_nodes.push(ngram_idx);
|
||||
}
|
||||
}
|
||||
if !prev2.is_empty() {
|
||||
if let Some(ngram) =
|
||||
query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)?
|
||||
{
|
||||
new_located_query_terms.push(ngram.clone());
|
||||
let ngram_idx = add_node(
|
||||
&mut nodes_data,
|
||||
QueryNodeData::Term(LocatedQueryTermSubset {
|
||||
term_subset: QueryTermSubset::full(ngram.value),
|
||||
positions: ngram.positions,
|
||||
term_ids: term_idx as u8 - 2..=term_idx as u8,
|
||||
}),
|
||||
);
|
||||
new_nodes.push(ngram_idx);
|
||||
}
|
||||
}
|
||||
(prev0, prev1, prev2) = (new_nodes, prev0, prev1);
|
||||
}
|
||||
|
||||
let root_node = Interned::from_raw(root_node);
|
||||
let end_node = Interned::from_raw(end_node);
|
||||
let mut nodes = FixedSizeInterner::new(
|
||||
nodes_data.len() as u16,
|
||||
QueryNode {
|
||||
data: QueryNodeData::Deleted,
|
||||
predecessors: SmallBitmap::new(nodes_data.len() as u16),
|
||||
successors: SmallBitmap::new(nodes_data.len() as u16),
|
||||
},
|
||||
);
|
||||
for (node_idx, node_data) in nodes_data.into_iter().enumerate() {
|
||||
let node = nodes.get_mut(Interned::from_raw(node_idx as u16));
|
||||
node.data = node_data;
|
||||
}
|
||||
let mut graph = QueryGraph { root_node, end_node, nodes };
|
||||
graph.build_initial_edges();
|
||||
|
||||
Ok((graph, new_located_query_terms))
|
||||
}
|
||||
|
||||
/// Remove the given nodes, connecting all their predecessors to all their successors.
|
||||
pub fn remove_nodes_keep_edges(&mut self, nodes: &[Interned<QueryNode>]) {
|
||||
for &node_id in nodes {
|
||||
let node = self.nodes.get(node_id);
|
||||
let old_node_pred = node.predecessors.clone();
|
||||
let old_node_succ = node.successors.clone();
|
||||
for pred in old_node_pred.iter() {
|
||||
let pred_successors = &mut self.nodes.get_mut(pred).successors;
|
||||
pred_successors.remove(node_id);
|
||||
pred_successors.union(&old_node_succ);
|
||||
}
|
||||
for succ in old_node_succ.iter() {
|
||||
let succ_predecessors = &mut self.nodes.get_mut(succ).predecessors;
|
||||
succ_predecessors.remove(node_id);
|
||||
succ_predecessors.union(&old_node_pred);
|
||||
}
|
||||
let node = self.nodes.get_mut(node_id);
|
||||
node.data = QueryNodeData::Deleted;
|
||||
node.predecessors.clear();
|
||||
node.successors.clear();
|
||||
}
|
||||
}
|
||||
|
||||
/// Remove the given nodes and all their edges from the query graph.
|
||||
pub fn remove_nodes(&mut self, nodes: &[Interned<QueryNode>]) {
|
||||
for &node_id in nodes {
|
||||
let node = &self.nodes.get(node_id);
|
||||
let old_node_pred = node.predecessors.clone();
|
||||
let old_node_succ = node.successors.clone();
|
||||
|
||||
for pred in old_node_pred.iter() {
|
||||
self.nodes.get_mut(pred).successors.remove(node_id);
|
||||
}
|
||||
for succ in old_node_succ.iter() {
|
||||
self.nodes.get_mut(succ).predecessors.remove(node_id);
|
||||
}
|
||||
|
||||
let node = self.nodes.get_mut(node_id);
|
||||
node.data = QueryNodeData::Deleted;
|
||||
node.predecessors.clear();
|
||||
node.successors.clear();
|
||||
}
|
||||
}
|
||||
/// Simplify the query graph by removing all nodes that are disconnected from
|
||||
/// the start or end nodes.
|
||||
pub fn simplify(&mut self) {
|
||||
loop {
|
||||
let mut nodes_to_remove = vec![];
|
||||
for (node_idx, node) in self.nodes.iter() {
|
||||
if (!matches!(node.data, QueryNodeData::End | QueryNodeData::Deleted)
|
||||
&& node.successors.is_empty())
|
||||
|| (!matches!(node.data, QueryNodeData::Start | QueryNodeData::Deleted)
|
||||
&& node.predecessors.is_empty())
|
||||
{
|
||||
nodes_to_remove.push(node_idx);
|
||||
}
|
||||
}
|
||||
if nodes_to_remove.is_empty() {
|
||||
break;
|
||||
} else {
|
||||
self.remove_nodes(&nodes_to_remove);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn build_initial_edges(&mut self) {
|
||||
for (_, node) in self.nodes.iter_mut() {
|
||||
node.successors.clear();
|
||||
node.predecessors.clear();
|
||||
}
|
||||
for node_id in self.nodes.indexes() {
|
||||
let node = self.nodes.get(node_id);
|
||||
let end_prev_term_id = match &node.data {
|
||||
QueryNodeData::Term(term) => *term.term_ids.end() as i16,
|
||||
QueryNodeData::Start => -1,
|
||||
QueryNodeData::Deleted => continue,
|
||||
QueryNodeData::End => continue,
|
||||
};
|
||||
let successors = {
|
||||
let mut successors = SmallBitmap::for_interned_values_in(&self.nodes);
|
||||
let mut min = i16::MAX;
|
||||
for (node_id, node) in self.nodes.iter() {
|
||||
let start_next_term_id = match &node.data {
|
||||
QueryNodeData::Term(term) => *term.term_ids.start() as i16,
|
||||
QueryNodeData::End => i16::MAX,
|
||||
QueryNodeData::Start => continue,
|
||||
QueryNodeData::Deleted => continue,
|
||||
};
|
||||
if start_next_term_id <= end_prev_term_id {
|
||||
continue;
|
||||
}
|
||||
match start_next_term_id.cmp(&min) {
|
||||
Ordering::Less => {
|
||||
min = start_next_term_id;
|
||||
successors.clear();
|
||||
successors.insert(node_id);
|
||||
}
|
||||
Ordering::Equal => {
|
||||
successors.insert(node_id);
|
||||
}
|
||||
Ordering::Greater => continue,
|
||||
}
|
||||
}
|
||||
successors
|
||||
};
|
||||
let node = self.nodes.get_mut(node_id);
|
||||
node.successors = successors.clone();
|
||||
for successor in successors.iter() {
|
||||
let successor = self.nodes.get_mut(successor);
|
||||
successor.predecessors.insert(node_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn removal_order_for_terms_matching_strategy_frequency(
|
||||
&self,
|
||||
ctx: &mut SearchContext<'_>,
|
||||
) -> Result<Vec<SmallBitmap<QueryNode>>> {
|
||||
// lookup frequency for each term
|
||||
let mut term_with_frequency: Vec<(u8, u64)> = {
|
||||
let mut term_docids: BTreeMap<u8, RoaringBitmap> = Default::default();
|
||||
for (_, node) in self.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(t) => {
|
||||
let docids = compute_query_term_subset_docids(ctx, None, &t.term_subset)?;
|
||||
for id in t.term_ids.clone() {
|
||||
term_docids
|
||||
.entry(id)
|
||||
.and_modify(|curr| *curr |= &docids)
|
||||
.or_insert_with(|| docids.clone());
|
||||
}
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||
}
|
||||
}
|
||||
term_docids
|
||||
.into_iter()
|
||||
.map(|(idx, docids)| match docids.len() {
|
||||
0 => (idx, u64::MAX),
|
||||
frequency => (idx, frequency),
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
term_with_frequency.sort_by_key(|(_, frequency)| Reverse(*frequency));
|
||||
let mut term_weight = BTreeMap::new();
|
||||
let mut weight: u16 = 1;
|
||||
let mut peekable = term_with_frequency.into_iter().peekable();
|
||||
while let Some((idx, frequency)) = peekable.next() {
|
||||
term_weight.insert(idx, weight);
|
||||
if peekable.peek().map_or(false, |(_, f)| frequency != *f) {
|
||||
weight += 1;
|
||||
}
|
||||
}
|
||||
let cost_of_term_idx = move |term_idx: u8| *term_weight.get(&term_idx).unwrap();
|
||||
Ok(self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx))
|
||||
}
|
||||
|
||||
pub fn removal_order_for_terms_matching_strategy_last(
|
||||
&self,
|
||||
ctx: &SearchContext<'_>,
|
||||
) -> Vec<SmallBitmap<QueryNode>> {
|
||||
let (first_term_idx, last_term_idx) = {
|
||||
let mut first_term_idx = u8::MAX;
|
||||
let mut last_term_idx = 0u8;
|
||||
for (_, node) in self.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(t) => {
|
||||
if *t.term_ids.end() > last_term_idx {
|
||||
last_term_idx = *t.term_ids.end();
|
||||
}
|
||||
if *t.term_ids.start() < first_term_idx {
|
||||
first_term_idx = *t.term_ids.start();
|
||||
}
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
|
||||
}
|
||||
}
|
||||
(first_term_idx, last_term_idx)
|
||||
};
|
||||
if first_term_idx >= last_term_idx {
|
||||
return vec![];
|
||||
}
|
||||
|
||||
let cost_of_term_idx = |term_idx: u8| {
|
||||
let rank = 1 + last_term_idx - term_idx;
|
||||
rank as u16
|
||||
};
|
||||
self.removal_order_for_terms_matching_strategy(ctx, cost_of_term_idx)
|
||||
}
|
||||
|
||||
pub fn removal_order_for_terms_matching_strategy(
|
||||
&self,
|
||||
ctx: &SearchContext<'_>,
|
||||
order: impl Fn(u8) -> u16,
|
||||
) -> Vec<SmallBitmap<QueryNode>> {
|
||||
let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
|
||||
let mut at_least_one_mandatory_term = false;
|
||||
for (node_id, node) in self.nodes.iter() {
|
||||
let QueryNodeData::Term(t) = &node.data else { continue };
|
||||
if t.term_subset.original_phrase(ctx).is_some() || t.term_subset.is_mandatory() {
|
||||
at_least_one_mandatory_term = true;
|
||||
continue;
|
||||
}
|
||||
let mut cost = 0;
|
||||
for id in t.term_ids.clone() {
|
||||
cost = std::cmp::max(cost, order(id));
|
||||
}
|
||||
nodes_to_remove
|
||||
.entry(cost)
|
||||
.or_insert_with(|| SmallBitmap::for_interned_values_in(&self.nodes))
|
||||
.insert(node_id);
|
||||
}
|
||||
let mut res: Vec<_> = nodes_to_remove.into_values().collect();
|
||||
if !at_least_one_mandatory_term {
|
||||
res.pop();
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
/// Number of words in the phrases in this query graph
|
||||
pub(crate) fn words_in_phrases_count(&self, ctx: &SearchContext<'_>) -> usize {
|
||||
let mut word_count = 0;
|
||||
for (_, node) in self.nodes.iter() {
|
||||
match &node.data {
|
||||
QueryNodeData::Term(term) => {
|
||||
let Some(phrase) = term.term_subset.original_phrase(ctx) else { continue };
|
||||
let phrase = ctx.phrase_interner.get(phrase);
|
||||
word_count += phrase.words.iter().copied().filter(|a| a.is_some()).count()
|
||||
}
|
||||
_ => continue,
|
||||
}
|
||||
}
|
||||
word_count
|
||||
}
|
||||
}
|
||||
|
||||
fn add_node(nodes_data: &mut Vec<QueryNodeData>, node_data: QueryNodeData) -> u16 {
|
||||
let new_node_idx = nodes_data.len() as u16;
|
||||
nodes_data.push(node_data);
|
||||
new_node_idx
|
||||
}
|
||||
|
||||
impl QueryGraph {
|
||||
/*
|
||||
Build a query graph from a list of paths
|
||||
|
||||
The paths are composed of source and dest terms.
|
||||
|
||||
For example, consider the following paths:
|
||||
```txt
|
||||
PATH 1 : a -> b1 -> c1 -> d -> e1
|
||||
PATH 2 : a -> b2 -> c2 -> d -> e2
|
||||
```
|
||||
Then the resulting graph will be:
|
||||
```txt
|
||||
┌────┐ ┌────┐ ┌────┐ ┌────┐
|
||||
┌──│ b1 │──│ c1 │───│ d │───│ e1 │
|
||||
┌────┐ │ └────┘ └────┘ └────┘ └────┘
|
||||
│ a │─┤
|
||||
└────┘ │ ┌────┐ ┌────┐ ┌────┐ ┌────┐
|
||||
└──│ b2 │──│ c2 │───│ d │───│ e2 │
|
||||
└────┘ └────┘ └────┘ └────┘
|
||||
```
|
||||
*/
|
||||
pub fn build_from_paths(
|
||||
paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
|
||||
) -> Self {
|
||||
let mut node_data = Interner::default();
|
||||
let root_node = node_data.push(QueryNodeData::Start);
|
||||
let end_node = node_data.push(QueryNodeData::End);
|
||||
|
||||
let mut paths_with_single_terms = vec![];
|
||||
|
||||
for path in paths {
|
||||
let mut processed_path = vec![];
|
||||
let mut prev_dest_term: Option<LocatedQueryTermSubset> = None;
|
||||
for (start_term, dest_term) in path {
|
||||
if let Some(prev_dest_term) = prev_dest_term.take() {
|
||||
if let Some(mut start_term) = start_term {
|
||||
if start_term.term_ids == prev_dest_term.term_ids {
|
||||
start_term.term_subset.intersect(&prev_dest_term.term_subset);
|
||||
processed_path.push(start_term);
|
||||
} else {
|
||||
processed_path.push(prev_dest_term);
|
||||
processed_path.push(start_term);
|
||||
}
|
||||
} else {
|
||||
processed_path.push(prev_dest_term);
|
||||
}
|
||||
} else if let Some(start_term) = start_term {
|
||||
processed_path.push(start_term);
|
||||
}
|
||||
prev_dest_term = Some(dest_term);
|
||||
}
|
||||
if let Some(prev_dest_term) = prev_dest_term {
|
||||
processed_path.push(prev_dest_term);
|
||||
}
|
||||
paths_with_single_terms.push(processed_path);
|
||||
}
|
||||
|
||||
let mut paths_with_single_terms_and_suffix_hash = vec![];
|
||||
for path in paths_with_single_terms {
|
||||
let mut hasher = FxHasher::default();
|
||||
let mut path_with_hash = vec![];
|
||||
for term in path.into_iter().rev() {
|
||||
term.hash(&mut hasher);
|
||||
path_with_hash.push((term, hasher.finish()));
|
||||
}
|
||||
path_with_hash.reverse();
|
||||
paths_with_single_terms_and_suffix_hash.push(path_with_hash);
|
||||
}
|
||||
|
||||
let mut node_data_id_for_term_and_suffix_hash =
|
||||
FxHashMap::<(LocatedQueryTermSubset, u64), Interned<QueryNodeData>>::default();
|
||||
|
||||
let mut paths_with_ids = vec![];
|
||||
for path in paths_with_single_terms_and_suffix_hash {
|
||||
let mut path_with_ids = vec![];
|
||||
for (term, suffix_hash) in path {
|
||||
let node_data_id = node_data_id_for_term_and_suffix_hash
|
||||
.entry((term.clone(), suffix_hash))
|
||||
.or_insert_with(|| node_data.push(QueryNodeData::Term(term)));
|
||||
path_with_ids.push(Interned::from_raw(node_data_id.into_raw()));
|
||||
}
|
||||
paths_with_ids.push(path_with_ids);
|
||||
}
|
||||
|
||||
let nodes_data = node_data.freeze();
|
||||
let nodes_data_len = nodes_data.len();
|
||||
let mut nodes = nodes_data.map_move(|n| QueryNode {
|
||||
data: n,
|
||||
predecessors: SmallBitmap::new(nodes_data_len),
|
||||
successors: SmallBitmap::new(nodes_data_len),
|
||||
});
|
||||
|
||||
let root_node = Interned::<QueryNode>::from_raw(root_node.into_raw());
|
||||
let end_node = Interned::<QueryNode>::from_raw(end_node.into_raw());
|
||||
|
||||
for path in paths_with_ids {
|
||||
let mut prev_node_id = root_node;
|
||||
for node_id in path {
|
||||
let prev_node = nodes.get_mut(prev_node_id);
|
||||
prev_node.successors.insert(node_id);
|
||||
let node = nodes.get_mut(node_id);
|
||||
node.predecessors.insert(prev_node_id);
|
||||
prev_node_id = node_id;
|
||||
}
|
||||
let prev_node = nodes.get_mut(prev_node_id);
|
||||
prev_node.successors.insert(end_node);
|
||||
let node = nodes.get_mut(end_node);
|
||||
node.predecessors.insert(prev_node_id);
|
||||
}
|
||||
|
||||
QueryGraph { root_node, end_node, nodes }
|
||||
}
|
||||
}
|
||||
428
crates/milli/src/search/new/query_term/compute_derivations.rs
Normal file
428
crates/milli/src/search/new/query_term/compute_derivations.rs
Normal file
@@ -0,0 +1,428 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use fst::automaton::Str;
|
||||
use fst::{Automaton, IntoStreamer, Streamer};
|
||||
use heed::types::DecodeIgnore;
|
||||
|
||||
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
|
||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::{Lazy, TwoTypoTerm};
|
||||
use crate::search::new::{limits, SearchContext};
|
||||
use crate::search::{build_dfa, get_first};
|
||||
use crate::{Result, MAX_WORD_LENGTH};
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub enum NumberOfTypos {
|
||||
Zero,
|
||||
One,
|
||||
Two,
|
||||
}
|
||||
|
||||
pub enum ZeroOrOneTypo {
|
||||
Zero,
|
||||
One,
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() {
|
||||
assert!(s.two_typo.is_uninit());
|
||||
// Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
|
||||
self.initialize_one_typo_subterm(ctx)?;
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
assert!(s.one_typo.is_init());
|
||||
s.two_typo = Lazy::Init(TwoTypoTerm::default());
|
||||
} else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() {
|
||||
assert!(s.two_typo.is_uninit());
|
||||
self.initialize_one_and_two_typo_subterm(ctx)?;
|
||||
let s = ctx.term_interner.get_mut(self);
|
||||
assert!(s.one_typo.is_init() && s.two_typo.is_init());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn find_zero_typo_prefix_derivations(
|
||||
word_interned: Interned<String>,
|
||||
fst: fst::Set<Cow<'_, [u8]>>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let word = word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
let prefix = Str::new(word).starts_with();
|
||||
let mut stream = fst.search(prefix).into_stream();
|
||||
|
||||
while let Some(derived_word) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?.to_owned();
|
||||
let derived_word_interned = word_interner.insert(derived_word);
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_zero_one_typo_derivations(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
word_interned: Interned<String>,
|
||||
is_prefix: bool,
|
||||
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let fst = ctx.get_words_fst()?;
|
||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
|
||||
let dfa = build_dfa(word, 1, is_prefix);
|
||||
let starts = StartsWith(Str::new(get_first(word)));
|
||||
let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream();
|
||||
|
||||
while let Some((derived_word, state)) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?;
|
||||
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
||||
let d = dfa.distance(state.1);
|
||||
match d.to_u8() {
|
||||
0 => {
|
||||
if derived_word != word_interned {
|
||||
let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
let cf = visit(derived_word, ZeroOrOneTypo::One)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
unreachable!("One typo dfa produced multiple typos")
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn find_zero_one_two_typo_derivations(
|
||||
word_interned: Interned<String>,
|
||||
is_prefix: bool,
|
||||
fst: fst::Set<Cow<'_, [u8]>>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
mut visit: impl FnMut(Interned<String>, NumberOfTypos) -> Result<ControlFlow<()>>,
|
||||
) -> Result<()> {
|
||||
let word = word_interner.get(word_interned).to_owned();
|
||||
let word = word.as_str();
|
||||
|
||||
let starts = StartsWith(Str::new(get_first(word)));
|
||||
let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
|
||||
let second_dfa = build_dfa(word, 2, is_prefix);
|
||||
let second = Intersection(&second_dfa, &starts);
|
||||
let automaton = Union(first, &second);
|
||||
|
||||
let mut stream = fst.search_with_state(automaton).into_stream();
|
||||
|
||||
while let Some((derived_word, state)) = stream.next() {
|
||||
let derived_word = std::str::from_utf8(derived_word)?;
|
||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||
// in the case the typo is on the first letter, we know the number of typo
|
||||
// is two
|
||||
if get_first(derived_word) != get_first(word) {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Else, we know that it is the second dfa that matched and compute the
|
||||
// correct distance
|
||||
let d = second_dfa.distance((state.1).0);
|
||||
match d.to_u8() {
|
||||
0 => {
|
||||
if derived_word_interned != word_interned {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
2 => {
|
||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
||||
if cf.is_break() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ => unreachable!("2 typos DFA produced a distance greater than 2"),
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn partially_initialized_term_from_word(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
word: &str,
|
||||
max_typo: u8,
|
||||
is_prefix: bool,
|
||||
is_ngram: bool,
|
||||
) -> Result<QueryTerm> {
|
||||
let word_interned = ctx.word_interner.insert(word.to_owned());
|
||||
|
||||
if word.len() > MAX_WORD_LENGTH {
|
||||
return Ok({
|
||||
QueryTerm {
|
||||
original: ctx.word_interner.insert(word.to_owned()),
|
||||
ngram_words: None,
|
||||
is_prefix: false,
|
||||
max_levenshtein_distance: 0,
|
||||
zero_typo: <_>::default(),
|
||||
one_typo: Lazy::Init(<_>::default()),
|
||||
two_typo: Lazy::Init(<_>::default()),
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let fst = ctx.index.words_fst(ctx.txn)?;
|
||||
|
||||
let use_prefix_db = is_prefix
|
||||
&& (ctx
|
||||
.index
|
||||
.word_prefix_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(ctx.txn, word)?
|
||||
.is_some()
|
||||
|| (!is_ngram
|
||||
&& ctx
|
||||
.index
|
||||
.exact_word_prefix_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(ctx.txn, word)?
|
||||
.is_some()));
|
||||
let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };
|
||||
|
||||
let mut zero_typo = None;
|
||||
let mut prefix_of = BTreeSet::new();
|
||||
|
||||
if fst.contains(word) {
|
||||
zero_typo = Some(word_interned);
|
||||
}
|
||||
|
||||
if is_prefix && use_prefix_db.is_none() {
|
||||
find_zero_typo_prefix_derivations(
|
||||
word_interned,
|
||||
fst,
|
||||
&mut ctx.word_interner,
|
||||
|derived_word| {
|
||||
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
||||
prefix_of.insert(derived_word);
|
||||
Ok(ControlFlow::Continue(()))
|
||||
} else {
|
||||
Ok(ControlFlow::Break(()))
|
||||
}
|
||||
},
|
||||
)?;
|
||||
}
|
||||
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||
let mut synonym_word_count = 0;
|
||||
let synonyms = synonyms
|
||||
.get(&vec![word.to_owned()])
|
||||
.cloned()
|
||||
.unwrap_or_default()
|
||||
.into_iter()
|
||||
.take(limits::MAX_SYNONYM_PHRASE_COUNT)
|
||||
.filter_map(|words| {
|
||||
if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT {
|
||||
return None;
|
||||
}
|
||||
synonym_word_count += words.len();
|
||||
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
|
||||
Some(ctx.phrase_interner.insert(Phrase { words }))
|
||||
})
|
||||
.collect();
|
||||
let zero_typo =
|
||||
ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db };
|
||||
|
||||
Ok(QueryTerm {
|
||||
original: word_interned,
|
||||
ngram_words: None,
|
||||
max_levenshtein_distance: max_typo,
|
||||
is_prefix,
|
||||
zero_typo,
|
||||
one_typo: Lazy::Uninit,
|
||||
two_typo: Lazy::Uninit,
|
||||
})
|
||||
}
|
||||
|
||||
fn find_split_words(ctx: &mut SearchContext<'_>, word: &str) -> Result<Option<Interned<Phrase>>> {
|
||||
if let Some((l, r)) = split_best_frequency(ctx, word)? {
|
||||
Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
fn initialize_one_typo_subterm(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
|
||||
let allows_split_words = self_mut.allows_split_words();
|
||||
let QueryTerm {
|
||||
original,
|
||||
is_prefix,
|
||||
one_typo,
|
||||
max_levenshtein_distance: max_nbr_typos,
|
||||
..
|
||||
} = self_mut;
|
||||
|
||||
let original = *original;
|
||||
let is_prefix = *is_prefix;
|
||||
// let original_str = ctx.word_interner.get(*original).to_owned();
|
||||
if one_typo.is_init() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut one_typo_words = BTreeSet::new();
|
||||
|
||||
if *max_nbr_typos > 0 {
|
||||
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
|
||||
match nbr_typos {
|
||||
ZeroOrOneTypo::Zero => {}
|
||||
ZeroOrOneTypo::One => {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
} else {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
})?;
|
||||
}
|
||||
|
||||
let split_words = if allows_split_words {
|
||||
let original_str = ctx.word_interner.get(original).to_owned();
|
||||
find_split_words(ctx, original_str.as_str())?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
|
||||
// Only add the split words to the derivations if:
|
||||
// 1. the term is neither an ngram nor a phrase; OR
|
||||
// 2. the term is an ngram, but the split words are different from the ngram's component words
|
||||
let split_words = if let Some((ngram_words, split_words)) =
|
||||
self_mut.ngram_words.as_ref().zip(split_words.as_ref())
|
||||
{
|
||||
let Phrase { words } = ctx.phrase_interner.get(*split_words);
|
||||
if ngram_words.iter().ne(words.iter().flatten()) {
|
||||
Some(*split_words)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
split_words
|
||||
};
|
||||
let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words };
|
||||
|
||||
self_mut.one_typo = Lazy::Init(one_typo);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
let QueryTerm {
|
||||
original,
|
||||
is_prefix,
|
||||
two_typo,
|
||||
max_levenshtein_distance: max_nbr_typos,
|
||||
..
|
||||
} = self_mut;
|
||||
let original_str = ctx.word_interner.get(*original).to_owned();
|
||||
if two_typo.is_init() {
|
||||
return Ok(());
|
||||
}
|
||||
let mut one_typo_words = BTreeSet::new();
|
||||
let mut two_typo_words = BTreeSet::new();
|
||||
|
||||
if *max_nbr_typos > 0 {
|
||||
find_zero_one_two_typo_derivations(
|
||||
*original,
|
||||
*is_prefix,
|
||||
ctx.index.words_fst(ctx.txn)?,
|
||||
&mut ctx.word_interner,
|
||||
|derived_word, nbr_typos| {
|
||||
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
||||
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
||||
{
|
||||
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
||||
return Ok(ControlFlow::Break(()));
|
||||
}
|
||||
match nbr_typos {
|
||||
NumberOfTypos::Zero => {}
|
||||
NumberOfTypos::One => {
|
||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
||||
one_typo_words.insert(derived_word);
|
||||
}
|
||||
}
|
||||
NumberOfTypos::Two => {
|
||||
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
|
||||
two_typo_words.insert(derived_word);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(ControlFlow::Continue(()))
|
||||
},
|
||||
)?;
|
||||
}
|
||||
|
||||
let split_words = find_split_words(ctx, original_str.as_str())?;
|
||||
let self_mut = ctx.term_interner.get_mut(self);
|
||||
|
||||
let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words };
|
||||
|
||||
let two_typo = TwoTypoTerm { two_typos: two_typo_words };
|
||||
|
||||
self_mut.one_typo = Lazy::Init(one_typo);
|
||||
self_mut.two_typo = Lazy::Init(two_typo);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Split the original word into the two words that appear the
|
||||
/// most next to each other in the index.
|
||||
///
|
||||
/// Return `None` if the original word cannot be split.
|
||||
fn split_best_frequency(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
original: &str,
|
||||
) -> Result<Option<(Interned<String>, Interned<String>)>> {
|
||||
let chars = original.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = original.split_at(i);
|
||||
let left = ctx.word_interner.insert(left.to_owned());
|
||||
let right = ctx.word_interner.insert(right.to_owned());
|
||||
|
||||
if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(None, left, right, 1)? {
|
||||
if best.map_or(true, |(old, _, _)| frequency > old) {
|
||||
best = Some((frequency, left, right));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, left, right)| (left, right)))
|
||||
}
|
||||
510
crates/milli/src/search/new/query_term/mod.rs
Normal file
510
crates/milli/src/search/new/query_term/mod.rs
Normal file
@@ -0,0 +1,510 @@
|
||||
mod compute_derivations;
|
||||
mod ntypo_subset;
|
||||
mod parse_query;
|
||||
mod phrase;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::iter::FromIterator;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use either::Either;
|
||||
pub use ntypo_subset::NTypoTermSubset;
|
||||
pub use parse_query::{
|
||||
located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens,
|
||||
};
|
||||
pub use phrase::Phrase;
|
||||
|
||||
use super::interner::{DedupInterner, Interned};
|
||||
use super::{limits, SearchContext, Word};
|
||||
use crate::Result;
|
||||
|
||||
/// A set of word derivations attached to a location in the search query.
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct LocatedQueryTermSubset {
|
||||
pub term_subset: QueryTermSubset,
|
||||
pub positions: RangeInclusive<u16>,
|
||||
pub term_ids: RangeInclusive<u8>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct QueryTermSubset {
|
||||
original: Interned<QueryTerm>,
|
||||
zero_typo_subset: NTypoTermSubset,
|
||||
one_typo_subset: NTypoTermSubset,
|
||||
two_typo_subset: NTypoTermSubset,
|
||||
/// `true` if the term cannot be deleted through the term matching strategy
|
||||
///
|
||||
/// Note that there are other reasons for which a term cannot be deleted, such as
|
||||
/// being a phrase. In that case, this field could be set to `false`, but it
|
||||
/// still wouldn't be deleteable by the term matching strategy.
|
||||
mandatory: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub struct QueryTerm {
|
||||
original: Interned<String>,
|
||||
ngram_words: Option<Vec<Interned<String>>>,
|
||||
max_levenshtein_distance: u8,
|
||||
is_prefix: bool,
|
||||
zero_typo: ZeroTypoTerm,
|
||||
// May not be computed yet
|
||||
one_typo: Lazy<OneTypoTerm>,
|
||||
// May not be computed yet
|
||||
two_typo: Lazy<TwoTypoTerm>,
|
||||
}
|
||||
|
||||
// SubTerms will be in a dedup interner
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct ZeroTypoTerm {
|
||||
/// The original phrase, if any
|
||||
phrase: Option<Interned<Phrase>>,
|
||||
/// A single word equivalent to the original term, with zero typos
|
||||
exact: Option<Interned<String>>,
|
||||
/// All the words that contain the original word as prefix
|
||||
prefix_of: BTreeSet<Interned<String>>,
|
||||
/// All the synonyms of the original word or phrase
|
||||
synonyms: BTreeSet<Interned<Phrase>>,
|
||||
/// A prefix in the prefix databases matching the original word
|
||||
use_prefix_db: Option<Interned<String>>,
|
||||
}
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct OneTypoTerm {
|
||||
/// The original word split into multiple consecutive words
|
||||
split_words: Option<Interned<Phrase>>,
|
||||
/// Words that are 1 typo away from the original word
|
||||
one_typo: BTreeSet<Interned<String>>,
|
||||
}
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
struct TwoTypoTerm {
|
||||
/// Words that are 2 typos away from the original word
|
||||
two_typos: BTreeSet<Interned<String>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum Lazy<T> {
|
||||
Uninit,
|
||||
Init(T),
|
||||
}
|
||||
impl<T> Lazy<T> {
|
||||
pub fn is_init(&self) -> bool {
|
||||
match self {
|
||||
Lazy::Uninit => false,
|
||||
Lazy::Init(_) => true,
|
||||
}
|
||||
}
|
||||
pub fn is_uninit(&self) -> bool {
|
||||
match self {
|
||||
Lazy::Uninit => true,
|
||||
Lazy::Init(_) => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
pub enum ExactTerm {
|
||||
Phrase(Interned<Phrase>),
|
||||
Word(Interned<String>),
|
||||
}
|
||||
|
||||
impl ExactTerm {
|
||||
pub fn interned_words<'ctx>(
|
||||
&self,
|
||||
ctx: &'ctx SearchContext<'ctx>,
|
||||
) -> impl Iterator<Item = Option<Interned<String>>> + 'ctx {
|
||||
match *self {
|
||||
ExactTerm::Phrase(phrase) => {
|
||||
let phrase = ctx.phrase_interner.get(phrase);
|
||||
Either::Left(phrase.words.iter().copied())
|
||||
}
|
||||
ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTermSubset {
|
||||
pub fn is_mandatory(&self) -> bool {
|
||||
self.mandatory
|
||||
}
|
||||
pub fn make_mandatory(&mut self) {
|
||||
self.mandatory = true;
|
||||
}
|
||||
pub fn exact_term(&self, ctx: &SearchContext<'_>) -> Option<ExactTerm> {
|
||||
let full_query_term = ctx.term_interner.get(self.original);
|
||||
if full_query_term.ngram_words.is_some() {
|
||||
return None;
|
||||
}
|
||||
if let Some(phrase) = full_query_term.zero_typo.phrase {
|
||||
self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
|
||||
} else if let Some(word) = full_query_term.zero_typo.exact {
|
||||
self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn empty(for_term: Interned<QueryTerm>) -> Self {
|
||||
Self {
|
||||
original: for_term,
|
||||
zero_typo_subset: NTypoTermSubset::Nothing,
|
||||
one_typo_subset: NTypoTermSubset::Nothing,
|
||||
two_typo_subset: NTypoTermSubset::Nothing,
|
||||
mandatory: false,
|
||||
}
|
||||
}
|
||||
pub fn full(for_term: Interned<QueryTerm>) -> Self {
|
||||
Self {
|
||||
original: for_term,
|
||||
zero_typo_subset: NTypoTermSubset::All,
|
||||
one_typo_subset: NTypoTermSubset::All,
|
||||
two_typo_subset: NTypoTermSubset::All,
|
||||
mandatory: false,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn union(&mut self, other: &Self) {
|
||||
assert!(self.original == other.original);
|
||||
self.zero_typo_subset.union(&other.zero_typo_subset);
|
||||
self.one_typo_subset.union(&other.one_typo_subset);
|
||||
self.two_typo_subset.union(&other.two_typo_subset);
|
||||
}
|
||||
pub fn intersect(&mut self, other: &Self) {
|
||||
assert!(self.original == other.original);
|
||||
self.zero_typo_subset.intersect(&other.zero_typo_subset);
|
||||
self.one_typo_subset.intersect(&other.one_typo_subset);
|
||||
self.two_typo_subset.intersect(&other.two_typo_subset);
|
||||
}
|
||||
|
||||
pub fn use_prefix_db(&self, ctx: &SearchContext<'_>) -> Option<Word> {
|
||||
let original = ctx.term_interner.get(self.original);
|
||||
let use_prefix_db = original.zero_typo.use_prefix_db?;
|
||||
let word = match &self.zero_typo_subset {
|
||||
NTypoTermSubset::All => Some(use_prefix_db),
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
if words.contains(&use_prefix_db) {
|
||||
Some(use_prefix_db)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
NTypoTermSubset::Nothing => None,
|
||||
};
|
||||
word.map(|word| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(word)
|
||||
} else {
|
||||
Word::Original(word)
|
||||
}
|
||||
})
|
||||
}
|
||||
pub fn all_single_words_except_prefix_db(
|
||||
&self,
|
||||
ctx: &mut SearchContext<'_>,
|
||||
) -> Result<BTreeSet<Word>> {
|
||||
let mut result = BTreeSet::default();
|
||||
if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
|
||||
self.original.compute_fully_if_needed(ctx)?;
|
||||
}
|
||||
|
||||
let original = ctx.term_interner.get_mut(self.original);
|
||||
match &self.zero_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let ZeroTypoTerm {
|
||||
phrase: _,
|
||||
exact: zero_typo,
|
||||
prefix_of,
|
||||
synonyms: _,
|
||||
use_prefix_db: _,
|
||||
} = &original.zero_typo;
|
||||
result.extend(zero_typo.iter().copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
result.extend(prefix_of.iter().copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let ZeroTypoTerm {
|
||||
phrase: _,
|
||||
exact: zero_typo,
|
||||
prefix_of,
|
||||
synonyms: _,
|
||||
use_prefix_db: _,
|
||||
} = &original.zero_typo;
|
||||
if let Some(zero_typo) = zero_typo {
|
||||
if words.contains(zero_typo) {
|
||||
if original.ngram_words.is_some() {
|
||||
result.insert(Word::Derived(*zero_typo));
|
||||
} else {
|
||||
result.insert(Word::Original(*zero_typo));
|
||||
}
|
||||
}
|
||||
}
|
||||
result.extend(prefix_of.intersection(words).copied().map(|w| {
|
||||
if original.ngram_words.is_some() {
|
||||
Word::Derived(w)
|
||||
} else {
|
||||
Word::Original(w)
|
||||
}
|
||||
}));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
}
|
||||
|
||||
match &self.one_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo
|
||||
else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.iter().copied().map(Word::Derived))
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo
|
||||
else {
|
||||
panic!()
|
||||
};
|
||||
result.extend(one_typo.intersection(words).copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
|
||||
match &self.two_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() };
|
||||
result.extend(two_typos.iter().copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => {
|
||||
let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { panic!() };
|
||||
result.extend(two_typos.intersection(words).copied().map(Word::Derived));
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
pub fn all_phrases(&self, ctx: &mut SearchContext<'_>) -> Result<BTreeSet<Interned<Phrase>>> {
|
||||
let mut result = BTreeSet::default();
|
||||
|
||||
if !self.one_typo_subset.is_empty() {
|
||||
self.original.compute_fully_if_needed(ctx)?;
|
||||
}
|
||||
let original = ctx.term_interner.get_mut(self.original);
|
||||
|
||||
let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } =
|
||||
&original.zero_typo;
|
||||
result.extend(phrase.iter().copied());
|
||||
result.extend(synonyms.iter().copied());
|
||||
|
||||
match &self.one_typo_subset {
|
||||
NTypoTermSubset::All => {
|
||||
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo
|
||||
else {
|
||||
panic!();
|
||||
};
|
||||
result.extend(split_words.iter().copied());
|
||||
}
|
||||
NTypoTermSubset::Subset { phrases, .. } => {
|
||||
let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo
|
||||
else {
|
||||
panic!();
|
||||
};
|
||||
if let Some(split_words) = split_words {
|
||||
if phrases.contains(split_words) {
|
||||
result.insert(*split_words);
|
||||
}
|
||||
}
|
||||
}
|
||||
NTypoTermSubset::Nothing => {}
|
||||
}
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn original_phrase(&self, ctx: &SearchContext<'_>) -> Option<Interned<Phrase>> {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
if let Some(p) = t.zero_typo.phrase {
|
||||
if self.zero_typo_subset.contains_phrase(p) {
|
||||
return Some(p);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
pub fn max_typo_cost(&self, ctx: &SearchContext<'_>) -> u8 {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
match t.max_levenshtein_distance {
|
||||
0 => {
|
||||
if t.allows_split_words() {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
}
|
||||
}
|
||||
1 => {
|
||||
if self.one_typo_subset.is_empty() {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
}
|
||||
2 => {
|
||||
if self.two_typo_subset.is_empty() {
|
||||
if self.one_typo_subset.is_empty() {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
}
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}
|
||||
_ => panic!(),
|
||||
}
|
||||
}
|
||||
pub fn keep_only_exact_term(&mut self, ctx: &SearchContext<'_>) {
|
||||
if let Some(term) = self.exact_term(ctx) {
|
||||
match term {
|
||||
ExactTerm::Phrase(p) => {
|
||||
self.zero_typo_subset = NTypoTermSubset::Subset {
|
||||
words: BTreeSet::new(),
|
||||
phrases: BTreeSet::from_iter([p]),
|
||||
};
|
||||
self.clear_one_typo_subset();
|
||||
self.clear_two_typo_subset();
|
||||
}
|
||||
ExactTerm::Word(w) => {
|
||||
self.zero_typo_subset = NTypoTermSubset::Subset {
|
||||
words: BTreeSet::from_iter([w]),
|
||||
phrases: BTreeSet::new(),
|
||||
};
|
||||
self.clear_one_typo_subset();
|
||||
self.clear_two_typo_subset();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn clear_zero_typo_subset(&mut self) {
|
||||
self.zero_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn clear_one_typo_subset(&mut self) {
|
||||
self.one_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn clear_two_typo_subset(&mut self) {
|
||||
self.two_typo_subset = NTypoTermSubset::Nothing;
|
||||
}
|
||||
pub fn description(&self, ctx: &SearchContext<'_>) -> String {
|
||||
let t = ctx.term_interner.get(self.original);
|
||||
ctx.word_interner.get(t.original).to_owned()
|
||||
}
|
||||
}
|
||||
|
||||
impl ZeroTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self;
|
||||
phrase.is_none()
|
||||
&& zero_typo.is_none()
|
||||
&& prefix_of.is_empty()
|
||||
&& synonyms.is_empty()
|
||||
&& use_prefix_db.is_none()
|
||||
}
|
||||
}
|
||||
impl OneTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let OneTypoTerm { split_words, one_typo } = self;
|
||||
one_typo.is_empty() && split_words.is_none()
|
||||
}
|
||||
}
|
||||
impl TwoTypoTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let TwoTypoTerm { two_typos } = self;
|
||||
two_typos.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTerm {
|
||||
fn is_empty(&self) -> bool {
|
||||
let Lazy::Init(one_typo) = &self.one_typo else {
|
||||
return false;
|
||||
};
|
||||
let Lazy::Init(two_typo) = &self.two_typo else {
|
||||
return false;
|
||||
};
|
||||
|
||||
self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
|
||||
}
|
||||
fn allows_split_words(&self) -> bool {
|
||||
self.zero_typo.phrase.is_none()
|
||||
}
|
||||
}
|
||||
|
||||
impl Interned<QueryTerm> {
|
||||
/// Return the original word from the given query term
|
||||
fn original_single_word(self, ctx: &SearchContext<'_>) -> Option<Interned<String>> {
|
||||
let self_ = ctx.term_interner.get(self);
|
||||
if self_.ngram_words.is_some() {
|
||||
None
|
||||
} else {
|
||||
Some(self_.original)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A query term coupled with its position in the user's search query.
|
||||
#[derive(Clone)]
|
||||
pub struct LocatedQueryTerm {
|
||||
pub value: Interned<QueryTerm>,
|
||||
pub positions: RangeInclusive<u16>,
|
||||
}
|
||||
|
||||
impl LocatedQueryTerm {
|
||||
/// Return `true` iff the term is empty
|
||||
pub fn is_empty(&self, interner: &DedupInterner<QueryTerm>) -> bool {
|
||||
interner.get(self.value).is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl QueryTerm {
|
||||
pub fn is_cached_prefix(&self) -> bool {
|
||||
self.zero_typo.use_prefix_db.is_some()
|
||||
}
|
||||
pub fn is_prefix(&self) -> bool {
|
||||
self.is_prefix
|
||||
}
|
||||
pub fn original_word(&self, ctx: &SearchContext<'_>) -> String {
|
||||
ctx.word_interner.get(self.original).clone()
|
||||
}
|
||||
|
||||
pub fn original_phrase(&self) -> Option<Interned<Phrase>> {
|
||||
self.zero_typo.phrase
|
||||
}
|
||||
|
||||
pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
|
||||
let mut words = BTreeSet::new();
|
||||
let mut phrases = BTreeSet::new();
|
||||
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
|
||||
&self.zero_typo;
|
||||
words.extend(zero_typo.iter().copied());
|
||||
words.extend(prefix_of.iter().copied());
|
||||
phrases.extend(phrase.iter().copied());
|
||||
phrases.extend(synonyms.iter().copied());
|
||||
|
||||
if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = &self.one_typo {
|
||||
words.extend(one_typo.iter().copied());
|
||||
phrases.extend(split_words.iter().copied());
|
||||
};
|
||||
|
||||
if let Lazy::Init(TwoTypoTerm { two_typos }) = &self.two_typo {
|
||||
words.extend(two_typos.iter().copied());
|
||||
};
|
||||
|
||||
(words.into_iter().collect(), phrases.into_iter().collect())
|
||||
}
|
||||
}
|
||||
79
crates/milli/src/search/new/query_term/ntypo_subset.rs
Normal file
79
crates/milli/src/search/new/query_term/ntypo_subset.rs
Normal file
@@ -0,0 +1,79 @@
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use super::Phrase;
|
||||
use crate::search::new::interner::Interned;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub enum NTypoTermSubset {
|
||||
All,
|
||||
Subset {
|
||||
words: BTreeSet<Interned<String>>,
|
||||
phrases: BTreeSet<Interned<Phrase>>,
|
||||
// TODO: prefixes: BTreeSet<Interned<String>>,
|
||||
},
|
||||
Nothing,
|
||||
}
|
||||
|
||||
impl NTypoTermSubset {
|
||||
pub fn contains_word(&self, word: Interned<String>) -> bool {
|
||||
match self {
|
||||
NTypoTermSubset::All => true,
|
||||
NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word),
|
||||
NTypoTermSubset::Nothing => false,
|
||||
}
|
||||
}
|
||||
pub fn contains_phrase(&self, phrase: Interned<Phrase>) -> bool {
|
||||
match self {
|
||||
NTypoTermSubset::All => true,
|
||||
NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase),
|
||||
NTypoTermSubset::Nothing => false,
|
||||
}
|
||||
}
|
||||
pub fn is_empty(&self) -> bool {
|
||||
match self {
|
||||
NTypoTermSubset::All => false,
|
||||
NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(),
|
||||
NTypoTermSubset::Nothing => true,
|
||||
}
|
||||
}
|
||||
pub fn union(&mut self, other: &Self) {
|
||||
match self {
|
||||
Self::All => {}
|
||||
Self::Subset { words, phrases } => match other {
|
||||
Self::All => {
|
||||
*self = Self::All;
|
||||
}
|
||||
Self::Subset { words: w2, phrases: p2 } => {
|
||||
words.extend(w2);
|
||||
phrases.extend(p2);
|
||||
}
|
||||
Self::Nothing => {}
|
||||
},
|
||||
Self::Nothing => {
|
||||
*self = other.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn intersect(&mut self, other: &Self) {
|
||||
match self {
|
||||
Self::All => *self = other.clone(),
|
||||
Self::Subset { words, phrases } => match other {
|
||||
Self::All => {}
|
||||
Self::Subset { words: w2, phrases: p2 } => {
|
||||
let mut ws = BTreeSet::new();
|
||||
for w in words.intersection(w2) {
|
||||
ws.insert(*w);
|
||||
}
|
||||
let mut ps = BTreeSet::new();
|
||||
for p in phrases.intersection(p2) {
|
||||
ps.insert(*p);
|
||||
}
|
||||
*words = ws;
|
||||
*phrases = ps;
|
||||
}
|
||||
Self::Nothing => *self = Self::Nothing,
|
||||
},
|
||||
Self::Nothing => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
382
crates/milli/src/search/new/query_term/parse_query.rs
Normal file
382
crates/milli/src/search/new/query_term/parse_query.rs
Normal file
@@ -0,0 +1,382 @@
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use charabia::normalizer::NormalizedTokenIter;
|
||||
use charabia::{SeparatorKind, TokenKind};
|
||||
|
||||
use super::compute_derivations::partially_initialized_term_from_word;
|
||||
use super::{LocatedQueryTerm, ZeroTypoTerm};
|
||||
use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
|
||||
use crate::search::new::Word;
|
||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Extraction of the content of a query.
|
||||
pub struct ExtractedTokens {
|
||||
/// The terms to search for in the database.
|
||||
pub query_terms: Vec<LocatedQueryTerm>,
|
||||
/// The words that must not appear in the results.
|
||||
pub negative_words: Vec<Word>,
|
||||
/// The phrases that must not appear in the results.
|
||||
pub negative_phrases: Vec<LocatedQueryTerm>,
|
||||
}
|
||||
|
||||
/// Convert the tokenised search query into a list of located query terms.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
||||
pub fn located_query_terms_from_tokens(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
query: NormalizedTokenIter<'_, '_, '_, '_>,
|
||||
words_limit: Option<usize>,
|
||||
) -> Result<ExtractedTokens> {
|
||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||
|
||||
let mut query_terms = Vec::new();
|
||||
|
||||
let mut negative_phrase = false;
|
||||
let mut phrase: Option<PhraseBuilder> = None;
|
||||
let mut encountered_whitespace = true;
|
||||
let mut negative_next_token = false;
|
||||
let mut negative_words = Vec::new();
|
||||
let mut negative_phrases = Vec::new();
|
||||
|
||||
let parts_limit = words_limit.unwrap_or(usize::MAX);
|
||||
|
||||
// start with the last position as we will wrap around to position 0 at the beginning of the loop below.
|
||||
let mut position = u16::MAX;
|
||||
|
||||
let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable();
|
||||
while let Some(token) = peekable.next() {
|
||||
if token.lemma().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// early return if word limit is exceeded
|
||||
if query_terms.len() >= parts_limit {
|
||||
return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases });
|
||||
}
|
||||
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord => {
|
||||
// On first loop, goes from u16::MAX to 0, then normal increment.
|
||||
position = position.wrapping_add(1);
|
||||
|
||||
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
|
||||
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
|
||||
// 3. if the word is the last token of the query we push it as a prefix word.
|
||||
if let Some(phrase) = &mut phrase {
|
||||
phrase.push_word(ctx, &token, position)
|
||||
} else if negative_next_token {
|
||||
let word = token.lemma().to_string();
|
||||
let word = Word::Original(ctx.word_interner.insert(word));
|
||||
negative_words.push(word);
|
||||
negative_next_token = false;
|
||||
} else if peekable.peek().is_some() {
|
||||
match token.kind {
|
||||
TokenKind::Word => {
|
||||
let word = token.lemma();
|
||||
let term = partially_initialized_term_from_word(
|
||||
ctx,
|
||||
word,
|
||||
nbr_typos(word),
|
||||
false,
|
||||
false,
|
||||
)?;
|
||||
let located_term = LocatedQueryTerm {
|
||||
value: ctx.term_interner.push(term),
|
||||
positions: position..=position,
|
||||
};
|
||||
query_terms.push(located_term);
|
||||
}
|
||||
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (),
|
||||
}
|
||||
} else {
|
||||
let word = token.lemma();
|
||||
let term = partially_initialized_term_from_word(
|
||||
ctx,
|
||||
word,
|
||||
nbr_typos(word),
|
||||
true,
|
||||
false,
|
||||
)?;
|
||||
let located_term = LocatedQueryTerm {
|
||||
value: ctx.term_interner.push(term),
|
||||
positions: position..=position,
|
||||
};
|
||||
query_terms.push(located_term);
|
||||
}
|
||||
}
|
||||
TokenKind::Separator(separator_kind) => {
|
||||
// add penalty for hard separators
|
||||
if let SeparatorKind::Hard = separator_kind {
|
||||
position = position.wrapping_add(7);
|
||||
}
|
||||
|
||||
phrase = 'phrase: {
|
||||
let phrase = phrase.take();
|
||||
|
||||
// If we have a hard separator inside a phrase, we immediately start a new phrase
|
||||
let phrase = if separator_kind == SeparatorKind::Hard {
|
||||
if let Some(phrase) = phrase {
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
// as we are evaluating a negative operator we put the phrase
|
||||
// in the negative one *but* we don't reset the negative operator
|
||||
// as we are immediately starting a new negative phrase.
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
} else {
|
||||
query_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
Some(PhraseBuilder::empty())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
phrase
|
||||
};
|
||||
|
||||
// We close and start a new phrase depending on the number of double quotes
|
||||
let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
|
||||
if quote_count == 0 {
|
||||
break 'phrase phrase;
|
||||
}
|
||||
|
||||
// Consume the closing quote and the phrase
|
||||
if let Some(phrase) = phrase {
|
||||
// Per the check above, quote_count > 0
|
||||
quote_count -= 1;
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
// we were evaluating a negative operator so we
|
||||
// put the phrase in the negative phrases
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
negative_phrase = false;
|
||||
} else {
|
||||
query_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start new phrase if the token ends with an opening quote
|
||||
if quote_count % 2 == 1 {
|
||||
negative_phrase = negative_next_token;
|
||||
Some(PhraseBuilder::empty())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
negative_next_token =
|
||||
phrase.is_none() && token.lemma() == "-" && encountered_whitespace;
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
encountered_whitespace =
|
||||
token.lemma().chars().last().filter(|c| c.is_whitespace()).is_some();
|
||||
}
|
||||
|
||||
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
||||
if let Some(phrase) = phrase.take() {
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
// put the phrase in the negative set if we are evaluating a negative operator.
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
} else {
|
||||
query_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ExtractedTokens { query_terms, negative_words, negative_phrases })
|
||||
}
|
||||
|
||||
pub fn number_of_typos_allowed<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
) -> Result<impl Fn(&str) -> u8 + 'ctx> {
|
||||
let authorize_typos = ctx.index.authorize_typos(ctx.txn)?;
|
||||
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
|
||||
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
|
||||
|
||||
let exact_words = ctx.index.exact_words(ctx.txn)?;
|
||||
|
||||
Ok(Box::new(move |word: &str| {
|
||||
if !authorize_typos
|
||||
|| word.len() < min_len_one_typo as usize
|
||||
|| exact_words.as_ref().map_or(false, |fst| fst.contains(word))
|
||||
{
|
||||
0
|
||||
} else if word.len() < min_len_two_typos as usize {
|
||||
1
|
||||
} else {
|
||||
2
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn make_ngram(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
terms: &[LocatedQueryTerm],
|
||||
number_of_typos_allowed: &impl Fn(&str) -> u8,
|
||||
) -> Result<Option<LocatedQueryTerm>> {
|
||||
assert!(!terms.is_empty());
|
||||
for t in terms {
|
||||
if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
for ts in terms.windows(2) {
|
||||
let [t1, t2] = ts else { panic!() };
|
||||
if *t1.positions.end() != t2.positions.start() - 1 {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
let mut words_interned = vec![];
|
||||
for term in terms {
|
||||
if let Some(original_term_word) = term.value.original_single_word(ctx) {
|
||||
words_interned.push(original_term_word);
|
||||
} else {
|
||||
return Ok(None);
|
||||
}
|
||||
}
|
||||
let words =
|
||||
words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::<Vec<_>>();
|
||||
|
||||
let start = *terms.first().as_ref().unwrap().positions.start();
|
||||
let end = *terms.last().as_ref().unwrap().positions.end();
|
||||
let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix;
|
||||
let ngram_str = words.join("");
|
||||
if ngram_str.len() > MAX_WORD_LENGTH {
|
||||
return Ok(None);
|
||||
}
|
||||
let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone());
|
||||
|
||||
let max_nbr_typos =
|
||||
number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);
|
||||
|
||||
let mut term =
|
||||
partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?;
|
||||
|
||||
// Now add the synonyms
|
||||
let index_synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||
|
||||
term.zero_typo.synonyms.extend(
|
||||
index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| {
|
||||
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
|
||||
ctx.phrase_interner.insert(Phrase { words })
|
||||
}),
|
||||
);
|
||||
|
||||
let term = QueryTerm {
|
||||
original: ngram_str_interned,
|
||||
ngram_words: Some(words_interned),
|
||||
is_prefix,
|
||||
max_levenshtein_distance: max_nbr_typos,
|
||||
zero_typo: term.zero_typo,
|
||||
one_typo: Lazy::Uninit,
|
||||
two_typo: Lazy::Uninit,
|
||||
};
|
||||
|
||||
let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end };
|
||||
|
||||
Ok(Some(term))
|
||||
}
|
||||
|
||||
struct PhraseBuilder {
|
||||
words: Vec<Option<crate::search::new::Interned<String>>>,
|
||||
start: u16,
|
||||
end: u16,
|
||||
}
|
||||
|
||||
impl PhraseBuilder {
|
||||
fn empty() -> Self {
|
||||
Self { words: Default::default(), start: u16::MAX, end: u16::MAX }
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.words.is_empty() || self.words.iter().all(Option::is_none)
|
||||
}
|
||||
|
||||
// precondition: token has kind Word or StopWord
|
||||
fn push_word(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'_>,
|
||||
token: &charabia::Token<'_>,
|
||||
position: u16,
|
||||
) {
|
||||
if self.is_empty() {
|
||||
self.start = position;
|
||||
}
|
||||
self.end = position;
|
||||
if let TokenKind::StopWord = token.kind {
|
||||
self.words.push(None);
|
||||
} else {
|
||||
// token has kind Word
|
||||
let word = ctx.word_interner.insert(token.lemma().to_string());
|
||||
self.words.push(Some(word));
|
||||
}
|
||||
}
|
||||
|
||||
fn build(self, ctx: &mut SearchContext<'_>) -> Option<LocatedQueryTerm> {
|
||||
if self.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(LocatedQueryTerm {
|
||||
value: ctx.term_interner.push({
|
||||
let phrase = ctx.phrase_interner.insert(Phrase { words: self.words });
|
||||
let phrase_desc = phrase.description(ctx);
|
||||
QueryTerm {
|
||||
original: ctx.word_interner.insert(phrase_desc),
|
||||
ngram_words: None,
|
||||
max_levenshtein_distance: 0,
|
||||
is_prefix: false,
|
||||
zero_typo: ZeroTypoTerm {
|
||||
phrase: Some(phrase),
|
||||
exact: None,
|
||||
prefix_of: BTreeSet::default(),
|
||||
synonyms: BTreeSet::default(),
|
||||
use_prefix_db: None,
|
||||
},
|
||||
one_typo: Lazy::Uninit,
|
||||
two_typo: Lazy::Uninit,
|
||||
}
|
||||
}),
|
||||
positions: self.start..=self.end,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use charabia::TokenizerBuilder;
|
||||
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
|
||||
fn temp_index_with_documents() -> TempIndex {
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
|
||||
{ "id": 2, "name": "Westfália" },
|
||||
{ "id": 3, "name": "Ŵôřlḑôle" },
|
||||
]))
|
||||
.unwrap();
|
||||
temp_index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn start_with_hard_separator() -> Result<()> {
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize(".");
|
||||
let index = temp_index_with_documents();
|
||||
let rtxn = index.read_txn()?;
|
||||
let mut ctx = SearchContext::new(&index, &rtxn)?;
|
||||
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None)?;
|
||||
assert!(query_terms.is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
21
crates/milli/src/search/new/query_term/phrase.rs
Normal file
21
crates/milli/src/search/new/query_term/phrase.rs
Normal file
@@ -0,0 +1,21 @@
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::search::new::interner::Interned;
|
||||
use crate::SearchContext;
|
||||
|
||||
/// A phrase in the user's search query, consisting of several words
|
||||
/// that must appear side-by-side in the search results.
|
||||
#[derive(Default, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct Phrase {
|
||||
pub words: Vec<Option<Interned<String>>>,
|
||||
}
|
||||
impl Interned<Phrase> {
|
||||
pub fn description(self, ctx: &SearchContext<'_>) -> String {
|
||||
let p = ctx.phrase_interner.get(self);
|
||||
p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ")
|
||||
}
|
||||
pub fn words(self, ctx: &SearchContext<'_>) -> Vec<Option<Interned<String>>> {
|
||||
let p = ctx.phrase_interner.get(self);
|
||||
p.words.clone()
|
||||
}
|
||||
}
|
||||
92
crates/milli/src/search/new/ranking_rule_graph/build.rs
Normal file
92
crates/milli/src/search/new/ranking_rule_graph/build.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::search::new::interner::{DedupInterner, MappedInterner};
|
||||
use crate::search::new::query_graph::{QueryNode, QueryNodeData};
|
||||
use crate::search::new::small_bitmap::SmallBitmap;
|
||||
use crate::search::new::{QueryGraph, SearchContext};
|
||||
use crate::Result;
|
||||
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
/// Build the ranking rule graph from the given query graph
|
||||
pub fn build(
|
||||
ctx: &mut SearchContext<'_>,
|
||||
query_graph: QueryGraph,
|
||||
cost_of_ignoring_node: MappedInterner<QueryNode, Option<(u32, SmallBitmap<QueryNode>)>>,
|
||||
) -> Result<Self> {
|
||||
let QueryGraph { nodes: graph_nodes, .. } = &query_graph;
|
||||
|
||||
let mut conditions_interner = DedupInterner::default();
|
||||
|
||||
let mut edges_store = DedupInterner::default();
|
||||
let mut edges_of_node = query_graph.nodes.map(|_| HashSet::new());
|
||||
|
||||
for (source_id, source_node) in graph_nodes.iter() {
|
||||
let new_edges = edges_of_node.get_mut(source_id);
|
||||
|
||||
for dest_idx in source_node.successors.iter() {
|
||||
let src_term = match &source_node.data {
|
||||
QueryNodeData::Term(t) => Some(t),
|
||||
QueryNodeData::Start => None,
|
||||
QueryNodeData::Deleted | QueryNodeData::End => panic!(),
|
||||
};
|
||||
let dest_node = graph_nodes.get(dest_idx);
|
||||
let dest_term = match &dest_node.data {
|
||||
QueryNodeData::Term(t) => t,
|
||||
QueryNodeData::End => {
|
||||
let new_edge_id = edges_store.insert(Some(Edge {
|
||||
source_node: source_id,
|
||||
dest_node: dest_idx,
|
||||
cost: 0,
|
||||
condition: None,
|
||||
nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes),
|
||||
}));
|
||||
new_edges.insert(new_edge_id);
|
||||
continue;
|
||||
}
|
||||
QueryNodeData::Deleted | QueryNodeData::Start => panic!(),
|
||||
};
|
||||
if let Some((cost_of_ignoring, forbidden_nodes)) =
|
||||
cost_of_ignoring_node.get(dest_idx)
|
||||
{
|
||||
let dest = graph_nodes.get(dest_idx);
|
||||
let dest_size = match &dest.data {
|
||||
QueryNodeData::Term(term) => term.term_ids.len(),
|
||||
_ => panic!(),
|
||||
};
|
||||
let new_edge_id = edges_store.insert(Some(Edge {
|
||||
source_node: source_id,
|
||||
dest_node: dest_idx,
|
||||
cost: *cost_of_ignoring * dest_size as u32,
|
||||
condition: None,
|
||||
nodes_to_skip: forbidden_nodes.clone(),
|
||||
}));
|
||||
new_edges.insert(new_edge_id);
|
||||
}
|
||||
|
||||
let edges = G::build_edges(ctx, &mut conditions_interner, src_term, dest_term)?;
|
||||
if edges.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
for (cost, condition) in edges {
|
||||
let new_edge_id = edges_store.insert(Some(Edge {
|
||||
source_node: source_id,
|
||||
dest_node: dest_idx,
|
||||
cost,
|
||||
condition: Some(condition),
|
||||
nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes),
|
||||
}));
|
||||
new_edges.insert(new_edge_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
let edges_store = edges_store.freeze();
|
||||
let edges_of_node =
|
||||
edges_of_node.map(|edges| SmallBitmap::from_iter(edges.iter().copied(), &edges_store));
|
||||
|
||||
let conditions_interner = conditions_interner.freeze();
|
||||
|
||||
Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node, conditions_interner })
|
||||
}
|
||||
}
|
||||
400
crates/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
Normal file
400
crates/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
Normal file
@@ -0,0 +1,400 @@
|
||||
/** Implements a "PathVisitor" which finds all paths of a certain cost
|
||||
from the START to END node of a ranking rule graph.
|
||||
|
||||
A path is a list of conditions. A condition is the data associated with
|
||||
an edge, given by the ranking rule. Some edges don't have a condition associated
|
||||
with them, they are "unconditional". These kinds of edges are used to "skip" a node.
|
||||
|
||||
The algorithm uses a depth-first search. It benefits from two main optimisations:
|
||||
- The list of all possible costs to go from any node to the END node is precomputed
|
||||
- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges
|
||||
untraversable depending on what other edges were selected.
|
||||
|
||||
These two optimisations are meant to avoid traversing edges that wouldn't lead
|
||||
to a valid path. In practically all cases, we avoid the exponential complexity
|
||||
that is inherent to depth-first search in a large ranking rule graph.
|
||||
|
||||
The DeadEndsCache is a sort of prefix tree which associates a list of forbidden
|
||||
conditions to a list of traversed conditions.
|
||||
For example, the DeadEndsCache could say the following:
|
||||
- Immediately, from the start, the conditions `[a,b]` are forbidden
|
||||
- if we take the condition `c`, then the conditions `[e]` are also forbidden
|
||||
- and if after that, we take `f`, then `[h,i]` are also forbidden
|
||||
- etc.
|
||||
- if we take `g`, then `[f]` is also forbidden
|
||||
- etc.
|
||||
- etc.
|
||||
As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden
|
||||
conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden.
|
||||
|
||||
When a path is found from START to END, we give it to the `visit` closure.
|
||||
This closure takes a mutable reference to the `DeadEndsCache`. This means that
|
||||
the caller can update this cache. Therefore, we must handle the case where the
|
||||
DeadEndsCache has been updated. This means potentially backtracking up to the point
|
||||
where the traversed conditions are all allowed by the new DeadEndsCache.
|
||||
|
||||
The algorithm also implements the `TermsMatchingStrategy` logic.
|
||||
Some edges are augmented with a list of "nodes_to_skip". Skipping
|
||||
a node means "reaching this node through an unconditional edge". If we have
|
||||
already traversed (ie. not skipped) a node that is in this list, then we know that we
|
||||
can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any
|
||||
future node that was present in the "nodes_to_skip" list.
|
||||
|
||||
The caller can decide to stop the path finding algorithm
|
||||
by returning a `ControlFlow::Break` from the `visit` closure.
|
||||
*/
|
||||
use std::collections::{BTreeSet, VecDeque};
|
||||
use std::iter::FromIterator;
|
||||
use std::ops::ControlFlow;
|
||||
|
||||
use fxhash::FxHashSet;
|
||||
|
||||
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::search::new::interner::{Interned, MappedInterner};
|
||||
use crate::search::new::query_graph::QueryNode;
|
||||
use crate::search::new::small_bitmap::SmallBitmap;
|
||||
use crate::Result;
|
||||
|
||||
/// Closure which processes a path found by the `PathVisitor`
|
||||
type VisitFn<'f, G> = &'f mut dyn FnMut(
|
||||
// the path as a list of conditions
|
||||
&[Interned<<G as RankingRuleGraphTrait>::Condition>],
|
||||
&mut RankingRuleGraph<G>,
|
||||
// a mutable reference to the DeadEndsCache, to update it in case the given
|
||||
// path doesn't resolve to any valid document ids
|
||||
&mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>,
|
||||
) -> Result<ControlFlow<()>>;
|
||||
|
||||
/// A structure which is kept but not updated during the traversal of the graph.
|
||||
/// It can however be updated by the `visit` closure once a valid path has been found.
|
||||
struct VisitorContext<'a, G: RankingRuleGraphTrait> {
|
||||
graph: &'a mut RankingRuleGraph<G>,
|
||||
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
|
||||
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
|
||||
}
|
||||
|
||||
/// The internal state of the traversal algorithm
|
||||
struct VisitorState<G: RankingRuleGraphTrait> {
|
||||
/// Budget from the current node to the end node
|
||||
remaining_cost: u64,
|
||||
/// Previously visited conditions, in order.
|
||||
path: Vec<Interned<G::Condition>>,
|
||||
/// Previously visited conditions, as an efficient and compact set.
|
||||
visited_conditions: SmallBitmap<G::Condition>,
|
||||
/// Previously visited (ie not skipped) nodes, as an efficient and compact set.
|
||||
visited_nodes: SmallBitmap<QueryNode>,
|
||||
/// The conditions that cannot be visited anymore
|
||||
forbidden_conditions: SmallBitmap<G::Condition>,
|
||||
/// The nodes that cannot be visited anymore (they must be skipped)
|
||||
nodes_to_skip: SmallBitmap<QueryNode>,
|
||||
}
|
||||
|
||||
/// See module documentation
|
||||
pub struct PathVisitor<'a, G: RankingRuleGraphTrait> {
|
||||
state: VisitorState<G>,
|
||||
ctx: VisitorContext<'a, G>,
|
||||
}
|
||||
impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
|
||||
pub fn new(
|
||||
cost: u64,
|
||||
graph: &'a mut RankingRuleGraph<G>,
|
||||
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
|
||||
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
|
||||
) -> Self {
|
||||
Self {
|
||||
state: VisitorState {
|
||||
remaining_cost: cost,
|
||||
path: vec![],
|
||||
visited_conditions: SmallBitmap::for_interned_values_in(&graph.conditions_interner),
|
||||
visited_nodes: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
|
||||
forbidden_conditions: SmallBitmap::for_interned_values_in(
|
||||
&graph.conditions_interner,
|
||||
),
|
||||
nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
|
||||
},
|
||||
ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache },
|
||||
}
|
||||
}
|
||||
|
||||
/// See module documentation
|
||||
pub fn visit_paths(mut self, visit: VisitFn<'_, G>) -> Result<()> {
|
||||
let _ =
|
||||
self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<G: RankingRuleGraphTrait> VisitorState<G> {
|
||||
/// Visits a node: traverse all its valid conditional and unconditional edges.
|
||||
///
|
||||
/// Returns ControlFlow::Break if the path finding algorithm should stop.
|
||||
/// Returns whether a valid path was found from this node otherwise.
|
||||
fn visit_node(
|
||||
&mut self,
|
||||
from_node: Interned<QueryNode>,
|
||||
visit: VisitFn<'_, G>,
|
||||
ctx: &mut VisitorContext<'_, G>,
|
||||
) -> Result<ControlFlow<(), bool>> {
|
||||
// any valid path will be found from this point
|
||||
// if a valid path was found, then we know that the DeadEndsCache may have been updated,
|
||||
// and we will need to do more work to potentially backtrack
|
||||
let mut any_valid = false;
|
||||
|
||||
let edges = ctx.graph.edges_of_node.get(from_node).clone();
|
||||
for edge_idx in edges.iter() {
|
||||
// could be none if the edge was deleted
|
||||
let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue };
|
||||
|
||||
if self.remaining_cost < edge.cost as u64 {
|
||||
continue;
|
||||
}
|
||||
self.remaining_cost -= edge.cost as u64;
|
||||
|
||||
let cf = match edge.condition {
|
||||
Some(condition) => self.visit_condition(
|
||||
condition,
|
||||
edge.dest_node,
|
||||
&edge.nodes_to_skip,
|
||||
visit,
|
||||
ctx,
|
||||
)?,
|
||||
None => self.visit_no_condition(edge.dest_node, &edge.nodes_to_skip, visit, ctx)?,
|
||||
};
|
||||
self.remaining_cost += edge.cost as u64;
|
||||
|
||||
let ControlFlow::Continue(next_any_valid) = cf else {
|
||||
return Ok(ControlFlow::Break(()));
|
||||
};
|
||||
any_valid |= next_any_valid;
|
||||
if next_any_valid {
|
||||
// backtrack as much as possible if a valid path was found and the dead_ends_cache
|
||||
// was updated such that the current prefix is now invalid
|
||||
self.forbidden_conditions = ctx
|
||||
.dead_ends_cache
|
||||
.forbidden_conditions_for_all_prefixes_up_to(self.path.iter().copied());
|
||||
if self.visited_conditions.intersects(&self.forbidden_conditions) {
|
||||
return Ok(ControlFlow::Continue(true));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ControlFlow::Continue(any_valid))
|
||||
}
|
||||
|
||||
/// Visits an unconditional edge.
|
||||
///
|
||||
/// Returns ControlFlow::Break if the path finding algorithm should stop.
|
||||
/// Returns whether a valid path was found from this node otherwise.
|
||||
fn visit_no_condition(
|
||||
&mut self,
|
||||
dest_node: Interned<QueryNode>,
|
||||
edge_new_nodes_to_skip: &SmallBitmap<QueryNode>,
|
||||
visit: VisitFn<'_, G>,
|
||||
ctx: &mut VisitorContext<'_, G>,
|
||||
) -> Result<ControlFlow<(), bool>> {
|
||||
if !ctx
|
||||
.all_costs_from_node
|
||||
.get(dest_node)
|
||||
.iter()
|
||||
.any(|next_cost| *next_cost == self.remaining_cost)
|
||||
{
|
||||
return Ok(ControlFlow::Continue(false));
|
||||
}
|
||||
// We've reached the END node!
|
||||
if dest_node == ctx.graph.query_graph.end_node {
|
||||
let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?;
|
||||
// We could change the return type of the visit closure such that the caller
|
||||
// tells us whether the dead ends cache was updated or not.
|
||||
// Alternatively, maybe the DeadEndsCache should have a generation number
|
||||
// to it, so that we don't need to play with these booleans at all.
|
||||
match control_flow {
|
||||
ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)),
|
||||
ControlFlow::Break(_) => Ok(ControlFlow::Break(())),
|
||||
}
|
||||
} else {
|
||||
let old_fbct = self.nodes_to_skip.clone();
|
||||
self.nodes_to_skip.union(edge_new_nodes_to_skip);
|
||||
let cf = self.visit_node(dest_node, visit, ctx)?;
|
||||
self.nodes_to_skip = old_fbct;
|
||||
Ok(cf)
|
||||
}
|
||||
}
|
||||
/// Visits a conditional edge.
|
||||
///
|
||||
/// Returns ControlFlow::Break if the path finding algorithm should stop.
|
||||
/// Returns whether a valid path was found from this node otherwise.
|
||||
fn visit_condition(
|
||||
&mut self,
|
||||
condition: Interned<G::Condition>,
|
||||
dest_node: Interned<QueryNode>,
|
||||
edge_new_nodes_to_skip: &SmallBitmap<QueryNode>,
|
||||
visit: VisitFn<'_, G>,
|
||||
ctx: &mut VisitorContext<'_, G>,
|
||||
) -> Result<ControlFlow<(), bool>> {
|
||||
assert!(dest_node != ctx.graph.query_graph.end_node);
|
||||
|
||||
if self.forbidden_conditions.contains(condition)
|
||||
|| self.nodes_to_skip.contains(dest_node)
|
||||
|| edge_new_nodes_to_skip.intersects(&self.visited_nodes)
|
||||
{
|
||||
return Ok(ControlFlow::Continue(false));
|
||||
}
|
||||
|
||||
// Checking that from the destination node, there is at least
|
||||
// one cost that we can visit that corresponds to our remaining budget.
|
||||
if !ctx
|
||||
.all_costs_from_node
|
||||
.get(dest_node)
|
||||
.iter()
|
||||
.any(|next_cost| *next_cost == self.remaining_cost)
|
||||
{
|
||||
return Ok(ControlFlow::Continue(false));
|
||||
}
|
||||
|
||||
self.path.push(condition);
|
||||
self.visited_nodes.insert(dest_node);
|
||||
self.visited_conditions.insert(condition);
|
||||
|
||||
let old_forb_cond = self.forbidden_conditions.clone();
|
||||
if let Some(next_forbidden) =
|
||||
ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied())
|
||||
{
|
||||
self.forbidden_conditions.union(&next_forbidden);
|
||||
}
|
||||
let old_nodes_to_skip = self.nodes_to_skip.clone();
|
||||
self.nodes_to_skip.union(edge_new_nodes_to_skip);
|
||||
|
||||
let cf = self.visit_node(dest_node, visit, ctx)?;
|
||||
|
||||
self.nodes_to_skip = old_nodes_to_skip;
|
||||
self.forbidden_conditions = old_forb_cond;
|
||||
|
||||
self.visited_conditions.remove(condition);
|
||||
self.visited_nodes.remove(dest_node);
|
||||
self.path.pop();
|
||||
|
||||
Ok(cf)
|
||||
}
|
||||
}
|
||||
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> {
|
||||
let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]);
|
||||
|
||||
self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| {
|
||||
if cur_node == self.query_graph.end_node {
|
||||
*costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
|
||||
return;
|
||||
}
|
||||
let mut self_costs = Vec::<u64>::new();
|
||||
|
||||
let cur_node_edges = &self.edges_of_node.get(cur_node);
|
||||
for edge_idx in cur_node_edges.iter() {
|
||||
let edge = self.edges_store.get(edge_idx).as_ref().unwrap();
|
||||
let succ_node = edge.dest_node;
|
||||
let succ_costs = costs_to_end.get(succ_node);
|
||||
for succ_cost in succ_costs {
|
||||
self_costs.push(edge.cost as u64 + succ_cost);
|
||||
}
|
||||
}
|
||||
self_costs.sort_unstable();
|
||||
self_costs.dedup();
|
||||
|
||||
*costs_to_end.get_mut(cur_node) = self_costs;
|
||||
});
|
||||
costs_to_end
|
||||
}
|
||||
|
||||
pub fn update_all_costs_before_node(
|
||||
&self,
|
||||
node_with_removed_outgoing_conditions: Interned<QueryNode>,
|
||||
costs: &mut MappedInterner<QueryNode, Vec<u64>>,
|
||||
) {
|
||||
// Traverse the graph backward from the target node, recomputing the cost for each of its predecessors.
|
||||
// We first check that no other node is contributing the same total cost to a predecessor before removing
|
||||
// the cost from the predecessor.
|
||||
self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
|
||||
let mut costs_to_remove = FxHashSet::default();
|
||||
costs_to_remove.extend(costs.get(cur_node).iter().copied());
|
||||
|
||||
let cur_node_edges = &self.edges_of_node.get(cur_node);
|
||||
for edge_idx in cur_node_edges.iter() {
|
||||
let edge = self.edges_store.get(edge_idx).as_ref().unwrap();
|
||||
for cost in costs.get(edge.dest_node).iter() {
|
||||
costs_to_remove.remove(&(*cost + edge.cost as u64));
|
||||
if costs_to_remove.is_empty() {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if costs_to_remove.is_empty() {
|
||||
return;
|
||||
}
|
||||
let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied());
|
||||
for c in costs_to_remove {
|
||||
new_costs.remove(&c);
|
||||
}
|
||||
*costs.get_mut(cur_node) = new_costs.into_iter().collect();
|
||||
});
|
||||
}
|
||||
|
||||
/// Traverse the graph backwards from the given node such that every time
|
||||
/// a node is visited, we are guaranteed that all its successors either:
|
||||
/// 1. have already been visited; OR
|
||||
/// 2. were not reachable from the given node
|
||||
pub fn traverse_breadth_first_backward(
|
||||
&self,
|
||||
from: Interned<QueryNode>,
|
||||
mut visit: impl FnMut(Interned<QueryNode>),
|
||||
) {
|
||||
let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
|
||||
{
|
||||
// go backward to get the set of all reachable nodes from the given node
|
||||
// the nodes that are not reachable will be set as `visited`
|
||||
let mut stack = VecDeque::new();
|
||||
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
|
||||
enqueued.insert(from);
|
||||
stack.push_back(from);
|
||||
while let Some(n) = stack.pop_front() {
|
||||
if reachable.contains(n) {
|
||||
continue;
|
||||
}
|
||||
reachable.insert(n);
|
||||
for prev_node in self.query_graph.nodes.get(n).predecessors.iter() {
|
||||
if !enqueued.contains(prev_node) && !reachable.contains(prev_node) {
|
||||
stack.push_back(prev_node);
|
||||
enqueued.insert(prev_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
let mut unreachable_or_visited =
|
||||
SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
|
||||
for (n, _) in self.query_graph.nodes.iter() {
|
||||
if !reachable.contains(n) {
|
||||
unreachable_or_visited.insert(n);
|
||||
}
|
||||
}
|
||||
|
||||
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
|
||||
let mut stack = VecDeque::new();
|
||||
|
||||
enqueued.insert(from);
|
||||
stack.push_back(from);
|
||||
|
||||
while let Some(cur_node) = stack.pop_front() {
|
||||
if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) {
|
||||
stack.push_back(cur_node);
|
||||
continue;
|
||||
}
|
||||
unreachable_or_visited.insert(cur_node);
|
||||
visit(cur_node);
|
||||
for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
|
||||
if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) {
|
||||
stack.push_back(prev_node);
|
||||
enqueued.insert(prev_node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user