Update the criteria to the new ones

This commit is contained in:
Clément Renault
2019-12-11 17:02:10 +01:00
parent ea148575cf
commit 248ccfc0d8
20 changed files with 693 additions and 1775 deletions

View File

@@ -1,9 +1,6 @@
use std::ops::Deref;
use std::fmt;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::HashSet;
use std::io::Write;
use std::mem;
use std::ops::Range;
use std::rc::Rc;
@@ -17,15 +14,15 @@ use meilisearch_tokenizer::{is_cjk, split_query_string};
use meilisearch_types::{DocIndex, Highlight};
use sdset::{Set, SetBuf};
use slice_group_by::{GroupBy, GroupByMut};
use itertools::EitherOrBoth;
use crate::automaton::NGRAMS;
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
use crate::automaton::{normalize_str, split_best_frequency};
use crate::criterion2::*;
use crate::criterion::Criteria;
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::raw_document::RawDocument;
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
use crate::{store, Document, DocumentId, MResult};
@@ -33,6 +30,7 @@ pub fn bucket_sort<'c>(
reader: &heed::RoTxn<MainT>,
query: &str,
range: Range<usize>,
criteria: Criteria<'c>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
documents_fields_counts_store: store::DocumentsFieldsCounts,
@@ -76,17 +74,7 @@ pub fn bucket_sort<'c>(
let mut groups = vec![raw_documents.as_mut_slice()];
let criteria = [
Box::new(Typo) as Box<dyn Criterion>,
Box::new(Words),
Box::new(Proximity),
Box::new(Attribute),
Box::new(WordsPosition),
Box::new(Exact),
Box::new(StableDocId),
];
'criteria: for criterion in &criteria {
'criteria: for criterion in criteria.as_ref() {
let tmp_groups = mem::replace(&mut groups, Vec::new());
let mut documents_seen = 0;
@@ -131,7 +119,7 @@ pub fn bucket_sort<'c>(
}).collect();
Document {
id: d.raw_matches[0].document_id,
id: d.id,
highlights,
#[cfg(test)] matches: Vec::new(),
}
@@ -140,88 +128,6 @@ pub fn bucket_sort<'c>(
Ok(iter.collect())
}
pub struct RawDocument<'a, 'tag> {
pub raw_matches: &'a mut [BareMatch<'tag>],
pub processed_matches: Vec<SimpleMatch>,
/// The list of minimum `distance` found
pub processed_distances: Vec<Option<u8>>,
}
impl<'a, 'tag> RawDocument<'a, 'tag> {
fn new<'txn>(
raw_matches: &'a mut [BareMatch<'tag>],
automatons: &[QueryWordAutomaton],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
) -> Option<RawDocument<'a, 'tag>>
{
raw_matches.sort_unstable_by_key(|m| m.query_index);
let mut previous_word = None;
for i in 0..raw_matches.len() {
let a = &raw_matches[i];
let auta = &automatons[a.query_index as usize];
match auta.phrase_query {
Some((0, _)) => {
let b = match raw_matches.get(i + 1) {
Some(b) => b,
None => {
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
continue;
}
};
if a.query_index + 1 != b.query_index {
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
continue
}
let pla = &postings_lists[a.postings_list];
let plb = &postings_lists[b.postings_list];
let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
});
let mut newa = Vec::new();
let mut newb = Vec::new();
for eb in iter {
if let EitherOrBoth::Both(a, b) = eb {
newa.push(*a);
newb.push(*b);
}
}
if !newa.is_empty() {
previous_word = Some(a.query_index);
}
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
},
Some((1, _)) => {
if previous_word.take() != Some(a.query_index - 1) {
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
}
},
Some((_, _)) => unreachable!(),
None => (),
}
}
if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
return None
}
Some(RawDocument {
raw_matches,
processed_matches: Vec::new(),
processed_distances: Vec::new(),
})
}
}
pub struct BareMatch<'tag> {
pub document_id: DocumentId,
pub query_index: u16,

View File

@@ -0,0 +1,48 @@
use std::cmp::{self, Ordering};
use compact_arena::SmallArena;
use slice_group_by::GroupBy;
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
use crate::RawDocument;
use super::{Criterion, prepare_raw_matches};
pub struct Attribute;
impl Criterion for Attribute {
fn name(&self) -> &str { "attribute" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) -> Ordering
{
#[inline]
fn best_attribute(matches: &[SimpleMatch]) -> u16 {
let mut best_attribute = u16::max_value();
for group in matches.linear_group_by_key(|bm| bm.query_index) {
best_attribute = cmp::min(best_attribute, group[0].attribute);
}
best_attribute
}
let lhs = best_attribute(&lhs.processed_matches);
let rhs = best_attribute(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@@ -1,16 +1,37 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use std::cmp::Ordering;
#[derive(Debug, Clone, Copy)]
use compact_arena::SmallArena;
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
use crate::RawDocument;
use super::Criterion;
pub struct DocumentId;
impl Criterion for DocumentId {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
lhs.id.cmp(&rhs.id)
fn name(&self) -> &str { "stable document id" }
fn prepare(
&self,
documents: &mut [RawDocument],
postings_lists: &mut SmallArena<PostingsListView>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
// ...
}
fn name(&self) -> &str {
"DocumentId"
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &SmallArena<PostingsListView>,
) -> Ordering
{
let lhs = &lhs.id;
let rhs = &rhs.id;
lhs.cmp(rhs)
}
}

View File

@@ -1,131 +1,51 @@
use std::cmp::Ordering;
use std::cmp::{Ordering, Reverse};
use sdset::Set;
use compact_arena::SmallArena;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::{AttrCount, RawDocument};
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{PostingsListView, BareMatch, QueryWordAutomaton};
use crate::RawDocument;
use super::Criterion;
#[inline]
fn number_exact_matches(
query_index: &[u32],
attribute: &[u16],
is_exact: &[bool],
fields_counts: &Set<AttrCount>,
) -> usize {
let mut count = 0;
let mut index = 0;
for group in query_index.linear_group() {
let len = group.len();
let mut found_exact = false;
for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
if *is_exact {
found_exact = true;
let attr = &attribute[index + pos];
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |ac| ac.attr) {
let AttrCount { count, .. } = fields_counts[pos];
if count == 1 {
return usize::max_value();
}
}
}
}
count += found_exact as usize;
index += len;
}
count
}
#[derive(Debug, Clone, Copy)]
pub struct Exact;
impl Criterion for Exact {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let is_exact = lhs.is_exact();
let attribute = lhs.attribute();
let fields_counts = lhs.fields_counts.as_ref().unwrap();
fn name(&self) -> &str { "exact" }
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
fn prepare(
&self,
documents: &mut [RawDocument],
postings_lists: &mut SmallArena<PostingsListView>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
for document in documents {
document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
}
}
let rhs = {
let query_index = rhs.query_index();
let is_exact = rhs.is_exact();
let attribute = rhs.attribute();
let fields_counts = rhs.fields_counts.as_ref().unwrap();
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &SmallArena<PostingsListView>,
) -> Ordering
{
#[inline]
fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
let mut sum_exact_query_words = 0;
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_exact_query_words += group[0].is_exact as usize;
}
sum_exact_query_words
}
let lhs = sum_exact_query_words(&lhs.raw_matches);
let rhs = sum_exact_query_words(&rhs.raw_matches);
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &str {
"Exact"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "souliereres rouge"
#[test]
fn easy_case() {
let doc0 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
let doc1 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[false];
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 2 }]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "soulier"
//
// doc0: { 0. "soulier" }
// doc1: { 0. "soulier bleu et blanc" }
#[test]
fn basic() {
let doc0 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 1 }]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
let doc1 = {
let query_index = &[0];
let attribute = &[0];
let is_exact = &[true];
let fields_counts = Set::new(&[AttrCount { attr: 0, count: 4 }]).unwrap();
number_exact_matches(query_index, attribute, is_exact, fields_counts)
};
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@@ -1,58 +1,58 @@
mod document_id;
mod exact;
mod number_of_words;
mod sort_by_attr;
mod sum_of_typos;
mod sum_of_words_attribute;
mod sum_of_words_position;
mod words_proximity;
use std::cmp::{self, Ordering};
use compact_arena::SmallArena;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
use crate::RawDocument;
use std::cmp::Ordering;
pub use self::{
document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
words_proximity::WordsProximity,
};
mod typo;
mod words;
mod proximity;
mod attribute;
mod words_position;
mod exact;
mod document_id;
mod sort_by_attr;
pub trait Criterion: Send + Sync {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
pub use self::typo::Typo;
pub use self::words::Words;
pub use self::proximity::Proximity;
pub use self::attribute::Attribute;
pub use self::words_position::WordsPosition;
pub use self::exact::Exact;
pub use self::document_id::DocumentId;
pub use self::sort_by_attr::SortByAttr;
pub trait Criterion {
fn name(&self) -> &str;
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
);
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) -> Ordering;
#[inline]
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
self.evaluate(lhs, rhs) == Ordering::Equal
}
}
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
}
}
impl<T: Criterion + ?Sized> Criterion for Box<T> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
(**self).evaluate(lhs, rhs)
}
fn name(&self) -> &str {
(**self).name()
}
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
(**self).eq(lhs, rhs)
fn eq<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) -> bool
{
self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal
}
}
@@ -103,11 +103,11 @@ pub struct Criteria<'a> {
impl<'a> Default for Criteria<'a> {
fn default() -> Self {
CriteriaBuilder::with_capacity(7)
.add(SumOfTypos)
.add(NumberOfWords)
.add(WordsProximity)
.add(SumOfWordsAttribute)
.add(SumOfWordsPosition)
.add(Typo)
.add(Words)
.add(Proximity)
.add(Attribute)
.add(WordsPosition)
.add(Exact)
.add(DocumentId)
.build()
@@ -119,3 +119,165 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
&self.inner
}
}
fn prepare_query_distances<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) {
for document in documents {
if !document.processed_distances.is_empty() { continue }
let mut processed = Vec::new();
for m in document.raw_matches.iter() {
if postings_lists[m.postings_list].is_empty() { continue }
let range = query_enhancer.replacement(m.query_index as u32);
let new_len = cmp::max(range.end as usize, processed.len());
processed.resize(new_len, None);
for index in range {
let index = index as usize;
processed[index] = match processed[index] {
Some(distance) if distance > m.distance => Some(m.distance),
Some(distance) => Some(distance),
None => Some(m.distance),
};
}
}
document.processed_distances = processed;
}
}
fn prepare_raw_matches<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
for document in documents {
if !document.processed_matches.is_empty() { continue }
let mut processed = Vec::new();
for m in document.raw_matches.iter() {
let postings_list = &postings_lists[m.postings_list];
processed.reserve(postings_list.len());
for di in postings_list.as_ref() {
let simple_match = SimpleMatch {
query_index: m.query_index,
distance: m.distance,
attribute: di.attribute,
word_index: di.word_index,
is_exact: m.is_exact,
};
processed.push(simple_match);
}
}
let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons);
document.processed_matches = processed.into_vec();
}
}
fn multiword_rewrite_matches(
matches: &mut [SimpleMatch],
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) -> SetBuf<SimpleMatch>
{
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
let mut padded_matches = Vec::with_capacity(matches.len());
// let before_padding = Instant::now();
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
// padding will only be applied
// to word indices in the same attribute
let mut padding = 0;
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
// for each match at the same position
// in this document attribute
while let Some(same_word_index) = iter.next() {
// find the biggest padding
let mut biggest = 0;
for match_ in same_word_index {
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
let mut found = false;
// look ahead and if there already is a match
// corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
for nmatch_ in next_group {
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
let query_index = rep.next().unwrap() as u16;
if query_index == padmatch.query_index {
if !found {
// if we find a corresponding padding for the
// first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i)
{
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
biggest = biggest.max(i + 1);
}
}
padded_matches.push(padmatch);
found = true;
continue 'padding;
}
}
}
// if we do not find a corresponding padding in the
// next groups so stop here and pad what was found
break;
}
if !found {
// if no padding was found in the following matches
// we must insert the entire padding
for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
biggest = biggest.max(replacement_len - 1);
}
}
padding += biggest;
}
}
// debug!("padding matches took {:.02?}", before_padding.elapsed());
// With this check we can see that the loop above takes something
// like 43% of the search time even when no rewrite is needed.
// assert_eq!(before_matches, padded_matches);
SetBuf::from_dirty(padded_matches)
}

View File

@@ -1,31 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn number_of_query_words(query_index: &[u32]) -> usize {
query_index.linear_group().count()
}
#[derive(Debug, Clone, Copy)]
pub struct NumberOfWords;
impl Criterion for NumberOfWords {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
number_of_query_words(query_index)
};
let rhs = {
let query_index = rhs.query_index();
number_of_query_words(query_index)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &str {
"NumberOfWords"
}
}

View File

@@ -0,0 +1,79 @@
use std::cmp::{self, Ordering};
use compact_arena::SmallArena;
use slice_group_by::GroupBy;
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton};
use crate::RawDocument;
use super::{Criterion, prepare_raw_matches};
pub struct Proximity;
impl Criterion for Proximity {
fn name(&self) -> &str { "proximity" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) -> Ordering
{
const MAX_DISTANCE: u16 = 8;
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
if lhs.attribute != rhs.attribute { MAX_DISTANCE }
else { index_proximity(lhs.word_index, rhs.word_index) }
}
fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
let mut min_prox = u16::max_value();
for a in lhs {
for b in rhs {
let prox = attribute_proximity(*a, *b);
min_prox = cmp::min(min_prox, prox);
}
}
min_prox
}
fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
let mut proximity = 0;
let mut iter = matches.linear_group_by_key(|m| m.query_index);
// iterate over groups by windows of size 2
let mut last = iter.next();
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
proximity += min_proximity(lhs, rhs);
last = Some(rhs);
}
proximity
}
let lhs = matches_proximity(&lhs.processed_matches);
let rhs = matches_proximity(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@@ -2,9 +2,13 @@ use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use compact_arena::SmallArena;
use meilisearch_schema::{Schema, SchemaAttr};
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
use crate::criterion::Criterion;
use crate::{RankedMap, RawDocument};
use meilisearch_schema::{Schema, SchemaAttr};
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
@@ -28,11 +32,11 @@ use meilisearch_schema::{Schema, SchemaAttr};
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
///
/// let builder = CriteriaBuilder::with_capacity(8)
/// .add(SumOfTypos)
/// .add(NumberOfWords)
/// .add(WordsProximity)
/// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition)
/// .add(Typo)
/// .add(Words)
/// .add(Proximity)
/// .add(Attribute)
/// .add(WordsPosition)
/// .add(Exact)
/// .add(custom_ranking)
/// .add(DocumentId);
@@ -86,8 +90,28 @@ impl<'a> SortByAttr<'a> {
}
}
impl<'a> Criterion for SortByAttr<'a> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
impl Criterion for SortByAttr<'_> {
fn name(&self) -> &str {
"sort by attribute"
}
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
// ...
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) -> Ordering
{
let lhs = self.ranked_map.get(lhs.id, self.attr);
let rhs = self.ranked_map.get(rhs.id, self.attr);
@@ -105,10 +129,6 @@ impl<'a> Criterion for SortByAttr<'a> {
(None, None) => Ordering::Equal,
}
}
fn name(&self) -> &str {
"SortByAttr"
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]

View File

@@ -1,116 +0,0 @@
use std::cmp::Ordering;
use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::RawDocument;
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
let mut index = 0;
for group in query_index.linear_group() {
sum_typos += custom_log10(distance[index]);
number_words += 1;
index += group.len();
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfTypos;
impl Criterion for SumOfTypos {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
sum_matches_typos(query_index, distance)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
sum_matches_typos(query_index, distance)
};
lhs.cmp(&rhs).reverse()
}
fn name(&self) -> &str {
"SumOfTypos"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "Geox CEO"
//
// doc0: "Geox SpA: CEO and Executive"
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
#[test]
fn one_typo_reference() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0, 1];
let distance1 = &[1, 0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchette"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn no_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 0];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
// typing: "bouton manchztte"
//
// doc0: "bouton manchette"
// doc1: "bouton"
#[test]
fn one_typo() {
let query_index0 = &[0, 1];
let distance0 = &[0, 1];
let query_index1 = &[0];
let distance1 = &[0];
let doc0 = sum_matches_typos(query_index0, distance0);
let doc1 = sum_matches_typos(query_index1, distance1);
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
}
}

View File

@@ -1,64 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
let mut sum_attributes = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_attributes += attribute[index] as usize;
index += group.len();
}
sum_attributes
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsAttribute;
impl Criterion for SumOfWordsAttribute {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let attribute = lhs.attribute();
sum_matches_attributes(query_index, attribute)
};
let rhs = {
let query_index = rhs.query_index();
let attribute = rhs.attribute();
sum_matches_attributes(query_index, attribute)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &str {
"SumOfWordsAttribute"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
// doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
#[test]
fn title_vs_description() {
let query_index0 = &[0];
let attribute0 = &[0];
let query_index1 = &[0];
let attribute1 = &[1];
let doc0 = sum_matches_attributes(query_index0, attribute0);
let doc1 = sum_matches_attributes(query_index1, attribute1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@@ -1,64 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::Ordering;
#[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
let mut sum_word_index = 0;
let mut index = 0;
for group in query_index.linear_group() {
sum_word_index += word_index[index] as usize;
index += group.len();
}
sum_word_index
}
#[derive(Debug, Clone, Copy)]
pub struct SumOfWordsPosition;
impl Criterion for SumOfWordsPosition {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let word_index = lhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let word_index = rhs.word_index();
sum_matches_attribute_index(query_index, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &str {
"SumOfWordsPosition"
}
}
#[cfg(test)]
mod tests {
use super::*;
// typing: "soulier"
//
// doc0: "Soulier bleu"
// doc1: "Botte rouge et soulier noir"
#[test]
fn easy_case() {
let query_index0 = &[0];
let word_index0 = &[0];
let query_index1 = &[0];
let word_index1 = &[3];
let doc0 = sum_matches_attribute_index(query_index0, word_index0);
let doc1 = sum_matches_attribute_index(query_index1, word_index1);
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
}
}

View File

@@ -0,0 +1,67 @@
use std::cmp::Ordering;
use compact_arena::SmallArena;
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
use crate::RawDocument;
use super::{Criterion, prepare_query_distances};
pub struct Typo;
impl Criterion for Typo {
fn name(&self) -> &str { "typo" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &SmallArena<PostingsListView>,
) -> Ordering
{
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn compute_typos(distances: &[Option<u8>]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
for distance in distances {
if let Some(distance) = distance {
sum_typos += custom_log10(*distance);
number_words += 1;
}
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
let lhs = compute_typos(&lhs.processed_distances);
let rhs = compute_typos(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}

View File

@@ -0,0 +1,43 @@
use std::cmp::Ordering;
use compact_arena::SmallArena;
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
use crate::RawDocument;
use super::{Criterion, prepare_query_distances};
pub struct Words;
impl Criterion for Words {
fn name(&self) -> &str { "words" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &SmallArena<PostingsListView>,
) -> Ordering
{
#[inline]
fn number_of_query_words(distances: &[Option<u8>]) -> usize {
distances.iter().cloned().filter(Option::is_some).count()
}
let lhs = number_of_query_words(&lhs.processed_distances);
let rhs = number_of_query_words(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}

View File

@@ -0,0 +1,48 @@
use std::cmp::Ordering;
use compact_arena::SmallArena;
use slice_group_by::GroupBy;
use crate::automaton::QueryEnhancer;
use crate::bucket_sort::{PostingsListView, SimpleMatch, QueryWordAutomaton};
use crate::RawDocument;
use super::{Criterion, prepare_raw_matches};
pub struct WordsPosition;
impl Criterion for WordsPosition {
fn name(&self) -> &str { "words position" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
) -> Ordering
{
#[inline]
fn sum_words_position(matches: &[SimpleMatch]) -> usize {
let mut sum_words_position = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_words_position += group[0].word_index as usize;
}
sum_words_position
}
let lhs = sum_words_position(&lhs.processed_matches);
let rhs = sum_words_position(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}

View File

@@ -1,164 +0,0 @@
use crate::criterion::Criterion;
use crate::RawDocument;
use slice_group_by::GroupBy;
use std::cmp::{self, Ordering};
const MAX_DISTANCE: u16 = 8;
#[inline]
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
(a.clone(), b.clone())
}
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
if lattr != rattr {
return MAX_DISTANCE;
}
index_proximity(lwi, rwi)
}
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
let mut min_prox = u16::max_value();
for a in lattr.iter().zip(lwi) {
for b in rattr.iter().zip(rwi) {
let a = clone_tuple(a);
let b = clone_tuple(b);
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
}
}
min_prox
}
fn matches_proximity(
query_index: &[u32],
distance: &[u8],
attribute: &[u16],
word_index: &[u16],
) -> u16 {
let mut query_index_groups = query_index.linear_group();
let mut proximity = 0;
let mut index = 0;
let get_attr_wi = |index: usize, group_len: usize| {
// retrieve the first distance group (with the lowest values)
let len = distance[index..index + group_len]
.linear_group()
.next()
.unwrap()
.len();
let rattr = &attribute[index..index + len];
let rwi = &word_index[index..index + len];
(rattr, rwi)
};
let mut last = query_index_groups.next().map(|group| {
let attr_wi = get_attr_wi(index, group.len());
index += group.len();
attr_wi
});
// iter by windows of size 2
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
let attr_wi = get_attr_wi(index, rhs.len());
proximity += min_proximity(lhs, attr_wi);
last = Some(attr_wi);
index += rhs.len();
}
proximity
}
#[derive(Debug, Clone, Copy)]
pub struct WordsProximity;
impl Criterion for WordsProximity {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = {
let query_index = lhs.query_index();
let distance = lhs.distance();
let attribute = lhs.attribute();
let word_index = lhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
let rhs = {
let query_index = rhs.query_index();
let distance = rhs.distance();
let attribute = rhs.attribute();
let word_index = rhs.word_index();
matches_proximity(query_index, distance, attribute, word_index)
};
lhs.cmp(&rhs)
}
fn name(&self) -> &str {
"WordsProximity"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn three_different_attributes() {
// "soup" "of the" "the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 0 }
// { id: 2, attr: 1, attr_index: 1 }
// { id: 2, attr: 2, attr_index: 0 }
// { id: 3, attr: 3, attr_index: 1 }
let query_index = &[0, 1, 2, 2, 3];
let distance = &[0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 2, 3];
let word_index = &[0, 0, 1, 0, 1];
// soup -> of = 8
// + of -> the = 1
// + the -> day = 8 (not 1)
assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
17
);
}
#[test]
fn two_different_attributes() {
// "soup day" "soup of the day"
//
// { id: 0, attr: 0, attr_index: 0 }
// { id: 0, attr: 1, attr_index: 0 }
// { id: 1, attr: 1, attr_index: 1 }
// { id: 2, attr: 1, attr_index: 2 }
// { id: 3, attr: 0, attr_index: 1 }
// { id: 3, attr: 1, attr_index: 3 }
let query_index = &[0, 0, 1, 2, 3, 3];
let distance = &[0, 0, 0, 0, 0, 0];
let attribute = &[0, 1, 1, 1, 0, 1];
let word_index = &[0, 0, 1, 2, 1, 3];
// soup -> of = 1
// + of -> the = 1
// + the -> day = 1
assert_eq!(
matches_proximity(query_index, distance, attribute, word_index),
3
);
}
}

View File

@@ -1,514 +0,0 @@
use std::cmp::{self, Ordering, Reverse};
use std::borrow::Cow;
use std::sync::atomic::{self, AtomicUsize};
use slice_group_by::{GroupBy, GroupByMut};
use compact_arena::SmallArena;
use sdset::{Set, SetBuf};
use log::debug;
use crate::{DocIndex, DocumentId};
use crate::bucket_sort::{BareMatch, SimpleMatch, RawDocument, PostingsListView, QueryWordAutomaton};
use crate::automaton::QueryEnhancer;
type PostingsListsArena<'tag, 'txn> = SmallArena<'tag, PostingsListView<'txn>>;
pub trait Criterion {
fn name(&self) -> &str;
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
);
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> Ordering;
#[inline]
fn eq<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> bool
{
self.evaluate(lhs, rhs, postings_lists) == Ordering::Equal
}
}
fn prepare_query_distances<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
postings_lists: &PostingsListsArena<'tag, 'txn>,
) {
for document in documents {
if !document.processed_distances.is_empty() { continue }
let mut processed = Vec::new();
for m in document.raw_matches.iter() {
if postings_lists[m.postings_list].is_empty() { continue }
let range = query_enhancer.replacement(m.query_index as u32);
let new_len = cmp::max(range.end as usize, processed.len());
processed.resize(new_len, None);
for index in range {
let index = index as usize;
processed[index] = match processed[index] {
Some(distance) if distance > m.distance => Some(m.distance),
Some(distance) => Some(distance),
None => Some(m.distance),
};
}
}
document.processed_distances = processed;
}
}
pub struct Typo;
impl Criterion for Typo {
fn name(&self) -> &str { "typo" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &PostingsListsArena,
) -> Ordering
{
// This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3,
// the number of typos is never bigger than that.
#[inline]
fn custom_log10(n: u8) -> f32 {
match n {
0 => 0.0, // log(1)
1 => 0.30102, // log(2)
2 => 0.47712, // log(3)
3 => 0.60205, // log(4)
_ => panic!("invalid number"),
}
}
#[inline]
fn compute_typos(distances: &[Option<u8>]) -> usize {
let mut number_words: usize = 0;
let mut sum_typos = 0.0;
for distance in distances {
if let Some(distance) = distance {
sum_typos += custom_log10(*distance);
number_words += 1;
}
}
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
}
let lhs = compute_typos(&lhs.processed_distances);
let rhs = compute_typos(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}
pub struct Words;
impl Criterion for Words {
fn name(&self) -> &str { "words" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_query_distances(documents, query_enhancer, automatons, postings_lists);
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &PostingsListsArena,
) -> Ordering
{
#[inline]
fn number_of_query_words(distances: &[Option<u8>]) -> usize {
distances.iter().cloned().filter(Option::is_some).count()
}
let lhs = number_of_query_words(&lhs.processed_distances);
let rhs = number_of_query_words(&rhs.processed_distances);
lhs.cmp(&rhs).reverse()
}
}
fn prepare_raw_matches<'a, 'tag, 'txn>(
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
for document in documents {
if !document.processed_matches.is_empty() { continue }
let mut processed = Vec::new();
for m in document.raw_matches.iter() {
let postings_list = &postings_lists[m.postings_list];
processed.reserve(postings_list.len());
for di in postings_list.as_ref() {
let simple_match = SimpleMatch {
query_index: m.query_index,
distance: m.distance,
attribute: di.attribute,
word_index: di.word_index,
is_exact: m.is_exact,
};
processed.push(simple_match);
}
}
let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons);
document.processed_matches = processed.into_vec();
}
}
pub struct Proximity;
impl Criterion for Proximity {
fn name(&self) -> &str { "proximity" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> Ordering
{
const MAX_DISTANCE: u16 = 8;
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
if lhs < rhs {
cmp::min(rhs - lhs, MAX_DISTANCE)
} else {
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
}
}
fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
if lhs.attribute != rhs.attribute { MAX_DISTANCE }
else { index_proximity(lhs.word_index, rhs.word_index) }
}
fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
let mut min_prox = u16::max_value();
for a in lhs {
for b in rhs {
let prox = attribute_proximity(*a, *b);
min_prox = cmp::min(min_prox, prox);
}
}
min_prox
}
fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
let mut proximity = 0;
let mut iter = matches.linear_group_by_key(|m| m.query_index);
// iterate over groups by windows of size 2
let mut last = iter.next();
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
proximity += min_proximity(lhs, rhs);
last = Some(rhs);
}
proximity
}
let lhs = matches_proximity(&lhs.processed_matches);
let rhs = matches_proximity(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}
pub struct Attribute;
impl Criterion for Attribute {
fn name(&self) -> &str { "attribute" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> Ordering
{
#[inline]
fn best_attribute(matches: &[SimpleMatch]) -> u16 {
let mut best_attribute = u16::max_value();
for group in matches.linear_group_by_key(|bm| bm.query_index) {
best_attribute = cmp::min(best_attribute, group[0].attribute);
}
best_attribute
}
let lhs = best_attribute(&lhs.processed_matches);
let rhs = best_attribute(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}
pub struct WordsPosition;
impl Criterion for WordsPosition {
fn name(&self) -> &str { "words position" }
fn prepare<'a, 'tag, 'txn>(
&self,
documents: &mut [RawDocument<'a, 'tag>],
postings_lists: &mut PostingsListsArena<'tag, 'txn>,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
prepare_raw_matches(documents, postings_lists, query_enhancer, automatons);
}
fn evaluate<'a, 'tag, 'txn>(
&self,
lhs: &RawDocument<'a, 'tag>,
rhs: &RawDocument<'a, 'tag>,
postings_lists: &PostingsListsArena<'tag, 'txn>,
) -> Ordering
{
#[inline]
fn sum_words_position(matches: &[SimpleMatch]) -> usize {
let mut sum_words_position = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_words_position += group[0].word_index as usize;
}
sum_words_position
}
let lhs = sum_words_position(&lhs.processed_matches);
let rhs = sum_words_position(&rhs.processed_matches);
lhs.cmp(&rhs)
}
}
pub struct Exact;
impl Criterion for Exact {
fn name(&self) -> &str { "exact" }
fn prepare(
&self,
documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
for document in documents {
document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
}
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &PostingsListsArena,
) -> Ordering
{
#[inline]
fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
let mut sum_exact_query_words = 0;
for group in matches.linear_group_by_key(|bm| bm.query_index) {
sum_exact_query_words += group[0].is_exact as usize;
}
sum_exact_query_words
}
let lhs = sum_exact_query_words(&lhs.raw_matches);
let rhs = sum_exact_query_words(&rhs.raw_matches);
lhs.cmp(&rhs).reverse()
}
}
pub struct StableDocId;
impl Criterion for StableDocId {
fn name(&self) -> &str { "stable document id" }
fn prepare(
&self,
documents: &mut [RawDocument],
postings_lists: &mut PostingsListsArena,
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) {
// ...
}
fn evaluate(
&self,
lhs: &RawDocument,
rhs: &RawDocument,
postings_lists: &PostingsListsArena,
) -> Ordering
{
let lhs = &lhs.raw_matches[0].document_id;
let rhs = &rhs.raw_matches[0].document_id;
lhs.cmp(rhs)
}
}
pub fn multiword_rewrite_matches(
matches: &mut [SimpleMatch],
query_enhancer: &QueryEnhancer,
automatons: &[QueryWordAutomaton],
) -> SetBuf<SimpleMatch>
{
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
let mut padded_matches = Vec::with_capacity(matches.len());
// let before_padding = Instant::now();
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
// padding will only be applied
// to word indices in the same attribute
let mut padding = 0;
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
// for each match at the same position
// in this document attribute
while let Some(same_word_index) = iter.next() {
// find the biggest padding
let mut biggest = 0;
for match_ in same_word_index {
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
let mut found = false;
// look ahead and if there already is a match
// corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
for nmatch_ in next_group {
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
let query_index = rep.next().unwrap() as u16;
if query_index == padmatch.query_index {
if !found {
// if we find a corresponding padding for the
// first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i)
{
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
biggest = biggest.max(i + 1);
}
}
padded_matches.push(padmatch);
found = true;
continue 'padding;
}
}
}
// if we do not find a corresponding padding in the
// next groups so stop here and pad what was found
break;
}
if !found {
// if no padding was found in the following matches
// we must insert the entire padding
for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let query_index = query_index as u16;
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
padded_matches.push(match_);
}
biggest = biggest.max(replacement_len - 1);
}
}
padding += biggest;
}
}
// debug!("padding matches took {:.02?}", before_padding.elapsed());
// With this check we can see that the loop above takes something
// like 43% of the search time even when no rewrite is needed.
// assert_eq!(before_matches, padded_matches);
SetBuf::from_dirty(padded_matches)
}

View File

@@ -20,7 +20,6 @@ mod update;
// TODO replace
mod bucket_sort;
mod criterion2;
pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT};
pub use self::error::{Error, MResult};
@@ -31,62 +30,13 @@ pub use self::store::Index;
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount};
#[doc(hidden)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct TmpMatch {
pub query_index: u32,
pub distance: u8,
pub attribute: u16,
pub word_index: u16,
pub is_exact: bool,
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Document {
pub id: DocumentId,
pub highlights: Vec<Highlight>,
#[cfg(test)]
pub matches: Vec<TmpMatch>,
}
impl Document {
#[cfg(not(test))]
fn from_raw(raw: RawDocument) -> Document {
Document {
id: raw.id,
highlights: raw.highlights,
}
}
#[cfg(test)]
fn from_raw(raw: RawDocument) -> Document {
let len = raw.query_index().len();
let mut matches = Vec::with_capacity(len);
let query_index = raw.query_index();
let distance = raw.distance();
let attribute = raw.attribute();
let word_index = raw.word_index();
let is_exact = raw.is_exact();
for i in 0..len {
let match_ = TmpMatch {
query_index: query_index[i],
distance: distance[i],
attribute: attribute[i],
word_index: word_index[i],
is_exact: is_exact[i],
};
matches.push(match_);
}
Document {
id: raw.id,
matches,
highlights: raw.highlights,
}
}
// #[cfg(test)]
// pub matches: Vec<TmpMatch>,
}
#[cfg(test)]

View File

@@ -1,21 +1,8 @@
use hashbrown::HashMap;
use std::convert::TryFrom;
use std::ops::Range;
use std::rc::Rc;
use std::time::{Duration, Instant};
use std::{cmp, mem};
use fst::{IntoStreamer, Streamer};
use log::debug;
use sdset::SetBuf;
use slice_group_by::{GroupBy, GroupByMut};
use std::time::Duration;
use crate::{bucket_sort::bucket_sort, database::MainT};
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::raw_document::{raw_documents_from, RawDocument};
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount};
use crate::{criterion::Criteria, Document, DocumentId};
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
pub struct QueryBuilder<'c, 'f, 'd> {
@@ -30,292 +17,6 @@ pub struct QueryBuilder<'c, 'f, 'd> {
synonyms_store: store::Synonyms,
}
fn multiword_rewrite_matches(
mut matches: Vec<(DocumentId, TmpMatch)>,
query_enhancer: &QueryEnhancer,
) -> SetBuf<(DocumentId, TmpMatch)> {
let mut padded_matches = Vec::with_capacity(matches.len());
let before_sort = Instant::now();
// we sort the matches by word index to make them rewritable
matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index));
debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
let before_padding = Instant::now();
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
// padding will only be applied
// to word indices in the same attribute
let mut padding = 0;
let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index);
// for each match at the same position
// in this document attribute
while let Some(same_word_index) = iter.next() {
// find the biggest padding
let mut biggest = 0;
for (id, match_) in same_word_index {
let mut replacement = query_enhancer.replacement(match_.query_index);
let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16;
let match_ = TmpMatch {
query_index,
word_index,
..*match_
};
padded_matches.push((*id, match_));
}
let mut found = false;
// look ahead and if there already is a match
// corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let padmatch = TmpMatch {
query_index,
word_index,
..*match_
};
for (_, nmatch_) in next_group {
let mut rep = query_enhancer.replacement(nmatch_.query_index);
let query_index = rep.next().unwrap();
if query_index == padmatch.query_index {
if !found {
// if we find a corresponding padding for the
// first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i)
{
let word_index =
match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = TmpMatch {
query_index,
word_index,
..*match_
};
padded_matches.push((*id, match_));
biggest = biggest.max(i + 1);
}
}
padded_matches.push((*id, padmatch));
found = true;
continue 'padding;
}
}
}
// if we do not find a corresponding padding in the
// next groups so stop here and pad what was found
break;
}
if !found {
// if no padding was found in the following matches
// we must insert the entire padding
for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = TmpMatch {
query_index,
word_index,
..*match_
};
padded_matches.push((*id, match_));
}
biggest = biggest.max(replacement_len - 1);
}
}
padding += biggest;
}
}
for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) {
document_matches.sort_unstable();
}
debug!("padding matches took {:.02?}", before_padding.elapsed());
// With this check we can see that the loop above takes something
// like 43% of the search time even when no rewrite is needed.
// assert_eq!(before_matches, padded_matches);
SetBuf::new_unchecked(padded_matches)
}
fn fetch_raw_documents(
reader: &heed::RoTxn<MainT>,
automatons_groups: &[AutomatonGroup],
query_enhancer: &QueryEnhancer,
searchables: Option<&ReorderedAttrs>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
) -> MResult<Vec<RawDocument>> {
let mut matches = Vec::new();
let mut highlights = Vec::new();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
let before_automatons_groups_loop = Instant::now();
let mut doc_indexes_rewrite = Duration::default();
let mut retrieve_postings_lists = Duration::default();
let mut stream_reserve = Duration::default();
let mut covered_area_time = Duration::default();
let mut eval_time = Duration::default();
for group in automatons_groups {
let AutomatonGroup { is_phrase_query, automatons } = group;
let phrase_query_len = automatons.len();
let mut tmp_matches = Vec::new();
for (id, automaton) in automatons.into_iter().enumerate() {
let Automaton { index, is_exact, query_len, query, .. } = automaton;
let dfa = automaton.dfa();
let before_stream_loop = Instant::now();
let mut stream_count = 0;
let mut stream = words.search(&dfa).into_stream();
while let Some(input) = stream.next() {
let before_eval_time = Instant::now();
let distance = dfa.eval(input).to_u8();
eval_time += before_eval_time.elapsed();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
stream_count += 1;
let before_covered_area = Instant::now();
let covered_area = if *query_len > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
covered_area_time += before_covered_area.elapsed();
let before_retrieve_postings_lists = Instant::now();
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
};
retrieve_postings_lists += before_retrieve_postings_lists.elapsed();
let before_stream_reserve = Instant::now();
tmp_matches.reserve(doc_indexes.len());
stream_reserve += before_stream_reserve.elapsed();
let before_doc_indexes_rewrite = Instant::now();
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value());
let covered_area = cmp::min(covered_area, di.char_length);
let highlight = Highlight {
attribute: di.attribute,
char_index: di.char_index,
char_length: covered_area,
};
tmp_matches.push((di.document_id, id, match_, highlight));
}
}
doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed();
}
debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count);
}
if *is_phrase_query {
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
for window in group.windows(2) {
let (ida, ia, ma, ha) = window[0];
let (idb, ib, mb, hb) = window[1];
debug_assert_eq!(ida, idb);
// if matches must follow and actually follows themselves
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
// TODO we must make it work for phrase query longer than 2
// if the second match is the last phrase query word
if ib + 1 == phrase_query_len {
// insert first match
matches.push((ida, ma));
highlights.push((ida, ha));
// insert second match
matches.push((idb, mb));
highlights.push((idb, hb));
}
}
}
}
} else {
let before_rerewrite = Instant::now();
matches.reserve(tmp_matches.len());
highlights.reserve(tmp_matches.len());
for (id, _, match_, highlight) in tmp_matches {
matches.push((id, match_));
highlights.push((id, highlight));
}
debug!("rerewrite took {:.02?}", before_rerewrite.elapsed());
}
}
debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed());
debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite);
debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists);
debug!("stream reserve took {:.02?}", stream_reserve);
debug!("covered area took {:.02?}", covered_area_time);
debug!("eval value took {:.02?}", eval_time);
// {
// let mut cloned = matches.clone();
// let before_sort_test = Instant::now();
// cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance));
// debug!("sorting test took {:.02?}", before_sort_test.elapsed());
// }
let before_multiword_rewrite_matches = Instant::now();
debug!("number of matches before rewrite {}", matches.len());
debug!("{:?}", query_enhancer);
let matches = multiword_rewrite_matches(matches, &query_enhancer);
debug!("number of matches after rewrite {}", matches.len());
debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed());
let before_highlight_sorting = Instant::now();
let highlights = {
highlights.sort_unstable_by_key(|(id, _)| *id);
SetBuf::new_unchecked(highlights)
};
debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed());
let before_raw_documents = Instant::now();
let raw_documents = raw_documents_from(matches, highlights);
debug!("raw_documents took {:.02?}", before_raw_documents.elapsed());
debug!("documents to worry about: {}", raw_documents.len());
Ok(raw_documents)
}
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
pub fn new(
main: store::Main,
@@ -389,7 +90,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
reader,
query,
range,
// self.criteria,
self.criteria,
self.main_store,
self.postings_lists_store,
self.documents_fields_counts_store,

View File

@@ -1,183 +1,89 @@
use std::fmt;
use std::sync::Arc;
use compact_arena::SmallArena;
use itertools::EitherOrBoth;
use sdset::SetBuf;
use slice_group_by::GroupBy;
use crate::{DocumentId, Highlight, TmpMatch, AttrCount};
use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};
#[derive(Clone)]
pub struct RawDocument {
pub id: DocumentId,
pub matches: SharedMatches,
pub highlights: Vec<Highlight>,
pub fields_counts: Option<SetBuf<AttrCount>>,
pub struct RawDocument<'a, 'tag> {
pub id: crate::DocumentId,
pub raw_matches: &'a mut [BareMatch<'tag>],
pub processed_matches: Vec<SimpleMatch>,
/// The list of minimum `distance` found
pub processed_distances: Vec<Option<u8>>,
}
impl RawDocument {
pub fn query_index(&self) -> &[u32] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe {
&self
.matches
.matches
.query_index
.get_unchecked(r.start..r.end)
impl<'a, 'tag> RawDocument<'a, 'tag> {
pub fn new<'txn>(
raw_matches: &'a mut [BareMatch<'tag>],
automatons: &[QueryWordAutomaton],
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
) -> Option<RawDocument<'a, 'tag>>
{
raw_matches.sort_unstable_by_key(|m| m.query_index);
let mut previous_word = None;
for i in 0..raw_matches.len() {
let a = &raw_matches[i];
let auta = &automatons[a.query_index as usize];
match auta.phrase_query {
Some((0, _)) => {
let b = match raw_matches.get(i + 1) {
Some(b) => b,
None => {
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
continue;
}
};
if a.query_index + 1 != b.query_index {
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
continue
}
let pla = &postings_lists[a.postings_list];
let plb = &postings_lists[b.postings_list];
let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
});
let mut newa = Vec::new();
let mut newb = Vec::new();
for eb in iter {
if let EitherOrBoth::Both(a, b) = eb {
newa.push(*a);
newb.push(*b);
}
}
if !newa.is_empty() {
previous_word = Some(a.query_index);
}
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
},
Some((1, _)) => {
if previous_word.take() != Some(a.query_index - 1) {
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
}
},
Some((_, _)) => unreachable!(),
None => (),
}
}
}
pub fn distance(&self) -> &[u8] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
}
pub fn attribute(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
}
pub fn word_index(&self) -> &[u16] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe {
&self
.matches
.matches
.word_index
.get_unchecked(r.start..r.end)
if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
return None
}
}
pub fn is_exact(&self) -> &[bool] {
let r = self.matches.range;
// it is safe because construction/modifications
// can only be done in this module
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
}
}
impl fmt::Debug for RawDocument {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("RawDocument {\r\n")?;
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"query_index",
self.query_index()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"distance",
self.distance()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"attribute",
self.attribute()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"word_index",
self.word_index()
))?;
f.write_fmt(format_args!(
"{:>15}: {:^5?},\r\n",
"is_exact",
self.is_exact()
))?;
f.write_str("}")?;
Ok(())
}
}
pub fn raw_documents_from(
matches: SetBuf<(DocumentId, TmpMatch)>,
highlights: SetBuf<(DocumentId, Highlight)>
) -> Vec<RawDocument> {
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
let mut matches2 = Matches::with_capacity(matches.len());
let matches = matches.linear_group_by_key(|(id, _)| *id);
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
for (mgroup, hgroup) in matches.zip(highlights) {
assert_eq!(mgroup[0].0, hgroup[0].0);
let document_id = mgroup[0].0;
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
let end = start + mgroup.len();
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
let fields_counts = None;
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
// TODO we could try to keep both data
// - the data oriented one and,
// - the raw one, the one that comes from the arguments of this function
// This way we would be able to only produce data oriented lazily.
//
// For example the default first criterion is `SumOfTypos`
// and just needs the `query_index` and the `distance` fields.
// It would probably be good to avoid wasting time sorting other fields of documents
// that will never ever reach the second criterion.
matches2.extend_from_slice(mgroup);
}
let matches = Arc::new(matches2);
docs_ranges
.into_iter()
.map(|(id, range, highlights, fields_counts)| {
let matches = SharedMatches { range, matches: matches.clone() };
RawDocument { id, matches, highlights, fields_counts }
Some(RawDocument {
id: raw_matches[0].document_id,
raw_matches,
processed_matches: Vec::new(),
processed_distances: Vec::new(),
})
.collect()
}
#[derive(Debug, Copy, Clone)]
struct Range {
start: usize,
end: usize,
}
#[derive(Clone)]
pub struct SharedMatches {
range: Range,
matches: Arc<Matches>,
}
#[derive(Clone)]
struct Matches {
query_index: Vec<u32>,
distance: Vec<u8>,
attribute: Vec<u16>,
word_index: Vec<u16>,
is_exact: Vec<bool>,
}
impl Matches {
fn with_capacity(cap: usize) -> Matches {
Matches {
query_index: Vec::with_capacity(cap),
distance: Vec::with_capacity(cap),
attribute: Vec::with_capacity(cap),
word_index: Vec::with_capacity(cap),
is_exact: Vec::with_capacity(cap),
}
}
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
for (_, match_) in matches {
self.query_index.push(match_.query_index);
self.distance.push(match_.distance);
self.attribute.push(match_.attribute);
self.word_index.push(match_.word_index);
self.is_exact.push(match_.is_exact);
}
}
}