mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-02 11:50:03 +00:00
Update the criteria to the new ones
This commit is contained in:
@ -1,21 +1,8 @@
|
||||
use hashbrown::HashMap;
|
||||
use std::convert::TryFrom;
|
||||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{cmp, mem};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use log::debug;
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::{GroupBy, GroupByMut};
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::{bucket_sort::bucket_sort, database::MainT};
|
||||
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
|
||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||
use crate::levenshtein::prefix_damerau_levenshtein;
|
||||
use crate::raw_document::{raw_documents_from, RawDocument};
|
||||
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount};
|
||||
use crate::{criterion::Criteria, Document, DocumentId};
|
||||
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
|
||||
|
||||
pub struct QueryBuilder<'c, 'f, 'd> {
|
||||
@ -30,292 +17,6 @@ pub struct QueryBuilder<'c, 'f, 'd> {
|
||||
synonyms_store: store::Synonyms,
|
||||
}
|
||||
|
||||
fn multiword_rewrite_matches(
|
||||
mut matches: Vec<(DocumentId, TmpMatch)>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
) -> SetBuf<(DocumentId, TmpMatch)> {
|
||||
let mut padded_matches = Vec::with_capacity(matches.len());
|
||||
|
||||
let before_sort = Instant::now();
|
||||
// we sort the matches by word index to make them rewritable
|
||||
matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index));
|
||||
debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
|
||||
|
||||
let before_padding = Instant::now();
|
||||
// for each attribute of each document
|
||||
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
|
||||
// padding will only be applied
|
||||
// to word indices in the same attribute
|
||||
let mut padding = 0;
|
||||
let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index);
|
||||
|
||||
// for each match at the same position
|
||||
// in this document attribute
|
||||
while let Some(same_word_index) = iter.next() {
|
||||
// find the biggest padding
|
||||
let mut biggest = 0;
|
||||
for (id, match_) in same_word_index {
|
||||
let mut replacement = query_enhancer.replacement(match_.query_index);
|
||||
let replacement_len = replacement.len();
|
||||
let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
|
||||
|
||||
if let Some(query_index) = replacement.next() {
|
||||
let word_index = match_.word_index + padding as u16;
|
||||
let match_ = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..*match_
|
||||
};
|
||||
padded_matches.push((*id, match_));
|
||||
}
|
||||
|
||||
let mut found = false;
|
||||
|
||||
// look ahead and if there already is a match
|
||||
// corresponding to this padding word, abort the padding
|
||||
'padding: for (x, next_group) in nexts.enumerate() {
|
||||
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let padmatch = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..*match_
|
||||
};
|
||||
|
||||
for (_, nmatch_) in next_group {
|
||||
let mut rep = query_enhancer.replacement(nmatch_.query_index);
|
||||
let query_index = rep.next().unwrap();
|
||||
if query_index == padmatch.query_index {
|
||||
if !found {
|
||||
// if we find a corresponding padding for the
|
||||
// first time we must push preceding paddings
|
||||
for (i, query_index) in replacement.clone().enumerate().take(i)
|
||||
{
|
||||
let word_index =
|
||||
match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let match_ = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..*match_
|
||||
};
|
||||
padded_matches.push((*id, match_));
|
||||
biggest = biggest.max(i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
padded_matches.push((*id, padmatch));
|
||||
found = true;
|
||||
continue 'padding;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we do not find a corresponding padding in the
|
||||
// next groups so stop here and pad what was found
|
||||
break;
|
||||
}
|
||||
|
||||
if !found {
|
||||
// if no padding was found in the following matches
|
||||
// we must insert the entire padding
|
||||
for (i, query_index) in replacement.enumerate() {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let match_ = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..*match_
|
||||
};
|
||||
padded_matches.push((*id, match_));
|
||||
}
|
||||
|
||||
biggest = biggest.max(replacement_len - 1);
|
||||
}
|
||||
}
|
||||
|
||||
padding += biggest;
|
||||
}
|
||||
}
|
||||
|
||||
for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) {
|
||||
document_matches.sort_unstable();
|
||||
}
|
||||
|
||||
debug!("padding matches took {:.02?}", before_padding.elapsed());
|
||||
|
||||
// With this check we can see that the loop above takes something
|
||||
// like 43% of the search time even when no rewrite is needed.
|
||||
// assert_eq!(before_matches, padded_matches);
|
||||
|
||||
SetBuf::new_unchecked(padded_matches)
|
||||
}
|
||||
|
||||
fn fetch_raw_documents(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
automatons_groups: &[AutomatonGroup],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
searchables: Option<&ReorderedAttrs>,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
) -> MResult<Vec<RawDocument>> {
|
||||
let mut matches = Vec::new();
|
||||
let mut highlights = Vec::new();
|
||||
|
||||
let words = match main_store.words_fst(reader)? {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
let before_automatons_groups_loop = Instant::now();
|
||||
let mut doc_indexes_rewrite = Duration::default();
|
||||
let mut retrieve_postings_lists = Duration::default();
|
||||
let mut stream_reserve = Duration::default();
|
||||
let mut covered_area_time = Duration::default();
|
||||
let mut eval_time = Duration::default();
|
||||
|
||||
for group in automatons_groups {
|
||||
let AutomatonGroup { is_phrase_query, automatons } = group;
|
||||
let phrase_query_len = automatons.len();
|
||||
|
||||
let mut tmp_matches = Vec::new();
|
||||
for (id, automaton) in automatons.into_iter().enumerate() {
|
||||
let Automaton { index, is_exact, query_len, query, .. } = automaton;
|
||||
let dfa = automaton.dfa();
|
||||
|
||||
let before_stream_loop = Instant::now();
|
||||
let mut stream_count = 0;
|
||||
|
||||
let mut stream = words.search(&dfa).into_stream();
|
||||
while let Some(input) = stream.next() {
|
||||
let before_eval_time = Instant::now();
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
eval_time += before_eval_time.elapsed();
|
||||
|
||||
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
|
||||
|
||||
stream_count += 1;
|
||||
|
||||
let before_covered_area = Instant::now();
|
||||
let covered_area = if *query_len > input.len() {
|
||||
input.len()
|
||||
} else {
|
||||
prefix_damerau_levenshtein(query.as_bytes(), input).1
|
||||
};
|
||||
covered_area_time += before_covered_area.elapsed();
|
||||
|
||||
let before_retrieve_postings_lists = Instant::now();
|
||||
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
|
||||
Some(doc_indexes) => doc_indexes,
|
||||
None => continue,
|
||||
};
|
||||
retrieve_postings_lists += before_retrieve_postings_lists.elapsed();
|
||||
|
||||
let before_stream_reserve = Instant::now();
|
||||
tmp_matches.reserve(doc_indexes.len());
|
||||
stream_reserve += before_stream_reserve.elapsed();
|
||||
|
||||
let before_doc_indexes_rewrite = Instant::now();
|
||||
for di in doc_indexes.as_ref() {
|
||||
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
||||
if let Some(attribute) = attribute {
|
||||
let match_ = TmpMatch {
|
||||
query_index: *index as u32,
|
||||
distance,
|
||||
attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact,
|
||||
};
|
||||
|
||||
let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value());
|
||||
let covered_area = cmp::min(covered_area, di.char_length);
|
||||
|
||||
let highlight = Highlight {
|
||||
attribute: di.attribute,
|
||||
char_index: di.char_index,
|
||||
char_length: covered_area,
|
||||
};
|
||||
|
||||
tmp_matches.push((di.document_id, id, match_, highlight));
|
||||
}
|
||||
}
|
||||
doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed();
|
||||
}
|
||||
debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count);
|
||||
}
|
||||
|
||||
if *is_phrase_query {
|
||||
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
|
||||
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
|
||||
for window in group.windows(2) {
|
||||
let (ida, ia, ma, ha) = window[0];
|
||||
let (idb, ib, mb, hb) = window[1];
|
||||
|
||||
debug_assert_eq!(ida, idb);
|
||||
|
||||
// if matches must follow and actually follows themselves
|
||||
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
|
||||
// TODO we must make it work for phrase query longer than 2
|
||||
// if the second match is the last phrase query word
|
||||
if ib + 1 == phrase_query_len {
|
||||
// insert first match
|
||||
matches.push((ida, ma));
|
||||
highlights.push((ida, ha));
|
||||
|
||||
// insert second match
|
||||
matches.push((idb, mb));
|
||||
highlights.push((idb, hb));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let before_rerewrite = Instant::now();
|
||||
|
||||
matches.reserve(tmp_matches.len());
|
||||
highlights.reserve(tmp_matches.len());
|
||||
|
||||
for (id, _, match_, highlight) in tmp_matches {
|
||||
matches.push((id, match_));
|
||||
highlights.push((id, highlight));
|
||||
}
|
||||
debug!("rerewrite took {:.02?}", before_rerewrite.elapsed());
|
||||
}
|
||||
}
|
||||
debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed());
|
||||
debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite);
|
||||
debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists);
|
||||
debug!("stream reserve took {:.02?}", stream_reserve);
|
||||
debug!("covered area took {:.02?}", covered_area_time);
|
||||
debug!("eval value took {:.02?}", eval_time);
|
||||
|
||||
// {
|
||||
// let mut cloned = matches.clone();
|
||||
// let before_sort_test = Instant::now();
|
||||
// cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance));
|
||||
// debug!("sorting test took {:.02?}", before_sort_test.elapsed());
|
||||
// }
|
||||
|
||||
let before_multiword_rewrite_matches = Instant::now();
|
||||
debug!("number of matches before rewrite {}", matches.len());
|
||||
debug!("{:?}", query_enhancer);
|
||||
let matches = multiword_rewrite_matches(matches, &query_enhancer);
|
||||
debug!("number of matches after rewrite {}", matches.len());
|
||||
debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed());
|
||||
|
||||
let before_highlight_sorting = Instant::now();
|
||||
let highlights = {
|
||||
highlights.sort_unstable_by_key(|(id, _)| *id);
|
||||
SetBuf::new_unchecked(highlights)
|
||||
};
|
||||
debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed());
|
||||
|
||||
let before_raw_documents = Instant::now();
|
||||
let raw_documents = raw_documents_from(matches, highlights);
|
||||
debug!("raw_documents took {:.02?}", before_raw_documents.elapsed());
|
||||
debug!("documents to worry about: {}", raw_documents.len());
|
||||
|
||||
Ok(raw_documents)
|
||||
}
|
||||
|
||||
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
pub fn new(
|
||||
main: store::Main,
|
||||
@ -389,7 +90,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
reader,
|
||||
query,
|
||||
range,
|
||||
// self.criteria,
|
||||
self.criteria,
|
||||
self.main_store,
|
||||
self.postings_lists_store,
|
||||
self.documents_fields_counts_store,
|
||||
|
Reference in New Issue
Block a user