Update the criteria to the new ones

This commit is contained in:
Clément Renault
2019-12-11 17:02:10 +01:00
parent ea148575cf
commit 248ccfc0d8
20 changed files with 693 additions and 1775 deletions

View File

@ -1,21 +1,8 @@
use hashbrown::HashMap;
use std::convert::TryFrom;
use std::ops::Range;
use std::rc::Rc;
use std::time::{Duration, Instant};
use std::{cmp, mem};
use fst::{IntoStreamer, Streamer};
use log::debug;
use sdset::SetBuf;
use slice_group_by::{GroupBy, GroupByMut};
use std::time::Duration;
use crate::{bucket_sort::bucket_sort, database::MainT};
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
use crate::levenshtein::prefix_damerau_levenshtein;
use crate::raw_document::{raw_documents_from, RawDocument};
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount};
use crate::{criterion::Criteria, Document, DocumentId};
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
pub struct QueryBuilder<'c, 'f, 'd> {
@ -30,292 +17,6 @@ pub struct QueryBuilder<'c, 'f, 'd> {
synonyms_store: store::Synonyms,
}
fn multiword_rewrite_matches(
mut matches: Vec<(DocumentId, TmpMatch)>,
query_enhancer: &QueryEnhancer,
) -> SetBuf<(DocumentId, TmpMatch)> {
let mut padded_matches = Vec::with_capacity(matches.len());
let before_sort = Instant::now();
// we sort the matches by word index to make them rewritable
matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index));
debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
let before_padding = Instant::now();
// for each attribute of each document
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
// padding will only be applied
// to word indices in the same attribute
let mut padding = 0;
let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index);
// for each match at the same position
// in this document attribute
while let Some(same_word_index) = iter.next() {
// find the biggest padding
let mut biggest = 0;
for (id, match_) in same_word_index {
let mut replacement = query_enhancer.replacement(match_.query_index);
let replacement_len = replacement.len();
let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
if let Some(query_index) = replacement.next() {
let word_index = match_.word_index + padding as u16;
let match_ = TmpMatch {
query_index,
word_index,
..*match_
};
padded_matches.push((*id, match_));
}
let mut found = false;
// look ahead and if there already is a match
// corresponding to this padding word, abort the padding
'padding: for (x, next_group) in nexts.enumerate() {
for (i, query_index) in replacement.clone().enumerate().skip(x) {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let padmatch = TmpMatch {
query_index,
word_index,
..*match_
};
for (_, nmatch_) in next_group {
let mut rep = query_enhancer.replacement(nmatch_.query_index);
let query_index = rep.next().unwrap();
if query_index == padmatch.query_index {
if !found {
// if we find a corresponding padding for the
// first time we must push preceding paddings
for (i, query_index) in replacement.clone().enumerate().take(i)
{
let word_index =
match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = TmpMatch {
query_index,
word_index,
..*match_
};
padded_matches.push((*id, match_));
biggest = biggest.max(i + 1);
}
}
padded_matches.push((*id, padmatch));
found = true;
continue 'padding;
}
}
}
// if we do not find a corresponding padding in the
// next groups so stop here and pad what was found
break;
}
if !found {
// if no padding was found in the following matches
// we must insert the entire padding
for (i, query_index) in replacement.enumerate() {
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
let match_ = TmpMatch {
query_index,
word_index,
..*match_
};
padded_matches.push((*id, match_));
}
biggest = biggest.max(replacement_len - 1);
}
}
padding += biggest;
}
}
for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) {
document_matches.sort_unstable();
}
debug!("padding matches took {:.02?}", before_padding.elapsed());
// With this check we can see that the loop above takes something
// like 43% of the search time even when no rewrite is needed.
// assert_eq!(before_matches, padded_matches);
SetBuf::new_unchecked(padded_matches)
}
fn fetch_raw_documents(
reader: &heed::RoTxn<MainT>,
automatons_groups: &[AutomatonGroup],
query_enhancer: &QueryEnhancer,
searchables: Option<&ReorderedAttrs>,
main_store: store::Main,
postings_lists_store: store::PostingsLists,
) -> MResult<Vec<RawDocument>> {
let mut matches = Vec::new();
let mut highlights = Vec::new();
let words = match main_store.words_fst(reader)? {
Some(words) => words,
None => return Ok(Vec::new()),
};
let before_automatons_groups_loop = Instant::now();
let mut doc_indexes_rewrite = Duration::default();
let mut retrieve_postings_lists = Duration::default();
let mut stream_reserve = Duration::default();
let mut covered_area_time = Duration::default();
let mut eval_time = Duration::default();
for group in automatons_groups {
let AutomatonGroup { is_phrase_query, automatons } = group;
let phrase_query_len = automatons.len();
let mut tmp_matches = Vec::new();
for (id, automaton) in automatons.into_iter().enumerate() {
let Automaton { index, is_exact, query_len, query, .. } = automaton;
let dfa = automaton.dfa();
let before_stream_loop = Instant::now();
let mut stream_count = 0;
let mut stream = words.search(&dfa).into_stream();
while let Some(input) = stream.next() {
let before_eval_time = Instant::now();
let distance = dfa.eval(input).to_u8();
eval_time += before_eval_time.elapsed();
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
stream_count += 1;
let before_covered_area = Instant::now();
let covered_area = if *query_len > input.len() {
input.len()
} else {
prefix_damerau_levenshtein(query.as_bytes(), input).1
};
covered_area_time += before_covered_area.elapsed();
let before_retrieve_postings_lists = Instant::now();
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
Some(doc_indexes) => doc_indexes,
None => continue,
};
retrieve_postings_lists += before_retrieve_postings_lists.elapsed();
let before_stream_reserve = Instant::now();
tmp_matches.reserve(doc_indexes.len());
stream_reserve += before_stream_reserve.elapsed();
let before_doc_indexes_rewrite = Instant::now();
for di in doc_indexes.as_ref() {
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
if let Some(attribute) = attribute {
let match_ = TmpMatch {
query_index: *index as u32,
distance,
attribute,
word_index: di.word_index,
is_exact,
};
let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value());
let covered_area = cmp::min(covered_area, di.char_length);
let highlight = Highlight {
attribute: di.attribute,
char_index: di.char_index,
char_length: covered_area,
};
tmp_matches.push((di.document_id, id, match_, highlight));
}
}
doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed();
}
debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count);
}
if *is_phrase_query {
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
for window in group.windows(2) {
let (ida, ia, ma, ha) = window[0];
let (idb, ib, mb, hb) = window[1];
debug_assert_eq!(ida, idb);
// if matches must follow and actually follows themselves
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
// TODO we must make it work for phrase query longer than 2
// if the second match is the last phrase query word
if ib + 1 == phrase_query_len {
// insert first match
matches.push((ida, ma));
highlights.push((ida, ha));
// insert second match
matches.push((idb, mb));
highlights.push((idb, hb));
}
}
}
}
} else {
let before_rerewrite = Instant::now();
matches.reserve(tmp_matches.len());
highlights.reserve(tmp_matches.len());
for (id, _, match_, highlight) in tmp_matches {
matches.push((id, match_));
highlights.push((id, highlight));
}
debug!("rerewrite took {:.02?}", before_rerewrite.elapsed());
}
}
debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed());
debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite);
debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists);
debug!("stream reserve took {:.02?}", stream_reserve);
debug!("covered area took {:.02?}", covered_area_time);
debug!("eval value took {:.02?}", eval_time);
// {
// let mut cloned = matches.clone();
// let before_sort_test = Instant::now();
// cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance));
// debug!("sorting test took {:.02?}", before_sort_test.elapsed());
// }
let before_multiword_rewrite_matches = Instant::now();
debug!("number of matches before rewrite {}", matches.len());
debug!("{:?}", query_enhancer);
let matches = multiword_rewrite_matches(matches, &query_enhancer);
debug!("number of matches after rewrite {}", matches.len());
debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed());
let before_highlight_sorting = Instant::now();
let highlights = {
highlights.sort_unstable_by_key(|(id, _)| *id);
SetBuf::new_unchecked(highlights)
};
debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed());
let before_raw_documents = Instant::now();
let raw_documents = raw_documents_from(matches, highlights);
debug!("raw_documents took {:.02?}", before_raw_documents.elapsed());
debug!("documents to worry about: {}", raw_documents.len());
Ok(raw_documents)
}
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
pub fn new(
main: store::Main,
@ -389,7 +90,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
reader,
query,
range,
// self.criteria,
self.criteria,
self.main_store,
self.postings_lists_store,
self.documents_fields_counts_store,