Update the criteria to the new ones

2025-09-21 20:26:25 +00:00 · 2019-12-11 17:02:10 +01:00
parent ea148575cf
commit 248ccfc0d8
20 changed files with 693 additions and 1775 deletions
--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@ -1,21 +1,8 @@
-use hashbrown::HashMap;
-use std::convert::TryFrom;
 use std::ops::Range;
-use std::rc::Rc;
-use std::time::{Duration, Instant};
-use std::{cmp, mem};
-
-use fst::{IntoStreamer, Streamer};
-use log::debug;
-use sdset::SetBuf;
-use slice_group_by::{GroupBy, GroupByMut};
+use std::time::Duration;

 use crate::{bucket_sort::bucket_sort, database::MainT};
-use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
-use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
-use crate::levenshtein::prefix_damerau_levenshtein;
-use crate::raw_document::{raw_documents_from, RawDocument};
-use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount};
+use crate::{criterion::Criteria, Document, DocumentId};
 use crate::{reordered_attrs::ReorderedAttrs, store, MResult};

 pub struct QueryBuilder<'c, 'f, 'd> {
@ -30,292 +17,6 @@ pub struct QueryBuilder<'c, 'f, 'd> {
    synonyms_store: store::Synonyms,
 }

-fn multiword_rewrite_matches(
-    mut matches: Vec<(DocumentId, TmpMatch)>,
-    query_enhancer: &QueryEnhancer,
-) -> SetBuf<(DocumentId, TmpMatch)> {
-    let mut padded_matches = Vec::with_capacity(matches.len());
-
-    let before_sort = Instant::now();
-    // we sort the matches by word index to make them rewritable
-    matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index));
-    debug!("sorting dirty matches took {:.02?}", before_sort.elapsed());
-
-    let before_padding = Instant::now();
-    // for each attribute of each document
-    for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
-        // padding will only be applied
-        // to word indices in the same attribute
-        let mut padding = 0;
-        let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index);
-
-        // for each match at the same position
-        // in this document attribute
-        while let Some(same_word_index) = iter.next() {
-            // find the biggest padding
-            let mut biggest = 0;
-            for (id, match_) in same_word_index {
-                let mut replacement = query_enhancer.replacement(match_.query_index);
-                let replacement_len = replacement.len();
-                let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
-
-                if let Some(query_index) = replacement.next() {
-                    let word_index = match_.word_index + padding as u16;
-                    let match_ = TmpMatch {
-                        query_index,
-                        word_index,
-                        ..*match_
-                    };
-                    padded_matches.push((*id, match_));
-                }
-
-                let mut found = false;
-
-                // look ahead and if there already is a match
-                // corresponding to this padding word, abort the padding
-                'padding: for (x, next_group) in nexts.enumerate() {
-                    for (i, query_index) in replacement.clone().enumerate().skip(x) {
-                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
-                        let padmatch = TmpMatch {
-                            query_index,
-                            word_index,
-                            ..*match_
-                        };
-
-                        for (_, nmatch_) in next_group {
-                            let mut rep = query_enhancer.replacement(nmatch_.query_index);
-                            let query_index = rep.next().unwrap();
-                            if query_index == padmatch.query_index {
-                                if !found {
-                                    // if we find a corresponding padding for the
-                                    // first time we must push preceding paddings
-                                    for (i, query_index) in replacement.clone().enumerate().take(i)
-                                    {
-                                        let word_index =
-                                            match_.word_index + padding as u16 + (i + 1) as u16;
-                                        let match_ = TmpMatch {
-                                            query_index,
-                                            word_index,
-                                            ..*match_
-                                        };
-                                        padded_matches.push((*id, match_));
-                                        biggest = biggest.max(i + 1);
-                                    }
-                                }
-
-                                padded_matches.push((*id, padmatch));
-                                found = true;
-                                continue 'padding;
-                            }
-                        }
-                    }
-
-                    // if we do not find a corresponding padding in the
-                    // next groups so stop here and pad what was found
-                    break;
-                }
-
-                if !found {
-                    // if no padding was found in the following matches
-                    // we must insert the entire padding
-                    for (i, query_index) in replacement.enumerate() {
-                        let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
-                        let match_ = TmpMatch {
-                            query_index,
-                            word_index,
-                            ..*match_
-                        };
-                        padded_matches.push((*id, match_));
-                    }
-
-                    biggest = biggest.max(replacement_len - 1);
-                }
-            }
-
-            padding += biggest;
-        }
-    }
-
-    for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) {
-        document_matches.sort_unstable();
-    }
-
-    debug!("padding matches took {:.02?}", before_padding.elapsed());
-
-    // With this check we can see that the loop above takes something
-    // like 43% of the search time even when no rewrite is needed.
-    // assert_eq!(before_matches, padded_matches);
-
-    SetBuf::new_unchecked(padded_matches)
-}
-
-fn fetch_raw_documents(
-    reader: &heed::RoTxn<MainT>,
-    automatons_groups: &[AutomatonGroup],
-    query_enhancer: &QueryEnhancer,
-    searchables: Option<&ReorderedAttrs>,
-    main_store: store::Main,
-    postings_lists_store: store::PostingsLists,
-) -> MResult<Vec<RawDocument>> {
-    let mut matches = Vec::new();
-    let mut highlights = Vec::new();
-
-    let words = match main_store.words_fst(reader)? {
-        Some(words) => words,
-        None => return Ok(Vec::new()),
-    };
-
-    let before_automatons_groups_loop = Instant::now();
-    let mut doc_indexes_rewrite = Duration::default();
-    let mut retrieve_postings_lists = Duration::default();
-    let mut stream_reserve = Duration::default();
-    let mut covered_area_time = Duration::default();
-    let mut eval_time = Duration::default();
-
-    for group in automatons_groups {
-        let AutomatonGroup { is_phrase_query, automatons } = group;
-        let phrase_query_len = automatons.len();
-
-        let mut tmp_matches = Vec::new();
-        for (id, automaton) in automatons.into_iter().enumerate() {
-            let Automaton { index, is_exact, query_len, query, .. } = automaton;
-            let dfa = automaton.dfa();
-
-            let before_stream_loop = Instant::now();
-            let mut stream_count = 0;
-
-            let mut stream = words.search(&dfa).into_stream();
-            while let Some(input) = stream.next() {
-                let before_eval_time = Instant::now();
-                let distance = dfa.eval(input).to_u8();
-                eval_time += before_eval_time.elapsed();
-
-                let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
-
-                stream_count += 1;
-
-                let before_covered_area = Instant::now();
-                let covered_area = if *query_len > input.len() {
-                    input.len()
-                } else {
-                    prefix_damerau_levenshtein(query.as_bytes(), input).1
-                };
-                covered_area_time += before_covered_area.elapsed();
-
-                let before_retrieve_postings_lists = Instant::now();
-                let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
-                    Some(doc_indexes) => doc_indexes,
-                    None => continue,
-                };
-                retrieve_postings_lists += before_retrieve_postings_lists.elapsed();
-
-                let before_stream_reserve = Instant::now();
-                tmp_matches.reserve(doc_indexes.len());
-                stream_reserve += before_stream_reserve.elapsed();
-
-                let before_doc_indexes_rewrite = Instant::now();
-                for di in doc_indexes.as_ref() {
-                    let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
-                    if let Some(attribute) = attribute {
-                        let match_ = TmpMatch {
-                            query_index: *index as u32,
-                            distance,
-                            attribute,
-                            word_index: di.word_index,
-                            is_exact,
-                        };
-
-                        let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value());
-                        let covered_area = cmp::min(covered_area, di.char_length);
-
-                        let highlight = Highlight {
-                            attribute: di.attribute,
-                            char_index: di.char_index,
-                            char_length: covered_area,
-                        };
-
-                        tmp_matches.push((di.document_id, id, match_, highlight));
-                    }
-                }
-                doc_indexes_rewrite += before_doc_indexes_rewrite.elapsed();
-            }
-            debug!("{:?} took {:.02?} ({} words)", query, before_stream_loop.elapsed(), stream_count);
-        }
-
-        if *is_phrase_query {
-            tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
-            for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
-                for window in group.windows(2) {
-                    let (ida, ia, ma, ha) = window[0];
-                    let (idb, ib, mb, hb) = window[1];
-
-                    debug_assert_eq!(ida, idb);
-
-                    // if matches must follow and actually follows themselves
-                    if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
-                        // TODO we must make it work for phrase query longer than 2
-                        // if the second match is the last phrase query word
-                        if ib + 1 == phrase_query_len {
-                            // insert first match
-                            matches.push((ida, ma));
-                            highlights.push((ida, ha));
-
-                            // insert second match
-                            matches.push((idb, mb));
-                            highlights.push((idb, hb));
-                        }
-                    }
-                }
-            }
-        } else {
-            let before_rerewrite = Instant::now();
-
-            matches.reserve(tmp_matches.len());
-            highlights.reserve(tmp_matches.len());
-
-            for (id, _, match_, highlight) in tmp_matches {
-                matches.push((id, match_));
-                highlights.push((id, highlight));
-            }
-            debug!("rerewrite took {:.02?}", before_rerewrite.elapsed());
-        }
-    }
-    debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed());
-    debug!("doc_indexes_rewrite took {:.02?}", doc_indexes_rewrite);
-    debug!("retrieve_postings_lists took {:.02?}", retrieve_postings_lists);
-    debug!("stream reserve took {:.02?}", stream_reserve);
-    debug!("covered area took {:.02?}", covered_area_time);
-    debug!("eval value took {:.02?}", eval_time);
-
-    // {
-    //     let mut cloned = matches.clone();
-    //     let before_sort_test = Instant::now();
-    //     cloned.sort_unstable_by_key(|(id, m)| (*id, m.query_index, m.distance));
-    //     debug!("sorting test took {:.02?}", before_sort_test.elapsed());
-    // }
-
-    let before_multiword_rewrite_matches = Instant::now();
-    debug!("number of matches before rewrite {}", matches.len());
-    debug!("{:?}", query_enhancer);
-    let matches = multiword_rewrite_matches(matches, &query_enhancer);
-    debug!("number of matches after rewrite {}", matches.len());
-    debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed());
-
-    let before_highlight_sorting = Instant::now();
-    let highlights = {
-        highlights.sort_unstable_by_key(|(id, _)| *id);
-        SetBuf::new_unchecked(highlights)
-    };
-    debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed());
-
-    let before_raw_documents = Instant::now();
-    let raw_documents = raw_documents_from(matches, highlights);
-    debug!("raw_documents took {:.02?}", before_raw_documents.elapsed());
-    debug!("documents to worry about: {}", raw_documents.len());
-
-    Ok(raw_documents)
-}
-
 impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
    pub fn new(
        main: store::Main,
@ -389,7 +90,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
                reader,
                query,
                range,
-                // self.criteria,
+                self.criteria,
                self.main_store,
                self.postings_lists_store,
                self.documents_fields_counts_store,