Before improving fields AttrCount

Removing the fields_count fetching reduced by 2 times the serach time, we should look at lazily pulling them form the criterions in needs ugly-test: Make the fields_count fetching lazy Just before running the exactness criterion
2025-09-21 20:26:25 +00:00 · 2019-11-29 16:31:47 +01:00
parent 11f3d7782d
commit ef6a4db182
4 changed files with 79 additions and 60 deletions
--- a/meilisearch-core/src/query_builder.rs
+++ b/meilisearch-core/src/query_builder.rs
@ -6,6 +6,7 @@ use std::time::{Duration, Instant};
 use std::{cmp, mem};

 use fst::{IntoStreamer, Streamer};
+use log::debug;
 use sdset::SetBuf;
 use slice_group_by::{GroupBy, GroupByMut};

@ -14,7 +15,7 @@ use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhanc
 use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
 use crate::levenshtein::prefix_damerau_levenshtein;
 use crate::raw_document::{raw_documents_from, RawDocument};
-use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
+use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch, AttrCount};
 use crate::{reordered_attrs::ReorderedAttrs, store, MResult};

 pub struct QueryBuilder<'c, 'f, 'd> {
@ -146,27 +147,18 @@ fn fetch_raw_documents(
    searchables: Option<&ReorderedAttrs>,
    main_store: store::Main,
    postings_lists_store: store::PostingsLists,
-    documents_fields_counts_store: store::DocumentsFieldsCounts,
 ) -> MResult<Vec<RawDocument>> {
    let mut matches = Vec::new();
    let mut highlights = Vec::new();

+    let before_automatons_groups_loop = Instant::now();
    for group in automatons_groups {
-        let AutomatonGroup {
-            is_phrase_query,
-            automatons,
-        } = group;
+        let AutomatonGroup { is_phrase_query, automatons } = group;
        let phrase_query_len = automatons.len();

        let mut tmp_matches = Vec::new();
        for (id, automaton) in automatons.into_iter().enumerate() {
-            let Automaton {
-                index,
-                is_exact,
-                query_len,
-                query,
-                ..
-            } = automaton;
+            let Automaton { index, is_exact, query_len, query, .. } = automaton;
            let dfa = automaton.dfa();

            let words = match main_store.words_fst(reader)? {
@ -250,26 +242,26 @@ fn fetch_raw_documents(
            }
        }
    }
+    debug!("automatons_groups_loop took {:.02?}", before_automatons_groups_loop.elapsed());

+    let before_multiword_rewrite_matches = Instant::now();
    let matches = multiword_rewrite_matches(matches, &query_enhancer);
+    debug!("multiword_rewrite_matches took {:.02?}", before_multiword_rewrite_matches.elapsed());
+
+    let before_highlight_sorting = Instant::now();
    let highlights = {
        highlights.sort_unstable_by_key(|(id, _)| *id);
        SetBuf::new_unchecked(highlights)
    };
+    debug!("highlight_sorting {:.02?}", before_highlight_sorting.elapsed());

-    let fields_counts = {
-        let mut fields_counts = Vec::new();
-        for group in matches.linear_group_by_key(|(id, ..)| *id) {
-            let id = group[0].0;
-            for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
-                let (attr, count) = result?;
-                fields_counts.push((id, attr, count));
-            }
-        }
-        SetBuf::new(fields_counts).unwrap()
-    };

-    Ok(raw_documents_from(matches, highlights, fields_counts))
+    let before_raw_documents = Instant::now();
+    let raw_documents = raw_documents_from(matches, highlights);
+    debug!("raw_documents took {:.02?}", before_raw_documents.elapsed());
+    debug!("documents to worry about: {}", raw_documents.len());
+
+    Ok(raw_documents)
 }

 impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
@ -434,6 +426,11 @@ where
    for auts in automaton_producer {
        automatons.push(auts);

+        for (i, group) in automatons.iter().enumerate() {
+            debug!("group {} automatons {:?}", i, group.automatons);
+        }
+
+        let before_fetch_raw_documents = Instant::now();
        // we must retrieve the documents associated
        // with the current automatons
        let mut raw_documents = fetch_raw_documents(
@ -443,8 +440,8 @@ where
            searchable_attrs.as_ref(),
            main_store,
            postings_lists_store,
-            documents_fields_counts_store,
        )?;
+        debug!("fetch_raw_documents took {:.02?}", before_fetch_raw_documents.elapsed());

        // stop processing when time is running out
        if let Some(timeout) = timeout {
@ -468,6 +465,20 @@ where
                    continue;
                }

+                // we must pull the fields counts of these documents
+                // TODO it would be great to had a "dependency" thing for each criterion
+                //      and make it so that we can be lazy on pulling/computing some data.
+                if criterion.name() == "Exact" {
+                    for document in group.iter_mut() {
+                        let mut fields_counts = Vec::new();
+                        for result in documents_fields_counts_store.document_fields_counts(reader, document.id)? {
+                            let (attr, count) = result?;
+                            fields_counts.push(AttrCount { attr: attr.0, count });
+                        }
+                        document.fields_counts = Some(SetBuf::new(fields_counts).unwrap());
+                    }
+                }
+
                group.sort_unstable_by(|a, b| criterion.evaluate(a, b));

                for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
@ -561,7 +572,6 @@ where
            searchable_attrs.as_ref(),
            main_store,
            postings_lists_store,
-            documents_fields_counts_store,
        )?;

        // stop processing when time is running out