Update the criteria to the new ones

2025-07-27 08:41:00 +00:00 · 2019-12-11 17:02:10 +01:00
parent ea148575cf
commit 248ccfc0d8
20 changed files with 693 additions and 1775 deletions
--- a/meilisearch-core/src/raw_document.rs
+++ b/meilisearch-core/src/raw_document.rs
@ -1,183 +1,89 @@
-use std::fmt;
-use std::sync::Arc;
-
+use compact_arena::SmallArena;
+use itertools::EitherOrBoth;
 use sdset::SetBuf;
-use slice_group_by::GroupBy;

-use crate::{DocumentId, Highlight, TmpMatch, AttrCount};
+use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};

-#[derive(Clone)]
-pub struct RawDocument {
-    pub id: DocumentId,
-    pub matches: SharedMatches,
-    pub highlights: Vec<Highlight>,
-    pub fields_counts: Option<SetBuf<AttrCount>>,
+pub struct RawDocument<'a, 'tag> {
+    pub id: crate::DocumentId,
+    pub raw_matches: &'a mut [BareMatch<'tag>],
+    pub processed_matches: Vec<SimpleMatch>,
+    /// The list of minimum `distance` found
+    pub processed_distances: Vec<Option<u8>>,
 }

-impl RawDocument {
-    pub fn query_index(&self) -> &[u32] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe {
-            &self
-                .matches
-                .matches
-                .query_index
-                .get_unchecked(r.start..r.end)
+impl<'a, 'tag> RawDocument<'a, 'tag> {
+    pub fn new<'txn>(
+        raw_matches: &'a mut [BareMatch<'tag>],
+        automatons: &[QueryWordAutomaton],
+        postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
+    ) -> Option<RawDocument<'a, 'tag>>
+    {
+        raw_matches.sort_unstable_by_key(|m| m.query_index);
+
+        let mut previous_word = None;
+        for i in 0..raw_matches.len() {
+            let a = &raw_matches[i];
+            let auta = &automatons[a.query_index as usize];
+
+            match auta.phrase_query {
+                Some((0, _)) => {
+                    let b = match raw_matches.get(i + 1) {
+                        Some(b) => b,
+                        None => {
+                            postings_lists[a.postings_list].rewrite_with(SetBuf::default());
+                            continue;
+                        }
+                    };
+
+                    if a.query_index + 1 != b.query_index {
+                        postings_lists[a.postings_list].rewrite_with(SetBuf::default());
+                        continue
+                    }
+
+                    let pla = &postings_lists[a.postings_list];
+                    let plb = &postings_lists[b.postings_list];
+
+                    let mut iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
+                        a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
+                    });
+
+                    let mut newa = Vec::new();
+                    let mut newb = Vec::new();
+
+                    for eb in iter {
+                        if let EitherOrBoth::Both(a, b) = eb {
+                            newa.push(*a);
+                            newb.push(*b);
+                        }
+                    }
+
+                    if !newa.is_empty() {
+                        previous_word = Some(a.query_index);
+                    }
+
+                    postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
+                    postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
+                },
+                Some((1, _)) => {
+                    if previous_word.take() != Some(a.query_index - 1) {
+                        postings_lists[a.postings_list].rewrite_with(SetBuf::default());
+                    }
+                },
+                Some((_, _)) => unreachable!(),
+                None => (),
+            }
        }
-    }

-    pub fn distance(&self) -> &[u8] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
-    }
-
-    pub fn attribute(&self) -> &[u16] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
-    }
-
-    pub fn word_index(&self) -> &[u16] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe {
-            &self
-                .matches
-                .matches
-                .word_index
-                .get_unchecked(r.start..r.end)
+        if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
+            return None
        }
-    }

-    pub fn is_exact(&self) -> &[bool] {
-        let r = self.matches.range;
-        // it is safe because construction/modifications
-        // can only be done in this module
-        unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
-    }
-}
-
-impl fmt::Debug for RawDocument {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.write_str("RawDocument {\r\n")?;
-        f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "query_index",
-            self.query_index()
-        ))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "distance",
-            self.distance()
-        ))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "attribute",
-            self.attribute()
-        ))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "word_index",
-            self.word_index()
-        ))?;
-        f.write_fmt(format_args!(
-            "{:>15}: {:^5?},\r\n",
-            "is_exact",
-            self.is_exact()
-        ))?;
-        f.write_str("}")?;
-        Ok(())
-    }
-}
-
-pub fn raw_documents_from(
-    matches: SetBuf<(DocumentId, TmpMatch)>,
-    highlights: SetBuf<(DocumentId, Highlight)>
-) -> Vec<RawDocument> {
-    let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
-    let mut matches2 = Matches::with_capacity(matches.len());
-
-    let matches = matches.linear_group_by_key(|(id, _)| *id);
-    let highlights = highlights.linear_group_by_key(|(id, _)| *id);
-
-    for (mgroup, hgroup) in matches.zip(highlights) {
-        assert_eq!(mgroup[0].0, hgroup[0].0);
-
-        let document_id = mgroup[0].0;
-        let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
-        let end = start + mgroup.len();
-        let highlights = hgroup.iter().map(|(_, h)| *h).collect();
-        let fields_counts = None;
-
-        docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
-        // TODO we could try to keep both data
-        //  - the data oriented one and,
-        //  - the raw one, the one that comes from the arguments of this function
-        // This way we would be able to only produce data oriented lazily.
-        //
-        // For example the default first criterion is `SumOfTypos`
-        // and just needs the `query_index` and the `distance` fields.
-        // It would probably be good to avoid wasting time sorting other fields of documents
-        // that will never ever reach the second criterion.
-        matches2.extend_from_slice(mgroup);
-    }
-
-    let matches = Arc::new(matches2);
-    docs_ranges
-        .into_iter()
-        .map(|(id, range, highlights, fields_counts)| {
-            let matches = SharedMatches { range, matches: matches.clone() };
-            RawDocument { id, matches, highlights, fields_counts }
+        Some(RawDocument {
+            id: raw_matches[0].document_id,
+            raw_matches,
+            processed_matches: Vec::new(),
+            processed_distances: Vec::new(),
        })
-        .collect()
-}
-
-#[derive(Debug, Copy, Clone)]
-struct Range {
-    start: usize,
-    end: usize,
-}
-
-#[derive(Clone)]
-pub struct SharedMatches {
-    range: Range,
-    matches: Arc<Matches>,
-}
-
-#[derive(Clone)]
-struct Matches {
-    query_index: Vec<u32>,
-    distance: Vec<u8>,
-    attribute: Vec<u16>,
-    word_index: Vec<u16>,
-    is_exact: Vec<bool>,
-}
-
-impl Matches {
-    fn with_capacity(cap: usize) -> Matches {
-        Matches {
-            query_index: Vec::with_capacity(cap),
-            distance: Vec::with_capacity(cap),
-            attribute: Vec::with_capacity(cap),
-            word_index: Vec::with_capacity(cap),
-            is_exact: Vec::with_capacity(cap),
-        }
-    }
-
-    fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
-        for (_, match_) in matches {
-            self.query_index.push(match_.query_index);
-            self.distance.push(match_.distance);
-            self.attribute.push(match_.attribute);
-            self.word_index.push(match_.word_index);
-            self.is_exact.push(match_.is_exact);
-        }
    }
 }