Introduce a first basic working positions-based engine

2025-07-27 16:51:01 +00:00 · 2020-06-05 20:12:52 +02:00
parent f51a63e4ef
commit dfdaceb410
4 changed files with 135 additions and 83 deletions
--- a/src/lib.rs
+++ b/src/lib.rs
@ -32,6 +32,7 @@ pub type AttributeId = u32;
 pub struct Index {
    pub main: PolyDatabase,
    pub postings_attrs: Database<Str, ByteSlice>,
+    pub prefix_postings_attrs: Database<ByteSlice, ByteSlice>,
    pub postings_ids: Database<ByteSlice, ByteSlice>,
    pub prefix_postings_ids: Database<ByteSlice, ByteSlice>,
    pub documents: Database<OwnedType<BEU32>, ByteSlice>,
@ -41,11 +42,12 @@ impl Index {
    pub fn new(env: &heed::Env) -> heed::Result<Index> {
        let main = env.create_poly_database(None)?;
        let postings_attrs = env.create_database(Some("postings-attrs"))?;
+        let prefix_postings_attrs = env.create_database(Some("prefix-postings-attrs"))?;
        let postings_ids = env.create_database(Some("postings-ids"))?;
        let prefix_postings_ids = env.create_database(Some("prefix-postings-ids"))?;
        let documents = env.create_database(Some("documents"))?;

-        Ok(Index { main, postings_attrs, postings_ids, prefix_postings_ids, documents })
+        Ok(Index { main, postings_attrs, prefix_postings_attrs, postings_ids, prefix_postings_ids, documents })
    }

    pub fn headers<'t>(&self, rtxn: &'t heed::RoTxn) -> heed::Result<Option<&'t [u8]>> {
@ -63,7 +65,7 @@ impl Index {
        let words: Vec<_> = QueryTokens::new(query).collect();
        let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);
        let number_of_words = words.len();
-        let dfas: Vec<_> = words.into_iter().enumerate().map(|(i, word)| {
+        let dfas = words.into_iter().enumerate().map(|(i, word)| {
            let (word, quoted) = match word {
                QueryToken::Free(word) => (word.cow_to_lowercase(), false),
                QueryToken::Quoted(word) => (Cow::Borrowed(word), true),
@ -83,73 +85,74 @@ impl Index {
            };

            (word, is_prefix, dfa)
-        })
-        .collect();
+        });

-        let mut intersect_attrs: Option<RoaringBitmap> = None;
-        for (_word, _is_prefix, dfa) in &dfas {
-            let mut union_result = RoaringBitmap::default();
-            let mut stream = fst.search(dfa).into_stream();
-            while let Some(word) = stream.next() {
-                let word = std::str::from_utf8(word)?;
-                if let Some(attrs) = self.postings_attrs.get(rtxn, word)? {
-                    let right = RoaringBitmap::deserialize_from(attrs)?;
-                    union_result.union_with(&right);
+        let mut words_positions = Vec::new();
+        for (word, is_prefix, dfa) in dfas {
+            let mut count = 0;
+            let mut union_positions = RoaringBitmap::default();
+            if word.len() <= 4 && is_prefix {
+                if let Some(ids) = self.prefix_postings_attrs.get(rtxn, word.as_bytes())? {
+                    let right = RoaringBitmap::deserialize_from(ids)?;
+                    union_positions.union_with(&right);
+                    count = 1;
+                }
+            } else {
+                let mut stream = fst.search(&dfa).into_stream();
+                while let Some(word) = stream.next() {
+                    let word = std::str::from_utf8(word)?;
+                    if let Some(attrs) = self.postings_attrs.get(rtxn, word)? {
+                        let right = RoaringBitmap::deserialize_from(attrs)?;
+                        union_positions.union_with(&right);
+                        count += 1;
+                    }
                }
            }

-            match &mut intersect_attrs {
-                Some(left) => left.intersect_with(&union_result),
-                None => intersect_attrs = Some(union_result),
-            }
+            eprintln!("{} words for {:?} we have found positions {:?}", count, word, union_positions);
+            words_positions.push((word, is_prefix, dfa, union_positions));
        }

-        eprintln!("we should only look for documents with attrs {:?}", intersect_attrs);
+        use itertools::EitherOrBoth;
+        let (a, b) = (&words_positions[0].3, &words_positions[1].3);
+        let positions: Vec<_> = itertools::merge_join_by(a, b, |a, b| (a + 1).cmp(b)).flat_map(EitherOrBoth::both).collect();
+
+        if positions.is_empty() { return Ok(Vec::new()); }

        let mut intersect_docids: Option<RoaringBitmap> = None;
-        // TODO would be faster to store and use the words
-        //      seen in the previous attrs loop
-        for (word, is_prefix, dfa) in &dfas {
-            let mut union_result = RoaringBitmap::default();
-            for attr in intersect_attrs.as_ref().unwrap_or(&RoaringBitmap::default()) {
-                let before = Instant::now();
+        for (i, (word, is_prefix, dfa, _)) in words_positions.into_iter().take(2).enumerate() {
+            let mut count = 0;
+            let mut union_docids = RoaringBitmap::default();

-                let mut count = 0;
-                if word.len() <= 4 && *is_prefix {
-                    let mut key = word.as_bytes()[..word.len().min(5)].to_vec();
-                    key.extend_from_slice(&attr.to_be_bytes());
-                    if let Some(ids) = self.prefix_postings_ids.get(rtxn, &key)? {
-                        let right = RoaringBitmap::deserialize_from(ids)?;
-                        union_result.union_with(&right);
-                        count = 1;
-                    }
-                } else {
-                    let mut stream = fst.search(dfa).into_stream();
-                    while let Some(word) = stream.next() {
+            if word.len() <= 4 && is_prefix {
+                let mut key = word.as_bytes()[..word.len().min(5)].to_vec();
+                let pos = if i == 0 { positions[0].0 } else { positions[0].1 };
+                key.extend_from_slice(&pos.to_be_bytes());
+                if let Some(ids) = self.prefix_postings_ids.get(rtxn, &key)? {
+                    let right = RoaringBitmap::deserialize_from(ids)?;
+                    union_docids.union_with(&right);
+                    count = 1;
+                }
+            } else {
+                let mut stream = fst.search(dfa).into_stream();
+                while let Some(word) = stream.next() {
+                    let word = std::str::from_utf8(word)?;
+                    let mut key = word.as_bytes().to_vec();
+                    let pos = if i == 0 { positions[0].0 } else { positions[0].1 };
+                    key.extend_from_slice(&pos.to_be_bytes());
+                    if let Some(attrs) = self.postings_ids.get(rtxn, &key)? {
+                        let right = RoaringBitmap::deserialize_from(attrs)?;
+                        union_docids.union_with(&right);
                        count += 1;
-                        let word = std::str::from_utf8(word)?;
-                        let mut key = word.as_bytes().to_vec();
-                        key.extend_from_slice(&attr.to_be_bytes());
-                        if let Some(ids) = self.postings_ids.get(rtxn, &key)? {
-                            let right = RoaringBitmap::deserialize_from(ids)?;
-                            union_result.union_with(&right);
-                        }
                    }
                }
-
-                eprintln!("with {:?} similar words (for attr {}) union for {:?} gives {:?} took {:.02?}",
-                    count, attr, word, union_result.len(), before.elapsed());
            }

+            let _ = count;
+
            match &mut intersect_docids {
-                Some(left) => {
-                    let before = Instant::now();
-                    let left_len = left.len();
-                    left.intersect_with(&union_result);
-                    eprintln!("intersect between {:?} and {:?} gives {:?} took {:.02?}",
-                        left_len, union_result.len(), left.len(), before.elapsed());
-                },
-                None => intersect_docids = Some(union_result),
+                Some(left) => left.intersect_with(&union_docids),
+                None => intersect_docids = Some(union_docids),
            }
        }