mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Index the intra documents word pair proximities
This commit is contained in:
		| @@ -37,6 +37,7 @@ const WORDS_FST_KEY: &[u8] = b"\x06words-fst"; | ||||
| const HEADERS_BYTE: u8 = 0; | ||||
| const WORD_DOCID_POSITIONS_BYTE: u8 = 1; | ||||
| const WORD_DOCIDS_BYTE: u8 = 2; | ||||
| const WORDS_PROXIMITIES_BYTE: u8 = 5; | ||||
| const DOCUMENTS_IDS_BYTE: u8 = 4; | ||||
|  | ||||
| #[cfg(target_os = "linux")] | ||||
| @@ -128,6 +129,35 @@ fn create_writer(type_: CompressionType, level: Option<u32>, file: File) -> Writ | ||||
|     builder.build(file) | ||||
| } | ||||
|  | ||||
| fn compute_words_pair_proximities( | ||||
|     word_positions: &HashMap<String, RoaringBitmap>, | ||||
| ) -> HashMap<(&str, &str), RoaringBitmap> | ||||
| { | ||||
|     use itertools::Itertools; | ||||
|  | ||||
|     let mut words_pair_proximities = HashMap::new(); | ||||
|     for (w1, w2) in word_positions.keys().cartesian_product(word_positions.keys()) { | ||||
|         let mut distances = RoaringBitmap::new(); | ||||
|         let positions1: Vec<_> = word_positions[w1].iter().collect(); | ||||
|         let positions2: Vec<_> = word_positions[w2].iter().collect(); | ||||
|         for (ps1, ps2) in positions1.iter().cartesian_product(positions2.iter()) { | ||||
|             let prox = milli::proximity::positions_proximity(*ps1, *ps2); | ||||
|             // We don't care about a word that appear at the | ||||
|             // same position or too far from the other. | ||||
|             if prox > 0 && prox < 8 { distances.insert(prox); } | ||||
|         } | ||||
|         if !distances.is_empty() { | ||||
|             // We only store the proximites under one word pair. | ||||
|             let (w1, w2) = if w1 > w2 { (w2, w1) } else { (w1, w2) }; | ||||
|             words_pair_proximities.entry((w1.as_str(), w2.as_str())) | ||||
|                 .or_insert_with(RoaringBitmap::new) | ||||
|                 .union_with(&distances); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     words_pair_proximities | ||||
| } | ||||
|  | ||||
| type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>; | ||||
|  | ||||
| struct Store { | ||||
| @@ -213,6 +243,43 @@ impl Store { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     // FIXME We must store those pairs in an ArcCache to reduce the number of I/O operations, | ||||
|     //       We must store the documents ids associated with the words pairs and proximities. | ||||
|     fn write_words_proximities( | ||||
|         sorter: &mut Sorter<MergeFn>, | ||||
|         document_id: DocumentId, | ||||
|         words_pair_proximities: &HashMap<(&str, &str), RoaringBitmap>, | ||||
|     ) -> anyhow::Result<()> | ||||
|     { | ||||
|         // words proximities keys are all prefixed | ||||
|         let mut key = vec![WORDS_PROXIMITIES_BYTE]; | ||||
|         let mut buffer = Vec::new(); | ||||
|  | ||||
|         for ((w1, w2), proximities) in words_pair_proximities { | ||||
|             assert!(w1 <= w2); | ||||
|             key.truncate(1); | ||||
|             key.extend_from_slice(w1.as_bytes()); | ||||
|             key.push(0); | ||||
|             key.extend_from_slice(w2.as_bytes()); | ||||
|             let pair_len = key.len(); | ||||
|             for prox in proximities { | ||||
|                 key.truncate(pair_len); | ||||
|                 key.push(u8::try_from(prox).unwrap()); | ||||
|                 // We serialize the document ids into a buffer | ||||
|                 buffer.clear(); | ||||
|                 let ids = RoaringBitmap::from_iter(Some(document_id)); | ||||
|                 buffer.reserve(ids.serialized_size()); | ||||
|                 ids.serialize_into(&mut buffer)?; | ||||
|                 // that we write under the generated key into MTBL | ||||
|                 if lmdb_key_valid_size(&key) { | ||||
|                     sorter.insert(&key, &buffer)?; | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn write_docid_word_positions( | ||||
|         sorter: &mut Sorter<MergeFn>, | ||||
|         id: DocumentId, | ||||
| @@ -307,6 +374,9 @@ impl Store { | ||||
|                     } | ||||
|                 } | ||||
|  | ||||
|                 let words_pair_proximities = compute_words_pair_proximities(&word_positions); | ||||
|                 Self::write_words_proximities(&mut self.sorter, document_id, &words_pair_proximities)?; | ||||
|  | ||||
|                 // We write the document in the documents store. | ||||
|                 self.write_document(document_id, &word_positions, &document)?; | ||||
|                 word_positions.clear(); | ||||
| @@ -386,7 +456,7 @@ fn merge(key: &[u8], values: &[Vec<u8>]) -> Result<Vec<u8>, ()> { | ||||
|                 assert!(values.windows(2).all(|vs| vs[0] == vs[1])); | ||||
|                 Ok(values[0].to_vec()) | ||||
|             }, | ||||
|             DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE => { | ||||
|             DOCUMENTS_IDS_BYTE | WORD_DOCIDS_BYTE | WORDS_PROXIMITIES_BYTE => { | ||||
|                 let (head, tail) = values.split_first().unwrap(); | ||||
|  | ||||
|                 let mut head = RoaringBitmap::deserialize_from(head.as_slice()).unwrap(); | ||||
| @@ -428,6 +498,10 @@ fn lmdb_writer(wtxn: &mut heed::RwTxn, index: &Index, key: &[u8], val: &[u8]) -> | ||||
|         // Write the postings lists | ||||
|         index.docid_word_positions.as_polymorph() | ||||
|             .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; | ||||
|     } else if key.starts_with(&[WORDS_PROXIMITIES_BYTE]) { | ||||
|         // Write the word pair proximity document ids | ||||
|         index.word_pair_proximity_docids.as_polymorph() | ||||
|             .put::<_, ByteSlice, ByteSlice>(wtxn, &key[1..], val)?; | ||||
|     } | ||||
|  | ||||
|     Ok(()) | ||||
|   | ||||
| @@ -17,8 +17,8 @@ use heed::{PolyDatabase, Database}; | ||||
| pub use self::search::{Search, SearchResult}; | ||||
| pub use self::criterion::{Criterion, default_criteria}; | ||||
| pub use self::heed_codec::{ | ||||
|     RoaringBitmapCodec, BEU32StrCodec, CsvStringRecordCodec, | ||||
|     ByteorderXRoaringBitmapCodec, | ||||
|     RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, | ||||
|     CsvStringRecordCodec, ByteorderXRoaringBitmapCodec, | ||||
| }; | ||||
|  | ||||
| pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | ||||
| @@ -45,6 +45,8 @@ pub struct Index { | ||||
|     pub docid_word_positions: Database<BEU32StrCodec, ByteorderXRoaringBitmapCodec>, | ||||
|     /// Maps the document id to the document as a CSV line. | ||||
|     pub documents: Database<OwnedType<BEU32>, ByteSlice>, | ||||
|     /// Maps the proximity between a pair of words with all the docids where this relation appears. | ||||
|     pub word_pair_proximity_docids: Database<StrStrU8Codec, RoaringBitmapCodec>, | ||||
| } | ||||
|  | ||||
| impl Index { | ||||
| @@ -54,6 +56,7 @@ impl Index { | ||||
|             word_docids: env.create_database(Some("word-docids"))?, | ||||
|             docid_word_positions: env.create_database(Some("docid-word-positions"))?, | ||||
|             documents: env.create_database(Some("documents"))?, | ||||
|             word_pair_proximity_docids: env.create_database(Some("word-pair-proximity-docids"))?, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -1,5 +1,4 @@ | ||||
| use std::collections::{HashMap, HashSet}; | ||||
| use std::cmp; | ||||
|  | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use levenshtein_automata::DFA; | ||||
| @@ -12,7 +11,7 @@ use near_proximity::near_proximity; | ||||
|  | ||||
| use crate::proximity::path_proximity; | ||||
| use crate::query_tokens::{QueryTokens, QueryToken}; | ||||
| use crate::{Index, DocumentId, Position}; | ||||
| use crate::{Index, DocumentId}; | ||||
|  | ||||
| // Building these factories is not free. | ||||
| static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user