mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 01:46:28 +00:00 
			
		
		
		
	Refine some details in word_prefix_pair_proximity indexing code
This commit is contained in:
		
				
					committed by
					
						
						Loïc Lecrenier
					
				
			
			
				
	
			
			
			
						parent
						
							e6e76fbefe
						
					
				
				
					commit
					ab2f6f3aa4
				
			@@ -35,9 +35,6 @@ pub fn index_prefix_word_database(
 | 
			
		||||
        .filter(|s| s.len() <= max_prefix_length)
 | 
			
		||||
        .collect();
 | 
			
		||||
 | 
			
		||||
    // If the prefix trie is not empty, then we can iterate over all new
 | 
			
		||||
    // word pairs to look for new (word1, common_prefix, proximity) elements
 | 
			
		||||
    // to insert in the DB
 | 
			
		||||
    for proximity in 1..=max_proximity - 1 {
 | 
			
		||||
        for prefix in common_prefixes.iter() {
 | 
			
		||||
            let mut prefix_key = vec![];
 | 
			
		||||
@@ -135,13 +132,11 @@ pub fn index_prefix_word_database(
 | 
			
		||||
    Ok(())
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
 | 
			
		||||
/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database.
 | 
			
		||||
///
 | 
			
		||||
/// Its main arguments are:
 | 
			
		||||
/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements
 | 
			
		||||
/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
 | 
			
		||||
///
 | 
			
		||||
/// For more information about what this function does, read the module documentation.
 | 
			
		||||
/// Its arguments are:
 | 
			
		||||
/// - an iterator over the words following the given `prefix` with the given `proximity`
 | 
			
		||||
/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements
 | 
			
		||||
fn execute_on_word_pairs_and_prefixes<I>(
 | 
			
		||||
    proximity: u8,
 | 
			
		||||
    prefix: &[u8],
 | 
			
		||||
@@ -151,28 +146,32 @@ fn execute_on_word_pairs_and_prefixes<I>(
 | 
			
		||||
) -> Result<()> {
 | 
			
		||||
    let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = <_>::default();
 | 
			
		||||
 | 
			
		||||
    while let Some((word2, data)) = next_word2_and_docids(iter)? {
 | 
			
		||||
    // Memory usage check:
 | 
			
		||||
    // The content of the loop will be called for each `word2` that follows a word beginning
 | 
			
		||||
    // with `prefix` with the given proximity.
 | 
			
		||||
    // In practice, I don't think the batch can ever get too big.
 | 
			
		||||
    while let Some((word2, docids)) = next_word2_and_docids(iter)? {
 | 
			
		||||
        let entry = batch.entry(word2.to_owned()).or_default();
 | 
			
		||||
        entry.push(Cow::Owned(data.to_owned()));
 | 
			
		||||
        entry.push(Cow::Owned(docids.to_owned()));
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    let mut key_buffer = Vec::with_capacity(8);
 | 
			
		||||
    let mut key_buffer = Vec::with_capacity(512);
 | 
			
		||||
    key_buffer.push(proximity);
 | 
			
		||||
    key_buffer.extend_from_slice(prefix);
 | 
			
		||||
    key_buffer.push(0);
 | 
			
		||||
 | 
			
		||||
    let mut value_buffer = Vec::with_capacity(65_536);
 | 
			
		||||
 | 
			
		||||
    for (key, values) in batch {
 | 
			
		||||
    for (word2, docids) in batch {
 | 
			
		||||
        key_buffer.truncate(prefix.len() + 2);
 | 
			
		||||
        value_buffer.clear();
 | 
			
		||||
 | 
			
		||||
        key_buffer.extend_from_slice(&key);
 | 
			
		||||
        let data = if values.len() > 1 {
 | 
			
		||||
            CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?;
 | 
			
		||||
        key_buffer.extend_from_slice(&word2);
 | 
			
		||||
        let data = if docids.len() > 1 {
 | 
			
		||||
            CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?;
 | 
			
		||||
            value_buffer.as_slice()
 | 
			
		||||
        } else {
 | 
			
		||||
            &values[0]
 | 
			
		||||
            &docids[0]
 | 
			
		||||
        };
 | 
			
		||||
        insert(key_buffer.as_slice(), data)?;
 | 
			
		||||
    }
 | 
			
		||||
 
 | 
			
		||||
@@ -1,5 +1,4 @@
 | 
			
		||||
/*!
 | 
			
		||||
 ## What is WordPrefix?
 | 
			
		||||
The word-prefix-pair-proximity-docids database is a database whose keys are of
 | 
			
		||||
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
 | 
			
		||||
the documents which contain `word` followed by another word starting with
 | 
			
		||||
@@ -320,7 +319,7 @@ fn execute_on_word_pairs_and_prefixes<I>(
 | 
			
		||||
    let mut merge_buffer = Vec::with_capacity(65_536);
 | 
			
		||||
 | 
			
		||||
    while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? {
 | 
			
		||||
        // skip this iteration if the proximity is over the threshold
 | 
			
		||||
        // stop indexing if the proximity is over the threshold
 | 
			
		||||
        if proximity > max_proximity {
 | 
			
		||||
            break;
 | 
			
		||||
        };
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user