Simplify word level position DB into a word position DB

This commit is contained in:
many
2021-10-05 11:18:42 +02:00
parent 75d341d928
commit 3296bb243c
18 changed files with 220 additions and 545 deletions

View File

@ -14,13 +14,13 @@ use crate::{DocumentId, Result};
/// Returns a grenad reader with the list of extracted words at positions and
/// documents ids from the given chunk of docid word positions.
#[logging_timer::time]
pub fn extract_word_level_position_docids<R: io::Read>(
pub fn extract_word_position_docids<R: io::Read>(
mut docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
let max_memory = indexer.max_memory_by_thread();
let mut word_level_position_docids_sorter = create_sorter(
let mut word_position_docids_sorter = create_sorter(
merge_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
@ -37,15 +37,11 @@ pub fn extract_word_level_position_docids<R: io::Read>(
for position in read_u32_ne_bytes(value) {
key_buffer.clear();
key_buffer.extend_from_slice(word_bytes);
key_buffer.push(0); // tree level
// Levels are composed of left and right bounds.
key_buffer.extend_from_slice(&position.to_be_bytes());
key_buffer.extend_from_slice(&position.to_be_bytes());
word_level_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?;
word_position_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?;
}
}
sorter_into_reader(word_level_position_docids_sorter, indexer)
sorter_into_reader(word_position_docids_sorter, indexer)
}

View File

@ -5,8 +5,8 @@ mod extract_fid_docid_facet_values;
mod extract_fid_word_count_docids;
mod extract_geo_points;
mod extract_word_docids;
mod extract_word_level_position_docids;
mod extract_word_pair_proximity_docids;
mod extract_word_position_docids;
use std::collections::HashSet;
use std::fs::File;
@ -22,8 +22,8 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
use self::extract_geo_points::extract_geo_points;
use self::extract_word_docids::extract_word_docids;
use self::extract_word_level_position_docids::extract_word_level_position_docids;
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{
into_clonable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
@ -98,10 +98,10 @@ pub(crate) fn data_from_obkv_documents(
docid_word_positions_chunks.clone(),
indexer.clone(),
lmdb_writer_sx.clone(),
extract_word_level_position_docids,
extract_word_position_docids,
merge_cbo_roaring_bitmaps,
TypedChunk::WordLevelPositionDocids,
"word-level-position-docids",
TypedChunk::WordPositionDocids,
"word-position-docids",
);
spawn_extraction_task(

View File

@ -27,7 +27,7 @@ pub use self::transform::{Transform, TransformOutput};
use crate::documents::DocumentBatchReader;
use crate::update::{
Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
WordsLevelPositions, WordsPrefixesFst,
WordPrefixPositionDocids, WordsPrefixesFst,
};
use crate::{Index, Result};
@ -412,8 +412,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
// Run the words level positions update operation.
let mut builder = WordsLevelPositions::new(self.wtxn, self.index);
// Run the words prefix position docids update operation.
let mut builder = WordPrefixPositionDocids::new(self.wtxn, self.index);
builder.chunk_compression_type = self.chunk_compression_type;
builder.chunk_compression_level = self.chunk_compression_level;
builder.max_nb_chunks = self.max_nb_chunks;

View File

@ -22,7 +22,7 @@ pub(crate) enum TypedChunk {
FieldIdWordcountDocids(grenad::Reader<File>),
NewDocumentsIds(RoaringBitmap),
WordDocids(grenad::Reader<File>),
WordLevelPositionDocids(grenad::Reader<File>),
WordPositionDocids(grenad::Reader<File>),
WordPairProximityDocids(grenad::Reader<File>),
FieldIdFacetStringDocids(grenad::Reader<File>),
FieldIdFacetNumberDocids(grenad::Reader<File>),
@ -110,10 +110,10 @@ pub(crate) fn write_typed_chunk_into_index(
index.put_words_fst(wtxn, &fst)?;
is_merged_database = true;
}
TypedChunk::WordLevelPositionDocids(word_level_position_docids_iter) => {
TypedChunk::WordPositionDocids(word_position_docids_iter) => {
append_entries_into_database(
word_level_position_docids_iter,
&index.word_level_position_docids,
word_position_docids_iter,
&index.word_position_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),