mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-21 13:51:05 +00:00
Compare commits
20 Commits
prototype-
...
prototype-
Author | SHA1 | Date | |
---|---|---|---|
b0adc73ce6 | |||
72d3fa4898 | |||
772964125d | |||
378deb0bef | |||
1f36410541 | |||
264b10ec20 | |||
825257da76 | |||
f8289cd974 | |||
3053e01c05 | |||
b11c2afac0 | |||
9cef800b2a | |||
db2fb86b8b | |||
882ab9cc85 | |||
5a9c96e1db | |||
70ce40828c | |||
688266c83e | |||
6dab826908 | |||
1e2fbc6a42 | |||
523519fdbf | |||
ef6fa10f7a |
@ -864,22 +864,12 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBi
|
||||
|
||||
let indexer_config = IndexerConfig::default();
|
||||
for ids in document_ids_to_delete {
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
|
||||
// Since what we have is an iterator, it would be better to delete in chunks
|
||||
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
|
||||
external_documents_ids
|
||||
.find_external_id_of(&wtxn, ids)
|
||||
.unwrap()
|
||||
.only_external_ids()
|
||||
.collect();
|
||||
let ids = external_to_internal.unwrap();
|
||||
let config = IndexDocumentsConfig::default();
|
||||
|
||||
let mut builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
|
||||
.unwrap();
|
||||
(builder, _) = builder.remove_documents(ids).unwrap();
|
||||
(builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap();
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
|
@ -104,12 +104,6 @@ pub(crate) enum IndexOperation {
|
||||
operations: Vec<DocumentOperation>,
|
||||
tasks: Vec<Task>,
|
||||
},
|
||||
DocumentDeletion {
|
||||
index_uid: String,
|
||||
// The vec associated with each document deletion tasks.
|
||||
documents: Vec<Vec<String>>,
|
||||
tasks: Vec<Task>,
|
||||
},
|
||||
IndexDocumentDeletionByFilter {
|
||||
index_uid: String,
|
||||
task: Task,
|
||||
@ -161,7 +155,6 @@ impl Batch {
|
||||
}
|
||||
Batch::IndexOperation { op, .. } => match op {
|
||||
IndexOperation::DocumentOperation { tasks, .. }
|
||||
| IndexOperation::DocumentDeletion { tasks, .. }
|
||||
| IndexOperation::Settings { tasks, .. }
|
||||
| IndexOperation::DocumentClear { tasks, .. } => {
|
||||
tasks.iter().map(|task| task.uid).collect()
|
||||
@ -226,7 +219,6 @@ impl IndexOperation {
|
||||
pub fn index_uid(&self) -> &str {
|
||||
match self {
|
||||
IndexOperation::DocumentOperation { index_uid, .. }
|
||||
| IndexOperation::DocumentDeletion { index_uid, .. }
|
||||
| IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
|
||||
| IndexOperation::DocumentClear { index_uid, .. }
|
||||
| IndexOperation::Settings { index_uid, .. }
|
||||
@ -242,9 +234,6 @@ impl fmt::Display for IndexOperation {
|
||||
IndexOperation::DocumentOperation { .. } => {
|
||||
f.write_str("IndexOperation::DocumentOperation")
|
||||
}
|
||||
IndexOperation::DocumentDeletion { .. } => {
|
||||
f.write_str("IndexOperation::DocumentDeletion")
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
|
||||
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
|
||||
}
|
||||
@ -347,18 +336,27 @@ impl IndexScheduler {
|
||||
BatchKind::DocumentDeletion { deletion_ids } => {
|
||||
let tasks = self.get_existing_tasks(rtxn, deletion_ids)?;
|
||||
|
||||
let mut documents = Vec::new();
|
||||
let mut operations = Vec::with_capacity(tasks.len());
|
||||
let mut documents_counts = Vec::with_capacity(tasks.len());
|
||||
for task in &tasks {
|
||||
match task.kind {
|
||||
KindWithContent::DocumentDeletion { ref documents_ids, .. } => {
|
||||
documents.push(documents_ids.clone())
|
||||
operations.push(DocumentOperation::Delete(documents_ids.clone()));
|
||||
documents_counts.push(documents_ids.len() as u64);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Some(Batch::IndexOperation {
|
||||
op: IndexOperation::DocumentDeletion { index_uid, documents, tasks },
|
||||
op: IndexOperation::DocumentOperation {
|
||||
index_uid,
|
||||
primary_key: None,
|
||||
method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
documents_counts,
|
||||
operations,
|
||||
tasks,
|
||||
},
|
||||
must_create_index,
|
||||
}))
|
||||
}
|
||||
@ -1275,45 +1273,6 @@ impl IndexScheduler {
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
let must_stop_processing = self.must_stop_processing.clone();
|
||||
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
index_wtxn,
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let document_ids = documents.iter().flatten().cloned().collect();
|
||||
|
||||
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
|
||||
for (task, documents) in tasks.iter_mut().zip(documents) {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids: documents.len(),
|
||||
deleted_documents: Some(count.min(documents.len() as u64)),
|
||||
});
|
||||
}
|
||||
|
||||
if !tasks.iter().all(|res| res.error.is_some()) {
|
||||
let addition = builder.execute()?;
|
||||
info!("document deletion done: {:?}", addition);
|
||||
}
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
||||
let filter =
|
||||
if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
|
||||
@ -1575,18 +1534,6 @@ fn delete_document_by_filter<'a>(
|
||||
}
|
||||
e => e.into(),
|
||||
})?;
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
|
||||
// Since what we have is an iterator, it would be better to delete in chunks
|
||||
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
|
||||
external_documents_ids
|
||||
.find_external_id_of(wtxn, candidates)?
|
||||
.only_external_ids()
|
||||
.collect();
|
||||
let document_ids = match external_to_internal {
|
||||
Ok(external_ids) => external_ids,
|
||||
Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids),
|
||||
};
|
||||
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
@ -1602,13 +1549,10 @@ fn delete_document_by_filter<'a>(
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
|
||||
let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?;
|
||||
builder = new_builder;
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
|
||||
let _ = builder.execute()?;
|
||||
|
||||
count
|
||||
} else {
|
||||
0
|
||||
|
@ -1,5 +1,6 @@
|
||||
mod builder;
|
||||
mod enriched;
|
||||
mod primary_key;
|
||||
mod reader;
|
||||
mod serde_impl;
|
||||
|
||||
@ -11,6 +12,7 @@ use bimap::BiHashMap;
|
||||
pub use builder::DocumentsBatchBuilder;
|
||||
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
||||
use obkv::KvReader;
|
||||
pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY};
|
||||
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@ -87,6 +89,12 @@ impl DocumentsBatchIndex {
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldIdMapper for DocumentsBatchIndex {
|
||||
fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.id(name)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum Error {
|
||||
#[error("Error parsing number {value:?} at line {line}: {error}")]
|
||||
|
172
milli/src/documents/primary_key.rs
Normal file
172
milli/src/documents/primary_key.rs
Normal file
@ -0,0 +1,172 @@
|
||||
use std::iter;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::{FieldId, InternalError, Object, Result, UserError};
|
||||
|
||||
/// The symbol used to define levels in a nested primary key.
|
||||
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// The default primary that is used when not specified.
|
||||
pub const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||
|
||||
/// Trait for objects that can map the name of a field to its [`FieldId`].
|
||||
pub trait FieldIdMapper {
|
||||
/// Attempts to map the passed name to its [`FieldId`].
|
||||
///
|
||||
/// `None` if the field with this name was not found.
|
||||
fn id(&self, name: &str) -> Option<FieldId>;
|
||||
}
|
||||
|
||||
/// A type that represent the type of primary key that has been set
|
||||
/// for this index, a classic flat one or a nested one.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum PrimaryKey<'a> {
|
||||
Flat { name: &'a str, field_id: FieldId },
|
||||
Nested { name: &'a str },
|
||||
}
|
||||
|
||||
pub enum DocumentIdExtractionError {
|
||||
InvalidDocumentId(UserError),
|
||||
MissingDocumentId,
|
||||
TooManyDocumentIds(usize),
|
||||
}
|
||||
|
||||
impl<'a> PrimaryKey<'a> {
|
||||
pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> {
|
||||
Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
|
||||
Self::Nested { name: path }
|
||||
} else {
|
||||
let field_id = fields.id(path)?;
|
||||
Self::Flat { name: path, field_id }
|
||||
})
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &str {
|
||||
match self {
|
||||
PrimaryKey::Flat { name, .. } => name,
|
||||
PrimaryKey::Nested { name } => name,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_id(
|
||||
&self,
|
||||
document: &obkv::KvReader<FieldId>,
|
||||
fields: &impl FieldIdMapper,
|
||||
) -> Result<StdResult<String, DocumentIdExtractionError>> {
|
||||
match self {
|
||||
PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
|
||||
Some(document_id_bytes) => {
|
||||
let document_id = serde_json::from_slice(document_id_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
}
|
||||
}
|
||||
}
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
},
|
||||
nested @ PrimaryKey::Nested { .. } => {
|
||||
let mut matching_documents_ids = Vec::new();
|
||||
for (first_level_name, right) in nested.possible_level_names() {
|
||||
if let Some(field_id) = fields.id(first_level_name) {
|
||||
if let Some(value_bytes) = document.get(field_id) {
|
||||
let object = serde_json::from_slice(value_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
fetch_matching_values(object, right, &mut matching_documents_ids);
|
||||
|
||||
if matching_documents_ids.len() >= 2 {
|
||||
return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(
|
||||
matching_documents_ids.len(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match matching_documents_ids.pop() {
|
||||
Some(document_id) => match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
}
|
||||
},
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||
/// can have depending of the first level name and depth of the objects.
|
||||
pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||
let name = self.name();
|
||||
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
||||
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
||||
.chain(iter::once((name, "")))
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
||||
match value {
|
||||
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
||||
otherwise => output.push(otherwise),
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matching_values_in_object(
|
||||
object: Object,
|
||||
selector: &str,
|
||||
base_key: &str,
|
||||
output: &mut Vec<Value>,
|
||||
) {
|
||||
for (key, value) in object {
|
||||
let base_key = if base_key.is_empty() {
|
||||
key.to_string()
|
||||
} else {
|
||||
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
||||
};
|
||||
|
||||
if starts_with(selector, &base_key) {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
fetch_matching_values_in_object(object, selector, &base_key, output)
|
||||
}
|
||||
value => output.push(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn starts_with(selector: &str, key: &str) -> bool {
|
||||
selector.strip_prefix(key).map_or(false, |tail| {
|
||||
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
||||
})
|
||||
}
|
||||
|
||||
// FIXME: move to a DocumentId struct
|
||||
|
||||
fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||
if !document_id.is_empty()
|
||||
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||
{
|
||||
Some(document_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
||||
match document_id {
|
||||
Value::String(string) => match validate_document_id(&string) {
|
||||
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
||||
Some(s) => Ok(Ok(s.to_string())),
|
||||
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
||||
},
|
||||
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
||||
content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
|
||||
}
|
||||
}
|
@ -81,6 +81,12 @@ impl Default for FieldsIdsMap {
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::documents::FieldIdMapper for FieldsIdsMap {
|
||||
fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.id(name)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
@ -12,6 +12,7 @@ use rstar::RTree;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::distance::NDotProductPoint;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::error::{InternalError, UserError};
|
||||
use crate::fields_ids_map::FieldsIdsMap;
|
||||
use crate::heed_codec::facet::{
|
||||
@ -83,8 +84,6 @@ pub mod db_name {
|
||||
pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids";
|
||||
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
||||
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
||||
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
||||
pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
|
||||
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
|
||||
pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
|
||||
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
||||
@ -129,10 +128,6 @@ pub struct Index {
|
||||
|
||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
|
||||
pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
/// Maps the proximity between a pair of prefix and word with all the docids where this relation appears.
|
||||
pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Maps the word and the position with the docids that corresponds to it.
|
||||
pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
||||
@ -186,7 +181,7 @@ impl Index {
|
||||
) -> Result<Index> {
|
||||
use db_name::*;
|
||||
|
||||
options.max_dbs(26);
|
||||
options.max_dbs(24);
|
||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||
|
||||
let env = options.open(path)?;
|
||||
@ -203,10 +198,6 @@ impl Index {
|
||||
env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let script_language_docids =
|
||||
env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?;
|
||||
let word_prefix_pair_proximity_docids =
|
||||
env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let prefix_word_pair_proximity_docids =
|
||||
env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||
let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?;
|
||||
let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?;
|
||||
let field_id_word_count_docids =
|
||||
@ -247,8 +238,6 @@ impl Index {
|
||||
exact_word_prefix_docids,
|
||||
word_pair_proximity_docids,
|
||||
script_language_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
prefix_word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
word_prefix_position_docids,
|
||||
@ -1176,6 +1165,36 @@ impl Index {
|
||||
self.iter_documents(rtxn, self.documents_ids(rtxn)?)
|
||||
}
|
||||
|
||||
pub fn external_id_of<'a, 't: 'a>(
|
||||
&'a self,
|
||||
rtxn: &'t RoTxn,
|
||||
ids: impl IntoIterator<Item = DocumentId> + 'a,
|
||||
) -> Result<impl IntoIterator<Item = Result<String>> + 'a> {
|
||||
let fields = self.fields_ids_map(rtxn)?;
|
||||
|
||||
// uses precondition "never called on an empty index"
|
||||
let primary_key = self.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::MAIN,
|
||||
key: Some(main_key::PRIMARY_KEY_KEY),
|
||||
})?;
|
||||
let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| {
|
||||
InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName {
|
||||
field_name: primary_key.to_owned(),
|
||||
process: "external_id_of",
|
||||
})
|
||||
})?;
|
||||
Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
|
||||
let (_docid, obkv) = entry?;
|
||||
match primary_key.document_id(&obkv, &fields)? {
|
||||
Ok(document_id) => Ok(document_id),
|
||||
Err(_) => Err(InternalError::DocumentsError(
|
||||
crate::documents::Error::InvalidDocumentFormat,
|
||||
)
|
||||
.into()),
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {
|
||||
FacetDistribution::new(rtxn, self)
|
||||
}
|
||||
|
@ -11,7 +11,9 @@ use super::interner::Interned;
|
||||
use super::Word;
|
||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext};
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
|
||||
};
|
||||
|
||||
/// A cache storing pointers to values in the LMDB databases.
|
||||
///
|
||||
@ -23,7 +25,7 @@ pub struct DatabaseCache<'ctx> {
|
||||
pub word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_prefix_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>,
|
||||
pub prefix_word_pair_proximity_docids:
|
||||
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
|
||||
pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
|
||||
@ -295,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
prefix2: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
(proximity, word1, prefix2),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(prefix2).as_str(),
|
||||
),
|
||||
&mut self.db_cache.word_prefix_pair_proximity_docids,
|
||||
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
let docids = match self
|
||||
.db_cache
|
||||
.word_prefix_pair_proximity_docids
|
||||
.entry((proximity, word1, prefix2))
|
||||
{
|
||||
Entry::Occupied(docids) => docids.get().clone(),
|
||||
Entry::Vacant(entry) => {
|
||||
// compute docids using prefix iter and store the result in the cache.
|
||||
let key = U8StrStrCodec::bytes_encode(&(
|
||||
proximity,
|
||||
self.word_interner.get(word1).as_str(),
|
||||
self.word_interner.get(prefix2).as_str(),
|
||||
))
|
||||
.unwrap()
|
||||
.into_owned();
|
||||
let mut prefix_docids = RoaringBitmap::new();
|
||||
let remap_key_type = self
|
||||
.index
|
||||
.word_pair_proximity_docids
|
||||
.remap_key_type::<ByteSlice>()
|
||||
.prefix_iter(self.txn, &key)?;
|
||||
for result in remap_key_type {
|
||||
let (_, docids) = result?;
|
||||
|
||||
prefix_docids |= docids;
|
||||
}
|
||||
entry.insert(Some(prefix_docids.clone()));
|
||||
Some(prefix_docids)
|
||||
}
|
||||
};
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
pub fn get_db_prefix_word_pair_proximity_docids(
|
||||
&mut self,
|
||||
left_prefix: Interned<String>,
|
||||
right: Interned<String>,
|
||||
proximity: u8,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||
self.txn,
|
||||
(proximity, left_prefix, right),
|
||||
&(
|
||||
proximity,
|
||||
self.word_interner.get(left_prefix).as_str(),
|
||||
self.word_interner.get(right).as_str(),
|
||||
),
|
||||
&mut self.db_cache.prefix_word_pair_proximity_docids,
|
||||
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
|
||||
)
|
||||
// only accept exact matches on reverted positions
|
||||
self.get_db_word_pair_proximity_docids(left_prefix, right, proximity)
|
||||
}
|
||||
|
||||
pub fn get_db_word_fid_docids(
|
||||
|
@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best s");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
@ -379,13 +379,13 @@ fn test_proximity_prefix_db() {
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best summer meal\"",
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal of summer\"",
|
||||
"\"summer x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful summer day\"",
|
||||
"\"this is the best cooked meal of the summer\"",
|
||||
"\"this is the best meal of the summer\"",
|
||||
"\"summer x y best\"",
|
||||
"\"summer x best\"",
|
||||
"\"summer best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
]
|
||||
"###);
|
||||
@ -423,20 +423,20 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best win");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best winter meal\"",
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
"\"winter x best\"",
|
||||
"\"winter best\"",
|
||||
]
|
||||
"###);
|
||||
|
||||
@ -471,20 +471,20 @@ fn test_proximity_prefix_db() {
|
||||
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
|
||||
s.query("best wi");
|
||||
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
|
||||
insta::assert_snapshot!(format!("{document_scores:#?}"));
|
||||
let texts = collect_field_values(&index, &txn, "text", &documents_ids);
|
||||
|
||||
insta::assert_debug_snapshot!(texts, @r###"
|
||||
[
|
||||
"\"this is the best winter meal\"",
|
||||
"\"winter best\"",
|
||||
"\"this is the best meal of winter\"",
|
||||
"\"winter x best\"",
|
||||
"\"this is the best meal I have ever had in such a beautiful winter day\"",
|
||||
"\"this is the best cooked meal of the winter\"",
|
||||
"\"this is the best meal of the winter\"",
|
||||
"\"winter x y best\"",
|
||||
"\"winter x best\"",
|
||||
"\"winter best\"",
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
|
@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
|
@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 3,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 2,
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
],
|
||||
[
|
||||
Proximity(
|
||||
Rank {
|
||||
rank: 1,
|
||||
max_rank: 4,
|
||||
},
|
||||
),
|
||||
|
@ -219,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
|
||||
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
|
||||
})
|
||||
}
|
||||
pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |(
|
||||
(proximity, word1, prefix),
|
||||
b,
|
||||
)| {
|
||||
&format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
|
||||
})
|
||||
}
|
||||
pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
|
||||
(proximity, prefix, word2),
|
||||
b,
|
||||
)| {
|
||||
&format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
|
||||
})
|
||||
}
|
||||
pub fn snap_word_position_docids(index: &Index) -> String {
|
||||
make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
|
||||
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
||||
|
@ -26,8 +26,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
word_prefix_docids,
|
||||
exact_word_prefix_docids,
|
||||
word_pair_proximity_docids,
|
||||
word_prefix_pair_proximity_docids,
|
||||
prefix_word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
field_id_word_count_docids,
|
||||
@ -68,8 +66,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
||||
word_prefix_docids.clear(self.wtxn)?;
|
||||
exact_word_prefix_docids.clear(self.wtxn)?;
|
||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_position_docids.clear(self.wtxn)?;
|
||||
word_fid_docids.clear(self.wtxn)?;
|
||||
field_id_word_count_docids.clear(self.wtxn)?;
|
||||
@ -132,7 +128,6 @@ mod tests {
|
||||
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
|
||||
|
@ -102,3 +102,17 @@ pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
|
||||
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
|
||||
}
|
||||
|
||||
/// A function that extracts and returns the Add side of a DelAdd obkv.
|
||||
/// This is useful when there are no previous value in the database and
|
||||
/// therefore we don't need to do a diff with what's already there.
|
||||
///
|
||||
/// If there is no Add side we currently write an empty buffer
|
||||
/// which is a valid CboRoaringBitmap.
|
||||
#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
|
||||
pub fn deladd_serialize_add_side<'a>(
|
||||
obkv: &'a [u8],
|
||||
_buffer: &mut Vec<u8>,
|
||||
) -> crate::Result<&'a [u8]> {
|
||||
Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
|
||||
}
|
||||
|
@ -1,20 +1,17 @@
|
||||
use std::fmt;
|
||||
use std::io::{BufWriter, Read, Seek};
|
||||
use std::result::Result as StdResult;
|
||||
use std::{fmt, iter};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
|
||||
use crate::documents::{
|
||||
DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader,
|
||||
EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||
};
|
||||
use crate::error::{GeoError, InternalError, UserError};
|
||||
use crate::update::index_documents::{obkv_to_object, writer_into_reader};
|
||||
use crate::{FieldId, Index, Object, Result};
|
||||
|
||||
/// The symbol used to define levels in a nested primary key.
|
||||
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// The default primary that is used when not specified.
|
||||
const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// This function validates and enrich the documents by checking that:
|
||||
/// - we can infer a primary key,
|
||||
@ -41,14 +38,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
// The primary key *field id* that has already been set for this index or the one
|
||||
// we will guess by searching for the first key that contains "id" as a substring.
|
||||
let primary_key = match index.primary_key(rtxn)? {
|
||||
Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => {
|
||||
PrimaryKey::nested(primary_key)
|
||||
}
|
||||
Some(primary_key) => match documents_batch_index.id(primary_key) {
|
||||
Some(id) => PrimaryKey::flat(primary_key, id),
|
||||
None if autogenerate_docids => {
|
||||
PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key))
|
||||
}
|
||||
Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) {
|
||||
Some(primary_key) => primary_key,
|
||||
None if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: primary_key,
|
||||
field_id: documents_batch_index.insert(primary_key),
|
||||
},
|
||||
None => {
|
||||
return match cursor.next_document()? {
|
||||
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
||||
@ -76,14 +71,14 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
});
|
||||
|
||||
match guesses.as_slice() {
|
||||
[] if autogenerate_docids => PrimaryKey::flat(
|
||||
DEFAULT_PRIMARY_KEY,
|
||||
documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||
),
|
||||
[] if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: DEFAULT_PRIMARY_KEY,
|
||||
field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||
},
|
||||
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||
[(field_id, name)] => {
|
||||
log::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||
PrimaryKey::flat(name, *field_id)
|
||||
PrimaryKey::Flat { name, field_id: *field_id }
|
||||
}
|
||||
multiple => {
|
||||
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||
@ -156,92 +151,24 @@ fn fetch_or_generate_document_id(
|
||||
uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
|
||||
count: u32,
|
||||
) -> Result<StdResult<DocumentId, UserError>> {
|
||||
match primary_key {
|
||||
PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
|
||||
match document.get(primary_key_id) {
|
||||
Some(document_id_bytes) => {
|
||||
let document_id = serde_json::from_slice(document_id_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
||||
Err(user_error) => Ok(Err(user_error)),
|
||||
}
|
||||
}
|
||||
None if autogenerate_docids => {
|
||||
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
||||
Ok(Ok(DocumentId::generated(uuid.to_string(), count)))
|
||||
}
|
||||
None => Ok(Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})),
|
||||
}
|
||||
Ok(match primary_key.document_id(document, documents_batch_index)? {
|
||||
Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }),
|
||||
Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => {
|
||||
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
||||
Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count })
|
||||
}
|
||||
nested @ PrimaryKey::Nested { .. } => {
|
||||
let mut matching_documents_ids = Vec::new();
|
||||
for (first_level_name, right) in nested.possible_level_names() {
|
||||
if let Some(field_id) = documents_batch_index.id(first_level_name) {
|
||||
if let Some(value_bytes) = document.get(field_id) {
|
||||
let object = serde_json::from_slice(value_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
fetch_matching_values(object, right, &mut matching_documents_ids);
|
||||
|
||||
if matching_documents_ids.len() >= 2 {
|
||||
return Ok(Err(UserError::TooManyDocumentIds {
|
||||
primary_key: nested.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match matching_documents_ids.pop() {
|
||||
Some(document_id) => match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
|
||||
Err(user_error) => Ok(Err(user_error)),
|
||||
},
|
||||
None => Ok(Err(UserError::MissingDocumentId {
|
||||
primary_key: nested.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})),
|
||||
}
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
}),
|
||||
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
||||
Err(UserError::TooManyDocumentIds {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A type that represent the type of primary key that has been set
|
||||
/// for this index, a classic flat one or a nested one.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
enum PrimaryKey<'a> {
|
||||
Flat { name: &'a str, field_id: FieldId },
|
||||
Nested { name: &'a str },
|
||||
}
|
||||
|
||||
impl PrimaryKey<'_> {
|
||||
fn flat(name: &str, field_id: FieldId) -> PrimaryKey {
|
||||
PrimaryKey::Flat { name, field_id }
|
||||
}
|
||||
|
||||
fn nested(name: &str) -> PrimaryKey {
|
||||
PrimaryKey::Nested { name }
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
match self {
|
||||
PrimaryKey::Flat { name, .. } => name,
|
||||
PrimaryKey::Nested { name } => name,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||
/// can have depending of the first level name and deepnes of the objects.
|
||||
fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||
let name = self.name();
|
||||
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
||||
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
||||
.chain(iter::once((name, "")))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// A type that represents a document id that has been retrieved from a document or auto-generated.
|
||||
@ -255,14 +182,6 @@ pub enum DocumentId {
|
||||
}
|
||||
|
||||
impl DocumentId {
|
||||
fn retrieved(value: String) -> DocumentId {
|
||||
DocumentId::Retrieved { value }
|
||||
}
|
||||
|
||||
fn generated(value: String, document_nth: u32) -> DocumentId {
|
||||
DocumentId::Generated { value, document_nth }
|
||||
}
|
||||
|
||||
fn debug(&self) -> String {
|
||||
format!("{:?}", self)
|
||||
}
|
||||
@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId {
|
||||
}
|
||||
}
|
||||
|
||||
fn starts_with(selector: &str, key: &str) -> bool {
|
||||
selector.strip_prefix(key).map_or(false, |tail| {
|
||||
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
||||
})
|
||||
}
|
||||
|
||||
pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
||||
match value {
|
||||
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
||||
otherwise => output.push(otherwise),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn fetch_matching_values_in_object(
|
||||
object: Object,
|
||||
selector: &str,
|
||||
base_key: &str,
|
||||
output: &mut Vec<Value>,
|
||||
) {
|
||||
for (key, value) in object {
|
||||
let base_key = if base_key.is_empty() {
|
||||
key.to_string()
|
||||
} else {
|
||||
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
||||
};
|
||||
|
||||
if starts_with(selector, &base_key) {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
fetch_matching_values_in_object(object, selector, &base_key, output)
|
||||
}
|
||||
value => output.push(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||
if !document_id.is_empty()
|
||||
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||
{
|
||||
Some(document_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses a Json encoded document id and validate it, returning a user error when it is one.
|
||||
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
||||
match document_id {
|
||||
Value::String(string) => match validate_document_id(&string) {
|
||||
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
||||
Some(s) => Ok(Ok(s.to_string())),
|
||||
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
||||
},
|
||||
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
||||
content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to extract an `f64` from a JSON `Value` and return the `Value`
|
||||
/// in the `Err` variant if it failed.
|
||||
pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
|
||||
|
@ -1,14 +1,12 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, Seek};
|
||||
use std::time::Instant;
|
||||
|
||||
use grenad::{CompressionType, Sorter};
|
||||
use heed::types::ByteSlice;
|
||||
use log::debug;
|
||||
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use crate::error::InternalError;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::Result;
|
||||
|
||||
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
||||
@ -240,45 +238,46 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
Ok(std::iter::from_fn(move || transposer().transpose()))
|
||||
}
|
||||
|
||||
pub fn sorter_into_lmdb_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
/// Write provided sorter in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
pub fn write_sorter_into_database<K, V, FS, FM>(
|
||||
sorter: Sorter<MergeFn>,
|
||||
merge: MergeFn,
|
||||
) -> Result<()> {
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut heed::RwTxn,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
debug!("Writing MTBL sorter...");
|
||||
let before = Instant::now();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<ByteSlice, ByteSlice>();
|
||||
|
||||
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||
if database.is_empty(wtxn)? {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
while let Some((k, v)) = merger_iter.next()? {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
} else {
|
||||
while let Some((k, v)) = merger_iter.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if key == k => {
|
||||
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
|
||||
let val = merge(k, &vals).map_err(|_| {
|
||||
// TODO just wrap this error?
|
||||
InternalError::IndexingMergingKeys { process: "get-put-merge" }
|
||||
})?;
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
unsafe { iter.put_current(k, &val)? };
|
||||
while let Some((key, value)) = merger_iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = if index_is_empty {
|
||||
Some(serialize_value(value, &mut buffer)?)
|
||||
} else {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
};
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
None => {
|
||||
database.delete(wtxn, key)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
@ -239,3 +239,19 @@ pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
|
||||
///
|
||||
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
||||
/// the second one is the CboRoaringBitmap to merge into.
|
||||
pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||
deladd_obkv: &[u8],
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> Result<Option<&'a [u8]>> {
|
||||
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
||||
KvReaderDelAdd::new(deladd_obkv),
|
||||
previous,
|
||||
buffer,
|
||||
)?)
|
||||
}
|
||||
|
@ -9,12 +9,13 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
pub use grenad_helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||
merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader,
|
||||
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
|
||||
GrenadParameters, MergeableReader,
|
||||
};
|
||||
pub use merge_functions::{
|
||||
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
|
||||
merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||
merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
|
||||
obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
|
||||
serialize_roaring_bitmap, MergeFn,
|
||||
};
|
||||
|
@ -20,14 +20,13 @@ use slice_group_by::GroupBy;
|
||||
use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
|
||||
|
||||
use self::enrich::enrich_documents_batch;
|
||||
pub use self::enrich::{
|
||||
extract_finite_float_from_value, validate_document_id, validate_document_id_value,
|
||||
validate_geo_from_json, DocumentId,
|
||||
};
|
||||
pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId};
|
||||
pub use self::helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||
sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
|
||||
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader,
|
||||
ClonableMmap, MergeFn,
|
||||
};
|
||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
@ -35,13 +34,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||
use crate::update::{
|
||||
IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids,
|
||||
WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
static MERGED_DATABASE_COUNT: usize = 7;
|
||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
||||
static PREFIX_DATABASE_COUNT: usize = 4;
|
||||
static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@ -197,6 +195,39 @@ where
|
||||
Ok((self, Ok(deleted_documents)))
|
||||
}
|
||||
|
||||
/// Removes documents from db using their internal document ids.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This function is dangerous and will only work correctly if:
|
||||
///
|
||||
/// - All the passed ids currently exist in the database
|
||||
/// - No batching using the standards `remove_documents` and `add_documents` took place
|
||||
///
|
||||
/// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
|
||||
pub fn remove_documents_from_db_no_batch(
|
||||
mut self,
|
||||
to_delete: &RoaringBitmap,
|
||||
) -> Result<(Self, u64)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
// Early return when there is no document to add
|
||||
if to_delete.is_empty() {
|
||||
return Ok((self, 0));
|
||||
}
|
||||
|
||||
let deleted_documents = self
|
||||
.transform
|
||||
.as_mut()
|
||||
.expect("Invalid document deletion state")
|
||||
.remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)?
|
||||
as u64;
|
||||
|
||||
self.deleted_documents += deleted_documents;
|
||||
|
||||
Ok((self, deleted_documents))
|
||||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
||||
puffin::profile_function!();
|
||||
@ -381,12 +412,42 @@ where
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
let mut word_position_docids = None;
|
||||
let mut word_fid_docids = None;
|
||||
let mut word_docids = None;
|
||||
let mut exact_word_docids = None;
|
||||
|
||||
for result in lmdb_writer_rx {
|
||||
if (self.should_abort)() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
let typed_chunk = result?;
|
||||
let typed_chunk = match result? {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||
word_docids = Some(cloneable_chunk);
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||
exact_word_docids = Some(cloneable_chunk);
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||
word_fid_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
}
|
||||
TypedChunk::WordPositionDocids(chunk) => {
|
||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||
word_position_docids = Some(cloneable_chunk);
|
||||
TypedChunk::WordPositionDocids(chunk)
|
||||
}
|
||||
otherwise => otherwise,
|
||||
};
|
||||
|
||||
// FIXME: return newly added as well as newly deleted documents
|
||||
let (docids, is_merged_database) =
|
||||
@ -417,17 +478,16 @@ where
|
||||
|
||||
// We write the primary key field id into the main database
|
||||
self.index.put_primary_key(self.wtxn, &primary_key)?;
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
|
||||
// TODO: reactivate prefix DB with diff-indexing
|
||||
// self.execute_prefix_databases(
|
||||
// word_docids,
|
||||
// exact_word_docids,
|
||||
// word_pair_proximity_docids,
|
||||
// word_position_docids,
|
||||
// word_fid_docids,
|
||||
// )?;
|
||||
self.execute_prefix_databases(
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
)?;
|
||||
|
||||
self.index.number_of_documents(self.wtxn)
|
||||
Ok(number_of_documents)
|
||||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
@ -435,7 +495,6 @@ where
|
||||
self,
|
||||
word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
|
||||
) -> Result<()>
|
||||
@ -556,32 +615,6 @@ where
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
|
||||
// Run the word prefix pair proximity docids update operation.
|
||||
PrefixWordPairsProximityDocids::new(
|
||||
self.wtxn,
|
||||
self.index,
|
||||
self.indexer_config.chunk_compression_type,
|
||||
self.indexer_config.chunk_compression_level,
|
||||
)
|
||||
.execute(
|
||||
word_pair_proximity_docids,
|
||||
&new_prefix_fst_words,
|
||||
&common_prefix_fst_words,
|
||||
&del_prefix_fst_words,
|
||||
)?;
|
||||
}
|
||||
|
||||
if (self.should_abort)() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
databases_seen += 1;
|
||||
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
|
||||
databases_seen,
|
||||
total_databases: TOTAL_POSTING_DATABASE_COUNT,
|
||||
});
|
||||
|
||||
if let Some(word_position_docids) = word_position_docids {
|
||||
// Run the words prefix position docids update operation.
|
||||
let mut builder = WordPrefixIntegerDocids::new(
|
||||
|
@ -421,52 +421,13 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// Then we push the document in sorters in deletion mode.
|
||||
let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? {
|
||||
Some(docid) => {
|
||||
self.replaced_documents_ids.insert(docid);
|
||||
|
||||
// fetch the obkv document
|
||||
let original_key = BEU32::new(docid);
|
||||
let base_obkv = self
|
||||
.index
|
||||
.documents
|
||||
.remap_data_type::<heed::types::ByteSlice>()
|
||||
.get(wtxn, &original_key)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::DOCUMENTS,
|
||||
key: None,
|
||||
})?;
|
||||
|
||||
// Key is the concatenation of the internal docid and the external one.
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes());
|
||||
// push it as to delete in the original_sorter
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(base_obkv),
|
||||
true,
|
||||
false,
|
||||
self.remove_document_from_db(
|
||||
docid,
|
||||
to_remove,
|
||||
wtxn,
|
||||
&mut document_sorter_key_buffer,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
self.original_sorter
|
||||
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
|
||||
// flatten it and push it as to delete in the flattened_sorter
|
||||
let flattened_obkv = KvReader::new(base_obkv);
|
||||
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&obkv),
|
||||
true,
|
||||
false,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
}
|
||||
self.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||
|
||||
true
|
||||
}
|
||||
None => false,
|
||||
@ -481,6 +442,97 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
Ok(documents_deleted)
|
||||
}
|
||||
|
||||
/// Removes documents from db using their internal document ids.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This function is dangerous and will only work correctly if:
|
||||
///
|
||||
/// - All the passed ids currently exist in the database
|
||||
/// - No batching using the standards `remove_documents` and `add_documents` took place
|
||||
///
|
||||
/// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
|
||||
#[logging_timer::time]
|
||||
pub fn remove_documents_from_db_no_batch<FA>(
|
||||
&mut self,
|
||||
to_remove: &RoaringBitmap,
|
||||
wtxn: &mut heed::RwTxn,
|
||||
should_abort: FA,
|
||||
) -> Result<usize>
|
||||
where
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut documents_deleted = 0;
|
||||
let mut document_sorter_value_buffer = Vec::new();
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?;
|
||||
|
||||
for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) {
|
||||
let external_docid = external_docid?;
|
||||
if should_abort() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
self.remove_document_from_db(
|
||||
internal_docid,
|
||||
external_docid,
|
||||
wtxn,
|
||||
&mut document_sorter_key_buffer,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
|
||||
documents_deleted += 1;
|
||||
}
|
||||
|
||||
Ok(documents_deleted)
|
||||
}
|
||||
|
||||
fn remove_document_from_db(
|
||||
&mut self,
|
||||
internal_docid: u32,
|
||||
external_docid: String,
|
||||
txn: &heed::RoTxn,
|
||||
document_sorter_key_buffer: &mut Vec<u8>,
|
||||
document_sorter_value_buffer: &mut Vec<u8>,
|
||||
) -> Result<()> {
|
||||
self.replaced_documents_ids.insert(internal_docid);
|
||||
|
||||
// fetch the obkv document
|
||||
let original_key = BEU32::new(internal_docid);
|
||||
let base_obkv = self
|
||||
.index
|
||||
.documents
|
||||
.remap_data_type::<heed::types::ByteSlice>()
|
||||
.get(txn, &original_key)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::DOCUMENTS,
|
||||
key: None,
|
||||
})?;
|
||||
|
||||
// Key is the concatenation of the internal docid and the external one.
|
||||
document_sorter_key_buffer.clear();
|
||||
document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes());
|
||||
document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes());
|
||||
// push it as to delete in the original_sorter
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
into_del_add_obkv(KvReaderU16::new(base_obkv), true, false, document_sorter_value_buffer)?;
|
||||
self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
|
||||
// flatten it and push it as to delete in the flattened_sorter
|
||||
let flattened_obkv = KvReader::new(base_obkv);
|
||||
if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
into_del_add_obkv(KvReaderU16::new(&obkv), true, false, document_sorter_value_buffer)?;
|
||||
}
|
||||
self.flattened_sorter
|
||||
.insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Flatten a document from the fields ids map contained in self and insert the new
|
||||
// created fields. Returns `None` if the document doesn't need to be flattened.
|
||||
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
|
||||
|
@ -13,7 +13,10 @@ use obkv::{KvReader, KvWriter};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap};
|
||||
use super::helpers::{
|
||||
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values,
|
||||
valid_lmdb_key, CursorClonableMmap,
|
||||
};
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use crate::distance::NDotProductPoint;
|
||||
use crate::error::UserError;
|
||||
@ -21,12 +24,11 @@ use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||
use crate::facet::FacetType;
|
||||
use crate::index::db_name::DOCUMENTS;
|
||||
use crate::index::Hnsw;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
||||
use crate::{
|
||||
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result,
|
||||
SerializationError, BEU32,
|
||||
lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32,
|
||||
};
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
@ -186,7 +188,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -202,7 +204,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
@ -212,7 +214,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
|
||||
@ -222,7 +224,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
// create fst from word docids
|
||||
@ -244,7 +246,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -265,7 +267,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -276,7 +278,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -287,7 +289,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -298,7 +300,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn,
|
||||
index_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
@ -495,33 +497,6 @@ fn merge_word_docids_reader_into_fst(
|
||||
Ok(builder.into_set())
|
||||
}
|
||||
|
||||
/// A function that extracts and returns the Add side of a DelAdd obkv.
|
||||
/// This is useful when there are no previous value in the database and
|
||||
/// therefore we don't need to do a diff with what's already there.
|
||||
///
|
||||
/// If there is no Add side we currently write an empty buffer
|
||||
/// which is a valid CboRoaringBitmap.
|
||||
#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
|
||||
fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec<u8>) -> Result<&'a [u8]> {
|
||||
Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
|
||||
}
|
||||
|
||||
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
|
||||
///
|
||||
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
||||
/// the second one is the CboRoaringBitmap to merge into.
|
||||
fn merge_deladd_cbo_roaring_bitmaps<'a>(
|
||||
deladd_obkv: &[u8],
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> Result<Option<&'a [u8]>> {
|
||||
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
||||
KvReaderDelAdd::new(deladd_obkv),
|
||||
previous,
|
||||
buffer,
|
||||
)?)
|
||||
}
|
||||
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
fn write_entries_into_database<R, K, V, FS, FM>(
|
||||
|
@ -8,10 +8,6 @@ pub use self::index_documents::{
|
||||
MergeFn,
|
||||
};
|
||||
pub use self::indexer_config::IndexerConfig;
|
||||
pub use self::prefix_word_pairs::{
|
||||
PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
|
||||
MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
||||
};
|
||||
pub use self::settings::{Setting, Settings};
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||
@ -24,7 +20,6 @@ pub(crate) mod del_add;
|
||||
pub(crate) mod facet;
|
||||
mod index_documents;
|
||||
mod indexer_config;
|
||||
mod prefix_word_pairs;
|
||||
mod settings;
|
||||
mod update_step;
|
||||
mod word_prefix_docids;
|
||||
|
@ -1,422 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use std::io::{BufReader, BufWriter};
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
|
||||
use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
|
||||
use crate::{Index, Result};
|
||||
|
||||
mod prefix_word;
|
||||
mod word_prefix;
|
||||
|
||||
pub use prefix_word::index_prefix_word_database;
|
||||
pub use word_prefix::index_word_prefix_database;
|
||||
|
||||
pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4;
|
||||
pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2;
|
||||
|
||||
pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
max_proximity: u8,
|
||||
max_prefix_length: usize,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
}
|
||||
impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
||||
pub fn new(
|
||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||
index: &'i Index,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
) -> Self {
|
||||
Self {
|
||||
wtxn,
|
||||
index,
|
||||
max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
|
||||
max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
|
||||
chunk_compression_type,
|
||||
chunk_compression_level,
|
||||
}
|
||||
}
|
||||
|
||||
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
||||
pub fn execute<'a>(
|
||||
self,
|
||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_prefix_fst_words: &'a [String],
|
||||
common_prefix_fst_words: &[&'a [String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
index_word_prefix_database(
|
||||
self.wtxn,
|
||||
self.index.word_pair_proximity_docids,
|
||||
self.index.word_prefix_pair_proximity_docids,
|
||||
self.max_proximity,
|
||||
self.max_prefix_length,
|
||||
new_word_pair_proximity_docids.clone(),
|
||||
new_prefix_fst_words,
|
||||
common_prefix_fst_words,
|
||||
del_prefix_fst_words,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
)?;
|
||||
|
||||
index_prefix_word_database(
|
||||
self.wtxn,
|
||||
self.index.word_pair_proximity_docids,
|
||||
self.index.prefix_word_pair_proximity_docids,
|
||||
self.max_proximity,
|
||||
self.max_prefix_length,
|
||||
new_word_pair_proximity_docids,
|
||||
new_prefix_fst_words,
|
||||
common_prefix_fst_words,
|
||||
del_prefix_fst_words,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// This is adapted from `sorter_into_lmdb_database`
|
||||
pub fn insert_into_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
new_key: &[u8],
|
||||
new_value: &[u8],
|
||||
) -> Result<()> {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
|
||||
match iter.next().transpose()? {
|
||||
Some((key, old_val)) if new_key == key => {
|
||||
let val =
|
||||
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
|
||||
.map_err(|_| {
|
||||
// TODO just wrap this error?
|
||||
crate::error::InternalError::IndexingMergingKeys {
|
||||
process: "get-put-merge",
|
||||
}
|
||||
})?;
|
||||
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
|
||||
unsafe { iter.put_current(new_key, &val)? };
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
|
||||
// but it uses `append` if the database is empty, and it assumes that the values in the
|
||||
// writer don't conflict with values in the database.
|
||||
pub fn write_into_lmdb_database_without_merging(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
database: heed::PolyDatabase,
|
||||
writer: grenad::Writer<BufWriter<std::fs::File>>,
|
||||
) -> Result<()> {
|
||||
let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||
let reader = grenad::Reader::new(BufReader::new(file))?;
|
||||
if database.is_empty(wtxn)? {
|
||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
// safety: the key comes from the grenad reader, not the database
|
||||
unsafe { out_iter.append(k, v)? };
|
||||
}
|
||||
} else {
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io::Cursor;
|
||||
|
||||
use crate::db_snap;
|
||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::IndexDocumentsMethod;
|
||||
|
||||
fn documents_with_enough_different_words_for_prefixes(
|
||||
prefixes: &[&str],
|
||||
start_id: usize,
|
||||
) -> Vec<crate::Object> {
|
||||
let mut documents = Vec::new();
|
||||
let mut id = start_id;
|
||||
for prefix in prefixes {
|
||||
for i in 0..50 {
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": id,
|
||||
"text": format!("{prefix}{i:x}"),
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
id += 1;
|
||||
}
|
||||
}
|
||||
documents
|
||||
}
|
||||
|
||||
#[ignore]
|
||||
#[test]
|
||||
fn add_new_documents() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": "9000",
|
||||
"text": "At an amazing and beautiful house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": "9001",
|
||||
"text": "The bell rings at 5 am"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": "9002",
|
||||
"text": "At an extraordinary house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_pair_proximity_docids, "update");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "update");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "update");
|
||||
}
|
||||
#[ignore]
|
||||
#[test]
|
||||
fn batch_bug_3043() {
|
||||
// https://github.com/meilisearch/meilisearch/issues/3043
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
index.index_documents_config.autogenerate_docids = true;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "x y"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"text": "x a y"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, word_pair_proximity_docids);
|
||||
db_snap!(index, word_prefix_pair_proximity_docids);
|
||||
db_snap!(index, prefix_word_pair_proximity_docids);
|
||||
}
|
||||
|
||||
#[ignore]
|
||||
#[test]
|
||||
fn hard_delete_and_reupdate() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9000,
|
||||
"text": "At an amazing and beautiful house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9001,
|
||||
"text": "The bell rings at 5 am"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "initial");
|
||||
db_snap!(index, word_docids, "initial");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
||||
|
||||
index.delete_document("9000");
|
||||
|
||||
db_snap!(index, documents_ids, "first_delete");
|
||||
db_snap!(index, word_docids, "first_delete");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
|
||||
|
||||
index.delete_documents((0..50).map(|id| id.to_string()).collect());
|
||||
|
||||
db_snap!(index, documents_ids, "second_delete");
|
||||
db_snap!(index, word_docids, "second_delete");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "second_delete");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "second_delete");
|
||||
|
||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
|
||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "reupdate");
|
||||
db_snap!(index, word_docids, "reupdate");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "reupdate");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
|
||||
}
|
||||
|
||||
#[ignore]
|
||||
#[test]
|
||||
fn replace_hard_deletion() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let batch_reader_from_documents = |documents| {
|
||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||
for object in documents {
|
||||
builder.append_json_object(&object).unwrap();
|
||||
}
|
||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||
};
|
||||
|
||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
|
||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9000,
|
||||
"text": "At an amazing house"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
documents.push(
|
||||
serde_json::json!({
|
||||
"id": 9001,
|
||||
"text": "The bell rings"
|
||||
})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
|
||||
let documents = batch_reader_from_documents(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "initial");
|
||||
db_snap!(index, word_docids, "initial");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
|
||||
|
||||
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0);
|
||||
index.add_documents(batch_reader_from_documents(documents)).unwrap();
|
||||
|
||||
db_snap!(index, documents_ids, "replaced");
|
||||
db_snap!(index, word_docids, "replaced");
|
||||
db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
|
||||
db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
|
||||
}
|
||||
}
|
@ -1,182 +0,0 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::BytesDecode;
|
||||
use log::debug;
|
||||
|
||||
use crate::update::index_documents::{create_writer, CursorClonableMmap};
|
||||
use crate::update::prefix_word_pairs::{
|
||||
insert_into_database, write_into_lmdb_database_without_merging,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[logging_timer::time]
|
||||
pub fn index_prefix_word_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
max_proximity: u8,
|
||||
max_prefix_length: usize,
|
||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_proximity = max_proximity - 1;
|
||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||
|
||||
let common_prefixes: Vec<_> = common_prefix_fst_words
|
||||
.iter()
|
||||
.flat_map(|s| s.iter())
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= max_prefix_length)
|
||||
.collect();
|
||||
|
||||
for proximity in 1..max_proximity {
|
||||
for prefix in common_prefixes.iter() {
|
||||
let mut prefix_key = vec![proximity];
|
||||
prefix_key.extend_from_slice(prefix.as_bytes());
|
||||
let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
|
||||
// This is the core of the algorithm
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
proximity,
|
||||
prefix.as_bytes(),
|
||||
// the next two arguments tell how to iterate over the new word pairs
|
||||
&mut cursor,
|
||||
|cursor| {
|
||||
if let Some((key, value)) = cursor.next()? {
|
||||
let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
|
||||
.ok_or(heed::Error::Decoding)?;
|
||||
Ok(Some((word2, value)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
},
|
||||
// and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
|
||||
|key, value| {
|
||||
insert_into_database(
|
||||
wtxn,
|
||||
*prefix_word_pair_proximity_docids.as_polymorph(),
|
||||
key,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// Now we do the same thing with the new prefixes and all word pairs in the DB
|
||||
let new_prefixes: Vec<_> = new_prefix_fst_words
|
||||
.iter()
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= max_prefix_length)
|
||||
.collect();
|
||||
|
||||
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
|
||||
// element in an intermediary grenad
|
||||
let mut writer =
|
||||
create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
|
||||
|
||||
for proximity in 1..max_proximity {
|
||||
for prefix in new_prefixes.iter() {
|
||||
let mut prefix_key = vec![proximity];
|
||||
prefix_key.extend_from_slice(prefix.as_bytes());
|
||||
let mut db_iter = word_pair_proximity_docids
|
||||
.as_polymorph()
|
||||
.prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
|
||||
.remap_key_type::<UncheckedU8StrStrCodec>();
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
proximity,
|
||||
prefix.as_bytes(),
|
||||
&mut db_iter,
|
||||
|db_iter| {
|
||||
db_iter
|
||||
.next()
|
||||
.transpose()
|
||||
.map(|x| x.map(|((_, _, word2), value)| (word2, value)))
|
||||
.map_err(|e| e.into())
|
||||
},
|
||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||
)?;
|
||||
drop(db_iter);
|
||||
}
|
||||
}
|
||||
|
||||
// and then we write the grenad into the DB
|
||||
// Since the grenad contains only new prefixes, we know in advance that none
|
||||
// of its elements already exist in the DB, thus there is no need to specify
|
||||
// how to merge conflicting elements
|
||||
write_into_lmdb_database_without_merging(
|
||||
wtxn,
|
||||
*prefix_word_pair_proximity_docids.as_polymorph(),
|
||||
writer,
|
||||
)?;
|
||||
|
||||
// All of the word prefix pairs in the database that have a w2
|
||||
// that is contained in the `suppr_pw` set must be removed as well.
|
||||
if !del_prefix_fst_words.is_empty() {
|
||||
let mut iter =
|
||||
prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||
while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database.
|
||||
///
|
||||
/// Its arguments are:
|
||||
/// - an iterator over the words following the given `prefix` with the given `proximity`
|
||||
/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements
|
||||
fn execute_on_word_pairs_and_prefixes<I>(
|
||||
proximity: u8,
|
||||
prefix: &[u8],
|
||||
iter: &mut I,
|
||||
mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
|
||||
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default();
|
||||
|
||||
// Memory usage check:
|
||||
// The content of the loop will be called for each `word2` that follows a word beginning
|
||||
// with `prefix` with the given proximity.
|
||||
// In practice, I don't think the batch can ever get too big.
|
||||
while let Some((word2, docids)) = next_word2_and_docids(iter)? {
|
||||
let entry = batch.entry(word2.to_owned()).or_default();
|
||||
entry.push(Cow::Owned(docids.to_owned()));
|
||||
}
|
||||
|
||||
let mut key_buffer = Vec::with_capacity(512);
|
||||
key_buffer.push(proximity);
|
||||
key_buffer.extend_from_slice(prefix);
|
||||
key_buffer.push(0);
|
||||
|
||||
let mut value_buffer = Vec::with_capacity(65_536);
|
||||
|
||||
for (word2, docids) in batch {
|
||||
key_buffer.truncate(prefix.len() + 2);
|
||||
value_buffer.clear();
|
||||
|
||||
key_buffer.extend_from_slice(&word2);
|
||||
let data = if docids.len() > 1 {
|
||||
CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?;
|
||||
value_buffer.as_slice()
|
||||
} else {
|
||||
&docids[0]
|
||||
};
|
||||
insert(key_buffer.as_slice(), data)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
@ -1,728 +0,0 @@
|
||||
/*!
|
||||
The word-prefix-pair-proximity-docids database is a database whose keys are of
|
||||
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
|
||||
the documents which contain `word` followed by another word starting with
|
||||
`prefix` at a distance of `proximity`.
|
||||
|
||||
The prefixes present in this database are only those that correspond to many
|
||||
different words in the documents.
|
||||
|
||||
## How is it created/updated? (simplified version)
|
||||
To compute it, we have access to (mainly) two inputs:
|
||||
|
||||
* a list of sorted prefixes, such as:
|
||||
```text
|
||||
c
|
||||
ca
|
||||
cat
|
||||
d
|
||||
do
|
||||
dog
|
||||
```
|
||||
Note that only prefixes which correspond to more than a certain number of
|
||||
different words from the database are included in this list.
|
||||
|
||||
* a sorted list of proximities and word pairs (the proximity is the distance between the two words),
|
||||
associated with a roaring bitmap, such as:
|
||||
```text
|
||||
1 good doggo -> docids1: [8]
|
||||
1 good door -> docids2: [7, 19, 20]
|
||||
1 good ghost -> docids3: [1]
|
||||
2 good dog -> docids4: [2, 5, 6]
|
||||
2 horror cathedral -> docids5: [1, 2]
|
||||
```
|
||||
|
||||
I illustrate a simplified version of the algorithm to create the word-prefix
|
||||
pair-proximity database below:
|
||||
|
||||
1. **Outer loop:** First, we iterate over each proximity and word pair:
|
||||
```text
|
||||
proximity: 1
|
||||
word1 : good
|
||||
word2 : doggo
|
||||
```
|
||||
2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
|
||||
in the list of sorted prefixes. And we insert the key `prefix`
|
||||
and the value (`docids`) to a sorted map which we call the “batch”. For example,
|
||||
at the end of the first outer loop, we may have:
|
||||
```text
|
||||
Outer loop 1:
|
||||
------------------------------
|
||||
proximity: 1
|
||||
word1 : good
|
||||
word2 : doggo
|
||||
docids : docids1
|
||||
|
||||
prefixes: [d, do, dog]
|
||||
|
||||
batch: [
|
||||
d, -> [docids1]
|
||||
do -> [docids1]
|
||||
dog -> [docids1]
|
||||
]
|
||||
```
|
||||
3. For illustration purpose, let's run through a second iteration of the outer loop:
|
||||
```text
|
||||
Outer loop 2:
|
||||
------------------------------
|
||||
proximity: 1
|
||||
word1 : good
|
||||
word2 : door
|
||||
docids : docids2
|
||||
|
||||
prefixes: [d, do, doo]
|
||||
|
||||
batch: [
|
||||
d -> [docids1, docids2]
|
||||
do -> [docids1, docids2]
|
||||
dog -> [docids1]
|
||||
doo -> [docids2]
|
||||
]
|
||||
```
|
||||
Notice that there were some conflicts which were resolved by merging the
|
||||
conflicting values together. Also, an additional prefix was added at the
|
||||
end of the batch.
|
||||
|
||||
4. On the third iteration of the outer loop, we have:
|
||||
```text
|
||||
Outer loop 3:
|
||||
------------------------------
|
||||
proximity: 1
|
||||
word1 : good
|
||||
word2 : ghost
|
||||
```
|
||||
Because `word2` begins with a different letter than the previous `word2`,
|
||||
we know that all the prefixes of `word2` are greater than the prefixes of the previous word2
|
||||
|
||||
Therefore, we know that we can insert every element from the batch into the
|
||||
database before proceeding any further. This operation is called
|
||||
“flushing the batch”. Flushing the batch should also be done whenever:
|
||||
* `proximity` is different than the previous `proximity`.
|
||||
* `word1` is different than the previous `word1`.
|
||||
* `word2` starts with a different letter than the previous word2
|
||||
|
||||
6. **Flushing the batch:** to flush the batch, we iterate over its elements:
|
||||
```text
|
||||
Flushing Batch loop 1:
|
||||
------------------------------
|
||||
proximity : 1
|
||||
word1 : good
|
||||
prefix : d
|
||||
|
||||
docids : [docids2, docids3]
|
||||
```
|
||||
We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using
|
||||
`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a
|
||||
roaring bitmap of all the document ids where `word1` is followed by `prefix`
|
||||
at a distance of `proximity`.
|
||||
Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids`
|
||||
into the database.
|
||||
|
||||
7. That's it! ... except...
|
||||
|
||||
## How is it created/updated (continued)
|
||||
|
||||
I lied a little bit about the input data. In reality, we get two sets of the
|
||||
inputs described above, which come from different places:
|
||||
|
||||
* For the list of sorted prefixes, we have:
|
||||
1. `new_prefixes`, which are all the prefixes that were not present in the
|
||||
database before the insertion of the new documents
|
||||
|
||||
2. `common_prefixes` which are the prefixes that are present both in the
|
||||
database and in the newly added documents
|
||||
|
||||
* For the list of word pairs and proximities, we have:
|
||||
1. `new_word_pairs`, which is the list of word pairs and their proximities
|
||||
present in the newly added documents
|
||||
|
||||
2. `word_pairs_db`, which is the list of word pairs from the database.
|
||||
This list includes all elements in `new_word_pairs` since `new_word_pairs`
|
||||
was added to the database prior to calling the `WordPrefix::execute`
|
||||
function.
|
||||
|
||||
To update the prefix database correctly, we call the algorithm described earlier first
|
||||
on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`).
|
||||
Thus:
|
||||
|
||||
1. For all the word pairs that were already present in the DB, we insert them
|
||||
again with the `new_prefixes`. Calling the algorithm on them with the
|
||||
`common_prefixes` would not result in any new data.
|
||||
|
||||
2. For all the new word pairs, we insert them twice: first with the `common_prefixes`,
|
||||
and then, because they are part of `word_pairs_db`, with the `new_prefixes`.
|
||||
|
||||
Note, also, that since we read data from the database when iterating over
|
||||
`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-
|
||||
docids from the batch directly into the database (we would have a concurrent
|
||||
reader and writer). Therefore, when calling the algorithm on
|
||||
`(new_prefixes, word_pairs_db)`, we insert the computed
|
||||
`((proximity, word, prefix), docids)` elements in an intermediary grenad
|
||||
Writer instead of the DB. At the end of the outer loop, we finally read from
|
||||
the grenad and insert its elements in the database.
|
||||
*/
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
|
||||
use grenad::CompressionType;
|
||||
use heed::types::ByteSlice;
|
||||
use heed::BytesDecode;
|
||||
use log::debug;
|
||||
|
||||
use crate::update::index_documents::{create_writer, CursorClonableMmap};
|
||||
use crate::update::prefix_word_pairs::{
|
||||
insert_into_database, write_into_lmdb_database_without_merging,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[logging_timer::time]
|
||||
pub fn index_word_prefix_database(
|
||||
wtxn: &mut heed::RwTxn,
|
||||
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
max_proximity: u8,
|
||||
max_prefix_length: usize,
|
||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||
new_prefix_fst_words: &[String],
|
||||
common_prefix_fst_words: &[&[String]],
|
||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||
chunk_compression_type: CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||
|
||||
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
|
||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
||||
common_prefix_fst_words
|
||||
.iter()
|
||||
.flat_map(|s| s.iter())
|
||||
.map(|s| s.as_str())
|
||||
.filter(|s| s.len() <= max_prefix_length),
|
||||
);
|
||||
|
||||
// If the prefix trie is not empty, then we can iterate over all new
|
||||
// word pairs to look for new (proximity, word1, common_prefix) elements
|
||||
// to insert in the DB
|
||||
if !prefixes.is_empty() {
|
||||
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
|
||||
// This is the core of the algorithm
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
// the first two arguments tell how to iterate over the new word pairs
|
||||
&mut cursor,
|
||||
|cursor| {
|
||||
if let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (proximity, word1, word2) =
|
||||
UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
|
||||
Ok(Some(((proximity, word1, word2), value)))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
},
|
||||
&prefixes,
|
||||
max_proximity,
|
||||
// and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap)
|
||||
|key, value| {
|
||||
insert_into_database(
|
||||
wtxn,
|
||||
*word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
key,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
}
|
||||
|
||||
// Now we do the same thing with the new prefixes and all word pairs in the DB
|
||||
|
||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
||||
new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length),
|
||||
);
|
||||
|
||||
if !prefixes.is_empty() {
|
||||
let mut db_iter = word_pair_proximity_docids
|
||||
.remap_key_type::<UncheckedU8StrStrCodec>()
|
||||
.remap_data_type::<ByteSlice>()
|
||||
.iter(wtxn)?;
|
||||
|
||||
// Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix)
|
||||
// element in an intermediary grenad
|
||||
let mut writer =
|
||||
create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
|
||||
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
&mut db_iter,
|
||||
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
|
||||
&prefixes,
|
||||
max_proximity,
|
||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||
)?;
|
||||
drop(db_iter);
|
||||
|
||||
// and then we write the grenad into the DB
|
||||
// Since the grenad contains only new prefixes, we know in advance that none
|
||||
// of its elements already exist in the DB, thus there is no need to specify
|
||||
// how to merge conflicting elements
|
||||
write_into_lmdb_database_without_merging(
|
||||
wtxn,
|
||||
*word_prefix_pair_proximity_docids.as_polymorph(),
|
||||
writer,
|
||||
)?;
|
||||
}
|
||||
|
||||
// All of the word prefix pairs in the database that have a w2
|
||||
// that is contained in the `suppr_pw` set must be removed as well.
|
||||
if !del_prefix_fst_words.is_empty() {
|
||||
let mut iter =
|
||||
word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||
while let Some(((_, _, prefix), _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
|
||||
///
|
||||
/// Its main arguments are:
|
||||
/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements
|
||||
/// 2. a prefix trie
|
||||
/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements
|
||||
///
|
||||
/// For more information about what this function does, read the module documentation.
|
||||
fn execute_on_word_pairs_and_prefixes<I>(
|
||||
iter: &mut I,
|
||||
mut next_word_pair_proximity: impl for<'a> FnMut(
|
||||
&'a mut I,
|
||||
) -> Result<
|
||||
Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>,
|
||||
>,
|
||||
prefixes: &PrefixTrieNode,
|
||||
max_proximity: u8,
|
||||
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let mut batch = PrefixAndProximityBatch::default();
|
||||
let mut prev_word2_start = 0;
|
||||
|
||||
// Optimisation: the index at the root of the prefix trie where to search for
|
||||
let mut prefix_search_start = PrefixTrieNodeSearchStart(0);
|
||||
|
||||
// Optimisation: true if there are no potential prefixes for the current word2 based on its first letter
|
||||
let mut empty_prefixes = false;
|
||||
|
||||
let mut prefix_buffer = Vec::with_capacity(8);
|
||||
let mut merge_buffer = Vec::with_capacity(65_536);
|
||||
|
||||
while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? {
|
||||
// stop indexing if the proximity is over the threshold
|
||||
if proximity > max_proximity {
|
||||
break;
|
||||
};
|
||||
let word2_start_different_than_prev = word2[0] != prev_word2_start;
|
||||
// if there were no potential prefixes for the previous word2 based on its first letter,
|
||||
// and if the current word2 starts with the same letter, then there is also no potential
|
||||
// prefixes for the current word2, and we can skip to the next iteration
|
||||
if empty_prefixes && !word2_start_different_than_prev {
|
||||
continue;
|
||||
}
|
||||
|
||||
// if the proximity is different to the previous one, OR
|
||||
// if word1 is different than the previous word1, OR
|
||||
// if the start of word2 is different than the previous start of word2,
|
||||
// THEN we'll need to flush the batch
|
||||
let prox_different_than_prev = proximity != batch.proximity;
|
||||
let word1_different_than_prev = word1 != batch.word1;
|
||||
if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
|
||||
{
|
||||
batch.flush(&mut merge_buffer, &mut insert)?;
|
||||
batch.proximity = proximity;
|
||||
// don't forget to reset the value of batch.word1 and prev_word2_start
|
||||
if word1_different_than_prev {
|
||||
batch.word1.clear();
|
||||
batch.word1.extend_from_slice(word1);
|
||||
}
|
||||
if word2_start_different_than_prev {
|
||||
prev_word2_start = word2[0];
|
||||
}
|
||||
prefix_search_start.0 = 0;
|
||||
// Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2
|
||||
empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start);
|
||||
}
|
||||
|
||||
if !empty_prefixes {
|
||||
// All conditions are satisfied, we can now insert each new prefix of word2 into the batch
|
||||
prefix_buffer.clear();
|
||||
prefixes.for_each_prefix_of(
|
||||
word2,
|
||||
&mut prefix_buffer,
|
||||
&prefix_search_start,
|
||||
|prefix_buffer| {
|
||||
batch.insert(prefix_buffer, data.to_vec());
|
||||
},
|
||||
);
|
||||
}
|
||||
}
|
||||
batch.flush(&mut merge_buffer, &mut insert)?;
|
||||
Ok(())
|
||||
}
|
||||
/**
|
||||
A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps).
|
||||
The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
|
||||
|
||||
It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently.
|
||||
|
||||
The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content
|
||||
can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments:
|
||||
- key : (proximity, word1, prefix) as bytes
|
||||
- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes
|
||||
*/
|
||||
#[derive(Default)]
|
||||
struct PrefixAndProximityBatch {
|
||||
proximity: u8,
|
||||
word1: Vec<u8>,
|
||||
#[allow(clippy::type_complexity)]
|
||||
batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>,
|
||||
}
|
||||
|
||||
impl PrefixAndProximityBatch {
|
||||
/// Insert the new key and value into the batch
|
||||
///
|
||||
/// The key must either exist in the batch or be greater than all existing keys
|
||||
fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
|
||||
match self.batch.iter_mut().find(|el| el.0 == new_key) {
|
||||
Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)),
|
||||
None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])),
|
||||
}
|
||||
}
|
||||
|
||||
/// Empties the batch, calling `insert` on each element.
|
||||
///
|
||||
/// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap.
|
||||
fn flush(
|
||||
&mut self,
|
||||
merge_buffer: &mut Vec<u8>,
|
||||
insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let PrefixAndProximityBatch { proximity, word1, batch } = self;
|
||||
if batch.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
merge_buffer.clear();
|
||||
|
||||
let mut buffer = Vec::with_capacity(word1.len() + 1 + 6);
|
||||
buffer.push(*proximity);
|
||||
buffer.extend_from_slice(word1);
|
||||
buffer.push(0);
|
||||
|
||||
for (key, mergeable_data) in batch.drain(..) {
|
||||
buffer.truncate(1 + word1.len() + 1);
|
||||
buffer.extend_from_slice(key.as_slice());
|
||||
|
||||
let data = if mergeable_data.len() > 1 {
|
||||
CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?;
|
||||
merge_buffer.as_slice()
|
||||
} else {
|
||||
&mergeable_data[0]
|
||||
};
|
||||
insert(buffer.as_slice(), data)?;
|
||||
merge_buffer.clear();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/** A prefix trie. Used to iterate quickly over the prefixes of a word that are
|
||||
within a set.
|
||||
|
||||
## Structure
|
||||
The trie is made of nodes composed of:
|
||||
1. a byte character (e.g. 'a')
|
||||
2. whether the node is an end node or not
|
||||
3. a list of children nodes, sorted by their byte character
|
||||
|
||||
For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]`
|
||||
is drawn below. Nodes with a double border are "end nodes".
|
||||
|
||||
┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗
|
||||
│ a │ │ c │ ║ r ║
|
||||
└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝
|
||||
╔══════╗╔══════╗╔══════╗ ┌─────────┐ ╔═════════╗ ┌─────────┐ ╔══════════╗
|
||||
║ c ║║ e ║║ r ║ │ e │ ║ h ║ │ e │ ║ i ║
|
||||
╚══════╝╚══════╝╚══════╝ └─────────┘ ╚═════════╝ └─────────┘ ╚══════════╝
|
||||
╔═══╗ ╔═══╗ ╔═══╗
|
||||
║ i ║ ║ l ║ ║ l ║
|
||||
╚═══╝ ╚═══╝ ╚═══╝
|
||||
*/
|
||||
#[derive(Default, Debug)]
|
||||
struct PrefixTrieNode {
|
||||
children: Vec<(PrefixTrieNode, u8)>,
|
||||
is_end_node: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct PrefixTrieNodeSearchStart(usize);
|
||||
|
||||
impl PrefixTrieNode {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.children.is_empty()
|
||||
}
|
||||
|
||||
/// Returns false if the trie does not contain a prefix of the given word.
|
||||
/// Returns true if the trie *may* contain a prefix of the given word.
|
||||
///
|
||||
/// Moves the search start to the first node equal to the first letter of the word,
|
||||
/// or to 0 otherwise.
|
||||
fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool {
|
||||
let byte = word[0];
|
||||
if self.children[search_start.0].1 == byte {
|
||||
true
|
||||
} else {
|
||||
match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) {
|
||||
Ok(position) => {
|
||||
search_start.0 += position;
|
||||
true
|
||||
}
|
||||
Err(_) => {
|
||||
search_start.0 = 0;
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn from_sorted_prefixes<'a>(prefixes: impl Iterator<Item = &'a str>) -> Self {
|
||||
let mut node = PrefixTrieNode::default();
|
||||
for prefix in prefixes {
|
||||
node.insert_sorted_prefix(prefix.as_bytes().iter());
|
||||
}
|
||||
node
|
||||
}
|
||||
fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter<u8>) {
|
||||
if let Some(&c) = prefix.next() {
|
||||
if let Some((node, byte)) = self.children.last_mut() {
|
||||
if *byte == c {
|
||||
node.insert_sorted_prefix(prefix);
|
||||
return;
|
||||
}
|
||||
}
|
||||
let mut new_node = PrefixTrieNode::default();
|
||||
new_node.insert_sorted_prefix(prefix);
|
||||
self.children.push((new_node, c));
|
||||
} else {
|
||||
self.is_end_node = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// Call the given closure on each prefix of the word contained in the prefix trie.
|
||||
///
|
||||
/// The search starts from the given `search_start`.
|
||||
fn for_each_prefix_of(
|
||||
&self,
|
||||
word: &[u8],
|
||||
buffer: &mut Vec<u8>,
|
||||
search_start: &PrefixTrieNodeSearchStart,
|
||||
mut do_fn: impl FnMut(&mut Vec<u8>),
|
||||
) {
|
||||
let first_byte = word[0];
|
||||
let mut cur_node = self;
|
||||
buffer.push(first_byte);
|
||||
if let Some((child_node, c)) =
|
||||
cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte)
|
||||
{
|
||||
if *c == first_byte {
|
||||
cur_node = child_node;
|
||||
if cur_node.is_end_node {
|
||||
do_fn(buffer);
|
||||
}
|
||||
for &byte in &word[1..] {
|
||||
buffer.push(byte);
|
||||
if let Some((child_node, c)) =
|
||||
cur_node.children.iter().find(|(_, c)| *c >= byte)
|
||||
{
|
||||
if *c == byte {
|
||||
cur_node = child_node;
|
||||
if cur_node.is_end_node {
|
||||
do_fn(buffer);
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::*;
|
||||
use crate::{CboRoaringBitmapCodec, U8StrStrCodec};
|
||||
|
||||
fn check_prefixes(
|
||||
trie: &PrefixTrieNode,
|
||||
search_start: &PrefixTrieNodeSearchStart,
|
||||
word: &str,
|
||||
expected_prefixes: &[&str],
|
||||
) {
|
||||
let mut actual_prefixes = vec![];
|
||||
trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| {
|
||||
let s = String::from_utf8(x.to_owned()).unwrap();
|
||||
actual_prefixes.push(s);
|
||||
});
|
||||
assert_eq!(actual_prefixes, expected_prefixes);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_trie() {
|
||||
let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
|
||||
"1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au",
|
||||
"b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c",
|
||||
"ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com",
|
||||
"comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des",
|
||||
"di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f",
|
||||
"fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi",
|
||||
"gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i",
|
||||
"im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka",
|
||||
"ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar",
|
||||
"mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni",
|
||||
"no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi",
|
||||
"pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res",
|
||||
"ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si",
|
||||
"sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t",
|
||||
"ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve",
|
||||
"vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z",
|
||||
]));
|
||||
|
||||
let mut search_start = PrefixTrieNodeSearchStart(0);
|
||||
|
||||
let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start);
|
||||
assert!(!is_empty);
|
||||
assert_eq!(search_start.0, 2);
|
||||
|
||||
check_prefixes(&trie, &search_start, "affair", &["a"]);
|
||||
check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]);
|
||||
|
||||
let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start);
|
||||
assert!(!is_empty);
|
||||
assert_eq!(trie.children[search_start.0].1, b'u');
|
||||
|
||||
check_prefixes(&trie, &search_start, "unique", &["u", "un"]);
|
||||
|
||||
// NOTE: this should fail, because the search start is already beyong 'a'
|
||||
let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start);
|
||||
assert!(!is_empty);
|
||||
// search start is reset
|
||||
assert_eq!(search_start.0, 0);
|
||||
|
||||
let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
|
||||
"arb", "arbre", "cat", "catto",
|
||||
]));
|
||||
check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]);
|
||||
check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_execute_on_word_pairs_and_prefixes() {
|
||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
|
||||
"arb", "arbre", "cat", "catto",
|
||||
]));
|
||||
|
||||
let mut serialised_bitmap123 = vec![];
|
||||
let mut bitmap123 = RoaringBitmap::new();
|
||||
bitmap123.insert(1);
|
||||
bitmap123.insert(2);
|
||||
bitmap123.insert(3);
|
||||
CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123);
|
||||
|
||||
let mut serialised_bitmap456 = vec![];
|
||||
let mut bitmap456 = RoaringBitmap::new();
|
||||
bitmap456.insert(4);
|
||||
bitmap456.insert(5);
|
||||
bitmap456.insert(6);
|
||||
CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456);
|
||||
|
||||
let mut serialised_bitmap789 = vec![];
|
||||
let mut bitmap789 = RoaringBitmap::new();
|
||||
bitmap789.insert(7);
|
||||
bitmap789.insert(8);
|
||||
bitmap789.insert(9);
|
||||
CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789);
|
||||
|
||||
let mut serialised_bitmap_ranges = vec![];
|
||||
let mut bitmap_ranges = RoaringBitmap::new();
|
||||
bitmap_ranges.insert_range(63_000..65_000);
|
||||
bitmap_ranges.insert_range(123_000..128_000);
|
||||
CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
|
||||
|
||||
let word_pairs = [
|
||||
((1, "healthy", "arbres"), &serialised_bitmap123),
|
||||
((1, "healthy", "boat"), &serialised_bitmap123),
|
||||
((1, "healthy", "ca"), &serialised_bitmap123),
|
||||
((1, "healthy", "cats"), &serialised_bitmap456),
|
||||
((1, "healthy", "cattos"), &serialised_bitmap123),
|
||||
((1, "jittery", "cat"), &serialised_bitmap123),
|
||||
((1, "jittery", "cata"), &serialised_bitmap456),
|
||||
((1, "jittery", "catb"), &serialised_bitmap789),
|
||||
((1, "jittery", "catc"), &serialised_bitmap_ranges),
|
||||
((2, "healthy", "arbre"), &serialised_bitmap123),
|
||||
((2, "healthy", "arbres"), &serialised_bitmap456),
|
||||
((2, "healthy", "cats"), &serialised_bitmap789),
|
||||
((2, "healthy", "cattos"), &serialised_bitmap_ranges),
|
||||
((3, "healthy", "arbre"), &serialised_bitmap456),
|
||||
((3, "healthy", "arbres"), &serialised_bitmap789),
|
||||
];
|
||||
|
||||
let expected_result = [
|
||||
((1, "healthy", "arb"), bitmap123.clone()),
|
||||
((1, "healthy", "arbre"), bitmap123.clone()),
|
||||
((1, "healthy", "cat"), &bitmap456 | &bitmap123),
|
||||
((1, "healthy", "catto"), bitmap123.clone()),
|
||||
((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
|
||||
((2, "healthy", "arb"), &bitmap123 | &bitmap456),
|
||||
((2, "healthy", "arbre"), &bitmap123 | &bitmap456),
|
||||
((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges),
|
||||
((2, "healthy", "catto"), bitmap_ranges.clone()),
|
||||
];
|
||||
|
||||
let mut result = vec![];
|
||||
|
||||
let mut iter =
|
||||
IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| {
|
||||
((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice())
|
||||
});
|
||||
execute_on_word_pairs_and_prefixes(
|
||||
&mut iter,
|
||||
|iter| Ok(iter.next()),
|
||||
&prefixes,
|
||||
2,
|
||||
|k, v| {
|
||||
let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap();
|
||||
let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
|
||||
result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap));
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) {
|
||||
let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x;
|
||||
let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y;
|
||||
|
||||
assert_eq!(actual_word1, expected_word1);
|
||||
assert_eq!(actual_prefix, expected_prefix);
|
||||
assert_eq!(actual_proximity, expected_proximity);
|
||||
assert_eq!(actual_bitmap, expected_bitmap);
|
||||
}
|
||||
}
|
||||
}
|
@ -4,9 +4,11 @@ use grenad::CompressionType;
|
||||
use heed::types::{ByteSlice, Str};
|
||||
use heed::Database;
|
||||
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||
CursorClonableMmap, MergeFn,
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||
write_sorter_into_database, CursorClonableMmap, MergeFn,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Result};
|
||||
|
||||
@ -51,7 +53,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
// and write into it at the same time, therefore we write into another file.
|
||||
let mut prefix_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.max_nb_chunks,
|
||||
@ -92,11 +94,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
|
||||
// We fetch the docids associated to the newly added word prefix fst only.
|
||||
let db = self.word_docids.remap_data_type::<ByteSlice>();
|
||||
let mut buffer = Vec::new();
|
||||
for prefix in new_prefix_fst_words {
|
||||
let prefix = std::str::from_utf8(prefix.as_bytes())?;
|
||||
for result in db.prefix_iter(self.wtxn, prefix)? {
|
||||
let (_word, data) = result?;
|
||||
prefix_docids_sorter.insert(prefix, data)?;
|
||||
buffer.clear();
|
||||
let mut writer = KvWriterDelAdd::new(&mut buffer);
|
||||
writer.insert(DelAdd::Addition, data)?;
|
||||
|
||||
prefix_docids_sorter.insert(prefix, writer.into_inner()?)?;
|
||||
}
|
||||
}
|
||||
|
||||
@ -110,12 +117,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||
|
||||
drop(iter);
|
||||
|
||||
let database_is_empty = self.word_prefix_docids.is_empty(self.wtxn)?;
|
||||
|
||||
// We finally write the word prefix docids into the LMDB database.
|
||||
sorter_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.word_prefix_docids.as_polymorph(),
|
||||
write_sorter_into_database(
|
||||
prefix_docids_sorter,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
&self.word_prefix_docids,
|
||||
self.wtxn,
|
||||
database_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
|
@ -9,9 +9,11 @@ use log::debug;
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{
|
||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||
CursorClonableMmap, MergeFn,
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||
write_sorter_into_database, CursorClonableMmap, MergeFn,
|
||||
};
|
||||
use crate::{CboRoaringBitmapCodec, Result};
|
||||
|
||||
@ -55,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
|
||||
|
||||
let mut prefix_integer_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
self.chunk_compression_type,
|
||||
self.chunk_compression_level,
|
||||
self.max_nb_chunks,
|
||||
@ -108,6 +110,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
|
||||
|
||||
// We fetch the docids associated to the newly added word prefix fst only.
|
||||
let db = self.word_database.remap_data_type::<ByteSlice>();
|
||||
let mut buffer = Vec::new();
|
||||
for prefix_bytes in new_prefix_fst_words {
|
||||
let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| {
|
||||
SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) }
|
||||
@ -123,7 +126,11 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
|
||||
if word.starts_with(prefix) {
|
||||
let key = (prefix, pos);
|
||||
let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
|
||||
prefix_integer_docids_sorter.insert(bytes, data)?;
|
||||
|
||||
buffer.clear();
|
||||
let mut writer = KvWriterDelAdd::new(&mut buffer);
|
||||
writer.insert(DelAdd::Addition, data)?;
|
||||
prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -143,12 +150,16 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
|
||||
drop(iter);
|
||||
}
|
||||
|
||||
let database_is_empty = self.prefix_database.is_empty(self.wtxn)?;
|
||||
|
||||
// We finally write all the word prefix integer docids into the LMDB database.
|
||||
sorter_into_lmdb_database(
|
||||
self.wtxn,
|
||||
*self.prefix_database.as_polymorph(),
|
||||
write_sorter_into_database(
|
||||
prefix_integer_docids_sorter,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
&self.prefix_database,
|
||||
self.wtxn,
|
||||
database_is_empty,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
@ -159,6 +170,7 @@ fn write_prefixes_in_sorter(
|
||||
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
|
||||
sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
// TODO: Merge before insertion.
|
||||
for (key, data_slices) in prefixes.drain() {
|
||||
for data in data_slices {
|
||||
if valid_lmdb_key(&key) {
|
||||
|
Reference in New Issue
Block a user