mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-09 14:16:33 +00:00
Post processing of the merge
This commit is contained in:
@ -1,73 +1,140 @@
|
||||
use std::cell::RefCell;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use hashbrown::HashMap;
|
||||
|
||||
use super::DelAddRoaringBitmap;
|
||||
use crate::update::new::channel::DocumentsSender;
|
||||
use crate::update::new::document::write_to_obkv;
|
||||
use crate::update::new::document::{write_to_obkv, Document as _};
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
DocumentChangeContext, Extractor, FullySend, RefCellExt as _,
|
||||
};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::Result;
|
||||
|
||||
pub struct DocumentsExtractor<'a> {
|
||||
documents_sender: &'a DocumentsSender<'a>,
|
||||
document_sender: &'a DocumentsSender<'a>,
|
||||
embedders: &'a EmbeddingConfigs,
|
||||
}
|
||||
|
||||
impl<'a> DocumentsExtractor<'a> {
|
||||
pub fn new(documents_sender: &'a DocumentsSender<'a>) -> Self {
|
||||
Self { documents_sender }
|
||||
pub fn new(document_sender: &'a DocumentsSender<'a>, embedders: &'a EmbeddingConfigs) -> Self {
|
||||
Self { document_sender, embedders }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct DocumentExtractorData {
|
||||
pub docids_delta: DelAddRoaringBitmap,
|
||||
pub field_distribution_delta: HashMap<String, i64>,
|
||||
}
|
||||
|
||||
impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
|
||||
type Data = FullySend<RefCell<DelAddRoaringBitmap>>;
|
||||
type Data = FullySend<RefCell<DocumentExtractorData>>;
|
||||
|
||||
fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(FullySend(RefCell::new(DelAddRoaringBitmap::empty())))
|
||||
Ok(FullySend(Default::default()))
|
||||
}
|
||||
|
||||
fn process(
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
change: DocumentChange,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
let mut document_buffer = Vec::new();
|
||||
let mut delta_documents_ids = context.data.0.borrow_mut_or_yield();
|
||||
let mut document_buffer = bumpalo::collections::Vec::new_in(&context.doc_alloc);
|
||||
let mut document_extractor_data = context.data.0.borrow_mut_or_yield();
|
||||
|
||||
let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield();
|
||||
let new_fields_ids_map = &*new_fields_ids_map;
|
||||
let new_fields_ids_map = new_fields_ids_map.local_map();
|
||||
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
|
||||
|
||||
let external_docid = change.external_docid().to_owned();
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
let external_docid = change.external_docid().to_owned();
|
||||
|
||||
// document but we need to create a function that collects and compresses documents.
|
||||
match change {
|
||||
DocumentChange::Deletion(deletion) => {
|
||||
let docid = deletion.docid();
|
||||
self.documents_sender.delete(docid, external_docid).unwrap();
|
||||
delta_documents_ids.insert_del_u32(docid);
|
||||
}
|
||||
/// TODO: change NONE by SOME(vector) when implemented
|
||||
DocumentChange::Update(update) => {
|
||||
let docid = update.docid();
|
||||
let content =
|
||||
update.new(&context.txn, context.index, &context.db_fields_ids_map)?;
|
||||
let content =
|
||||
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
|
||||
self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
|
||||
}
|
||||
DocumentChange::Insertion(insertion) => {
|
||||
let docid = insertion.docid();
|
||||
let content = insertion.new();
|
||||
let content =
|
||||
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
|
||||
self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
|
||||
delta_documents_ids.insert_add_u32(docid);
|
||||
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
||||
// document but we need to create a function that collects and compresses documents.
|
||||
match change {
|
||||
DocumentChange::Deletion(deletion) => {
|
||||
let docid = deletion.docid();
|
||||
let content = deletion.current(
|
||||
&context.txn,
|
||||
context.index,
|
||||
&context.db_fields_ids_map,
|
||||
)?;
|
||||
for res in content.iter_top_level_fields() {
|
||||
let (f, _) = res?;
|
||||
let entry = document_extractor_data
|
||||
.field_distribution_delta
|
||||
.entry_ref(f)
|
||||
.or_default();
|
||||
*entry -= 1;
|
||||
}
|
||||
document_extractor_data.docids_delta.insert_del_u32(docid);
|
||||
self.document_sender.delete(docid, external_docid).unwrap();
|
||||
}
|
||||
DocumentChange::Update(update) => {
|
||||
let docid = update.docid();
|
||||
let content =
|
||||
update.current(&context.txn, context.index, &context.db_fields_ids_map)?;
|
||||
for res in content.iter_top_level_fields() {
|
||||
let (f, _) = res?;
|
||||
let entry = document_extractor_data
|
||||
.field_distribution_delta
|
||||
.entry_ref(f)
|
||||
.or_default();
|
||||
*entry -= 1;
|
||||
}
|
||||
let content = update.updated();
|
||||
for res in content.iter_top_level_fields() {
|
||||
let (f, _) = res?;
|
||||
let entry = document_extractor_data
|
||||
.field_distribution_delta
|
||||
.entry_ref(f)
|
||||
.or_default();
|
||||
*entry += 1;
|
||||
}
|
||||
|
||||
let content =
|
||||
update.merged(&context.txn, context.index, &context.db_fields_ids_map)?;
|
||||
let vector_content = update.merged_vectors(
|
||||
&context.txn,
|
||||
context.index,
|
||||
&context.db_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
self.embedders,
|
||||
)?;
|
||||
let content = write_to_obkv(
|
||||
&content,
|
||||
vector_content.as_ref(),
|
||||
&mut new_fields_ids_map,
|
||||
&mut document_buffer,
|
||||
)?;
|
||||
self.document_sender.uncompressed(docid, external_docid, content).unwrap();
|
||||
}
|
||||
DocumentChange::Insertion(insertion) => {
|
||||
let docid = insertion.docid();
|
||||
let content = insertion.inserted();
|
||||
for res in content.iter_top_level_fields() {
|
||||
let (f, _) = res?;
|
||||
let entry = document_extractor_data
|
||||
.field_distribution_delta
|
||||
.entry_ref(f)
|
||||
.or_default();
|
||||
*entry += 1;
|
||||
}
|
||||
let inserted_vectors =
|
||||
insertion.inserted_vectors(&context.doc_alloc, self.embedders)?;
|
||||
let content = write_to_obkv(
|
||||
&content,
|
||||
inserted_vectors.as_ref(),
|
||||
&mut new_fields_ids_map,
|
||||
&mut document_buffer,
|
||||
)?;
|
||||
document_extractor_data.docids_delta.insert_add_u32(docid);
|
||||
self.document_sender.uncompressed(docid, external_docid, content).unwrap();
|
||||
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user