Fix the tests for the new DocumentsBatchBuilder/Reader

This commit is contained in:
Kerollmops
2022-06-14 16:04:27 +02:00
parent 419ce3966c
commit e8297ad27e
9 changed files with 292 additions and 374 deletions

View File

@ -25,7 +25,7 @@ pub use self::helpers::{
};
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput};
use crate::documents::DocumentBatchReader;
use crate::documents::DocumentsBatchReader;
pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
@ -121,7 +121,7 @@ where
/// builder, and the builder must be discarded.
///
/// Returns the number of documents added to the builder.
pub fn add_documents<R>(&mut self, reader: DocumentBatchReader<R>) -> Result<u64>
pub fn add_documents<R>(&mut self, reader: DocumentsBatchReader<R>) -> Result<u64>
where
R: Read + Seek,
{
@ -590,9 +590,8 @@ mod tests {
use maplit::hashset;
use super::*;
use crate::documents::DocumentBatchBuilder;
use crate::documents::DocumentsBatchBuilder;
use crate::update::DeleteDocuments;
use crate::HashMap;
#[test]
fn simple_document_replacement() {
@ -1252,21 +1251,17 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow");
let mut big_object = serde_json::Map::new();
big_object.insert(S("id"), serde_json::Value::from("wow"));
for i in 0..1000 {
let key = i.to_string();
big_object.insert(key, "I am a text!");
big_object.insert(key, serde_json::Value::from("I am a text!"));
}
let mut cursor = Cursor::new(Vec::new());
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap());
builder.extend_from_json(big_object).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(&big_object).unwrap();
let vector = builder.into_inner().unwrap();
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
@ -1288,23 +1283,19 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut big_object = HashMap::new();
big_object.insert(S("id"), "wow");
let mut big_object = serde_json::Map::new();
big_object.insert(S("id"), serde_json::Value::from("wow"));
let content: String = (0..=u16::MAX)
.into_iter()
.map(|p| p.to_string())
.reduce(|a, b| a + " " + b.as_ref())
.unwrap();
big_object.insert("content".to_string(), &content);
big_object.insert("content".to_string(), serde_json::Value::from(content));
let mut cursor = Cursor::new(Vec::new());
let big_object = serde_json::to_string(&big_object).unwrap();
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let mut builder = DocumentsBatchBuilder::new(Vec::new());
builder.append_json_object(&big_object).unwrap();
let vector = builder.into_inner().unwrap();
let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
let config = IndexerConfig::default();
let indexing_config = IndexDocumentsConfig::default();
@ -1843,18 +1834,20 @@ mod tests {
// Create 200 documents with a long text
let content = {
let documents: Vec<_> = (0..200i32)
let documents_iter = (0..200i32)
.into_iter()
.map(|i| serde_json::json!({ "id": i, "script": script }))
.collect();
.filter_map(|json| match json {
serde_json::Value::Object(object) => Some(object),
_ => None,
});
let mut writer = std::io::Cursor::new(Vec::new());
let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
let documents = serde_json::to_vec(&documents).unwrap();
builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
builder.finish().unwrap();
writer.set_position(0);
crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new());
for object in documents_iter {
builder.append_json_object(&object).unwrap();
}
let vector = builder.into_inner().unwrap();
crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap()
};
// Index those 200 long documents

View File

@ -14,7 +14,7 @@ use smartstring::SmartString;
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentBatchReader, DocumentsBatchIndex};
use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
@ -152,7 +152,7 @@ impl<'a, 'i> Transform<'a, 'i> {
pub fn read_documents<R, F>(
&mut self,
mut reader: DocumentBatchReader<R>,
reader: DocumentsBatchReader<R>,
wtxn: &mut heed::RwTxn,
progress_callback: F,
) -> Result<usize>
@ -160,7 +160,8 @@ impl<'a, 'i> Transform<'a, 'i> {
R: Read + Seek,
F: Fn(UpdateIndexingStep) + Sync,
{
let fields_index = reader.index();
let mut cursor = reader.into_cursor();
let fields_index = cursor.documents_batch_index();
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
@ -186,7 +187,8 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut documents_count = 0;
let mut external_id_buffer = Vec::new();
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
while let Some((addition_index, document)) = reader.next_document_with_index()? {
let addition_index = cursor.documents_batch_index().clone();
while let Some(document) = cursor.next_document()? {
let mut field_buffer_cache = drop_and_reuse(field_buffer);
if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
@ -840,7 +842,7 @@ fn update_primary_key<'a>(
None => {
let mut json = Map::new();
for (key, value) in document.iter() {
let key = addition_index.name(key).cloned();
let key = addition_index.name(key).map(ToString::to_string);
let value = serde_json::from_slice::<Value>(&value).ok();
if let Some((k, v)) = key.zip(value) {