Implement documents format

document reader transform

remove update format

support document sequences

fix document transform

clean transform

improve error handling

add documents! macro

fix transform bug

fix tests

remove csv dependency

Add comments on the transform process

replace search cli

fmt

review edits

fix http ui

fix clippy warnings

Revert "fix clippy warnings"

This reverts commit a1ce3cd96e603633dbf43e9e0b12b2453c9c5620.

fix review comments

remove smallvec in transform loop

review edits
This commit is contained in:
mpostma
2021-08-31 11:44:15 +02:00
parent 94764e5c7c
commit aa6c5df0bc
25 changed files with 5114 additions and 713 deletions

View File

@@ -82,7 +82,7 @@ mod tests {
use heed::EnvOpenOptions;
use super::*;
use crate::update::{IndexDocuments, UpdateFormat};
use crate::update::IndexDocuments;
#[test]
fn clear_documents() {
@@ -92,14 +92,12 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "id": 0, "name": "kevin", "age": 20 },
{ "id": 1, "name": "kevina" },
{ "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
]);
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
// Clear all documents from the database.
let builder = ClearDocuments::new(&mut wtxn, &index, 1);

View File

@@ -567,7 +567,7 @@ mod tests {
use maplit::hashset;
use super::*;
use crate::update::{IndexDocuments, Settings, UpdateFormat};
use crate::update::{IndexDocuments, Settings};
use crate::FilterCondition;
#[test]
@@ -578,13 +578,12 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
// delete those documents, ids are synchronous therefore 0, 1, and 2.
@@ -609,13 +608,12 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "mysuperid": 0, "name": "kevin" },
{ "mysuperid": 1, "name": "kevina" },
{ "mysuperid": 2, "name": "benoit" }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
// Delete not all of the documents but some of them.
@@ -640,7 +638,7 @@ mod tests {
builder.set_filterable_fields(hashset! { S("label") });
builder.execute(|_, _| ()).unwrap();
let content = &br#"[
let content = documents!([
{"docid":"1_4","label":"sign"},
{"docid":"1_5","label":"letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern"},
@@ -661,9 +659,8 @@ mod tests {
{"docid":"1_58","label":"abstract,art,cartoon"},
{"docid":"1_68","label":"design"},
{"docid":"1_69","label":"geometry"}
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
// Delete not all of the documents but some of them.
@@ -692,7 +689,7 @@ mod tests {
builder.set_sortable_fields(hashset!(S("_geo")));
builder.execute(|_, _| ()).unwrap();
let content = &r#"[
let content = documents!([
{"id":"1","city":"Lille", "_geo": { "lat": 50.629973371633746, "lng": 3.0569447399419570 } },
{"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.641586120121050, "lng": 3.1106593480348670 } },
{"id":"3","city":"Hellemmes", "_geo": { "lat": 50.631220965518080, "lng": 3.1106399673339933 } },
@@ -713,12 +710,10 @@ mod tests {
{"id":"18","city":"Amiens", "_geo": { "lat": 49.931472529669996, "lng": 2.2710499758317080 } },
{"id":"19","city":"Compiègne", "_geo": { "lat": 49.444980887725656, "lng": 2.7913841281529015 } },
{"id":"20","city":"Paris", "_geo": { "lat": 48.902100060895480, "lng": 2.3708400867406930 } }
]"#[..];
]);
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
let external_document_ids = index.external_documents_ids(&wtxn).unwrap();
let ids_to_delete: Vec<u32> = external_ids_to_delete

View File

@@ -4,7 +4,7 @@ mod transform;
mod typed_chunk;
use std::collections::HashSet;
use std::io::{self, BufRead, BufReader};
use std::io::{Read, Seek};
use std::iter::FromIterator;
use std::num::{NonZeroU32, NonZeroUsize};
use std::time::Instant;
@@ -24,6 +24,7 @@ pub use self::helpers::{
};
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput};
use crate::documents::DocumentBatchReader;
use crate::update::{
Facets, UpdateBuilder, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
WordsLevelPositions, WordsPrefixesFst,
@@ -57,17 +58,6 @@ pub enum WriteMethod {
GetMergePut,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[non_exhaustive]
pub enum UpdateFormat {
/// The given update is a real **comma seperated** CSV with headers on the first line.
Csv,
/// The given update is a JSON array with documents inside.
Json,
/// The given update is a JSON stream with a document on each line.
JsonStream,
}
pub struct IndexDocuments<'t, 'u, 'i, 'a> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
@@ -85,7 +75,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> {
words_positions_level_group_size: Option<NonZeroU32>,
words_positions_min_level_size: Option<NonZeroU32>,
update_method: IndexDocumentsMethod,
update_format: UpdateFormat,
autogenerate_docids: bool,
update_id: u64,
}
@@ -113,18 +102,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
words_positions_level_group_size: None,
words_positions_min_level_size: None,
update_method: IndexDocumentsMethod::ReplaceDocuments,
update_format: UpdateFormat::Json,
autogenerate_docids: false,
update_id,
}
}
pub fn index_documents_method(&mut self, method: IndexDocumentsMethod) {
self.update_method = method;
pub fn log_every_n(&mut self, n: usize) {
self.log_every_n = Some(n);
}
pub fn update_format(&mut self, format: UpdateFormat) {
self.update_format = format;
pub fn index_documents_method(&mut self, method: IndexDocumentsMethod) {
self.update_method = method;
}
pub fn enable_autogenerate_docids(&mut self) {
@@ -136,16 +124,17 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
}
#[logging_timer::time("IndexDocuments::{}")]
pub fn execute<R, F>(self, reader: R, progress_callback: F) -> Result<DocumentAdditionResult>
pub fn execute<R, F>(
self,
reader: DocumentBatchReader<R>,
progress_callback: F,
) -> Result<DocumentAdditionResult>
where
R: io::Read,
R: Read + Seek,
F: Fn(UpdateIndexingStep, u64) + Sync,
{
let mut reader = BufReader::new(reader);
reader.fill_buf()?;
// Early return when there is no document to add
if reader.buffer().is_empty() {
if reader.is_empty() {
return Ok(DocumentAdditionResult { nb_documents: 0 });
}
@@ -165,14 +154,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
autogenerate_docids: self.autogenerate_docids,
};
let output = match self.update_format {
UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?,
UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?,
UpdateFormat::JsonStream => {
transform.output_from_json_stream(reader, &progress_callback)?
}
};
let output = transform.read_documents(reader, progress_callback)?;
let nb_documents = output.documents_count;
info!("Update transformed in {:.02?}", before_transform.elapsed());
@@ -462,6 +444,7 @@ mod tests {
use heed::EnvOpenOptions;
use super::*;
use crate::documents::DocumentBatchBuilder;
use crate::update::DeleteDocuments;
use crate::HashMap;
@@ -474,9 +457,12 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([
{ "id": 1, "name": "kevin" },
{ "id": 2, "name": "kevina" },
{ "id": 3, "name": "benoit" }
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -488,9 +474,8 @@ mod tests {
// Second we send 1 document with id 1, to erase the previous ones.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,updated kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
let content = documents!([ { "id": 1, "name": "updated kevin" } ]);
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -502,9 +487,12 @@ mod tests {
// Third we send 3 documents again to replace the existing ones.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 2);
builder.update_format(UpdateFormat::Csv);
let content = documents!([
{ "id": 1, "name": "updated second kevin" },
{ "id": 2, "name": "updated kevina" },
{ "id": 3, "name": "updated benoit" }
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 2);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -525,9 +513,12 @@ mod tests {
// First we send 3 documents with duplicate ids and
// change the index method to merge documents.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,kevin\n1,kevina\n1,benoit\n"[..];
let content = documents!([
{ "id": 1, "name": "kevin" },
{ "id": 1, "name": "kevina" },
{ "id": 1, "name": "benoit" }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -552,9 +543,8 @@ mod tests {
// Second we send 1 document with id 1, to force it to be merged with the previous one.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,age\n1,25\n"[..];
let content = documents!([ { "id": 1, "age": 25 } ]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -574,13 +564,13 @@ mod tests {
let mut doc_iter = doc.iter();
assert_eq!(doc_iter.next(), Some((0, &br#""1""#[..])));
assert_eq!(doc_iter.next(), Some((1, &br#""benoit""#[..])));
assert_eq!(doc_iter.next(), Some((2, &br#""25""#[..])));
assert_eq!(doc_iter.next(), Some((2, &br#"25"#[..])));
assert_eq!(doc_iter.next(), None);
drop(rtxn);
}
#[test]
fn not_auto_generated_csv_documents_ids() {
fn not_auto_generated_documents_ids() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
@@ -588,35 +578,12 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
assert!(builder.execute(content, |_, _| ()).is_err());
wtxn.commit().unwrap();
// Check that there is no document.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 0);
drop(rtxn);
}
#[test]
fn not_auto_generated_json_documents_ids() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 3 documents and 2 without ids.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
{ "name": "kevina", "id": 21 },
let content = documents!([
{ "name": "kevin" },
{ "name": "kevina" },
{ "name": "benoit" }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
assert!(builder.execute(content, |_, _| ()).is_err());
wtxn.commit().unwrap();
@@ -636,10 +603,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
let content = documents!([
{ "name": "kevin" },
{ "name": "kevina" },
{ "name": "benoit" }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -655,10 +625,9 @@ mod tests {
// Second we send 1 document with the generated uuid, to erase the previous ones.
let mut wtxn = index.write_txn().unwrap();
let content = format!("id,name\n{},updated kevin", kevin_uuid);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]);
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is **always** 3 documents.
@@ -689,9 +658,12 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([
{ "id": 1, "name": "kevin" },
{ "id": 2, "name": "kevina" },
{ "id": 3, "name": "benoit" }
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -703,9 +675,9 @@ mod tests {
// Second we send 1 document without specifying the id.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name\nnew kevin"[..];
let content = documents!([ { "name": "new kevin" } ]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -717,7 +689,7 @@ mod tests {
}
#[test]
fn empty_csv_update() {
fn empty_update() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
@@ -725,9 +697,8 @@ mod tests {
// First we send 0 documents and only headers.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -738,83 +709,6 @@ mod tests {
drop(rtxn);
}
#[test]
fn json_documents() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
{ "name": "kevin" },
{ "name": "kevina", "id": 21 },
{ "name": "benoit" }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 3 documents now.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3);
drop(rtxn);
}
#[test]
fn empty_json_update() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 0 documents.
let mut wtxn = index.write_txn().unwrap();
let content = &b"[]"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is no documents.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 0);
drop(rtxn);
}
#[test]
fn json_stream_documents() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
let index = Index::new(options, &path).unwrap();
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"
{ "name": "kevin" }
{ "name": "kevina", "id": 21 }
{ "name": "benoit" }
"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::JsonStream);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
// Check that there is 3 documents now.
let rtxn = index.read_txn().unwrap();
let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3);
drop(rtxn);
}
#[test]
fn invalid_documents_ids() {
let path = tempfile::tempdir().unwrap();
@@ -825,18 +719,16 @@ mod tests {
// First we send 1 document with an invalid id.
let mut wtxn = index.write_txn().unwrap();
// There is a space in the document id.
let content = &b"id,name\nbrume bleue,kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
assert!(builder.execute(content, |_, _| ()).is_err());
wtxn.commit().unwrap();
// First we send 1 document with a valid id.
let mut wtxn = index.write_txn().unwrap();
// There is a space in the document id.
let content = &b"id,name\n32,kevin\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Csv);
let content = documents!([ { "id": 32, "name": "kevin" } ]);
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -848,7 +740,7 @@ mod tests {
}
#[test]
fn complex_json_documents() {
fn complex_documents() {
let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB
@@ -856,13 +748,12 @@ mod tests {
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
{ "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
{ "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -893,33 +784,31 @@ mod tests {
// First we send 3 documents with an id for only one of them.
let mut wtxn = index.write_txn().unwrap();
let documents = &r#"[
let documents = documents!([
{ "id": 2, "title": "Pride and Prejudice", "author": "Jane Austin", "genre": "romance", "price": 3.5, "_geo": { "lat": 12, "lng": 42 } },
{ "id": 456, "title": "Le Petit Prince", "author": "Antoine de Saint-Exupéry", "genre": "adventure" , "price": 10.0 },
{ "id": 1, "title": "Alice In Wonderland", "author": "Lewis Carroll", "genre": "fantasy", "price": 25.99 },
{ "id": 1344, "title": "The Hobbit", "author": "J. R. R. Tolkien", "genre": "fantasy" },
{ "id": 4, "title": "Harry Potter and the Half-Blood Prince", "author": "J. K. Rowling", "genre": "fantasy" },
{ "id": 42, "title": "The Hitchhiker's Guide to the Galaxy", "author": "Douglas Adams", "_geo": { "lat": 35, "lng": 23 } }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
builder.execute(Cursor::new(documents), |_, _| ()).unwrap();
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let mut wtxn = index.write_txn().unwrap();
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
let documents = &r#"[
let documents = documents!([
{
"id": 2,
"author": "J. Austen",
"date": "1813"
}
]"#[..];
]);
builder.execute(Cursor::new(documents), |_, _| ()).unwrap();
builder.execute(documents, |_, _| ()).unwrap();
wtxn.commit().unwrap();
}
@@ -931,15 +820,13 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
let content = documents!([
{ "objectId": 123, "title": "Pride and Prejudice", "comment": "A great book" },
{ "objectId": 456, "title": "Le Petit Prince", "comment": "A french book" },
{ "objectId": 1, "title": "Alice In Wonderland", "comment": "A weird book" },
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
]);
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
@@ -951,22 +838,18 @@ mod tests {
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
assert!(external_documents_ids.get("30").is_none());
let content = &br#"[
let content = documents!([
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
]);
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
assert!(external_documents_ids.get("30").is_some());
let content = &br#"[
let content = documents!([
{ "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
]);
IndexDocuments::new(&mut wtxn, &index, 0).execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
}
@@ -987,12 +870,16 @@ mod tests {
big_object.insert(key, "I am a text!");
}
let content = vec![big_object];
let content = serde_json::to_string(&content).unwrap();
let mut cursor = Cursor::new(Vec::new());
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(Cursor::new(content), |_, _| ()).unwrap();
let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
builder.add_documents(big_object).unwrap();
builder.finish().unwrap();
cursor.set_position(0);
let content = DocumentBatchReader::from_reader(cursor).unwrap();
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
}
@@ -1005,16 +892,38 @@ mod tests {
let index = Index::new(options, &path).unwrap();
let mut wtxn = index.write_txn().unwrap();
let content = r#"#id,title,au{hor,genre,price$
2,"Prideand Prejudice","Jane Austin","romance",3.5$
456,"Le Petit Prince","Antoine de Saint-Exupéry","adventure",10.0$
1,Wonderland","Lewis Carroll","fantasy",25.99$
4,"Harry Potter ing","fantasy\0lood Prince","J. K. Rowling","fantasy\0,
"#;
let content = documents!([
{
"id": 2,
"title": "Prideand Prejudice",
"au{hor": "Jane Austin",
"genre": "romance",
"price$": "3.5$",
},
{
"id": 456,
"title": "Le Petit Prince",
"au{hor": "Antoine de Saint-Exupéry",
"genre": "adventure",
"price$": "10.0$",
},
{
"id": 1,
"title": "Wonderland",
"au{hor": "Lewis Carroll",
"genre": "fantasy",
"price$": "25.99$",
},
{
"id": 4,
"title": "Harry Potter ing fantasy\0lood Prince",
"au{hor": "J. K. Rowling",
"genre": "fantasy\0",
},
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
builder.execute(content.as_bytes(), |_, _| ()).unwrap();
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
}

View File

@@ -1,12 +1,12 @@
use std::borrow::Cow;
use std::collections::btree_map::Entry;
use std::collections::HashMap;
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
use std::iter::Peekable;
use std::result::Result as StdResult;
use std::time::Instant;
use grenad::CompressionType;
use itertools::Itertools;
use log::info;
use roaring::RoaringBitmap;
use serde_json::{Map, Value};
@@ -15,7 +15,8 @@ use super::helpers::{
create_sorter, create_writer, keep_latest_obkv, merge_obkvs, merge_two_obkvs, MergeFn,
};
use super::IndexDocumentsMethod;
use crate::error::{InternalError, UserError};
use crate::documents::{DocumentBatchReader, DocumentsBatchIndex};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{ExternalDocumentsIds, FieldDistribution, FieldId, FieldsIdsMap, Index, Result, BEU32};
@@ -51,90 +52,63 @@ pub struct Transform<'t, 'i> {
pub autogenerate_docids: bool,
}
fn is_primary_key(field: impl AsRef<str>) -> bool {
field.as_ref().to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME)
/// Create a mapping between the field ids found in the document batch and the one that were
/// already present in the index.
///
/// If new fields are present in the addition, they are added to the index field ids map.
fn create_fields_mapping(
index_field_map: &mut FieldsIdsMap,
batch_field_map: &DocumentsBatchIndex,
) -> Result<HashMap<FieldId, FieldId>> {
batch_field_map
.iter()
// we sort by id here to ensure a deterministic mapping of the fields, that preserves
// the original ordering.
.sorted_by_key(|(&id, _)| id)
.map(|(field, name)| match index_field_map.id(&name) {
Some(id) => Ok((*field, id)),
None => index_field_map
.insert(&name)
.ok_or(Error::UserError(UserError::AttributeLimitReached))
.map(|id| (*field, id)),
})
.collect()
}
fn find_primary_key(index: &bimap::BiHashMap<u16, String>) -> Option<&str> {
index
.right_values()
.find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME))
.map(String::as_str)
}
impl Transform<'_, '_> {
pub fn output_from_json<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
self.output_from_generic_json(reader, false, progress_callback)
}
pub fn output_from_json_stream<R, F>(
pub fn read_documents<R, F>(
self,
reader: R,
mut reader: DocumentBatchReader<R>,
progress_callback: F,
) -> Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
self.output_from_generic_json(reader, true, progress_callback)
}
fn output_from_generic_json<R, F>(
self,
reader: R,
is_stream: bool,
progress_callback: F,
) -> Result<TransformOutput>
where
R: Read,
R: Read + Seek,
F: Fn(UpdateIndexingStep) + Sync,
{
let fields_index = reader.index();
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
let mapping = create_fields_mapping(&mut fields_ids_map, fields_index)?;
// Deserialize the whole batch of documents in memory.
let mut documents: Peekable<
Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>,
> = if is_stream {
let iter = serde_json::Deserializer::from_reader(reader).into_iter();
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
} else {
let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?;
let iter = vec.into_iter().map(Ok);
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
};
let alternative_name = self
.index
.primary_key(self.rtxn)?
.or_else(|| find_primary_key(fields_index))
.map(String::from);
// We extract the primary key from the first document in
// the batch if it hasn't already been defined in the index
let first = match documents.peek().map(StdResult::as_ref).transpose() {
Ok(first) => first,
Err(_) => {
let error = documents.next().unwrap().unwrap_err();
return Err(UserError::SerdeJson(error).into());
}
};
let alternative_name =
first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
let (primary_key_id, primary_key) = compute_primary_key_pair(
let (primary_key_id, primary_key_name) = compute_primary_key_pair(
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids,
)?;
if documents.peek().is_none() {
return Ok(TransformOutput {
primary_key,
fields_ids_map,
field_distribution: self.index.field_distribution(self.rtxn)?,
external_documents_ids: ExternalDocumentsIds::default(),
new_documents_ids: RoaringBitmap::new(),
replaced_documents_ids: RoaringBitmap::new(),
documents_count: 0,
documents_file: tempfile::tempfile()?,
});
}
// We must choose the appropriate merge function for when two or more documents
// with the same user id must be merged or fully replaced in the same batch.
let merge_function = match self.index_documents_method {
@@ -151,204 +125,103 @@ impl Transform<'_, '_> {
self.max_memory,
);
let mut json_buffer = Vec::new();
let mut obkv_buffer = Vec::new();
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
let mut documents_count = 0;
for result in documents {
let document = result.map_err(UserError::SerdeJson)?;
let mut external_id_buffer = Vec::new();
let mut field_buffer: Vec<(u16, &[u8])> = Vec::new();
while let Some((addition_index, document)) = reader.next_document_with_index()? {
let mut field_buffer_cache = drop_and_reuse(field_buffer);
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
documents_seen: documents_count,
});
}
obkv_buffer.clear();
let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
// We prepare the fields ids map with the documents keys.
for (key, _value) in &document {
fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
for (k, v) in document.iter() {
let mapped_id = *mapping.get(&k).unwrap();
field_buffer_cache.push((mapped_id, v));
}
// We retrieve the user id from the document based on the primary key name,
// if the document id isn't present we generate a uuid.
let external_id = match document.get(&primary_key) {
Some(value) => match value {
Value::String(string) => Cow::Borrowed(string.as_str()),
Value::Number(number) => Cow::Owned(number.to_string()),
content => {
return Err(
UserError::InvalidDocumentId { document_id: content.clone() }.into()
)
// We need to make sure that every document has a primary key. After we have remapped
// all the fields in the document, we try to find the primary key value. If we can find
// it, transform it into a string and validate it, and then update it in the
// document. If none is found, and we were told to generate missing document ids, then
// we create the missing field, and update the new document.
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
let external_id =
match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) {
Some((_, bytes)) => {
let value = match serde_json::from_slice(bytes).unwrap() {
Value::String(string) => match validate_document_id(&string) {
Some(s) if s.len() == string.len() => string,
Some(s) => s.to_string(),
None => {
return Err(UserError::InvalidDocumentId {
document_id: Value::String(string),
}
.into())
}
},
Value::Number(number) => number.to_string(),
content => {
return Err(UserError::InvalidDocumentId {
document_id: content.clone(),
}
.into())
}
};
serde_json::to_writer(&mut external_id_buffer, &value).unwrap();
*bytes = &external_id_buffer;
Cow::Owned(value)
}
},
None => {
if !self.autogenerate_docids {
return Err(UserError::MissingDocumentId { document }.into());
}
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
Cow::Borrowed(uuid)
}
};
None => {
if !self.autogenerate_docids {
let mut json = Map::new();
for (key, value) in document.iter() {
let key = addition_index.get_by_left(&key).cloned();
let value = serde_json::from_slice::<Value>(&value).ok();
// We iterate in the fields ids ordered.
for (field_id, name) in fields_ids_map.iter() {
json_buffer.clear();
if let Some((k, v)) = key.zip(value) {
json.insert(k, v);
}
}
// We try to extract the value from the document and if we don't find anything
// and this should be the document id we return the one we generated.
if let Some(value) = document.get(name) {
// We serialize the attribute values.
serde_json::to_writer(&mut json_buffer, value)
.map_err(InternalError::SerdeJson)?;
writer.insert(field_id, &json_buffer)?;
}
// We validate the document id [a-zA-Z0-9\-_].
if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
return Err(UserError::MissingDocumentId { document: json }.into());
}
let uuid =
uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
serde_json::to_writer(&mut external_id_buffer, &uuid).unwrap();
field_buffer_cache.push((primary_key_id, &external_id_buffer));
Cow::Borrowed(&*uuid)
}
.into());
}
};
// Insertion in a obkv need to be done with keys ordered. For now they are ordered
// according to the document addition key order, so we sort it according to the
// fieldids map keys order.
field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(&f2));
// The last step is to build the new obkv document, and insert it in the sorter.
let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
for (k, v) in field_buffer_cache.iter() {
writer.insert(*k, v)?;
}
// We use the extracted/generated user id as the key for this document.
sorter.insert(external_id.as_bytes(), &obkv_buffer)?;
sorter.insert(&external_id.as_ref().as_bytes(), &obkv_buffer)?;
documents_count += 1;
}
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
documents_seen: documents_count,
});
// Now that we have a valid sorter that contains the user id and the obkv we
// give it to the last transforming function which returns the TransformOutput.
self.output_from_sorter(
sorter,
primary_key,
fields_ids_map,
documents_count,
external_documents_ids,
progress_callback,
)
}
pub fn output_from_csv<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
{
let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
let mut csv = csv::Reader::from_reader(reader);
let headers = csv.headers().map_err(UserError::Csv)?;
let mut fields_ids = Vec::new();
// Generate the new fields ids based on the current fields ids and this CSV headers.
for (i, header) in headers.iter().enumerate() {
let id = fields_ids_map.insert(header).ok_or(UserError::AttributeLimitReached)?;
fields_ids.push((id, i));
}
// Extract the position of the primary key in the current headers, None if not found.
let primary_key_pos = match self.index.primary_key(self.rtxn)? {
Some(primary_key) => {
// The primary key is known so we must find the position in the CSV headers.
headers.iter().position(|h| h == primary_key)
}
None => headers.iter().position(is_primary_key),
};
// Returns the field id in the fields ids map, create an "id" field
// in case it is not in the current headers.
let alternative_name = primary_key_pos.map(|pos| headers[pos].to_string());
let (primary_key_id, primary_key_name) = compute_primary_key_pair(
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids,
)?;
// The primary key field is not present in the header, so we need to create it.
if primary_key_pos.is_none() {
fields_ids.push((primary_key_id, usize::max_value()));
}
// We sort the fields ids by the fields ids map id, this way we are sure to iterate over
// the records fields in the fields ids map order and correctly generate the obkv.
fields_ids.sort_unstable_by_key(|(field_id, _)| *field_id);
// We initialize the sorter with the user indexing settings.
let mut sorter = create_sorter(
keep_latest_obkv,
self.chunk_compression_type,
self.chunk_compression_level,
self.max_nb_chunks,
self.max_memory,
);
// We write into the sorter to merge and deduplicate the documents
// based on the external ids.
let mut json_buffer = Vec::new();
let mut obkv_buffer = Vec::new();
let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
let mut documents_count = 0;
let mut record = csv::StringRecord::new();
while csv.read_record(&mut record).map_err(UserError::Csv)? {
obkv_buffer.clear();
let mut writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
documents_seen: documents_count,
});
}
// We extract the user id if we know where it is or generate an UUID V4 otherwise.
let external_id = match primary_key_pos {
Some(pos) => {
let external_id = &record[pos];
// We validate the document id [a-zA-Z0-9\-_].
match validate_document_id(&external_id) {
Some(valid) => valid,
None => {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}
.into())
}
}
}
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
};
// When the primary_key_field_id is found in the fields ids list
// we return the generated document id instead of the record field.
let iter = fields_ids.iter().map(|(fi, i)| {
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
(fi, field)
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
documents_seen: documents_count,
});
// We retrieve the field id based on the fields ids map fields ids order.
for (field_id, field) in iter {
// We serialize the attribute values as JSON strings.
json_buffer.clear();
serde_json::to_writer(&mut json_buffer, &field)
.map_err(InternalError::SerdeJson)?;
writer.insert(*field_id, &json_buffer)?;
}
// We use the extracted/generated user id as the key for this document.
sorter.insert(external_id, &obkv_buffer)?;
documents_count += 1;
obkv_buffer.clear();
field_buffer = drop_and_reuse(field_buffer_cache);
external_id_buffer.clear();
}
progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
progress_callback(UpdateIndexingStep::RemapDocumentAddition {
documents_seen: documents_count,
});
@@ -359,7 +232,6 @@ impl Transform<'_, '_> {
primary_key_name,
fields_ids_map,
documents_count,
external_documents_ids,
progress_callback,
)
}
@@ -373,12 +245,12 @@ impl Transform<'_, '_> {
primary_key: String,
fields_ids_map: FieldsIdsMap,
approximate_number_of_documents: usize,
mut external_documents_ids: ExternalDocumentsIds<'_>,
progress_callback: F,
) -> Result<TransformOutput>
where
F: Fn(UpdateIndexingStep) + Sync,
{
let mut external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
let documents_ids = self.index.documents_ids(self.rtxn)?;
let mut field_distribution = self.index.field_distribution(self.rtxn)?;
let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
@@ -610,6 +482,17 @@ fn validate_document_id(document_id: &str) -> Option<&str> {
})
}
/// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec<T>`.
///
/// The size and alignment of T and U must match.
fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
debug_assert_eq!(std::mem::align_of::<U>(), std::mem::align_of::<T>());
debug_assert_eq!(std::mem::size_of::<U>(), std::mem::size_of::<T>());
vec.clear();
debug_assert!(vec.is_empty());
vec.into_iter().map(|_| unreachable!()).collect()
}
#[cfg(test)]
mod test {
use super::*;

View File

@@ -2,9 +2,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
pub use self::clear_documents::ClearDocuments;
pub use self::delete_documents::DeleteDocuments;
pub use self::facets::Facets;
pub use self::index_documents::{
DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat,
};
pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod};
pub use self::settings::{Setting, Settings};
pub use self::update_builder::UpdateBuilder;
pub use self::update_step::UpdateIndexingStep;

View File

@@ -111,6 +111,10 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
}
pub fn log_every_n(&mut self, n: usize) {
self.log_every_n = Some(n);
}
pub fn reset_searchable_fields(&mut self) {
self.searchable_fields = Setting::Reset;
}
@@ -501,7 +505,7 @@ mod tests {
use super::*;
use crate::error::Error;
use crate::update::{IndexDocuments, UpdateFormat};
use crate::update::IndexDocuments;
use crate::{Criterion, FilterCondition, SearchResult};
#[test]
@@ -513,9 +517,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"id,name,age\n0,kevin,23\n1,kevina,21\n2,benoit,34\n"[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Csv);
let content = documents!([
{ "id": 1, "name": "kevin", "age": 23 },
{ "id": 2, "name": "kevina", "age": 21},
{ "id": 3, "name": "benoit", "age": 34 }
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -567,10 +575,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -611,10 +622,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -633,10 +647,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
// In the same transaction we change the displayed fields to be only the age.
@@ -678,13 +695,12 @@ mod tests {
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
let content = &br#"[
{ "name": "kevin", "age": 23 },
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -695,11 +711,19 @@ mod tests {
assert_eq!(fields_ids, hashset! { S("age") });
// Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood.
let fidmap = index.fields_ids_map(&rtxn).unwrap();
println!("fidmap: {:?}", fidmap);
for document in index.all_documents(&rtxn).unwrap() {
let document = document.unwrap();
let json = crate::obkv_to_json(&fidmap.ids().collect::<Vec<_>>(), &fidmap, document.1)
.unwrap();
println!("json: {:?}", json);
}
let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
// The faceted field id is 2u16
.prefix_iter(&rtxn, &[0, 2, 0])
// The faceted field id is 1u16
.prefix_iter(&rtxn, &[0, 1, 0])
.unwrap()
.count();
assert_eq!(count, 3);
@@ -707,25 +731,23 @@ mod tests {
// Index a little more documents with new and current facets values.
let mut wtxn = index.write_txn().unwrap();
let content = &br#"[
{ "name": "kevin2", "age": 23 },
let content = documents!([
{ "name": "kevin2", "age": 23},
{ "name": "kevina2", "age": 21 },
{ "name": "benoit", "age": 35 }
]"#[..];
{ "name": "benoit", "age": 35 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 2);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood.
let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 2, 0])
.prefix_iter(&rtxn, &[0, 1, 0])
.unwrap()
.count();
assert_eq!(count, 4);
@@ -747,13 +769,12 @@ mod tests {
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
let content = &br#"[
{ "name": "kevin", "age": 23 },
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -790,7 +811,7 @@ mod tests {
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
let content = &br#"[
let content = documents!([
{ "name": "kevin", "age": 23 },
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 },
@@ -798,9 +819,8 @@ mod tests {
{ "name": "bertrand", "age": 34 },
{ "name": "bernie", "age": 34 },
{ "name": "ben", "age": 34 }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.enable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -822,10 +842,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23},
{ "name": "kevina", "age": 21 },
{ "name": "benoit", "age": 34 }
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -844,10 +867,13 @@ mod tests {
// First we send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23, "maxim": "I love dogs" },
{ "name": "kevina", "age": 21, "maxim": "Doggos are the best" },
{ "name": "benoit", "age": 34, "maxim": "The crepes are really good" },
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
// In the same transaction we provide some stop_words
@@ -915,10 +941,13 @@ mod tests {
// Send 3 documents with ids from 1 to 3.
let mut wtxn = index.write_txn().unwrap();
let content = &b"name,age,maxim\nkevin,23,I love dogs\nkevina,21,Doggos are the best\nbenoit,34,The crepes are really good\n"[..];
let content = documents!([
{ "name": "kevin", "age": 23, "maxim": "I love dogs"},
{ "name": "kevina", "age": 21, "maxim": "Doggos are the best"},
{ "name": "benoit", "age": 34, "maxim": "The crepes are really good"},
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.enable_autogenerate_docids();
builder.update_format(UpdateFormat::Csv);
builder.execute(content, |_, _| ()).unwrap();
// In the same transaction provide some synonyms
@@ -1038,7 +1067,7 @@ mod tests {
assert_eq!(index.primary_key(&wtxn).unwrap(), Some("mykey"));
// Then index some documents with the "mykey" primary key.
let content = &br#"[
let content = documents!([
{ "mykey": 1, "name": "kevin", "age": 23 },
{ "mykey": 2, "name": "kevina", "age": 21 },
{ "mykey": 3, "name": "benoit", "age": 34 },
@@ -1046,9 +1075,8 @@ mod tests {
{ "mykey": 5, "name": "bertrand", "age": 34 },
{ "mykey": 6, "name": "bernie", "age": 34 },
{ "mykey": 7, "name": "ben", "age": 34 }
]"#[..];
]);
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
builder.disable_autogenerate_docids();
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
@@ -1087,7 +1115,7 @@ mod tests {
builder.set_filterable_fields(hashset! { S("genres") });
builder.execute(|_, _| ()).unwrap();
let content = &br#"[
let content = documents!([
{
"id": 11,
"title": "Star Wars",
@@ -1105,9 +1133,8 @@ mod tests {
"poster": "https://image.tmdb.org/t/p/w500/gSuHDeWemA1menrwfMRChnSmMVN.jpg",
"release_date": 819676800
}
]"#[..];
let mut builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.update_format(UpdateFormat::Json);
]);
let builder = IndexDocuments::new(&mut wtxn, &index, 1);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();

View File

@@ -2,10 +2,9 @@ use UpdateIndexingStep::*;
#[derive(Debug, Clone, Copy)]
pub enum UpdateIndexingStep {
/// Transform from the original user given format (CSV, JSON, JSON lines)
/// into a generic format based on the obkv and grenad crates. This step also
/// deduplicate potential documents in this batch update by merging or replacing them.
TransformFromUserIntoGenericFormat { documents_seen: usize },
/// Remap document addition fields the one present in the database, adding new fields in to the
/// schema on the go.
RemapDocumentAddition { documents_seen: usize },
/// This step check the external document id, computes the internal ids and merge
/// the documents that are already present in the database.
@@ -23,7 +22,7 @@ pub enum UpdateIndexingStep {
impl UpdateIndexingStep {
pub const fn step(&self) -> usize {
match self {
TransformFromUserIntoGenericFormat { .. } => 0,
RemapDocumentAddition { .. } => 0,
ComputeIdsAndMergeDocuments { .. } => 1,
IndexDocuments { .. } => 2,
MergeDataIntoFinalDatabase { .. } => 3,