mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	Fix the benchmark tests
This commit is contained in:
		
							
								
								
									
										28
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										28
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -494,11 +494,13 @@ name = "benchmarks" | |||||||
| version = "1.11.0" | version = "1.11.0" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  |  "bumpalo", | ||||||
|  "bytes", |  "bytes", | ||||||
|  "convert_case 0.6.0", |  "convert_case 0.6.0", | ||||||
|  "criterion", |  "criterion", | ||||||
|  "csv", |  "csv", | ||||||
|  "flate2", |  "flate2", | ||||||
|  |  "memmap2", | ||||||
|  "milli", |  "milli", | ||||||
|  "mimalloc", |  "mimalloc", | ||||||
|  "rand", |  "rand", | ||||||
| @@ -506,6 +508,7 @@ dependencies = [ | |||||||
|  "reqwest", |  "reqwest", | ||||||
|  "roaring", |  "roaring", | ||||||
|  "serde_json", |  "serde_json", | ||||||
|  |  "tempfile", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| @@ -1860,9 +1863,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "fastrand" | name = "fastrand" | ||||||
| version = "2.1.0" | version = "2.2.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" | checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "file-store" | name = "file-store" | ||||||
| @@ -2869,9 +2872,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "libc" | name = "libc" | ||||||
| version = "0.2.155" | version = "0.2.164" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" | checksum = "433bfe06b8c75da9b2e3fbea6e5329ff87748f0b144ef75306e674c3f6f7c13f" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "libgit2-sys" | name = "libgit2-sys" | ||||||
| @@ -3255,9 +3258,9 @@ checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "linux-raw-sys" | name = "linux-raw-sys" | ||||||
| version = "0.4.12" | version = "0.4.14" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "c4cd1a83af159aa67994778be9070f0ae1bd732942279cabb14f86f986a21456" | checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "liquid" | name = "liquid" | ||||||
| @@ -3591,9 +3594,9 @@ checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "memmap2" | name = "memmap2" | ||||||
| version = "0.9.4" | version = "0.9.5" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "fe751422e4a8caa417e13c3ea66452215d7d63e19e604f4980461212f3ae1322" | checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "libc", |  "libc", | ||||||
|  "stable_deref_trait", |  "stable_deref_trait", | ||||||
| @@ -4801,9 +4804,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "rustix" | name = "rustix" | ||||||
| version = "0.38.31" | version = "0.38.41" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" | checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bitflags 2.6.0", |  "bitflags 2.6.0", | ||||||
|  "errno", |  "errno", | ||||||
| @@ -5372,12 +5375,13 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "tempfile" | name = "tempfile" | ||||||
| version = "3.10.1" | version = "3.14.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" | checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "cfg-if", |  "cfg-if", | ||||||
|  "fastrand", |  "fastrand", | ||||||
|  |  "once_cell", | ||||||
|  "rustix", |  "rustix", | ||||||
|  "windows-sys 0.52.0", |  "windows-sys 0.52.0", | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -12,10 +12,13 @@ license.workspace = true | |||||||
|  |  | ||||||
| [dependencies] | [dependencies] | ||||||
| anyhow = "1.0.86" | anyhow = "1.0.86" | ||||||
|  | bumpalo = "3.16.0" | ||||||
| csv = "1.3.0" | csv = "1.3.0" | ||||||
|  | memmap2 = "0.9.5" | ||||||
| milli = { path = "../milli" } | milli = { path = "../milli" } | ||||||
| mimalloc = { version = "0.1.43", default-features = false } | mimalloc = { version = "0.1.43", default-features = false } | ||||||
| serde_json = { version = "1.0.120", features = ["preserve_order"] } | serde_json = { version = "1.0.120", features = ["preserve_order"] } | ||||||
|  | tempfile = "3.14.0" | ||||||
|  |  | ||||||
| [dev-dependencies] | [dev-dependencies] | ||||||
| criterion = { version = "0.5.1", features = ["html_reports"] } | criterion = { version = "0.5.1", features = ["html_reports"] } | ||||||
|   | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -1,17 +1,19 @@ | |||||||
| #![allow(dead_code)] | #![allow(dead_code)] | ||||||
|  |  | ||||||
| use std::fs::{create_dir_all, remove_dir_all, File}; | use std::fs::{create_dir_all, remove_dir_all, File}; | ||||||
| use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; | use std::io::{self, BufReader, BufWriter, Read}; | ||||||
| use std::num::ParseFloatError; | use std::num::ParseFloatError; | ||||||
| use std::path::Path; | use std::path::Path; | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
|  |  | ||||||
|  | use anyhow::Context; | ||||||
|  | use bumpalo::Bump; | ||||||
| use criterion::BenchmarkId; | use criterion::BenchmarkId; | ||||||
| use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | use memmap2::Mmap; | ||||||
| use milli::heed::EnvOpenOptions; | use milli::heed::EnvOpenOptions; | ||||||
| use milli::update::{ | use milli::update::new::indexer; | ||||||
|     IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, | use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings}; | ||||||
| }; | use milli::vector::EmbeddingConfigs; | ||||||
| use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy}; | use milli::{Criterion, Filter, Index, Object, TermsMatchingStrategy}; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  |  | ||||||
| @@ -92,18 +94,34 @@ pub fn base_setup(conf: &Conf) -> Index { | |||||||
|  |  | ||||||
|     let config = IndexerConfig::default(); |     let config = IndexerConfig::default(); | ||||||
|     let mut wtxn = index.write_txn().unwrap(); |     let mut wtxn = index.write_txn().unwrap(); | ||||||
|     let indexing_config = IndexDocumentsConfig { |     let rtxn = index.read_txn().unwrap(); | ||||||
|         autogenerate_docids: conf.primary_key.is_none(), |     let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); | ||||||
|         update_method: IndexDocumentsMethod::ReplaceDocuments, |     let mut new_fields_ids_map = db_fields_ids_map.clone(); | ||||||
|         ..Default::default() |  | ||||||
|     }; |  | ||||||
|     let builder = |  | ||||||
|         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap(); |  | ||||||
|     let documents = documents_from(conf.dataset, conf.dataset_format); |     let documents = documents_from(conf.dataset, conf.dataset_format); | ||||||
|     let (builder, user_error) = builder.add_documents(documents).unwrap(); |     let mut indexer = indexer::DocumentOperation::new(IndexDocumentsMethod::ReplaceDocuments); | ||||||
|     user_error.unwrap(); |     indexer.add_documents(&documents).unwrap(); | ||||||
|     builder.execute().unwrap(); |  | ||||||
|  |     let indexer_alloc = Bump::new(); | ||||||
|  |     let (document_changes, _operation_stats, primary_key) = | ||||||
|  |         indexer.into_changes(&indexer_alloc, &index, &rtxn, None, &mut new_fields_ids_map).unwrap(); | ||||||
|  |  | ||||||
|  |     indexer::index( | ||||||
|  |         &mut wtxn, | ||||||
|  |         &index, | ||||||
|  |         config.grenad_parameters(), | ||||||
|  |         &db_fields_ids_map, | ||||||
|  |         new_fields_ids_map, | ||||||
|  |         primary_key, | ||||||
|  |         &document_changes, | ||||||
|  |         EmbeddingConfigs::default(), | ||||||
|  |         &|| false, | ||||||
|  |         &|_| (), | ||||||
|  |     ) | ||||||
|  |     .unwrap(); | ||||||
|  |  | ||||||
|     wtxn.commit().unwrap(); |     wtxn.commit().unwrap(); | ||||||
|  |     drop(rtxn); | ||||||
|  |  | ||||||
|     index |     index | ||||||
| } | } | ||||||
| @@ -141,48 +159,95 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { | |||||||
| } | } | ||||||
|  |  | ||||||
| pub fn documents_from(filename: &str, filetype: &str) -> Mmap { | pub fn documents_from(filename: &str, filetype: &str) -> Mmap { | ||||||
|     let reader = File::open(filename) |     let file = File::open(filename) | ||||||
|         .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); |         .unwrap_or_else(|_| panic!("could not find the dataset in: {filename}")); | ||||||
|     let reader = BufReader::new(reader); |     match filetype { | ||||||
|     let documents = match filetype { |         "csv" => documents_from_csv(file).unwrap(), | ||||||
|         "csv" => documents_from_csv(reader).unwrap(), |         "json" => documents_from_json(file).unwrap(), | ||||||
|         "json" => documents_from_json(reader).unwrap(), |         "jsonl" => documents_from_jsonl(file).unwrap(), | ||||||
|         "jsonl" => documents_from_jsonl(reader).unwrap(), |         otherwise => panic!("invalid update format {otherwise:?}"), | ||||||
|         otherwise => panic!("invalid update format {:?}", otherwise), |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn documents_from_jsonl(file: File) -> anyhow::Result<Mmap> { | ||||||
|  |     unsafe { Mmap::map(&file).map_err(Into::into) } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn documents_from_json(file: File) -> anyhow::Result<Mmap> { | ||||||
|  |     let reader = BufReader::new(file); | ||||||
|  |     let documents: Vec<milli::Object> = serde_json::from_reader(reader)?; | ||||||
|  |     let mut output = tempfile::tempfile().map(BufWriter::new)?; | ||||||
|  |  | ||||||
|  |     for document in documents { | ||||||
|  |         serde_json::to_writer(&mut output, &document)?; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     let file = output.into_inner()?; | ||||||
|  |     unsafe { Mmap::map(&file).map_err(Into::into) } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn documents_from_csv(file: File) -> anyhow::Result<Mmap> { | ||||||
|  |     let output = tempfile::tempfile()?; | ||||||
|  |     let mut output = BufWriter::new(output); | ||||||
|  |     let mut reader = csv::ReaderBuilder::new().from_reader(file); | ||||||
|  |  | ||||||
|  |     let headers = reader.headers().context("while retrieving headers")?.clone(); | ||||||
|  |     let typed_fields: Vec<_> = headers.iter().map(parse_csv_header).collect(); | ||||||
|  |     let mut object: serde_json::Map<_, _> = | ||||||
|  |         typed_fields.iter().map(|(k, _)| (k.to_string(), Value::Null)).collect(); | ||||||
|  |  | ||||||
|  |     let mut line = 0; | ||||||
|  |     let mut record = csv::StringRecord::new(); | ||||||
|  |     while reader.read_record(&mut record).context("while reading a record")? { | ||||||
|  |         // We increment here and not at the end of the loop | ||||||
|  |         // to take the header offset into account. | ||||||
|  |         line += 1; | ||||||
|  |  | ||||||
|  |         // Reset the document values | ||||||
|  |         object.iter_mut().for_each(|(_, v)| *v = Value::Null); | ||||||
|  |  | ||||||
|  |         for (i, (name, atype)) in typed_fields.iter().enumerate() { | ||||||
|  |             let value = &record[i]; | ||||||
|  |             let trimmed_value = value.trim(); | ||||||
|  |             let value = match atype { | ||||||
|  |                 AllowedType::Number if trimmed_value.is_empty() => Value::Null, | ||||||
|  |                 AllowedType::Number => { | ||||||
|  |                     match trimmed_value.parse::<i64>() { | ||||||
|  |                         Ok(integer) => Value::from(integer), | ||||||
|  |                         Err(_) => match trimmed_value.parse::<f64>() { | ||||||
|  |                             Ok(float) => Value::from(float), | ||||||
|  |                             Err(error) => { | ||||||
|  |                                 anyhow::bail!("document format error on line {line}: {error}. For value: {value}") | ||||||
|  |                             } | ||||||
|  |                         }, | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 AllowedType::Boolean if trimmed_value.is_empty() => Value::Null, | ||||||
|  |                 AllowedType::Boolean => match trimmed_value.parse::<bool>() { | ||||||
|  |                     Ok(bool) => Value::from(bool), | ||||||
|  |                     Err(error) => { | ||||||
|  |                         anyhow::bail!( | ||||||
|  |                             "document format error on line {line}: {error}. For value: {value}" | ||||||
|  |                         ) | ||||||
|  |                     } | ||||||
|  |                 }, | ||||||
|  |                 AllowedType::String if value.is_empty() => Value::Null, | ||||||
|  |                 AllowedType::String => Value::from(value), | ||||||
|             }; |             }; | ||||||
|     DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() |  | ||||||
|  |             *object.get_mut(name).expect("encountered an unknown field") = value; | ||||||
|         } |         } | ||||||
|  |  | ||||||
| fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> { |         serde_json::to_writer(&mut output, &object).context("while writing to disk")?; | ||||||
|     let mut documents = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|  |  | ||||||
|     for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() { |  | ||||||
|         let object = result?; |  | ||||||
|         documents.append_json_object(&object)?; |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     documents.into_inner().map_err(Into::into) |     let output = output.into_inner()?; | ||||||
| } |     unsafe { Mmap::map(&output).map_err(Into::into) } | ||||||
|  |  | ||||||
| fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> { |  | ||||||
|     let mut documents = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|  |  | ||||||
|     documents.append_json_array(reader)?; |  | ||||||
|  |  | ||||||
|     documents.into_inner().map_err(Into::into) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn documents_from_csv(reader: impl BufRead) -> anyhow::Result<Vec<u8>> { |  | ||||||
|     let csv = csv::Reader::from_reader(reader); |  | ||||||
|  |  | ||||||
|     let mut documents = DocumentsBatchBuilder::new(Vec::new()); |  | ||||||
|     documents.append_csv(csv)?; |  | ||||||
|  |  | ||||||
|     documents.into_inner().map_err(Into::into) |  | ||||||
| } | } | ||||||
|  |  | ||||||
| enum AllowedType { | enum AllowedType { | ||||||
|     String, |     String, | ||||||
|  |     Boolean, | ||||||
|     Number, |     Number, | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -191,8 +256,9 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) { | |||||||
|     match header.rsplit_once(':') { |     match header.rsplit_once(':') { | ||||||
|         Some((field_name, field_type)) => match field_type { |         Some((field_name, field_type)) => match field_type { | ||||||
|             "string" => (field_name.to_string(), AllowedType::String), |             "string" => (field_name.to_string(), AllowedType::String), | ||||||
|  |             "boolean" => (field_name.to_string(), AllowedType::Boolean), | ||||||
|             "number" => (field_name.to_string(), AllowedType::Number), |             "number" => (field_name.to_string(), AllowedType::Number), | ||||||
|             // we may return an error in this case. |             // if the pattern isn't recognized, we keep the whole field. | ||||||
|             _otherwise => (header.to_string(), AllowedType::String), |             _otherwise => (header.to_string(), AllowedType::String), | ||||||
|         }, |         }, | ||||||
|         None => (header.to_string(), AllowedType::String), |         None => (header.to_string(), AllowedType::String), | ||||||
| @@ -230,10 +296,13 @@ impl<R: Read> Iterator for CSVDocumentDeserializer<R> { | |||||||
|                 for ((field_name, field_type), value) in |                 for ((field_name, field_type), value) in | ||||||
|                     self.headers.iter().zip(csv_document.into_iter()) |                     self.headers.iter().zip(csv_document.into_iter()) | ||||||
|                 { |                 { | ||||||
|                     let parsed_value: Result<Value, ParseFloatError> = match field_type { |                     let parsed_value: anyhow::Result<Value> = match field_type { | ||||||
|                         AllowedType::Number => { |                         AllowedType::Number => { | ||||||
|                             value.parse::<f64>().map(Value::from).map_err(Into::into) |                             value.parse::<f64>().map(Value::from).map_err(Into::into) | ||||||
|                         } |                         } | ||||||
|  |                         AllowedType::Boolean => { | ||||||
|  |                             value.parse::<bool>().map(Value::from).map_err(Into::into) | ||||||
|  |                         } | ||||||
|                         AllowedType::String => Ok(Value::String(value.to_string())), |                         AllowedType::String => Ok(Value::String(value.to_string())), | ||||||
|                     }; |                     }; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -12,7 +12,7 @@ use crate::{DocumentId, Result}; | |||||||
|  |  | ||||||
| #[derive(Default)] | #[derive(Default)] | ||||||
| pub struct DocumentDeletion { | pub struct DocumentDeletion { | ||||||
|     pub to_delete: RoaringBitmap, |     to_delete: RoaringBitmap, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl DocumentDeletion { | impl DocumentDeletion { | ||||||
| @@ -26,11 +26,11 @@ impl DocumentDeletion { | |||||||
|  |  | ||||||
|     pub fn into_changes<'indexer>( |     pub fn into_changes<'indexer>( | ||||||
|         self, |         self, | ||||||
|         indexer: &'indexer Bump, |         indexer_alloc: &'indexer Bump, | ||||||
|         primary_key: PrimaryKey<'indexer>, |         primary_key: PrimaryKey<'indexer>, | ||||||
|     ) -> DocumentDeletionChanges<'indexer> { |     ) -> DocumentDeletionChanges<'indexer> { | ||||||
|         let to_delete: bumpalo::collections::Vec<_> = |         let to_delete: bumpalo::collections::Vec<_> = | ||||||
|             self.to_delete.into_iter().collect_in(indexer); |             self.to_delete.into_iter().collect_in(indexer_alloc); | ||||||
|  |  | ||||||
|         let to_delete = to_delete.into_bump_slice(); |         let to_delete = to_delete.into_bump_slice(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -107,6 +107,12 @@ impl<'pl> DocumentOperation<'pl> { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl Default for DocumentOperation<'_> { | ||||||
|  |     fn default() -> Self { | ||||||
|  |         DocumentOperation::new(IndexDocumentsMethod::default()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| #[allow(clippy::too_many_arguments)] | #[allow(clippy::too_many_arguments)] | ||||||
| fn extract_addition_payload_changes<'r, 'pl: 'r>( | fn extract_addition_payload_changes<'r, 'pl: 'r>( | ||||||
|     indexer: &'pl Bump, |     indexer: &'pl Bump, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user