Compare commits

..

1 Commits

95 changed files with 1074 additions and 2198 deletions

56
Cargo.lock generated
View File

@ -706,20 +706,6 @@ dependencies = [
"serde",
]
[[package]]
name = "bumparaw-collections"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ce682bdc86c2e25ef5cd95881d9d6a1902214eddf74cf9ffea88fe1464377e8"
dependencies = [
"allocator-api2",
"bitpacking",
"bumpalo",
"hashbrown 0.15.1",
"serde",
"serde_json",
]
[[package]]
name = "byte-unit"
version = "5.1.4"
@ -2631,8 +2617,6 @@ dependencies = [
"big_s",
"bincode",
"bumpalo",
"bumparaw-collections",
"convert_case 0.6.0",
"crossbeam-channel",
"csv",
"derive_builder 0.20.0",
@ -2647,6 +2631,7 @@ dependencies = [
"meilisearch-types",
"memmap2",
"page_size",
"raw-collections",
"rayon",
"roaring",
"serde",
@ -2662,12 +2647,12 @@ dependencies = [
[[package]]
name = "indexmap"
version = "2.7.0"
version = "2.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f"
checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
dependencies = [
"equivalent",
"hashbrown 0.15.1",
"hashbrown 0.14.3",
"serde",
]
@ -3564,7 +3549,6 @@ dependencies = [
"actix-web",
"anyhow",
"bumpalo",
"bumparaw-collections",
"convert_case 0.6.0",
"csv",
"deserr",
@ -3577,8 +3561,8 @@ dependencies = [
"meili-snap",
"memmap2",
"milli",
"raw-collections",
"roaring",
"rustc-hash 2.1.0",
"serde",
"serde-cs",
"serde_json",
@ -3599,12 +3583,9 @@ dependencies = [
"clap",
"dump",
"file-store",
"indexmap",
"meilisearch-auth",
"meilisearch-types",
"serde",
"serde_json",
"tempfile",
"time",
"uuid",
]
@ -3637,7 +3618,6 @@ dependencies = [
"bincode",
"bstr",
"bumpalo",
"bumparaw-collections",
"bytemuck",
"byteorder",
"candle-core",
@ -3676,12 +3656,13 @@ dependencies = [
"once_cell",
"ordered-float",
"rand",
"raw-collections",
"rayon",
"rayon-par-bridge",
"rhai",
"roaring",
"rstar",
"rustc-hash 2.1.0",
"rustc-hash 2.0.0",
"serde",
"serde_json",
"slice-group-by",
@ -4430,7 +4411,7 @@ dependencies = [
"bytes",
"rand",
"ring",
"rustc-hash 2.1.0",
"rustc-hash 2.0.0",
"rustls",
"slab",
"thiserror",
@ -4506,6 +4487,19 @@ dependencies = [
"rand",
]
[[package]]
name = "raw-collections"
version = "0.1.0"
source = "git+https://github.com/meilisearch/raw-collections.git#15e5d7bdebc0c149b2a28b2454f307c717d07f8a"
dependencies = [
"allocator-api2",
"bitpacking",
"bumpalo",
"hashbrown 0.15.1",
"serde",
"serde_json",
]
[[package]]
name = "raw-cpuid"
version = "10.7.0"
@ -4803,9 +4797,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
[[package]]
name = "rustc-hash"
version = "2.1.0"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7fb8039b3032c191086b10f11f319a6e99e1e82889c5cc6046f515c9db1d497"
checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152"
[[package]]
name = "rustc_version"
@ -4974,9 +4968,9 @@ dependencies = [
[[package]]
name = "serde_json"
version = "1.0.133"
version = "1.0.132"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377"
checksum = "d726bfaff4b320266d395898905d0eba0345aae23b54aee3a737e260fd46db03"
dependencies = [
"indexmap",
"itoa",

View File

@ -8,7 +8,6 @@ use bumpalo::Bump;
use criterion::{criterion_group, criterion_main, Criterion};
use milli::documents::PrimaryKey;
use milli::heed::{EnvOpenOptions, RwTxn};
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
@ -152,7 +151,7 @@ fn indexing_songs_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -167,7 +166,7 @@ fn indexing_songs_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -219,7 +218,7 @@ fn reindexing_songs_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -234,7 +233,7 @@ fn reindexing_songs_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -264,7 +263,7 @@ fn reindexing_songs_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -279,7 +278,7 @@ fn reindexing_songs_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -333,7 +332,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -348,7 +347,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -410,7 +409,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -425,7 +424,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -455,7 +454,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -470,7 +469,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -496,7 +495,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -511,7 +510,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -564,7 +563,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -579,7 +578,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -631,7 +630,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -646,7 +645,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -698,7 +697,7 @@ fn indexing_wiki(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -713,7 +712,7 @@ fn indexing_wiki(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -764,7 +763,7 @@ fn reindexing_wiki(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -779,7 +778,7 @@ fn reindexing_wiki(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -809,7 +808,7 @@ fn reindexing_wiki(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -824,7 +823,7 @@ fn reindexing_wiki(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -877,7 +876,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -892,7 +891,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -954,7 +953,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -969,7 +968,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1000,7 +999,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1015,7 +1014,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1042,7 +1041,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1057,7 +1056,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1109,7 +1108,7 @@ fn indexing_movies_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1124,7 +1123,7 @@ fn indexing_movies_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1175,7 +1174,7 @@ fn reindexing_movies_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1190,7 +1189,7 @@ fn reindexing_movies_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1220,7 +1219,7 @@ fn reindexing_movies_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1235,7 +1234,7 @@ fn reindexing_movies_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1288,7 +1287,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1303,7 +1302,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1351,7 +1350,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBi
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1401,7 +1400,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1416,7 +1415,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1446,7 +1445,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1461,7 +1460,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1487,7 +1486,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1502,7 +1501,7 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1577,7 +1576,7 @@ fn indexing_nested_movies_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1592,7 +1591,7 @@ fn indexing_nested_movies_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1668,7 +1667,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1683,7 +1682,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1751,7 +1750,7 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1766,7 +1765,7 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1818,7 +1817,7 @@ fn indexing_geo(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1833,7 +1832,7 @@ fn indexing_geo(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1884,7 +1883,7 @@ fn reindexing_geo(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1899,7 +1898,7 @@ fn reindexing_geo(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1929,7 +1928,7 @@ fn reindexing_geo(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -1944,7 +1943,7 @@ fn reindexing_geo(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
@ -1997,7 +1996,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2012,7 +2011,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();

View File

@ -10,7 +10,6 @@ use bumpalo::Bump;
use criterion::BenchmarkId;
use memmap2::Mmap;
use milli::heed::EnvOpenOptions;
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
@ -111,7 +110,7 @@ pub fn base_setup(conf: &Conf) -> Index {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -126,7 +125,7 @@ pub fn base_setup(conf: &Conf) -> Index {
&document_changes,
EmbeddingConfigs::default(),
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();

View File

@ -136,14 +136,6 @@ pub struct File {
}
impl File {
pub fn from_parts(path: PathBuf, file: Option<NamedTempFile>) -> Self {
Self { path, file }
}
pub fn into_parts(self) -> (PathBuf, Option<NamedTempFile>) {
(self.path, self.file)
}
pub fn dry_file() -> Result<Self> {
Ok(Self { path: PathBuf::new(), file: None })
}

View File

@ -10,7 +10,6 @@ use either::Either;
use fuzzers::Operation;
use milli::documents::mmap_from_objects;
use milli::heed::EnvOpenOptions;
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::{IndexDocumentsMethod, IndexerConfig};
use milli::vector::EmbeddingConfigs;
@ -129,7 +128,7 @@ fn main() {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -144,7 +143,7 @@ fn main() {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();

View File

@ -13,9 +13,6 @@ license.workspace = true
[dependencies]
anyhow = "1.0.86"
bincode = "1.3.3"
bumpalo = "3.16.0"
bumparaw-collections = "0.1.2"
convert_case = "0.6.0"
csv = "1.3.0"
derive_builder = "0.20.0"
dump = { path = "../dump" }
@ -24,8 +21,8 @@ file-store = { path = "../file-store" }
flate2 = "1.0.30"
meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" }
memmap2 = "0.9.4"
page_size = "0.6.0"
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
rayon = "1.10.0"
roaring = { version = "0.10.7", features = ["serde"] }
serde = { version = "1.0.204", features = ["derive"] }
@ -33,6 +30,7 @@ serde_json = { version = "1.0.120", features = ["preserve_order"] }
synchronoise = "1.0.1"
tempfile = "3.10.1"
thiserror = "1.0.61"
memmap2 = "0.9.4"
time = { version = "0.3.36", features = [
"serde-well-known",
"formatting",
@ -42,6 +40,7 @@ time = { version = "0.3.36", features = [
tracing = "0.1.40"
ureq = "2.10.0"
uuid = { version = "1.10.0", features = ["serde", "v4"] }
bumpalo = "3.16.0"
[dev-dependencies]
arroy = "0.5.0"

View File

@ -22,26 +22,27 @@ use std::ffi::OsStr;
use std::fmt;
use std::fs::{self, File};
use std::io::BufWriter;
use std::sync::atomic::Ordering;
use std::sync::atomic::{self, AtomicU64};
use std::time::Duration;
use bumpalo::collections::CollectIn;
use bumpalo::Bump;
use dump::IndexMetadata;
use meilisearch_types::batches::BatchId;
use meilisearch_types::error::Code;
use meilisearch_types::heed::{RoTxn, RwTxn};
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader, PrimaryKey};
use meilisearch_types::milli::heed::CompactionOption;
use meilisearch_types::milli::progress::Progress;
use meilisearch_types::milli::update::new::indexer::{self, UpdateByFunction};
use meilisearch_types::milli::update::{
DocumentAdditionResult, IndexDocumentsMethod, Settings as MilliSettings,
};
use meilisearch_types::milli::update::{IndexDocumentsMethod, Settings as MilliSettings};
use meilisearch_types::milli::vector::parsed_vectors::{
ExplicitVectors, VectorOrArrayOfVectors, RESERVED_VECTORS_FIELD_NAME,
};
use meilisearch_types::milli::{self, Filter, ThreadPoolNoAbortBuilder};
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task};
use meilisearch_types::tasks::{
Details, IndexSwap, Kind, KindWithContent, Status, Task, TaskProgress,
};
use meilisearch_types::{compression, Index, VERSION_FILE_NAME};
use roaring::RoaringBitmap;
use time::macros::format_description;
@ -49,13 +50,6 @@ use time::OffsetDateTime;
use uuid::Uuid;
use crate::autobatcher::{self, BatchKind};
use crate::processing::{
AtomicBatchStep, AtomicDocumentStep, AtomicTaskStep, AtomicUpdateFileStep, CreateIndexProgress,
DeleteIndexProgress, DocumentDeletionProgress, DocumentEditionProgress,
DocumentOperationProgress, DumpCreationProgress, InnerSwappingTwoIndexes, SettingsProgress,
SnapshotCreationProgress, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress,
UpdateIndexProgress, VariableNameStep,
};
use crate::utils::{self, swap_index_uid_in_task, ProcessingBatch};
use crate::{Error, IndexScheduler, Result, TaskId};
@ -566,12 +560,11 @@ impl IndexScheduler {
/// The list of tasks that were processed. The metadata of each task in the returned
/// list is updated accordingly, with the exception of the its date fields
/// [`finished_at`](meilisearch_types::tasks::Task::finished_at) and [`started_at`](meilisearch_types::tasks::Task::started_at).
#[tracing::instrument(level = "trace", skip(self, batch, progress), target = "indexing::scheduler", fields(batch=batch.to_string()))]
#[tracing::instrument(level = "trace", skip(self, batch), target = "indexing::scheduler", fields(batch=batch.to_string()))]
pub(crate) fn process_batch(
&self,
batch: Batch,
current_batch: &mut ProcessingBatch,
progress: Progress,
) -> Result<Vec<Task>> {
#[cfg(test)]
{
@ -591,13 +584,8 @@ impl IndexScheduler {
};
let rtxn = self.env.read_txn()?;
let mut canceled_tasks = self.cancel_matched_tasks(
&rtxn,
task.uid,
current_batch,
matched_tasks,
&progress,
)?;
let mut canceled_tasks =
self.cancel_matched_tasks(&rtxn, task.uid, current_batch, matched_tasks)?;
task.status = Status::Succeeded;
match &mut task.details {
@ -628,8 +616,7 @@ impl IndexScheduler {
}
let mut wtxn = self.env.write_txn()?;
let mut deleted_tasks =
self.delete_matched_tasks(&mut wtxn, &matched_tasks, &progress)?;
let mut deleted_tasks = self.delete_matched_tasks(&mut wtxn, &matched_tasks)?;
wtxn.commit()?;
for task in tasks.iter_mut() {
@ -655,8 +642,6 @@ impl IndexScheduler {
Ok(tasks)
}
Batch::SnapshotCreation(mut tasks) => {
progress.update_progress(SnapshotCreationProgress::StartTheSnapshotCreation);
fs::create_dir_all(&self.snapshots_path)?;
let temp_snapshot_dir = tempfile::tempdir()?;
@ -677,7 +662,6 @@ impl IndexScheduler {
// two read operations as the task processing is synchronous.
// 2.1 First copy the LMDB env of the index-scheduler
progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexScheduler);
let dst = temp_snapshot_dir.path().join("tasks");
fs::create_dir_all(&dst)?;
self.env.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
@ -690,41 +674,27 @@ impl IndexScheduler {
fs::create_dir_all(&update_files_dir)?;
// 2.4 Only copy the update files of the enqueued tasks
progress.update_progress(SnapshotCreationProgress::SnapshotTheUpdateFiles);
let enqueued = self.get_status(&rtxn, Status::Enqueued)?;
let (atomic, update_file_progress) =
AtomicUpdateFileStep::new(enqueued.len() as u32);
progress.update_progress(update_file_progress);
for task_id in enqueued {
for task_id in self.get_status(&rtxn, Status::Enqueued)? {
let task = self.get_task(&rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?;
if let Some(content_uuid) = task.content_uuid() {
let src = self.file_store.get_update_path(content_uuid);
let dst = update_files_dir.join(content_uuid.to_string());
fs::copy(src, dst)?;
}
atomic.fetch_add(1, Ordering::Relaxed);
}
// 3. Snapshot every indexes
progress.update_progress(SnapshotCreationProgress::SnapshotTheIndexes);
let index_mapping = self.index_mapper.index_mapping;
let nb_indexes = index_mapping.len(&rtxn)? as u32;
for (i, result) in index_mapping.iter(&rtxn)?.enumerate() {
for result in self.index_mapper.index_mapping.iter(&rtxn)? {
let (name, uuid) = result?;
progress.update_progress(VariableNameStep::new(name, i as u32, nb_indexes));
let index = self.index_mapper.index(&rtxn, name)?;
let dst = temp_snapshot_dir.path().join("indexes").join(uuid.to_string());
fs::create_dir_all(&dst)?;
index
.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)
.map_err(|e| Error::from_milli(e, Some(name.to_string())))?;
index.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
}
drop(rtxn);
// 4. Snapshot the auth LMDB env
progress.update_progress(SnapshotCreationProgress::SnapshotTheApiKeys);
let dst = temp_snapshot_dir.path().join("auth");
fs::create_dir_all(&dst)?;
// TODO We can't use the open_auth_store_env function here but we should
@ -737,7 +707,6 @@ impl IndexScheduler {
auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
// 5. Copy and tarball the flat snapshot
progress.update_progress(SnapshotCreationProgress::CreateTheTarball);
// 5.1 Find the original name of the database
// TODO find a better way to get this path
let mut base_path = self.env.path().to_owned();
@ -770,7 +739,6 @@ impl IndexScheduler {
Ok(tasks)
}
Batch::Dump(mut task) => {
progress.update_progress(DumpCreationProgress::StartTheDumpCreation);
let started_at = OffsetDateTime::now_utc();
let (keys, instance_uid) =
if let KindWithContent::DumpCreation { keys, instance_uid } = &task.kind {
@ -781,7 +749,6 @@ impl IndexScheduler {
let dump = dump::DumpWriter::new(*instance_uid)?;
// 1. dump the keys
progress.update_progress(DumpCreationProgress::DumpTheApiKeys);
let mut dump_keys = dump.create_keys()?;
for key in keys {
dump_keys.push_key(key)?;
@ -791,13 +758,7 @@ impl IndexScheduler {
let rtxn = self.env.read_txn()?;
// 2. dump the tasks
progress.update_progress(DumpCreationProgress::DumpTheTasks);
let mut dump_tasks = dump.create_tasks_queue()?;
let (atomic, update_task_progress) =
AtomicTaskStep::new(self.all_tasks.len(&rtxn)? as u32);
progress.update_progress(update_task_progress);
for ret in self.all_tasks.iter(&rtxn)? {
if self.must_stop_processing.get() {
return Err(Error::AbortedTask);
@ -830,84 +791,50 @@ impl IndexScheduler {
let content_file = self.file_store.get_update(content_file)?;
let reader = DocumentsBatchReader::from_reader(content_file)
.map_err(|e| Error::from_milli(e.into(), None))?;
.map_err(milli::Error::from)?;
let (mut cursor, documents_batch_index) =
reader.into_cursor_and_fields_index();
while let Some(doc) = cursor
.next_document()
.map_err(|e| Error::from_milli(e.into(), None))?
while let Some(doc) =
cursor.next_document().map_err(milli::Error::from)?
{
dump_content_file.push_document(
&obkv_to_object(doc, &documents_batch_index)
.map_err(|e| Error::from_milli(e, None))?,
)?;
dump_content_file
.push_document(&obkv_to_object(doc, &documents_batch_index)?)?;
}
dump_content_file.flush()?;
}
}
atomic.fetch_add(1, Ordering::Relaxed);
}
dump_tasks.flush()?;
// 3. Dump the indexes
progress.update_progress(DumpCreationProgress::DumpTheIndexes);
let nb_indexes = self.index_mapper.index_mapping.len(&rtxn)? as u32;
let mut count = 0;
self.index_mapper.try_for_each_index(&rtxn, |uid, index| -> Result<()> {
progress.update_progress(VariableNameStep::new(
uid.to_string(),
count,
nb_indexes,
));
count += 1;
let rtxn = index.read_txn()?;
let metadata = IndexMetadata {
uid: uid.to_owned(),
primary_key: index.primary_key(&rtxn)?.map(String::from),
created_at: index
.created_at(&rtxn)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?,
updated_at: index
.updated_at(&rtxn)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?,
created_at: index.created_at(&rtxn)?,
updated_at: index.updated_at(&rtxn)?,
};
let mut index_dumper = dump.create_index(uid, &metadata)?;
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let embedding_configs = index
.embedding_configs(&rtxn)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
let embedding_configs = index.embedding_configs(&rtxn)?;
let nb_documents = index
.number_of_documents(&rtxn)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?
as u32;
let (atomic, update_document_progress) = AtomicDocumentStep::new(nb_documents);
progress.update_progress(update_document_progress);
let documents = index
.all_documents(&rtxn)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
// 3.1. Dump the documents
for ret in documents {
for ret in index.all_documents(&rtxn)? {
if self.must_stop_processing.get() {
return Err(Error::AbortedTask);
}
let (id, doc) =
ret.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
let (id, doc) = ret?;
let mut document =
milli::obkv_to_json(&all_fields, &fields_ids_map, doc)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
'inject_vectors: {
let embeddings = index
.embeddings(&rtxn, id)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
let embeddings = index.embeddings(&rtxn, id)?;
if embeddings.is_empty() {
break 'inject_vectors;
@ -918,7 +845,7 @@ impl IndexScheduler {
.or_insert(serde_json::Value::Object(Default::default()));
let serde_json::Value::Object(vectors) = vectors else {
let user_err = milli::Error::UserError(
return Err(milli::Error::UserError(
milli::UserError::InvalidVectorsMapType {
document_id: {
if let Ok(Some(Ok(index))) = index
@ -932,9 +859,8 @@ impl IndexScheduler {
},
value: vectors.clone(),
},
);
return Err(Error::from_milli(user_err, Some(uid.to_string())));
)
.into());
};
for (embedder_name, embeddings) in embeddings {
@ -957,7 +883,6 @@ impl IndexScheduler {
}
index_dumper.push_document(&document)?;
atomic.fetch_add(1, Ordering::Relaxed);
}
// 3.2. Dump the settings
@ -965,14 +890,12 @@ impl IndexScheduler {
index,
&rtxn,
meilisearch_types::settings::SecretPolicy::RevealSecrets,
)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
)?;
index_dumper.settings(&settings)?;
Ok(())
})?;
// 4. Dump experimental feature settings
progress.update_progress(DumpCreationProgress::DumpTheExperimentalFeatures);
let features = self.features().runtime_features();
dump.create_experimental_features(features)?;
@ -983,7 +906,6 @@ impl IndexScheduler {
if self.must_stop_processing.get() {
return Err(Error::AbortedTask);
}
progress.update_progress(DumpCreationProgress::CompressTheDump);
let path = self.dumps_path.join(format!("{}.dump", dump_uid));
let file = File::create(path)?;
dump.persist_to(BufWriter::new(file))?;
@ -1009,7 +931,7 @@ impl IndexScheduler {
.set_currently_updating_index(Some((index_uid.clone(), index.clone())));
let mut index_wtxn = index.write_txn()?;
let tasks = self.apply_index_operation(&mut index_wtxn, &index, op, progress)?;
let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?;
{
let span = tracing::trace_span!(target: "indexing::scheduler", "commit");
@ -1024,8 +946,7 @@ impl IndexScheduler {
// the entire batch.
let res = || -> Result<()> {
let index_rtxn = index.read_txn()?;
let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)
.map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?;
let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?;
let mut wtxn = self.env.write_txn()?;
self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?;
wtxn.commit()?;
@ -1043,8 +964,6 @@ impl IndexScheduler {
Ok(tasks)
}
Batch::IndexCreation { index_uid, primary_key, task } => {
progress.update_progress(CreateIndexProgress::CreatingTheIndex);
let wtxn = self.env.write_txn()?;
if self.index_mapper.exists(&wtxn, &index_uid)? {
return Err(Error::IndexAlreadyExists(index_uid));
@ -1054,11 +973,9 @@ impl IndexScheduler {
self.process_batch(
Batch::IndexUpdate { index_uid, primary_key, task },
current_batch,
progress,
)
}
Batch::IndexUpdate { index_uid, primary_key, mut task } => {
progress.update_progress(UpdateIndexProgress::UpdatingTheIndex);
let rtxn = self.env.read_txn()?;
let index = self.index_mapper.index(&rtxn, &index_uid)?;
@ -1071,12 +988,10 @@ impl IndexScheduler {
);
builder.set_primary_key(primary_key);
let must_stop_processing = self.must_stop_processing.clone();
builder
.execute(
|indexing_step| tracing::debug!(update = ?indexing_step),
|| must_stop_processing.get(),
)
.map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))?;
builder.execute(
|indexing_step| tracing::debug!(update = ?indexing_step),
|| must_stop_processing.get(),
)?;
index_wtxn.commit()?;
}
@ -1093,8 +1008,7 @@ impl IndexScheduler {
let res = || -> Result<()> {
let mut wtxn = self.env.write_txn()?;
let index_rtxn = index.read_txn()?;
let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)
.map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?;
let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?;
self.index_mapper.store_stats_of(&mut wtxn, &index_uid, &stats)?;
wtxn.commit()?;
Ok(())
@ -1111,16 +1025,13 @@ impl IndexScheduler {
Ok(vec![task])
}
Batch::IndexDeletion { index_uid, index_has_been_created, mut tasks } => {
progress.update_progress(DeleteIndexProgress::DeletingTheIndex);
let wtxn = self.env.write_txn()?;
// it's possible that the index doesn't exist
let number_of_documents = || -> Result<u64> {
let index = self.index_mapper.index(&wtxn, &index_uid)?;
let index_rtxn = index.read_txn()?;
index
.number_of_documents(&index_rtxn)
.map_err(|e| Error::from_milli(e, Some(index_uid.to_string())))
Ok(index.number_of_documents(&index_rtxn)?)
}()
.unwrap_or_default();
@ -1145,8 +1056,6 @@ impl IndexScheduler {
Ok(tasks)
}
Batch::IndexSwap { mut task } => {
progress.update_progress(SwappingTheIndexes::EnsuringCorrectnessOfTheSwap);
let mut wtxn = self.env.write_txn()?;
let swaps = if let KindWithContent::IndexSwap { swaps } = &task.kind {
swaps
@ -1173,20 +1082,8 @@ impl IndexScheduler {
));
}
}
progress.update_progress(SwappingTheIndexes::SwappingTheIndexes);
for (step, swap) in swaps.iter().enumerate() {
progress.update_progress(VariableNameStep::new(
format!("swapping index {} and {}", swap.indexes.0, swap.indexes.1),
step as u32,
swaps.len() as u32,
));
self.apply_index_swap(
&mut wtxn,
&progress,
task.uid,
&swap.indexes.0,
&swap.indexes.1,
)?;
for swap in swaps {
self.apply_index_swap(&mut wtxn, task.uid, &swap.indexes.0, &swap.indexes.1)?;
}
wtxn.commit()?;
task.status = Status::Succeeded;
@ -1196,15 +1093,7 @@ impl IndexScheduler {
}
/// Swap the index `lhs` with the index `rhs`.
fn apply_index_swap(
&self,
wtxn: &mut RwTxn,
progress: &Progress,
task_id: u32,
lhs: &str,
rhs: &str,
) -> Result<()> {
progress.update_progress(InnerSwappingTwoIndexes::RetrieveTheTasks);
fn apply_index_swap(&self, wtxn: &mut RwTxn, task_id: u32, lhs: &str, rhs: &str) -> Result<()> {
// 1. Verify that both lhs and rhs are existing indexes
let index_lhs_exists = self.index_mapper.index_exists(wtxn, lhs)?;
if !index_lhs_exists {
@ -1222,21 +1111,14 @@ impl IndexScheduler {
index_rhs_task_ids.remove_range(task_id..);
// 3. before_name -> new_name in the task's KindWithContent
progress.update_progress(InnerSwappingTwoIndexes::UpdateTheTasks);
let tasks_to_update = &index_lhs_task_ids | &index_rhs_task_ids;
let (atomic, task_progress) = AtomicTaskStep::new(tasks_to_update.len() as u32);
progress.update_progress(task_progress);
for task_id in tasks_to_update {
for task_id in &index_lhs_task_ids | &index_rhs_task_ids {
let mut task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?;
swap_index_uid_in_task(&mut task, (lhs, rhs));
self.all_tasks.put(wtxn, &task_id, &task)?;
atomic.fetch_add(1, Ordering::Relaxed);
}
// 4. remove the task from indexuid = before_name
// 5. add the task to indexuid = after_name
progress.update_progress(InnerSwappingTwoIndexes::UpdateTheIndexesMetadata);
self.update_index(wtxn, lhs, |lhs_tasks| {
*lhs_tasks -= &index_lhs_task_ids;
*lhs_tasks |= &index_rhs_task_ids;
@ -1258,7 +1140,7 @@ impl IndexScheduler {
/// The list of processed tasks.
#[tracing::instrument(
level = "trace",
skip(self, index_wtxn, index, progress),
skip(self, index_wtxn, index),
target = "indexing::scheduler"
)]
fn apply_index_operation<'i>(
@ -1266,18 +1148,48 @@ impl IndexScheduler {
index_wtxn: &mut RwTxn<'i>,
index: &'i Index,
operation: IndexOperation,
progress: Progress,
) -> Result<Vec<Task>> {
let indexer_alloc = Bump::new();
let started_processing_at = std::time::Instant::now();
let secs_since_started_processing_at = AtomicU64::new(0);
const PRINT_SECS_DELTA: u64 = 5;
let processing_tasks = self.processing_tasks.clone();
let must_stop_processing = self.must_stop_processing.clone();
let send_progress = |progress| {
let now = std::time::Instant::now();
let elapsed = secs_since_started_processing_at.load(atomic::Ordering::Relaxed);
let previous = started_processing_at + Duration::from_secs(elapsed);
let elapsed = now - previous;
if elapsed.as_secs() < PRINT_SECS_DELTA {
return;
}
secs_since_started_processing_at
.store((now - started_processing_at).as_secs(), atomic::Ordering::Relaxed);
let TaskProgress {
current_step,
finished_steps,
total_steps,
finished_substeps,
total_substeps,
} = processing_tasks.write().unwrap().update_progress(progress);
tracing::info!(
current_step,
finished_steps,
total_steps,
finished_substeps,
total_substeps
);
};
match operation {
IndexOperation::DocumentClear { index_uid, mut tasks } => {
let count = milli::update::ClearDocuments::new(index_wtxn, index)
.execute()
.map_err(|e| Error::from_milli(e, Some(index_uid)))?;
IndexOperation::DocumentClear { mut tasks, .. } => {
let count = milli::update::ClearDocuments::new(index_wtxn, index).execute()?;
let mut first_clear_found = false;
for task in &mut tasks {
@ -1297,13 +1209,12 @@ impl IndexScheduler {
Ok(tasks)
}
IndexOperation::DocumentOperation {
index_uid,
index_uid: _,
primary_key,
method,
operations,
mut tasks,
} => {
progress.update_progress(DocumentOperationProgress::RetrievingConfig);
// TODO: at some point, for better efficiency we might want to reuse the bumpalo for successive batches.
// this is made difficult by the fact we're doing private clones of the index scheduler and sending it
// to a fresh thread.
@ -1324,17 +1235,13 @@ impl IndexScheduler {
let mut content_files_iter = content_files.iter();
let mut indexer = indexer::DocumentOperation::new(method);
let embedders = index
.embedding_configs(index_wtxn)
.map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?;
let embedders = self.embedders(index_uid.clone(), embedders)?;
let embedders = index.embedding_configs(index_wtxn)?;
let embedders = self.embedders(embedders)?;
for operation in operations {
match operation {
DocumentOperation::Add(_content_uuid) => {
let mmap = content_files_iter.next().unwrap();
indexer
.add_documents(mmap)
.map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?;
indexer.add_documents(mmap)?;
}
DocumentOperation::Delete(document_ids) => {
let document_ids: bumpalo::collections::vec::Vec<_> = document_ids
@ -1359,22 +1266,19 @@ impl IndexScheduler {
}
};
progress.update_progress(DocumentOperationProgress::ComputingDocumentChanges);
let (document_changes, operation_stats, primary_key) = indexer
.into_changes(
&indexer_alloc,
index,
&rtxn,
primary_key.as_deref(),
&mut new_fields_ids_map,
&|| must_stop_processing.get(),
progress.clone(),
)
.map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?;
let (document_changes, operation_stats, primary_key) = indexer.into_changes(
&indexer_alloc,
index,
&rtxn,
primary_key.as_deref(),
&mut new_fields_ids_map,
&|| must_stop_processing.get(),
&send_progress,
)?;
let mut candidates_count = 0;
let mut addition = 0;
for (stats, task) in operation_stats.into_iter().zip(&mut tasks) {
candidates_count += stats.document_count;
addition += stats.document_count;
match stats.error {
Some(error) => {
task.status = Status::Failed;
@ -1404,7 +1308,6 @@ impl IndexScheduler {
}
}
progress.update_progress(DocumentOperationProgress::Indexing);
if tasks.iter().any(|res| res.error.is_none()) {
indexer::index(
index_wtxn,
@ -1417,25 +1320,15 @@ impl IndexScheduler {
&document_changes,
embedders,
&|| must_stop_processing.get(),
&progress,
)
.map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?;
let addition = DocumentAdditionResult {
indexed_documents: candidates_count,
number_of_documents: index
.number_of_documents(index_wtxn)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?,
};
&send_progress,
)?;
tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
}
Ok(tasks)
}
IndexOperation::DocumentEdition { index_uid, mut task } => {
progress.update_progress(DocumentEditionProgress::RetrievingConfig);
IndexOperation::DocumentEdition { mut task, .. } => {
let (filter, code) = if let KindWithContent::DocumentEdition {
filter_expr,
context: _,
@ -1449,11 +1342,16 @@ impl IndexScheduler {
};
let candidates = match filter.as_ref().map(Filter::from_json) {
Some(Ok(Some(filter))) => filter
.evaluate(index_wtxn, index)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?,
Some(Ok(Some(filter))) => {
filter.evaluate(index_wtxn, index).map_err(|err| match err {
milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
}
e => e.into(),
})?
}
None | Some(Ok(None)) => index.documents_ids(index_wtxn)?,
Some(Err(e)) => return Err(Error::from_milli(e, Some(index_uid.clone()))),
Some(Err(e)) => return Err(e.into()),
};
let (original_filter, context, function) = if let Some(Details::DocumentEdition {
@ -1488,9 +1386,8 @@ impl IndexScheduler {
// candidates not empty => index not empty => a primary key is set
let primary_key = index.primary_key(&rtxn)?.unwrap();
let primary_key =
PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map)
.map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?;
let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map)
.map_err(milli::Error::from)?;
let result_count = Ok((candidates.len(), candidates.len())) as Result<_>;
@ -1508,22 +1405,13 @@ impl IndexScheduler {
}
};
let candidates_count = candidates.len();
progress.update_progress(DocumentEditionProgress::ComputingDocumentChanges);
let indexer = UpdateByFunction::new(candidates, context.clone(), code.clone());
let document_changes = pool
.install(|| {
indexer
.into_changes(&primary_key)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))
})
.unwrap()?;
let embedders = index
.embedding_configs(index_wtxn)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
let embedders = self.embedders(index_uid.clone(), embedders)?;
let document_changes =
pool.install(|| indexer.into_changes(&primary_key)).unwrap()?;
let embedders = index.embedding_configs(index_wtxn)?;
let embedders = self.embedders(embedders)?;
progress.update_progress(DocumentEditionProgress::Indexing);
indexer::index(
index_wtxn,
index,
@ -1535,18 +1423,10 @@ impl IndexScheduler {
&document_changes,
embedders,
&|| must_stop_processing.get(),
&progress,
)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
&send_progress,
)?;
let addition = DocumentAdditionResult {
indexed_documents: candidates_count,
number_of_documents: index
.number_of_documents(index_wtxn)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?,
};
tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
}
match result_count {
@ -1575,9 +1455,7 @@ impl IndexScheduler {
Ok(vec![task])
}
IndexOperation::DocumentDeletion { mut tasks, index_uid } => {
progress.update_progress(DocumentDeletionProgress::RetrievingConfig);
IndexOperation::DocumentDeletion { mut tasks, index_uid: _ } => {
let mut to_delete = RoaringBitmap::new();
let external_documents_ids = index.external_documents_ids();
@ -1598,23 +1476,35 @@ impl IndexScheduler {
deleted_documents: Some(will_be_removed),
});
}
KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } => {
KindWithContent::DocumentDeletionByFilter { index_uid: _, filter_expr } => {
let before = to_delete.len();
let filter = match Filter::from_json(filter_expr) {
Ok(filter) => filter,
Err(err) => {
// theorically, this should be catched by deserr before reaching the index-scheduler and cannot happens
task.status = Status::Failed;
task.error = Some(
Error::from_milli(err, Some(index_uid.clone())).into(),
);
task.error = match err {
milli::Error::UserError(
milli::UserError::InvalidFilterExpression { .. },
) => Some(
Error::from(err)
.with_custom_error_code(Code::InvalidDocumentFilter)
.into(),
),
e => Some(e.into()),
};
None
}
};
if let Some(filter) = filter {
let candidates = filter
.evaluate(index_wtxn, index)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())));
let candidates =
filter.evaluate(index_wtxn, index).map_err(|err| match err {
milli::Error::UserError(
milli::UserError::InvalidFilter(_),
) => Error::from(err)
.with_custom_error_code(Code::InvalidDocumentFilter),
e => e.into(),
});
match candidates {
Ok(candidates) => to_delete |= candidates,
Err(err) => {
@ -1650,9 +1540,8 @@ impl IndexScheduler {
// to_delete not empty => index not empty => primary key set
let primary_key = index.primary_key(&rtxn)?.unwrap();
let primary_key =
PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map)
.map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?;
let primary_key = PrimaryKey::new_or_insert(primary_key, &mut new_fields_ids_map)
.map_err(milli::Error::from)?;
if !tasks.iter().all(|res| res.error.is_some()) {
let local_pool;
@ -1668,17 +1557,12 @@ impl IndexScheduler {
}
};
progress.update_progress(DocumentDeletionProgress::DeleteDocuments);
let mut indexer = indexer::DocumentDeletion::new();
let candidates_count = to_delete.len();
indexer.delete_documents_by_docids(to_delete);
let document_changes = indexer.into_changes(&indexer_alloc, primary_key);
let embedders = index
.embedding_configs(index_wtxn)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
let embedders = self.embedders(index_uid.clone(), embedders)?;
let embedders = index.embedding_configs(index_wtxn)?;
let embedders = self.embedders(embedders)?;
progress.update_progress(DocumentDeletionProgress::Indexing);
indexer::index(
index_wtxn,
index,
@ -1690,24 +1574,15 @@ impl IndexScheduler {
&document_changes,
embedders,
&|| must_stop_processing.get(),
&progress,
)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
&send_progress,
)?;
let addition = DocumentAdditionResult {
indexed_documents: candidates_count,
number_of_documents: index
.number_of_documents(index_wtxn)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?,
};
tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
}
Ok(tasks)
}
IndexOperation::Settings { index_uid, settings, mut tasks } => {
progress.update_progress(SettingsProgress::RetrievingAndMergingTheSettings);
IndexOperation::Settings { index_uid: _, settings, mut tasks } => {
let indexer_config = self.index_mapper.indexer_config();
let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config);
@ -1721,13 +1596,10 @@ impl IndexScheduler {
task.status = Status::Succeeded;
}
progress.update_progress(SettingsProgress::ApplyTheSettings);
builder
.execute(
|indexing_step| tracing::debug!(update = ?indexing_step),
|| must_stop_processing.get(),
)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?;
builder.execute(
|indexing_step| tracing::debug!(update = ?indexing_step),
|| must_stop_processing.get(),
)?;
Ok(tasks)
}
@ -1744,14 +1616,12 @@ impl IndexScheduler {
index_uid: index_uid.clone(),
tasks: cleared_tasks,
},
progress.clone(),
)?;
let settings_tasks = self.apply_index_operation(
index_wtxn,
index,
IndexOperation::Settings { index_uid, settings, tasks: settings_tasks },
progress,
)?;
let mut tasks = settings_tasks;
@ -1768,18 +1638,15 @@ impl IndexScheduler {
&self,
wtxn: &mut RwTxn,
matched_tasks: &RoaringBitmap,
progress: &Progress,
) -> Result<RoaringBitmap> {
progress.update_progress(TaskDeletionProgress::DeletingTasksDateTime);
// 1. Remove from this list the tasks that we are not allowed to delete
let enqueued_tasks = self.get_status(wtxn, Status::Enqueued)?;
let processing_tasks = &self.processing_tasks.read().unwrap().processing.clone();
let all_task_ids = self.all_task_ids(wtxn)?;
let mut to_delete_tasks = all_task_ids & matched_tasks;
to_delete_tasks -= &**processing_tasks;
to_delete_tasks -= &enqueued_tasks;
to_delete_tasks -= processing_tasks;
to_delete_tasks -= enqueued_tasks;
// 2. We now have a list of tasks to delete, delete them
@ -1790,8 +1657,6 @@ impl IndexScheduler {
// The tasks that have been removed *per batches*.
let mut affected_batches: HashMap<BatchId, RoaringBitmap> = HashMap::new();
let (atomic_progress, task_progress) = AtomicTaskStep::new(to_delete_tasks.len() as u32);
progress.update_progress(task_progress);
for task_id in to_delete_tasks.iter() {
let task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?;
@ -1815,35 +1680,22 @@ impl IndexScheduler {
if let Some(batch_uid) = task.batch_uid {
affected_batches.entry(batch_uid).or_default().insert(task_id);
}
atomic_progress.fetch_add(1, Ordering::Relaxed);
}
progress.update_progress(TaskDeletionProgress::DeletingTasksMetadata);
let (atomic_progress, task_progress) = AtomicTaskStep::new(
(affected_indexes.len() + affected_statuses.len() + affected_kinds.len()) as u32,
);
progress.update_progress(task_progress);
for index in affected_indexes.iter() {
self.update_index(wtxn, index, |bitmap| *bitmap -= &to_delete_tasks)?;
atomic_progress.fetch_add(1, Ordering::Relaxed);
}
for status in affected_statuses.iter() {
self.update_status(wtxn, *status, |bitmap| *bitmap -= &to_delete_tasks)?;
atomic_progress.fetch_add(1, Ordering::Relaxed);
}
for kind in affected_kinds.iter() {
self.update_kind(wtxn, *kind, |bitmap| *bitmap -= &to_delete_tasks)?;
atomic_progress.fetch_add(1, Ordering::Relaxed);
}
progress.update_progress(TaskDeletionProgress::DeletingTasks);
let (atomic_progress, task_progress) = AtomicTaskStep::new(to_delete_tasks.len() as u32);
progress.update_progress(task_progress);
for task in to_delete_tasks.iter() {
self.all_tasks.delete(wtxn, &task)?;
atomic_progress.fetch_add(1, Ordering::Relaxed);
}
for canceled_by in affected_canceled_by {
if let Some(mut tasks) = self.canceled_by.get(wtxn, &canceled_by)? {
@ -1855,9 +1707,6 @@ impl IndexScheduler {
}
}
}
progress.update_progress(TaskDeletionProgress::DeletingBatches);
let (atomic_progress, batch_progress) = AtomicBatchStep::new(affected_batches.len() as u32);
progress.update_progress(batch_progress);
for (batch_id, to_delete_tasks) in affected_batches {
if let Some(mut tasks) = self.batch_to_tasks_mapping.get(wtxn, &batch_id)? {
tasks -= &to_delete_tasks;
@ -1899,7 +1748,6 @@ impl IndexScheduler {
}
}
}
atomic_progress.fetch_add(1, Ordering::Relaxed);
}
Ok(to_delete_tasks)
@ -1914,36 +1762,21 @@ impl IndexScheduler {
cancel_task_id: TaskId,
current_batch: &mut ProcessingBatch,
matched_tasks: &RoaringBitmap,
progress: &Progress,
) -> Result<Vec<Task>> {
progress.update_progress(TaskCancelationProgress::RetrievingTasks);
// 1. Remove from this list the tasks that we are not allowed to cancel
// Notice that only the _enqueued_ ones are cancelable and we should
// have already aborted the indexation of the _processing_ ones
let cancelable_tasks = self.get_status(rtxn, Status::Enqueued)?;
let tasks_to_cancel = cancelable_tasks & matched_tasks;
let (task_progress, progress_obj) = AtomicTaskStep::new(tasks_to_cancel.len() as u32);
progress.update_progress(progress_obj);
// 2. We now have a list of tasks to cancel, cancel them
let mut tasks = self.get_existing_tasks(
rtxn,
tasks_to_cancel.iter().inspect(|_| {
task_progress.fetch_add(1, Ordering::Relaxed);
}),
)?;
let mut tasks = self.get_existing_tasks(rtxn, tasks_to_cancel.iter())?;
progress.update_progress(TaskCancelationProgress::UpdatingTasks);
let (task_progress, progress_obj) = AtomicTaskStep::new(tasks_to_cancel.len() as u32);
progress.update_progress(progress_obj);
for task in tasks.iter_mut() {
task.status = Status::Canceled;
task.canceled_by = Some(cancel_task_id);
task.details = task.details.as_ref().map(|d| d.to_failed());
current_batch.processing(Some(task));
task_progress.fetch_add(1, Ordering::Relaxed);
}
Ok(tasks)

View File

@ -104,7 +104,7 @@ pub enum Error {
)]
InvalidTaskCanceledBy { canceled_by: String },
#[error(
"{index_uid} is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 400 bytes."
"{index_uid} is not a valid index uid. Index uid can be an integer or a string containing only alphanumeric characters, hyphens (-) and underscores (_), and can not be more than 512 bytes."
)]
InvalidIndexUid { index_uid: String },
#[error("Task `{0}` not found.")]
@ -122,11 +122,8 @@ pub enum Error {
Dump(#[from] dump::Error),
#[error(transparent)]
Heed(#[from] heed::Error),
#[error("{}", match .index_uid {
Some(uid) if !uid.is_empty() => format!("Index `{}`: {error}", uid),
_ => format!("{error}")
})]
Milli { error: milli::Error, index_uid: Option<String> },
#[error(transparent)]
Milli(#[from] milli::Error),
#[error("An unexpected crash occurred when processing the task.")]
ProcessBatchPanicked,
#[error(transparent)]
@ -193,7 +190,7 @@ impl Error {
| Error::AbortedTask
| Error::Dump(_)
| Error::Heed(_)
| Error::Milli { .. }
| Error::Milli(_)
| Error::ProcessBatchPanicked
| Error::FileStore(_)
| Error::IoError(_)
@ -212,20 +209,6 @@ impl Error {
pub fn with_custom_error_code(self, code: Code) -> Self {
Self::WithCustomErrorCode(code, Box::new(self))
}
pub fn from_milli(err: milli::Error, index_uid: Option<String>) -> Self {
match err {
milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
Self::Milli { error: err, index_uid }
.with_custom_error_code(Code::InvalidDocumentFilter)
}
milli::Error::UserError(milli::UserError::InvalidFilterExpression { .. }) => {
Self::Milli { error: err, index_uid }
.with_custom_error_code(Code::InvalidDocumentFilter)
}
_ => Self::Milli { error: err, index_uid },
}
}
}
impl ErrorCode for Error {
@ -253,7 +236,7 @@ impl ErrorCode for Error {
// TODO: not sure of the Code to use
Error::NoSpaceLeftInTaskQueue => Code::NoSpaceLeftOnDevice,
Error::Dump(e) => e.error_code(),
Error::Milli { error, .. } => error.error_code(),
Error::Milli(e) => e.error_code(),
Error::ProcessBatchPanicked => Code::Internal,
Error::Heed(e) => e.error_code(),
Error::HeedTransaction(e) => e.error_code(),

View File

@ -1,17 +1,16 @@
use std::collections::BTreeMap;
use std::env::VarError;
use std::path::Path;
use std::str::FromStr;
use std::time::Duration;
use meilisearch_types::heed::{EnvClosingEvent, EnvFlags, EnvOpenOptions};
use meilisearch_types::milli::{Index, Result};
use meilisearch_types::milli::Index;
use time::OffsetDateTime;
use uuid::Uuid;
use super::IndexStatus::{self, Available, BeingDeleted, Closing, Missing};
use crate::clamp_to_page_size;
use crate::lru::{InsertionOutcome, LruMap};
use crate::{clamp_to_page_size, Result};
/// Keep an internally consistent view of the open indexes in memory.
///
/// This view is made of an LRU cache that will evict the least frequently used indexes when new indexes are opened.
@ -304,15 +303,7 @@ fn create_or_open_index(
) -> Result<Index> {
let mut options = EnvOpenOptions::new();
options.map_size(clamp_to_page_size(map_size));
let max_readers = match std::env::var("MEILI_EXPERIMENTAL_INDEX_MAX_READERS") {
Ok(value) => u32::from_str(&value).unwrap(),
Err(VarError::NotPresent) => 1024,
Err(VarError::NotUnicode(value)) => panic!(
"Invalid unicode for the `MEILI_EXPERIMENTAL_INDEX_MAX_READERS` env var: {value:?}"
),
};
options.max_readers(max_readers);
options.max_readers(1024);
if enable_mdb_writemap {
unsafe { options.flags(EnvFlags::WRITE_MAP) };
}

View File

@ -5,7 +5,6 @@ use std::{fs, thread};
use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
use meilisearch_types::milli;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::{FieldDistribution, Index};
use serde::{Deserialize, Serialize};
@ -122,7 +121,7 @@ impl IndexStats {
/// # Parameters
///
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
pub fn new(index: &Index, rtxn: &RoTxn) -> Result<Self> {
Ok(IndexStats {
number_of_documents: index.number_of_documents(rtxn)?,
database_size: index.on_disk_size()?,
@ -184,18 +183,13 @@ impl IndexMapper {
// Error if the UUIDv4 somehow already exists in the map, since it should be fresh.
// This is very unlikely to happen in practice.
// TODO: it would be better to lazily create the index. But we need an Index::open function for milli.
let index = self
.index_map
.write()
.unwrap()
.create(
&uuid,
&index_path,
date,
self.enable_mdb_writemap,
self.index_base_map_size,
)
.map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?;
let index = self.index_map.write().unwrap().create(
&uuid,
&index_path,
date,
self.enable_mdb_writemap,
self.index_base_map_size,
)?;
wtxn.commit()?;
@ -363,9 +357,7 @@ impl IndexMapper {
};
let index_path = self.base_path.join(uuid.to_string());
// take the lock to reopen the environment.
reopen
.reopen(&mut self.index_map.write().unwrap(), &index_path)
.map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?;
reopen.reopen(&mut self.index_map.write().unwrap(), &index_path)?;
continue;
}
BeingDeleted => return Err(Error::IndexNotFound(name.to_string())),
@ -380,15 +372,13 @@ impl IndexMapper {
Missing => {
let index_path = self.base_path.join(uuid.to_string());
break index_map
.create(
&uuid,
&index_path,
None,
self.enable_mdb_writemap,
self.index_base_map_size,
)
.map_err(|e| Error::from_milli(e, Some(uuid.to_string())))?;
break index_map.create(
&uuid,
&index_path,
None,
self.enable_mdb_writemap,
self.index_base_map_size,
)?;
}
Available(index) => break index,
Closing(_) => {
@ -470,7 +460,6 @@ impl IndexMapper {
let index = self.index(rtxn, index_uid)?;
let index_rtxn = index.read_txn()?;
IndexStats::new(&index, &index_rtxn)
.map_err(|e| Error::from_milli(e, Some(uuid.to_string())))
}
}
}

View File

@ -353,7 +353,7 @@ pub fn snapshot_canceled_by(rtxn: &RoTxn, db: Database<BEU32, RoaringBitmapCodec
pub fn snapshot_batch(batch: &Batch) -> String {
let mut snap = String::new();
let Batch { uid, details, stats, started_at, finished_at, progress: _ } = batch;
let Batch { uid, details, stats, started_at, finished_at } = batch;
if let Some(finished_at) = finished_at {
assert!(finished_at > started_at);
}

View File

@ -26,7 +26,6 @@ mod index_mapper;
#[cfg(test)]
mod insta_snapshot;
mod lru;
mod processing;
mod utils;
pub mod uuid_codec;
@ -57,12 +56,12 @@ use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128};
use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn};
use meilisearch_types::milli::documents::DocumentsBatchBuilder;
use meilisearch_types::milli::index::IndexEmbeddingConfig;
use meilisearch_types::milli::update::new::indexer::document_changes::Progress;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs};
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
use meilisearch_types::task_view::TaskView;
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
use processing::ProcessingTasks;
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task, TaskProgress};
use rayon::current_num_threads;
use rayon::prelude::{IntoParallelIterator, ParallelIterator};
use roaring::RoaringBitmap;
@ -73,8 +72,7 @@ use utils::{filter_out_references_to_newer_tasks, keep_ids_within_datetimes, map
use uuid::Uuid;
use crate::index_mapper::IndexMapper;
use crate::processing::{AtomicTaskStep, BatchProgress};
use crate::utils::{check_index_swap_validity, clamp_to_page_size};
use crate::utils::{check_index_swap_validity, clamp_to_page_size, ProcessingBatch};
pub(crate) type BEI128 = I128<BE>;
@ -165,6 +163,48 @@ impl Query {
}
}
#[derive(Debug, Clone)]
pub struct ProcessingTasks {
batch: Option<ProcessingBatch>,
/// The list of tasks ids that are currently running.
processing: RoaringBitmap,
/// The progress on processing tasks
progress: Option<TaskProgress>,
}
impl ProcessingTasks {
/// Creates an empty `ProcessingAt` struct.
fn new() -> ProcessingTasks {
ProcessingTasks { batch: None, processing: RoaringBitmap::new(), progress: None }
}
/// Stores the currently processing tasks, and the date time at which it started.
fn start_processing(&mut self, processing_batch: ProcessingBatch, processing: RoaringBitmap) {
self.batch = Some(processing_batch);
self.processing = processing;
}
fn update_progress(&mut self, progress: Progress) -> TaskProgress {
self.progress.get_or_insert_with(TaskProgress::default).update(progress)
}
/// Set the processing tasks to an empty list
fn stop_processing(&mut self) -> Self {
self.progress = None;
Self {
batch: std::mem::take(&mut self.batch),
processing: std::mem::take(&mut self.processing),
progress: None,
}
}
/// Returns `true` if there, at least, is one task that is currently processing that we must stop.
fn must_cancel_processing_tasks(&self, canceled_tasks: &RoaringBitmap) -> bool {
!self.processing.is_disjoint(canceled_tasks)
}
}
#[derive(Default, Clone, Debug)]
struct MustStopProcessing(Arc<AtomicBool>);
@ -773,7 +813,7 @@ impl IndexScheduler {
let mut batch_tasks = RoaringBitmap::new();
for batch_uid in batch_uids {
if processing_batch.as_ref().map_or(false, |batch| batch.uid == *batch_uid) {
batch_tasks |= &*processing_tasks;
batch_tasks |= &processing_tasks;
} else {
batch_tasks |= self.tasks_in_batch(rtxn, *batch_uid)?;
}
@ -787,13 +827,13 @@ impl IndexScheduler {
match status {
// special case for Processing tasks
Status::Processing => {
status_tasks |= &*processing_tasks;
status_tasks |= &processing_tasks;
}
status => status_tasks |= &self.get_status(rtxn, *status)?,
};
}
if !status.contains(&Status::Processing) {
tasks -= &*processing_tasks;
tasks -= &processing_tasks;
}
tasks &= status_tasks;
}
@ -842,7 +882,7 @@ impl IndexScheduler {
// Once we have filtered the two subsets, we put them back together and assign it back to `tasks`.
tasks = {
let (mut filtered_non_processing_tasks, mut filtered_processing_tasks) =
(&tasks - &*processing_tasks, &tasks & &*processing_tasks);
(&tasks - &processing_tasks, &tasks & &processing_tasks);
// special case for Processing tasks
// A closure that clears the filtered_processing_tasks if their started_at date falls outside the given bounds
@ -1050,7 +1090,7 @@ impl IndexScheduler {
// Once we have filtered the two subsets, we put them back together and assign it back to `batches`.
batches = {
let (mut filtered_non_processing_batches, mut filtered_processing_batches) =
(&batches - &*processing.processing, &batches & &*processing.processing);
(&batches - &processing.processing, &batches & &processing.processing);
// special case for Processing batches
// A closure that clears the filtered_processing_batches if their started_at date falls outside the given bounds
@ -1566,8 +1606,7 @@ impl IndexScheduler {
// We reset the must_stop flag to be sure that we don't stop processing tasks
self.must_stop_processing.reset();
let progress = self
.processing_tasks
self.processing_tasks
.write()
.unwrap()
// We can clone the processing batch here because we don't want its modification to affect the view of the processing batches
@ -1580,12 +1619,11 @@ impl IndexScheduler {
let res = {
let cloned_index_scheduler = self.private_clone();
let processing_batch = &mut processing_batch;
let progress = progress.clone();
std::thread::scope(|s| {
let handle = std::thread::Builder::new()
.name(String::from("batch-operation"))
.spawn_scoped(s, move || {
cloned_index_scheduler.process_batch(batch, processing_batch, progress)
cloned_index_scheduler.process_batch(batch, processing_batch)
})
.unwrap();
handle.join().unwrap_or(Err(Error::ProcessBatchPanicked))
@ -1598,7 +1636,6 @@ impl IndexScheduler {
#[cfg(test)]
self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?;
progress.update_progress(BatchProgress::WritingTasksToDisk);
processing_batch.finished();
let mut wtxn = self.env.write_txn().map_err(Error::HeedTransaction)?;
let mut canceled = RoaringBitmap::new();
@ -1608,15 +1645,12 @@ impl IndexScheduler {
#[cfg(test)]
self.breakpoint(Breakpoint::ProcessBatchSucceeded);
let (task_progress, task_progress_obj) = AtomicTaskStep::new(tasks.len() as u32);
progress.update_progress(task_progress_obj);
let mut success = 0;
let mut failure = 0;
let mut canceled_by = None;
#[allow(unused_variables)]
for (i, mut task) in tasks.into_iter().enumerate() {
task_progress.fetch_add(1, Ordering::Relaxed);
processing_batch.update(&mut task);
if task.status == Status::Canceled {
canceled.insert(task.uid);
@ -1644,10 +1678,9 @@ impl IndexScheduler {
tracing::info!("A batch of tasks was successfully completed with {success} successful tasks and {failure} failed tasks.");
}
// If we have an abortion error we must stop the tick here and re-schedule tasks.
Err(Error::Milli {
error: milli::Error::InternalError(milli::InternalError::AbortedIndexation),
..
})
Err(Error::Milli(milli::Error::InternalError(
milli::InternalError::AbortedIndexation,
)))
| Err(Error::AbortedTask) => {
#[cfg(test)]
self.breakpoint(Breakpoint::AbortedIndexation);
@ -1666,10 +1699,9 @@ impl IndexScheduler {
// 2. close the associated environment
// 3. resize it
// 4. re-schedule tasks
Err(Error::Milli {
error: milli::Error::UserError(milli::UserError::MaxDatabaseSizeReached),
..
}) if index_uid.is_some() => {
Err(Error::Milli(milli::Error::UserError(
milli::UserError::MaxDatabaseSizeReached,
))) if index_uid.is_some() => {
// fixme: add index_uid to match to avoid the unwrap
let index_uid = index_uid.unwrap();
// fixme: handle error more gracefully? not sure when this could happen
@ -1684,12 +1716,8 @@ impl IndexScheduler {
Err(err) => {
#[cfg(test)]
self.breakpoint(Breakpoint::ProcessBatchFailed);
let (task_progress, task_progress_obj) = AtomicTaskStep::new(ids.len() as u32);
progress.update_progress(task_progress_obj);
let error: ResponseError = err.into();
for id in ids.iter() {
task_progress.fetch_add(1, Ordering::Relaxed);
let mut task = self
.get_task(&wtxn, id)
.map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?
@ -1915,7 +1943,6 @@ impl IndexScheduler {
// TODO: consider using a type alias or a struct embedder/template
pub fn embedders(
&self,
index_uid: String,
embedding_configs: Vec<IndexEmbeddingConfig>,
) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs
@ -1926,12 +1953,8 @@ impl IndexScheduler {
config: milli::vector::EmbeddingConfig { embedder_options, prompt, quantized },
..
}| {
let prompt = Arc::new(
prompt
.try_into()
.map_err(meilisearch_types::milli::Error::from)
.map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?,
);
let prompt =
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
// optimistically return existing embedder
{
let embedders = self.embedders.read().unwrap();
@ -1947,9 +1970,7 @@ impl IndexScheduler {
let embedder = Arc::new(
Embedder::new(embedder_options.clone())
.map_err(meilisearch_types::milli::vector::Error::from)
.map_err(|err| {
Error::from_milli(err.into(), Some(index_uid.clone()))
})?,
.map_err(meilisearch_types::milli::Error::from)?,
);
{
let mut embedders = self.embedders.write().unwrap();
@ -6150,7 +6171,7 @@ mod tests {
insta::assert_json_snapshot!(simple_hf_config.embedder_options);
let simple_hf_name = name.clone();
let configs = index_scheduler.embedders("doggos".to_string(), configs).unwrap();
let configs = index_scheduler.embedders(configs).unwrap();
let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap();
let beagle_embed =
hf_embedder.embed_one(S("Intel the beagle best doggo"), None).unwrap();

View File

@ -1,316 +0,0 @@
use std::borrow::Cow;
use std::sync::Arc;
use enum_iterator::Sequence;
use meilisearch_types::milli::progress::{AtomicSubStep, NamedStep, Progress, ProgressView, Step};
use meilisearch_types::milli::{make_atomic_progress, make_enum_progress};
use roaring::RoaringBitmap;
use crate::utils::ProcessingBatch;
#[derive(Clone)]
pub struct ProcessingTasks {
pub batch: Option<Arc<ProcessingBatch>>,
/// The list of tasks ids that are currently running.
pub processing: Arc<RoaringBitmap>,
/// The progress on processing tasks
pub progress: Option<Progress>,
}
impl ProcessingTasks {
/// Creates an empty `ProcessingAt` struct.
pub fn new() -> ProcessingTasks {
ProcessingTasks { batch: None, processing: Arc::new(RoaringBitmap::new()), progress: None }
}
pub fn get_progress_view(&self) -> Option<ProgressView> {
Some(self.progress.as_ref()?.as_progress_view())
}
/// Stores the currently processing tasks, and the date time at which it started.
pub fn start_processing(
&mut self,
processing_batch: ProcessingBatch,
processing: RoaringBitmap,
) -> Progress {
self.batch = Some(Arc::new(processing_batch));
self.processing = Arc::new(processing);
let progress = Progress::default();
progress.update_progress(BatchProgress::ProcessingTasks);
self.progress = Some(progress.clone());
progress
}
/// Set the processing tasks to an empty list
pub fn stop_processing(&mut self) -> Self {
self.progress = None;
Self {
batch: std::mem::take(&mut self.batch),
processing: std::mem::take(&mut self.processing),
progress: None,
}
}
/// Returns `true` if there, at least, is one task that is currently processing that we must stop.
pub fn must_cancel_processing_tasks(&self, canceled_tasks: &RoaringBitmap) -> bool {
!self.processing.is_disjoint(canceled_tasks)
}
}
make_enum_progress! {
pub enum BatchProgress {
ProcessingTasks,
WritingTasksToDisk,
}
}
make_enum_progress! {
pub enum TaskCancelationProgress {
RetrievingTasks,
UpdatingTasks,
}
}
make_enum_progress! {
pub enum TaskDeletionProgress {
DeletingTasksDateTime,
DeletingTasksMetadata,
DeletingTasks,
DeletingBatches,
}
}
make_enum_progress! {
pub enum SnapshotCreationProgress {
StartTheSnapshotCreation,
SnapshotTheIndexScheduler,
SnapshotTheUpdateFiles,
SnapshotTheIndexes,
SnapshotTheApiKeys,
CreateTheTarball,
}
}
make_enum_progress! {
pub enum DumpCreationProgress {
StartTheDumpCreation,
DumpTheApiKeys,
DumpTheTasks,
DumpTheIndexes,
DumpTheExperimentalFeatures,
CompressTheDump,
}
}
make_enum_progress! {
pub enum CreateIndexProgress {
CreatingTheIndex,
}
}
make_enum_progress! {
pub enum UpdateIndexProgress {
UpdatingTheIndex,
}
}
make_enum_progress! {
pub enum DeleteIndexProgress {
DeletingTheIndex,
}
}
make_enum_progress! {
pub enum SwappingTheIndexes {
EnsuringCorrectnessOfTheSwap,
SwappingTheIndexes,
}
}
make_enum_progress! {
pub enum InnerSwappingTwoIndexes {
RetrieveTheTasks,
UpdateTheTasks,
UpdateTheIndexesMetadata,
}
}
make_enum_progress! {
pub enum DocumentOperationProgress {
RetrievingConfig,
ComputingDocumentChanges,
Indexing,
}
}
make_enum_progress! {
pub enum DocumentEditionProgress {
RetrievingConfig,
ComputingDocumentChanges,
Indexing,
}
}
make_enum_progress! {
pub enum DocumentDeletionProgress {
RetrievingConfig,
DeleteDocuments,
Indexing,
}
}
make_enum_progress! {
pub enum SettingsProgress {
RetrievingAndMergingTheSettings,
ApplyTheSettings,
}
}
make_atomic_progress!(Task alias AtomicTaskStep => "task" );
make_atomic_progress!(Document alias AtomicDocumentStep => "document" );
make_atomic_progress!(Batch alias AtomicBatchStep => "batch" );
make_atomic_progress!(UpdateFile alias AtomicUpdateFileStep => "update file" );
pub struct VariableNameStep {
name: String,
current: u32,
total: u32,
}
impl VariableNameStep {
pub fn new(name: impl Into<String>, current: u32, total: u32) -> Self {
Self { name: name.into(), current, total }
}
}
impl Step for VariableNameStep {
fn name(&self) -> Cow<'static, str> {
self.name.clone().into()
}
fn current(&self) -> u32 {
self.current
}
fn total(&self) -> u32 {
self.total
}
}
#[cfg(test)]
mod test {
use std::sync::atomic::Ordering;
use meili_snap::{json_string, snapshot};
use super::*;
#[test]
fn one_level() {
let mut processing = ProcessingTasks::new();
processing.start_processing(ProcessingBatch::new(0), RoaringBitmap::new());
snapshot!(json_string!(processing.get_progress_view()), @r#"
{
"steps": [
{
"currentStep": "processing tasks",
"finished": 0,
"total": 2
}
],
"percentage": 0.0
}
"#);
processing.progress.as_ref().unwrap().update_progress(BatchProgress::WritingTasksToDisk);
snapshot!(json_string!(processing.get_progress_view()), @r#"
{
"steps": [
{
"currentStep": "writing tasks to disk",
"finished": 1,
"total": 2
}
],
"percentage": 50.0
}
"#);
}
#[test]
fn task_progress() {
let mut processing = ProcessingTasks::new();
processing.start_processing(ProcessingBatch::new(0), RoaringBitmap::new());
let (atomic, tasks) = AtomicTaskStep::new(10);
processing.progress.as_ref().unwrap().update_progress(tasks);
snapshot!(json_string!(processing.get_progress_view()), @r#"
{
"steps": [
{
"currentStep": "processing tasks",
"finished": 0,
"total": 2
},
{
"currentStep": "task",
"finished": 0,
"total": 10
}
],
"percentage": 0.0
}
"#);
atomic.fetch_add(6, Ordering::Relaxed);
snapshot!(json_string!(processing.get_progress_view()), @r#"
{
"steps": [
{
"currentStep": "processing tasks",
"finished": 0,
"total": 2
},
{
"currentStep": "task",
"finished": 6,
"total": 10
}
],
"percentage": 30.000002
}
"#);
processing.progress.as_ref().unwrap().update_progress(BatchProgress::WritingTasksToDisk);
snapshot!(json_string!(processing.get_progress_view()), @r#"
{
"steps": [
{
"currentStep": "writing tasks to disk",
"finished": 1,
"total": 2
}
],
"percentage": 50.0
}
"#);
let (atomic, tasks) = AtomicTaskStep::new(5);
processing.progress.as_ref().unwrap().update_progress(tasks);
atomic.fetch_add(4, Ordering::Relaxed);
snapshot!(json_string!(processing.get_progress_view()), @r#"
{
"steps": [
{
"currentStep": "writing tasks to disk",
"finished": 1,
"total": 2
},
{
"currentStep": "task",
"finished": 4,
"total": 5
}
],
"percentage": 90.0
}
"#);
}
}

View File

@ -9,8 +9,8 @@ source: crates/index-scheduler/src/lib.rs
0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set({"catto"}), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }}
1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_document_ids: 1, deleted_documents: Some(1) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }}
3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }}
4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Attribute `id` is not filterable. Available filterable attributes are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }}
3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }}
4 {uid: 4, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Attribute `id` is not filterable. Available filterable attributes are: `catto`.\n1:3 id = 2", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: "id = 2", deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("id = 2") }}
5 {uid: 5, batch_uid: 2, status: succeeded, details: { original_filter: "catto EXISTS", deleted_documents: Some(1) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: String("catto EXISTS") }}
----------------------------------------------------------------------
### Status:

View File

@ -134,7 +134,6 @@ impl ProcessingBatch {
pub fn to_batch(&self) -> Batch {
Batch {
uid: self.uid,
progress: None,
details: self.details.clone(),
stats: self.stats.clone(),
started_at: self.started_at,
@ -188,7 +187,6 @@ impl IndexScheduler {
&batch.uid,
&Batch {
uid: batch.uid,
progress: None,
details: batch.details,
stats: batch.stats,
started_at: batch.started_at,
@ -275,9 +273,7 @@ impl IndexScheduler {
.into_iter()
.map(|batch_id| {
if Some(batch_id) == processing.batch.as_ref().map(|batch| batch.uid) {
let mut batch = processing.batch.as_ref().unwrap().to_batch();
batch.progress = processing.get_progress_view();
Ok(batch)
Ok(processing.batch.as_ref().unwrap().to_batch())
} else {
self.get_batch(rtxn, batch_id)
.and_then(|task| task.ok_or(Error::CorruptedTaskQueue))

View File

@ -24,9 +24,8 @@ flate2 = "1.0.30"
fst = "0.4.7"
memmap2 = "0.9.4"
milli = { path = "../milli" }
bumparaw-collections = "0.1.2"
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
roaring = { version = "0.10.7", features = ["serde"] }
rustc-hash = "2.1.0"
serde = { version = "1.0.204", features = ["derive"] }
serde-cs = "0.2.4"
serde_json = "1.0.120"

View File

@ -1,16 +1,16 @@
use milli::progress::ProgressView;
use serde::Serialize;
use time::{Duration, OffsetDateTime};
use crate::batches::{Batch, BatchId, BatchStats};
use crate::task_view::DetailsView;
use crate::tasks::serialize_duration;
use crate::{
batches::{Batch, BatchId, BatchStats},
task_view::DetailsView,
tasks::serialize_duration,
};
#[derive(Debug, Clone, Serialize)]
#[serde(rename_all = "camelCase")]
pub struct BatchView {
pub uid: BatchId,
pub progress: Option<ProgressView>,
pub details: DetailsView,
pub stats: BatchStats,
#[serde(serialize_with = "serialize_duration", default)]
@ -25,7 +25,6 @@ impl BatchView {
pub fn from_batch(batch: &Batch) -> Self {
Self {
uid: batch.uid,
progress: batch.progress.clone(),
details: batch.details.clone(),
stats: batch.stats.clone(),
duration: batch.finished_at.map(|finished_at| finished_at - batch.started_at),

View File

@ -1,11 +1,12 @@
use std::collections::BTreeMap;
use milli::progress::ProgressView;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::task_view::DetailsView;
use crate::tasks::{Kind, Status};
use crate::{
task_view::DetailsView,
tasks::{Kind, Status},
};
pub type BatchId = u32;
@ -14,8 +15,6 @@ pub type BatchId = u32;
pub struct Batch {
pub uid: BatchId,
#[serde(skip)]
pub progress: Option<ProgressView>,
pub details: DetailsView,
pub stats: BatchStats,

View File

@ -4,11 +4,10 @@ use std::io::{self, BufWriter};
use std::marker::PhantomData;
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use memmap2::Mmap;
use milli::documents::Error;
use milli::Object;
use rustc_hash::FxBuildHasher;
use raw_collections::RawMap;
use serde::de::{SeqAccess, Visitor};
use serde::{Deserialize, Deserializer};
use serde_json::error::Category;
@ -221,7 +220,7 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
let mut deserializer = serde_json::Deserializer::from_slice(&input);
let res = array_each(&mut deserializer, |obj: &RawValue| {
doc_alloc.reset();
let map = RawMap::from_raw_value_and_hasher(obj, FxBuildHasher, &doc_alloc)?;
let map = RawMap::from_raw_value(obj, &doc_alloc)?;
to_writer(&mut out, &map)
});
let count = match res {
@ -251,25 +250,26 @@ pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
}
}
/// Reads NDJSON from file and checks it.
pub fn read_ndjson(input: &File) -> Result<u64> {
/// Reads NDJSON from file and write it in NDJSON in a file checking it along the way.
pub fn read_ndjson(input: &File, output: impl io::Write) -> Result<u64> {
// We memory map to be able to deserialize into a RawMap that
// does not allocate when possible and only materialize the first/top level.
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
let mut output = BufWriter::new(output);
let mut bump = Bump::with_capacity(1024 * 1024);
let mut count = 0;
for result in serde_json::Deserializer::from_slice(&input).into_iter() {
bump.reset();
match result {
Ok(raw) => {
count += 1;
result
.and_then(|raw: &RawValue| {
// try to deserialize as a map
RawMap::from_raw_value_and_hasher(raw, FxBuildHasher, &bump)
.map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?;
count += 1;
}
Err(e) => return Err(DocumentFormatError::from((PayloadType::Ndjson, e))),
}
let map = RawMap::from_raw_value(raw, &bump)?;
to_writer(&mut output, &map)
})
.map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?;
}
Ok(count)

View File

@ -550,7 +550,7 @@ impl fmt::Display for deserr_codes::InvalidSimilarId {
"the value of `id` is invalid. \
A document identifier can be of type integer or string, \
only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \
and can not be more than 511 bytes."
and can not be more than 512 bytes."
)
}
}

View File

@ -4,6 +4,7 @@ use std::fmt::{Display, Write};
use std::str::FromStr;
use enum_iterator::Sequence;
use milli::update::new::indexer::document_changes::Progress;
use milli::update::IndexDocumentsMethod;
use milli::Object;
use roaring::RoaringBitmap;
@ -40,6 +41,62 @@ pub struct Task {
pub kind: KindWithContent,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct TaskProgress {
pub current_step: &'static str,
pub finished_steps: u16,
pub total_steps: u16,
pub finished_substeps: Option<u32>,
pub total_substeps: Option<u32>,
}
impl Default for TaskProgress {
fn default() -> Self {
Self::new()
}
}
impl TaskProgress {
pub fn new() -> Self {
Self {
current_step: "start",
finished_steps: 0,
total_steps: 1,
finished_substeps: None,
total_substeps: None,
}
}
pub fn update(&mut self, progress: Progress) -> TaskProgress {
if self.finished_steps > progress.finished_steps {
return *self;
}
if self.current_step != progress.step_name {
self.current_step = progress.step_name
}
self.total_steps = progress.total_steps;
if self.finished_steps < progress.finished_steps {
self.finished_substeps = None;
self.total_substeps = None;
}
self.finished_steps = progress.finished_steps;
if let Some((finished_substeps, total_substeps)) = progress.finished_total_substep {
if let Some(task_finished_substeps) = self.finished_substeps {
if task_finished_substeps > finished_substeps {
return *self;
}
}
self.finished_substeps = Some(finished_substeps);
self.total_substeps = Some(total_substeps);
}
*self
}
}
impl Task {
pub fn index_uid(&self) -> Option<&str> {
use KindWithContent::*;

View File

@ -4,7 +4,6 @@ use byte_unit::{Byte, UnitType};
use meilisearch_types::document_formats::{DocumentFormatError, PayloadType};
use meilisearch_types::error::{Code, ErrorCode, ResponseError};
use meilisearch_types::index_uid::{IndexUid, IndexUidFormatError};
use meilisearch_types::milli;
use meilisearch_types::milli::OrderBy;
use serde_json::Value;
use tokio::task::JoinError;
@ -63,11 +62,8 @@ pub enum MeilisearchHttpError {
HeedError(#[from] meilisearch_types::heed::Error),
#[error(transparent)]
IndexScheduler(#[from] index_scheduler::Error),
#[error("{}", match .index_name {
Some(name) if !name.is_empty() => format!("Index `{}`: {error}", name),
_ => format!("{error}")
})]
Milli { error: milli::Error, index_name: Option<String> },
#[error(transparent)]
Milli(#[from] meilisearch_types::milli::Error),
#[error(transparent)]
Payload(#[from] PayloadError),
#[error(transparent)]
@ -80,12 +76,6 @@ pub enum MeilisearchHttpError {
MissingSearchHybrid,
}
impl MeilisearchHttpError {
pub(crate) fn from_milli(error: milli::Error, index_name: Option<String>) -> Self {
Self::Milli { error, index_name }
}
}
impl ErrorCode for MeilisearchHttpError {
fn error_code(&self) -> Code {
match self {
@ -105,7 +95,7 @@ impl ErrorCode for MeilisearchHttpError {
MeilisearchHttpError::SerdeJson(_) => Code::Internal,
MeilisearchHttpError::HeedError(_) => Code::Internal,
MeilisearchHttpError::IndexScheduler(e) => e.error_code(),
MeilisearchHttpError::Milli { error, .. } => error.error_code(),
MeilisearchHttpError::Milli(e) => e.error_code(),
MeilisearchHttpError::Payload(e) => e.error_code(),
MeilisearchHttpError::FileStore(_) => Code::Internal,
MeilisearchHttpError::DocumentFormat(e) => e.error_code(),

View File

@ -395,7 +395,6 @@ fn import_dump(
for index_reader in dump_reader.indexes()? {
let mut index_reader = index_reader?;
let metadata = index_reader.metadata();
let uid = metadata.uid.clone();
tracing::info!("Importing index `{}`.", metadata.uid);
let date = Some((metadata.created_at, metadata.updated_at));
@ -433,7 +432,7 @@ fn import_dump(
let reader = DocumentsBatchReader::from_reader(reader)?;
let embedder_configs = index.embedding_configs(&wtxn)?;
let embedders = index_scheduler.embedders(uid, embedder_configs)?;
let embedders = index_scheduler.embedders(embedder_configs)?;
let builder = milli::update::IndexDocuments::new(
&mut wtxn,

View File

@ -129,11 +129,6 @@ async fn try_main() -> anyhow::Result<()> {
print_launch_resume(&opt, analytics.clone(), config_read_from);
tokio::spawn(async move {
tokio::signal::ctrl_c().await.unwrap();
std::process::exit(130);
});
run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?;
Ok(())

View File

@ -654,8 +654,9 @@ impl Opt {
#[derive(Debug, Default, Clone, Parser, Deserialize)]
pub struct IndexerOpts {
/// Sets the maximum amount of RAM Meilisearch can use when indexing. By default, Meilisearch
/// uses no more than two thirds of available memory.
/// Specifies the maximum resident memory that Meilisearch can use for indexing.
/// By default, Meilisearch limits the RAM usage to 5% of the total available memory.
/// Note that the underlying store utilizes memory-mapping and makes use of the rest.
#[clap(long, env = MEILI_MAX_INDEXING_MEMORY, default_value_t)]
#[serde(default)]
pub max_indexing_memory: MaxMemory,
@ -714,7 +715,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
}
}
/// A type used to detect the max memory available and use 2/3 of it.
/// A type used to detect the max resident memory available and use 5% of it.
#[derive(Debug, Clone, Copy, Deserialize, Serialize)]
pub struct MaxMemory(Option<Byte>);
@ -728,7 +729,7 @@ impl FromStr for MaxMemory {
impl Default for MaxMemory {
fn default() -> MaxMemory {
MaxMemory(total_memory_bytes().map(|bytes| bytes * 2 / 3).map(Byte::from_u64))
MaxMemory(total_memory_bytes().map(|bytes| bytes * 5 / 100).map(Byte::from_u64))
}
}

View File

@ -1,18 +1,18 @@
use actix_web::web::{self, Data};
use actix_web::HttpResponse;
use actix_web::{
web::{self, Data},
HttpResponse,
};
use deserr::actix_web::AwebQueryParameter;
use index_scheduler::{IndexScheduler, Query};
use meilisearch_types::batch_view::BatchView;
use meilisearch_types::batches::BatchId;
use meilisearch_types::deserr::DeserrQueryParamError;
use meilisearch_types::error::ResponseError;
use meilisearch_types::keys::actions;
use meilisearch_types::{
batch_view::BatchView, batches::BatchId, deserr::DeserrQueryParamError, error::ResponseError,
keys::actions,
};
use serde::Serialize;
use super::tasks::TasksFilterQuery;
use super::ActionPolicy;
use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::extractors::{authentication::GuardedData, sequential_extractor::SeqHandler};
use super::{tasks::TasksFilterQuery, ActionPolicy};
pub fn configure(cfg: &mut web::ServiceConfig) {
cfg.service(web::resource("").route(web::get().to(SeqHandler(get_batches))))

View File

@ -1,5 +1,5 @@
use std::collections::HashSet;
use std::io::{ErrorKind, Seek as _};
use std::io::ErrorKind;
use std::marker::PhantomData;
use actix_web::http::header::CONTENT_TYPE;
@ -572,7 +572,7 @@ async fn document_addition(
index_uid: IndexUid,
primary_key: Option<String>,
csv_delimiter: Option<u8>,
body: Payload,
mut body: Payload,
method: IndexDocumentsMethod,
task_id: Option<TaskId>,
dry_run: bool,
@ -609,60 +609,54 @@ async fn document_addition(
};
let (uuid, mut update_file) = index_scheduler.create_update_file(dry_run)?;
let documents_count = match format {
PayloadType::Ndjson => {
let (path, file) = update_file.into_parts();
let file = match file {
Some(file) => {
let (file, path) = file.into_parts();
let mut file = copy_body_to_file(file, body, format).await?;
file.rewind().map_err(|e| {
index_scheduler::Error::FileStore(file_store::Error::IoError(e))
})?;
Some(tempfile::NamedTempFile::from_parts(file, path))
}
None => None,
};
let documents_count = tokio::task::spawn_blocking(move || {
let documents_count = file.as_ref().map_or(Ok(0), |ntf| {
read_ndjson(ntf.as_file()).map_err(MeilisearchHttpError::DocumentFormat)
})?;
let update_file = file_store::File::from_parts(path, file);
update_file.persist()?;
Ok(documents_count)
})
.await?;
Ok(documents_count)
}
PayloadType::Json | PayloadType::Csv { delimiter: _ } => {
let temp_file = match tempfile() {
Ok(file) => file,
Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))),
};
let read_file = copy_body_to_file(temp_file, body, format).await?;
tokio::task::spawn_blocking(move || {
let documents_count = match format {
PayloadType::Json => read_json(&read_file, &mut update_file)?,
PayloadType::Csv { delimiter } => {
read_csv(&read_file, &mut update_file, delimiter)?
}
PayloadType::Ndjson => {
unreachable!("We already wrote the user content into the update file")
}
};
// we NEED to persist the file here because we moved the `udpate_file` in another task.
update_file.persist()?;
Ok(documents_count)
})
.await
}
let temp_file = match tempfile() {
Ok(file) => file,
Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))),
};
let async_file = File::from_std(temp_file);
let mut buffer = BufWriter::new(async_file);
let mut buffer_write_size: usize = 0;
while let Some(result) = body.next().await {
let byte = result?;
if byte.is_empty() && buffer_write_size == 0 {
return Err(MeilisearchHttpError::MissingPayload(format));
}
match buffer.write_all(&byte).await {
Ok(()) => buffer_write_size += 1,
Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))),
}
}
if let Err(e) = buffer.flush().await {
return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e))));
}
if buffer_write_size == 0 {
return Err(MeilisearchHttpError::MissingPayload(format));
}
if let Err(e) = buffer.seek(std::io::SeekFrom::Start(0)).await {
return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e))));
}
let read_file = buffer.into_inner().into_std().await;
let documents_count = tokio::task::spawn_blocking(move || {
let documents_count = match format {
PayloadType::Json => read_json(&read_file, &mut update_file)?,
PayloadType::Csv { delimiter } => read_csv(&read_file, &mut update_file, delimiter)?,
PayloadType::Ndjson => read_ndjson(&read_file, &mut update_file)?,
};
// we NEED to persist the file here because we moved the `udpate_file` in another task.
update_file.persist()?;
Ok(documents_count)
})
.await;
let documents_count = match documents_count {
Ok(Ok(documents_count)) => documents_count,
// in this case the file has not possibly be persisted.
@ -709,39 +703,6 @@ async fn document_addition(
Ok(task.into())
}
async fn copy_body_to_file(
output: std::fs::File,
mut body: Payload,
format: PayloadType,
) -> Result<std::fs::File, MeilisearchHttpError> {
let async_file = File::from_std(output);
let mut buffer = BufWriter::new(async_file);
let mut buffer_write_size: usize = 0;
while let Some(result) = body.next().await {
let byte = result?;
if byte.is_empty() && buffer_write_size == 0 {
return Err(MeilisearchHttpError::MissingPayload(format));
}
match buffer.write_all(&byte).await {
Ok(()) => buffer_write_size += 1,
Err(e) => return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e)))),
}
}
if let Err(e) = buffer.flush().await {
return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e))));
}
if buffer_write_size == 0 {
return Err(MeilisearchHttpError::MissingPayload(format));
}
if let Err(e) = buffer.seek(std::io::SeekFrom::Start(0)).await {
return Err(MeilisearchHttpError::Payload(ReceivePayload(Box::new(e))));
}
let read_file = buffer.into_inner().into_std().await;
Ok(read_file)
}
pub async fn delete_documents_batch(
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
index_uid: web::Path<String>,

View File

@ -185,8 +185,7 @@ pub async fn search(
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features();
let search_kind =
search_kind(&search_query, &index_scheduler, index_uid.to_string(), &index, features)?;
let search_kind = search_kind(&search_query, &index_scheduler, &index, features)?;
let permit = search_queue.try_get_search_permit().await?;
let search_result = tokio::task::spawn_blocking(move || {
perform_facet_search(

View File

@ -5,7 +5,7 @@ use actix_web::web::Data;
use actix_web::{web, HttpRequest, HttpResponse};
use deserr::actix_web::{AwebJson, AwebQueryParameter};
use deserr::{DeserializeError, Deserr, ValuePointerRef};
use index_scheduler::{Error, IndexScheduler};
use index_scheduler::IndexScheduler;
use meilisearch_types::deserr::query_params::Param;
use meilisearch_types::deserr::{immutable_field_error, DeserrJsonError, DeserrQueryParamError};
use meilisearch_types::error::deserr_codes::*;
@ -107,10 +107,7 @@ pub async fn list_indexes(
if !filters.is_index_authorized(uid) {
return Ok(None);
}
Ok(Some(
IndexView::new(uid.to_string(), index)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?,
))
Ok(Some(IndexView::new(uid.to_string(), index)?))
})?;
// Won't cause to open all indexes because IndexView doesn't keep the `Index` opened.
let indexes: Vec<IndexView> = indexes.into_iter().flatten().collect();

View File

@ -243,19 +243,11 @@ pub async fn search_with_url_query(
let index = index_scheduler.index(&index_uid)?;
let features = index_scheduler.features();
let search_kind =
search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?;
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?;
let permit = search_queue.try_get_search_permit().await?;
let search_result = tokio::task::spawn_blocking(move || {
perform_search(
index_uid.to_string(),
&index,
query,
search_kind,
retrieve_vector,
index_scheduler.features(),
)
perform_search(&index, query, search_kind, retrieve_vector, index_scheduler.features())
})
.await;
permit.drop().await;
@ -295,20 +287,12 @@ pub async fn search_with_post(
let features = index_scheduler.features();
let search_kind =
search_kind(&query, index_scheduler.get_ref(), index_uid.to_string(), &index, features)?;
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
let permit = search_queue.try_get_search_permit().await?;
let search_result = tokio::task::spawn_blocking(move || {
perform_search(
index_uid.to_string(),
&index,
query,
search_kind,
retrieve_vectors,
index_scheduler.features(),
)
perform_search(&index, query, search_kind, retrieve_vectors, index_scheduler.features())
})
.await;
permit.drop().await;
@ -330,7 +314,6 @@ pub async fn search_with_post(
pub fn search_kind(
query: &SearchQuery,
index_scheduler: &IndexScheduler,
index_uid: String,
index: &milli::Index,
features: RoFeatures,
) -> Result<SearchKind, ResponseError> {
@ -349,7 +332,7 @@ pub fn search_kind(
(None, _, None) => Ok(SearchKind::KeywordOnly),
// hybrid.semantic_ratio == 1.0 => vector
(_, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => {
SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len()))
SearchKind::semantic(index_scheduler, index, embedder, v.map(|v| v.len()))
}
// hybrid.semantic_ratio == 0.0 => keyword
(_, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => {
@ -357,14 +340,13 @@ pub fn search_kind(
}
// no query, hybrid, vector => semantic
(None, Some(HybridQuery { semantic_ratio: _, embedder }), Some(v)) => {
SearchKind::semantic(index_scheduler, index_uid, index, embedder, Some(v.len()))
SearchKind::semantic(index_scheduler, index, embedder, Some(v.len()))
}
// query, no hybrid, no vector => keyword
(Some(_), None, None) => Ok(SearchKind::KeywordOnly),
// query, hybrid, maybe vector => hybrid
(Some(_), Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid(
index_scheduler,
index_uid,
index,
embedder,
**semantic_ratio,

View File

@ -103,13 +103,8 @@ async fn similar(
let index = index_scheduler.index(&index_uid)?;
let (embedder_name, embedder, quantized) = SearchKind::embedder(
&index_scheduler,
index_uid.to_string(),
&index,
&query.embedder,
None,
)?;
let (embedder_name, embedder, quantized) =
SearchKind::embedder(&index_scheduler, &index, &query.embedder, None)?;
tokio::task::spawn_blocking(move || {
perform_similar(

View File

@ -125,28 +125,14 @@ pub async fn multi_search_with_post(
})
.with_index(query_index)?;
let index_uid_str = index_uid.to_string();
let search_kind = search_kind(
&query,
index_scheduler.get_ref(),
index_uid_str.clone(),
&index,
features,
)
.with_index(query_index)?;
let search_kind =
search_kind(&query, index_scheduler.get_ref(), &index, features)
.with_index(query_index)?;
let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)
.with_index(query_index)?;
let search_result = tokio::task::spawn_blocking(move || {
perform_search(
index_uid_str.clone(),
&index,
query,
search_kind,
retrieve_vector,
features,
)
perform_search(&index, query, search_kind, retrieve_vector, features)
})
.await
.with_index(query_index)?;

View File

@ -560,8 +560,7 @@ pub fn perform_federated_search(
// use an immediately invoked lambda to capture the result without returning from the function
let res: Result<(), ResponseError> = (|| {
let search_kind =
search_kind(&query, index_scheduler, index_uid.to_string(), &index, features)?;
let search_kind = search_kind(&query, index_scheduler, &index, features)?;
let canonicalization_kind = match (&search_kind, &query.q) {
(SearchKind::SemanticOnly { .. }, _) => {
@ -637,8 +636,7 @@ pub fn perform_federated_search(
search.offset(0);
search.limit(required_hit_count);
let (result, _semantic_hit_count) =
super::search_from_kind(index_uid.to_string(), search_kind, search)?;
let (result, _semantic_hit_count) = super::search_from_kind(search_kind, search)?;
let format = AttributesFormat {
attributes_to_retrieve: query.attributes_to_retrieve,
retrieve_vectors,
@ -672,10 +670,7 @@ pub fn perform_federated_search(
let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer);
let hit_maker =
HitMaker::new(&index, &rtxn, format, formatter_builder).map_err(|e| {
MeilisearchHttpError::from_milli(e, Some(index_uid.to_string()))
})?;
let hit_maker = HitMaker::new(&index, &rtxn, format, formatter_builder)?;
results_by_query.push(SearchResultByQuery {
federation_options,

View File

@ -19,9 +19,7 @@ use meilisearch_types::locales::Locale;
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
use meilisearch_types::milli::vector::Embedder;
use meilisearch_types::milli::{
FacetValueHit, InternalError, OrderBy, SearchForFacetValues, TimeBudget,
};
use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget};
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
use meilisearch_types::{milli, Document};
use milli::tokenizer::{Language, TokenizerBuilder};
@ -283,38 +281,35 @@ pub enum SearchKind {
impl SearchKind {
pub(crate) fn semantic(
index_scheduler: &index_scheduler::IndexScheduler,
index_uid: String,
index: &Index,
embedder_name: &str,
vector_len: Option<usize>,
) -> Result<Self, ResponseError> {
let (embedder_name, embedder, quantized) =
Self::embedder(index_scheduler, index_uid, index, embedder_name, vector_len)?;
Self::embedder(index_scheduler, index, embedder_name, vector_len)?;
Ok(Self::SemanticOnly { embedder_name, embedder, quantized })
}
pub(crate) fn hybrid(
index_scheduler: &index_scheduler::IndexScheduler,
index_uid: String,
index: &Index,
embedder_name: &str,
semantic_ratio: f32,
vector_len: Option<usize>,
) -> Result<Self, ResponseError> {
let (embedder_name, embedder, quantized) =
Self::embedder(index_scheduler, index_uid, index, embedder_name, vector_len)?;
Self::embedder(index_scheduler, index, embedder_name, vector_len)?;
Ok(Self::Hybrid { embedder_name, embedder, quantized, semantic_ratio })
}
pub(crate) fn embedder(
index_scheduler: &index_scheduler::IndexScheduler,
index_uid: String,
index: &Index,
embedder_name: &str,
vector_len: Option<usize>,
) -> Result<(String, Arc<Embedder>, bool), ResponseError> {
let embedder_configs = index.embedding_configs(&index.read_txn()?)?;
let embedders = index_scheduler.embedders(index_uid, embedder_configs)?;
let embedders = index_scheduler.embedders(embedder_configs)?;
let (embedder, _, quantized) = embedders
.get(embedder_name)
@ -895,7 +890,6 @@ fn prepare_search<'t>(
}
pub fn perform_search(
index_uid: String,
index: &Index,
query: SearchQuery,
search_kind: SearchKind,
@ -922,7 +916,7 @@ pub fn perform_search(
used_negative_operator,
},
semantic_hit_count,
) = search_from_kind(index_uid, search_kind, search)?;
) = search_from_kind(search_kind, search)?;
let SearchQuery {
q,
@ -1075,27 +1069,17 @@ fn compute_facet_distribution_stats<S: AsRef<str>>(
}
pub fn search_from_kind(
index_uid: String,
search_kind: SearchKind,
search: milli::Search<'_>,
) -> Result<(milli::SearchResult, Option<u32>), MeilisearchHttpError> {
let (milli_result, semantic_hit_count) = match &search_kind {
SearchKind::KeywordOnly => {
let results = search
.execute()
.map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?;
(results, None)
}
SearchKind::KeywordOnly => (search.execute()?, None),
SearchKind::SemanticOnly { .. } => {
let results = search
.execute()
.map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid.to_string())))?;
let results = search.execute()?;
let semantic_hit_count = results.document_scores.len() as u32;
(results, Some(semantic_hit_count))
}
SearchKind::Hybrid { semantic_ratio, .. } => search
.execute_hybrid(*semantic_ratio)
.map_err(|e| MeilisearchHttpError::from_milli(e, Some(index_uid)))?,
SearchKind::Hybrid { semantic_ratio, .. } => search.execute_hybrid(*semantic_ratio)?,
};
Ok((milli_result, semantic_hit_count))
}
@ -1197,7 +1181,7 @@ impl<'a> HitMaker<'a> {
rtxn: &'a RoTxn<'a>,
format: AttributesFormat,
mut formatter_builder: MatcherBuilder<'a>,
) -> milli::Result<Self> {
) -> Result<Self, MeilisearchHttpError> {
formatter_builder.crop_marker(format.crop_marker);
formatter_builder.highlight_prefix(format.highlight_pre_tag);
formatter_builder.highlight_suffix(format.highlight_post_tag);
@ -1292,7 +1276,11 @@ impl<'a> HitMaker<'a> {
})
}
pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> {
pub fn make_hit(
&self,
id: u32,
score: &[ScoreDetails],
) -> Result<SearchHit, MeilisearchHttpError> {
let (_, obkv) =
self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?;
@ -1335,10 +1323,7 @@ impl<'a> HitMaker<'a> {
.is_some_and(|conf| conf.user_provided.contains(id));
let embeddings =
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
vectors.insert(
name,
serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?,
);
vectors.insert(name, serde_json::to_value(embeddings)?);
}
document.insert("_vectors".into(), vectors.into());
}
@ -1384,7 +1369,7 @@ fn make_hits<'a>(
format: AttributesFormat,
matching_words: milli::MatchingWords,
documents_ids_scores: impl Iterator<Item = (u32, &'a Vec<ScoreDetails>)> + 'a,
) -> milli::Result<Vec<SearchHit>> {
) -> Result<Vec<SearchHit>, MeilisearchHttpError> {
let mut documents = Vec::new();
let dictionary = index.dictionary(rtxn)?;
@ -1712,12 +1697,12 @@ fn make_document(
displayed_attributes: &BTreeSet<FieldId>,
field_ids_map: &FieldsIdsMap,
obkv: &obkv::KvReaderU16,
) -> milli::Result<Document> {
) -> Result<Document, MeilisearchHttpError> {
let mut document = serde_json::Map::new();
// recreate the original json
for (key, value) in obkv.iter() {
let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
let value = serde_json::from_slice(value)?;
let key = field_ids_map.name(key).expect("Missing field name").to_string();
document.insert(key, value);
@ -1742,7 +1727,7 @@ fn format_fields(
displayable_ids: &BTreeSet<FieldId>,
locales: Option<&[Language]>,
localized_attributes: &[LocalizedAttributesRule],
) -> milli::Result<(Option<MatchesPosition>, Document)> {
) -> Result<(Option<MatchesPosition>, Document), MeilisearchHttpError> {
let mut matches_position = compute_matches.then(BTreeMap::new);
let mut document = document.clone();
@ -1920,7 +1905,7 @@ fn parse_filter_array(arr: &[Value]) -> Result<Option<Filter>, MeilisearchHttpEr
}
}
Filter::from_array(ands).map_err(|e| MeilisearchHttpError::from_milli(e, None))
Ok(Filter::from_array(ands)?)
}
#[cfg(test)]

View File

@ -284,7 +284,6 @@ async fn test_summarized_document_addition_or_update() {
@r#"
{
"uid": 0,
"progress": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 1
@ -315,7 +314,6 @@ async fn test_summarized_document_addition_or_update() {
@r#"
{
"uid": 1,
"progress": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 1
@ -351,7 +349,6 @@ async fn test_summarized_delete_documents_by_batch() {
@r#"
{
"uid": 0,
"progress": null,
"details": {
"providedIds": 3,
"deletedDocuments": 0
@ -383,7 +380,6 @@ async fn test_summarized_delete_documents_by_batch() {
@r#"
{
"uid": 2,
"progress": null,
"details": {
"providedIds": 1,
"deletedDocuments": 0
@ -420,7 +416,6 @@ async fn test_summarized_delete_documents_by_filter() {
@r#"
{
"uid": 0,
"progress": null,
"details": {
"providedIds": 0,
"deletedDocuments": 0,
@ -453,7 +448,6 @@ async fn test_summarized_delete_documents_by_filter() {
@r#"
{
"uid": 2,
"progress": null,
"details": {
"providedIds": 0,
"deletedDocuments": 0,
@ -486,7 +480,6 @@ async fn test_summarized_delete_documents_by_filter() {
@r#"
{
"uid": 4,
"progress": null,
"details": {
"providedIds": 0,
"deletedDocuments": 0,
@ -523,7 +516,6 @@ async fn test_summarized_delete_document_by_id() {
@r#"
{
"uid": 0,
"progress": null,
"details": {
"providedIds": 1,
"deletedDocuments": 0
@ -555,7 +547,6 @@ async fn test_summarized_delete_document_by_id() {
@r#"
{
"uid": 2,
"progress": null,
"details": {
"providedIds": 1,
"deletedDocuments": 0
@ -603,7 +594,6 @@ async fn test_summarized_settings_update() {
@r#"
{
"uid": 0,
"progress": null,
"details": {
"displayedAttributes": [
"doggos",
@ -648,7 +638,6 @@ async fn test_summarized_index_creation() {
@r#"
{
"uid": 0,
"progress": null,
"details": {},
"stats": {
"totalNbTasks": 1,
@ -676,7 +665,6 @@ async fn test_summarized_index_creation() {
@r#"
{
"uid": 1,
"progress": null,
"details": {
"primaryKey": "doggos"
},
@ -821,7 +809,6 @@ async fn test_summarized_index_update() {
@r#"
{
"uid": 0,
"progress": null,
"details": {},
"stats": {
"totalNbTasks": 1,
@ -849,7 +836,6 @@ async fn test_summarized_index_update() {
@r#"
{
"uid": 1,
"progress": null,
"details": {
"primaryKey": "bones"
},
@ -882,7 +868,6 @@ async fn test_summarized_index_update() {
@r#"
{
"uid": 3,
"progress": null,
"details": {},
"stats": {
"totalNbTasks": 1,
@ -910,7 +895,6 @@ async fn test_summarized_index_update() {
@r#"
{
"uid": 4,
"progress": null,
"details": {
"primaryKey": "bones"
},
@ -948,7 +932,6 @@ async fn test_summarized_index_swap() {
@r#"
{
"uid": 0,
"progress": null,
"details": {
"swaps": [
{
@ -989,7 +972,6 @@ async fn test_summarized_index_swap() {
@r#"
{
"uid": 3,
"progress": null,
"details": {
"swaps": [
{
@ -1032,7 +1014,6 @@ async fn test_summarized_batch_cancelation() {
@r#"
{
"uid": 1,
"progress": null,
"details": {
"matchedTasks": 1,
"canceledTasks": 0,
@ -1070,7 +1051,6 @@ async fn test_summarized_batch_deletion() {
@r#"
{
"uid": 1,
"progress": null,
"details": {
"matchedTasks": 1,
"deletedTasks": 1,
@ -1104,7 +1084,6 @@ async fn test_summarized_dump_creation() {
@r#"
{
"uid": 0,
"progress": null,
"details": {
"dumpUid": "[dumpUid]"
},

View File

@ -1264,18 +1264,15 @@ async fn error_add_documents_bad_document_id() {
let server = Server::new().await;
let index = server.index("test");
index.create(Some("docid")).await;
// unsupported characters
let documents = json!([
{
"docid": "foo & bar",
"content": "foobar"
}
]);
let (value, _code) = index.add_documents(documents, None).await;
index.wait_task(value.uid()).await;
let (response, code) = index.get_task(value.uid()).await;
index.add_documents(documents, None).await;
index.wait_task(1).await;
let (response, code) = index.get_task(1).await;
snapshot!(code, @"200 OK");
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
@r###"
@ -1291,81 +1288,7 @@ async fn error_add_documents_bad_document_id() {
"indexedDocuments": 0
},
"error": {
"message": "Document identifier `\"foo & bar\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.",
"code": "invalid_document_id",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_id"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// More than 512 bytes
let documents = json!([
{
"docid": "a".repeat(600),
"content": "foobar"
}
]);
let (value, _code) = index.add_documents(documents, None).await;
index.wait_task(value.uid()).await;
let (response, code) = index.get_task(value.uid()).await;
snapshot!(code, @"200 OK");
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
@r###"
{
"uid": 2,
"batchUid": 2,
"indexUid": "test",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Document identifier `\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.",
"code": "invalid_document_id",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_id"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// Exactly 512 bytes
let documents = json!([
{
"docid": "a".repeat(512),
"content": "foobar"
}
]);
let (value, _code) = index.add_documents(documents, None).await;
index.wait_task(value.uid()).await;
let (response, code) = index.get_task(value.uid()).await;
snapshot!(code, @"200 OK");
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
@r###"
{
"uid": 3,
"batchUid": 3,
"indexUid": "test",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Document identifier `\"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.",
"message": "Document identifier `\"foo & bar\"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.",
"code": "invalid_document_id",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_id"
@ -1758,7 +1681,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.",
"message": "The `_geo` field in the document with the id: `\"11\"` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `\"foobar\"`.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1796,7 +1719,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.",
"message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1834,7 +1757,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.",
"message": "Could not find latitude nor longitude in the document with the id: `\"11\"`. Was expecting `_geo.lat` and `_geo.lng` fields.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1872,7 +1795,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.",
"message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1910,7 +1833,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.",
"message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1948,7 +1871,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.",
"message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -1986,7 +1909,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.",
"message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2024,7 +1947,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.",
"message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `false` and `true`.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2062,7 +1985,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.",
"message": "Could not find longitude in the document with the id: `\"11\"`. Was expecting a `_geo.lng` field.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2100,7 +2023,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.",
"message": "Could not find latitude in the document with the id: `\"11\"`. Was expecting a `_geo.lat` field.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2138,7 +2061,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.",
"message": "Could not parse latitude nor longitude in the document with the id: `\"11\"`. Was expecting finite numbers but instead got `\"doggo\"` and `\"doggo\"`.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2176,7 +2099,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.",
"message": "The `_geo` field in the document with the id: `\"11\"` contains the following unexpected fields: `{\"doggo\":\"are the best\"}`.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2215,7 +2138,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.",
"message": "Could not parse longitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2252,7 +2175,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.",
"message": "Could not parse latitude in the document with the id: `\"12\"`. Was expecting a finite number but instead got `null`.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2289,7 +2212,7 @@ async fn add_documents_invalid_geo_field() {
"indexedDocuments": 0
},
"error": {
"message": "Index `test`: Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.",
"message": "Could not parse latitude nor longitude in the document with the id: `\"13\"`. Was expecting finite numbers but instead got `null` and `null`.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
@ -2356,7 +2279,7 @@ async fn add_invalid_geo_and_then_settings() {
]
},
"error": {
"message": "Index `test`: Could not parse latitude in the document with the id: `\"11\"`. Was expecting a finite number but instead got `null`.",
"message": "Could not parse latitude in the document with the id: `\"11\"`. Was expecting a finite number but instead got `null`.",
"code": "invalid_document_geo_field",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"

View File

@ -604,7 +604,7 @@ async fn delete_document_by_filter() {
"originalFilter": "\"doggo = bernese\""
},
"error": {
"message": "Index `EMPTY_INDEX`: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese",
"message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese",
"code": "invalid_document_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_filter"
@ -636,7 +636,7 @@ async fn delete_document_by_filter() {
"originalFilter": "\"catto = jorts\""
},
"error": {
"message": "Index `SHARED_DOCUMENTS`: Attribute `catto` is not filterable. Available filterable attributes are: `id`, `title`.\n1:6 catto = jorts",
"message": "Attribute `catto` is not filterable. Available filterable attributes are: `id`, `title`.\n1:6 catto = jorts",
"code": "invalid_document_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_filter"

View File

@ -172,7 +172,7 @@ async fn error_update_documents_bad_document_id() {
assert_eq!(
response["error"]["message"],
json!(
r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes."#
r#"Document identifier `"foo & bar"` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes."#
)
);
assert_eq!(response["error"]["code"], json!("invalid_document_id"));

View File

@ -95,7 +95,7 @@ async fn error_update_existing_primary_key() {
let response = index.wait_task(2).await;
let expected_response = json!({
"message": "Index `test`: Index already has a primary key: `id`.",
"message": "Index already has a primary key: `id`.",
"code": "index_primary_key_already_exists",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#index_primary_key_already_exists"

View File

@ -711,7 +711,7 @@ async fn filter_invalid_attribute_array() {
index.wait_task(task.uid()).await;
let expected_response = json!({
"message": format!("Index `{}`: Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", index.uid),
"message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass",
"code": "invalid_search_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
@ -733,7 +733,7 @@ async fn filter_invalid_attribute_string() {
index.wait_task(task.uid()).await;
let expected_response = json!({
"message": format!("Index `{}`: Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass", index.uid),
"message": "Attribute `many` is not filterable. Available filterable attributes are: `title`.\n1:5 many = Glass",
"code": "invalid_search_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
@ -940,7 +940,7 @@ async fn sort_unsortable_attribute() {
index.wait_task(response.uid()).await.succeeded();
let expected_response = json!({
"message": format!("Index `{}`: Attribute `title` is not sortable. Available sortable attributes are: `id`.", index.uid),
"message": "Attribute `title` is not sortable. Available sortable attributes are: `id`.",
"code": "invalid_search_sort",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_sort"
@ -998,7 +998,7 @@ async fn sort_unset_ranking_rule() {
index.wait_task(response.uid()).await.succeeded();
let expected_response = json!({
"message": format!("Index `{}`: You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.", index.uid),
"message": "You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.",
"code": "invalid_search_sort",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_sort"
@ -1024,18 +1024,19 @@ async fn search_on_unknown_field() {
index.update_settings_searchable_attributes(json!(["id", "title"])).await;
index.wait_task(response.uid()).await.succeeded();
let expected_response = json!({
"message": format!("Index `{}`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", index.uid),
"code": "invalid_search_attributes_to_search_on",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on"
});
index
.search(
json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown"]}),
|response, code| {
assert_eq!(response, expected_response);
assert_eq!(code, 400);
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.",
"code": "invalid_search_attributes_to_search_on",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on"
}
"###);
},
)
.await;
@ -1049,18 +1050,19 @@ async fn search_on_unknown_field_plus_joker() {
index.update_settings_searchable_attributes(json!(["id", "title"])).await;
index.wait_task(response.uid()).await.succeeded();
let expected_response = json!({
"message": format!("Index `{}`: Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.", index.uid),
"code": "invalid_search_attributes_to_search_on",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on"
});
index
.search(
json!({"q": "Captain Marvel", "attributesToSearchOn": ["*", "unknown"]}),
|response, code| {
assert_eq!(response, expected_response);
assert_eq!(code, 400);
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.",
"code": "invalid_search_attributes_to_search_on",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on"
}
"###);
},
)
.await;
@ -1069,8 +1071,15 @@ async fn search_on_unknown_field_plus_joker() {
.search(
json!({"q": "Captain Marvel", "attributesToSearchOn": ["unknown", "*"]}),
|response, code| {
assert_eq!(response, expected_response);
assert_eq!(code, 400);
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Attribute `unknown` is not searchable. Available searchable attributes are: `id, title`.",
"code": "invalid_search_attributes_to_search_on",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_search_on"
}
"###);
},
)
.await;
@ -1083,44 +1092,47 @@ async fn distinct_at_search_time() {
let (task, _) = index.create(None).await;
index.wait_task(task.uid()).await.succeeded();
let expected_response = json!({
"message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.", index.uid),
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
});
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
assert_eq!(response, expected_response);
assert_eq!(code, 400);
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await;
index.wait_task(task.uid()).await;
let expected_response = json!({
"message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.", index.uid),
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
});
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
assert_eq!(response, expected_response);
assert_eq!(code, 400);
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await;
index.wait_task(task.uid()).await;
let expected_response = json!({
"message": format!("Index `{}`: Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.", index.uid),
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
});
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
assert_eq!(response, expected_response);
assert_eq!(code, 400);
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": true})).await;

View File

@ -57,116 +57,6 @@ async fn simple_facet_search() {
assert_eq!(response["facetHits"].as_array().unwrap().len(), 1);
}
#[actix_rt::test]
async fn simple_facet_search_on_movies() {
let server = Server::new().await;
let index = server.index("test");
let documents = json!([
{
"id": 1,
"title": "Carol",
"genres": [
"Romance",
"Drama"
],
"color": [
"red"
],
"platforms": [
"MacOS",
"Linux",
"Windows"
]
},
{
"id": 2,
"title": "Wonder Woman",
"genres": [
"Action",
"Adventure"
],
"color": [
"green"
],
"platforms": [
"MacOS"
]
},
{
"id": 3,
"title": "Life of Pi",
"genres": [
"Adventure",
"Drama"
],
"color": [
"blue"
],
"platforms": [
"Windows"
]
},
{
"id": 4,
"title": "Mad Max: Fury Road",
"genres": [
"Adventure",
"Science Fiction"
],
"color": [
"red"
],
"platforms": [
"MacOS",
"Linux"
]
},
{
"id": 5,
"title": "Moana",
"genres": [
"Fantasy",
"Action"
],
"color": [
"red"
],
"platforms": [
"Windows"
]
},
{
"id": 6,
"title": "Philadelphia",
"genres": [
"Drama"
],
"color": [
"blue"
],
"platforms": [
"MacOS",
"Linux",
"Windows"
]
}
]);
let (response, code) =
index.update_settings_filterable_attributes(json!(["genres", "color"])).await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await;
let (response, _code) = index.add_documents(documents, None).await;
index.wait_task(response.uid()).await;
let (response, code) =
index.facet_search(json!({"facetQuery": "", "facetName": "genres", "q": "" })).await;
assert_eq!(code, 200, "{}", response);
snapshot!(response["facetHits"], @r###"[{"value":"Action","count":2},{"value":"Adventure","count":3},{"value":"Drama","count":3},{"value":"Fantasy","count":1},{"value":"Romance","count":1},{"value":"Science Fiction","count":1}]"###);
}
#[actix_rt::test]
async fn advanced_facet_search() {
let server = Server::new().await;

View File

@ -1070,7 +1070,7 @@ async fn federation_one_query_error() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Inside `.queries[1]`: Index `nested`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto",
"message": "Inside `.queries[1]`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto",
"code": "invalid_search_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
@ -1102,7 +1102,7 @@ async fn federation_one_query_sort_error() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.",
"message": "Inside `.queries[1]`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.",
"code": "invalid_search_sort",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_sort"
@ -1166,7 +1166,7 @@ async fn federation_multiple_query_errors() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Inside `.queries[0]`: Index `test`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto",
"message": "Inside `.queries[0]`: Attribute `title` is not filterable. This index does not have configured filterable attributes.\n1:6 title = toto",
"code": "invalid_search_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
@ -1198,7 +1198,7 @@ async fn federation_multiple_query_sort_errors() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Inside `.queries[0]`: Index `test`: Attribute `title` is not sortable. This index does not have configured sortable attributes.",
"message": "Inside `.queries[0]`: Attribute `title` is not sortable. This index does not have configured sortable attributes.",
"code": "invalid_search_sort",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_sort"
@ -1231,7 +1231,7 @@ async fn federation_multiple_query_errors_interleaved() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not filterable. This index does not have configured filterable attributes.\n1:7 doggos IN [intel, kefir]",
"message": "Inside `.queries[1]`: Attribute `doggos` is not filterable. This index does not have configured filterable attributes.\n1:7 doggos IN [intel, kefir]",
"code": "invalid_search_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
@ -1264,7 +1264,7 @@ async fn federation_multiple_query_sort_errors_interleaved() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Inside `.queries[1]`: Index `nested`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.",
"message": "Inside `.queries[1]`: Attribute `doggos` is not sortable. This index does not have configured sortable attributes.",
"code": "invalid_search_sort",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_sort"

View File

@ -79,7 +79,7 @@ async fn similar_bad_id() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.",
"message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.",
"code": "invalid_similar_id",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_id"
@ -172,7 +172,7 @@ async fn similar_invalid_id() {
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 511 bytes.",
"message": "Invalid value at `.id`: the value of `id` is invalid. A document identifier can be of type integer or string, only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), and can not be more than 512 bytes.",
"code": "invalid_similar_id",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_id"

View File

@ -448,7 +448,7 @@ async fn test_summarized_delete_documents_by_filter() {
"originalFilter": "\"doggo = bernese\""
},
"error": {
"message": "Index `test`: Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese",
"message": "Attribute `doggo` is not filterable. This index does not have configured filterable attributes.\n1:6 doggo = bernese",
"code": "invalid_document_filter",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_filter"

View File

@ -318,7 +318,7 @@ async fn try_to_disable_binary_quantization() {
}
},
"error": {
"message": "Index `doggo`: `.embedders.manual.binaryQuantized`: Cannot disable the binary quantization.\n - Note: Binary quantization is a lossy operation that cannot be reverted.\n - Hint: Add a new embedder that is non-quantized and regenerate the vectors.",
"message": "`.embedders.manual.binaryQuantized`: Cannot disable the binary quantization.\n - Note: Binary quantization is a lossy operation that cannot be reverted.\n - Hint: Add a new embedder that is non-quantized and regenerate the vectors.",
"code": "invalid_settings_embedders",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_settings_embedders"

View File

@ -250,7 +250,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`",
"message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -280,7 +280,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`",
"message": "Bad embedder configuration in the document with id: `0`. Missing field `._vectors.manual.regenerate`\n - note: `._vectors.manual` must be an array of floats, an array of arrays of floats, or an object with field `regenerate`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -311,7 +311,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26",
"message": "Bad embedder configuration in the document with id: `0`. Could not parse `._vectors.manual.regenerate`: invalid type: string \"yes please\", expected a boolean at line 1 column 26",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -340,7 +340,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`",
"message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings`: expected null or an array, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -369,7 +369,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`",
"message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -398,7 +398,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`",
"message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -440,7 +440,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`",
"message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -469,7 +469,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`",
"message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[1]`: expected an array, but found a number: `0.3`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -498,7 +498,7 @@ async fn user_provided_embeddings_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`",
"message": "Bad embedder configuration in the document with id: `0`. Invalid value type at `._vectors.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
@ -539,7 +539,7 @@ async fn user_provided_vectors_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`",
"message": "While embedding documents for embedder `manual`: no vectors provided for document `40` and at least 4 other document(s)\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: opt-out for a document with `_vectors.manual: null`",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -569,7 +569,7 @@ async fn user_provided_vectors_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).",
"message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vector` by `_vectors` in 1 document(s).",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -599,7 +599,7 @@ async fn user_provided_vectors_error() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).",
"message": "While embedding documents for embedder `manual`: no vectors provided for document `42`\n- Note: `manual` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.manual`.\n- Hint: try replacing `_vectors.manaul` by `_vectors.manual` in 1 document(s).",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"

View File

@ -713,7 +713,7 @@ async fn bad_api_key() {
}
},
"error": {
"message": "Index `doggo`: While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"Incorrect API key provided: Bearer doggo. You can find your API key at https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":\"invalid_api_key\"}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables",
"message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"Incorrect API key provided: Bearer doggo. You can find your API key at https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":\"invalid_api_key\"}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -757,7 +757,7 @@ async fn bad_api_key() {
}
},
"error": {
"message": "Index `doggo`: While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables",
"message": "While embedding documents for embedder `default`: user error: could not authenticate against OpenAI server\n - server replied with `{\"error\":{\"message\":\"You didn't provide an API key. You need to provide your API key in an Authorization header using Bearer auth (i.e. Authorization: Bearer YOUR_KEY), or as the password field (with blank username) if you're accessing the API from your browser and are prompted for a username and password. You can obtain an API key from https://platform.openai.com/account/api-keys.\",\"type\":\"invalid_request_error\",\"param\":null,\"code\":null}}`\n - Hint: Check the `apiKey` parameter in the embedder configuration, and the `MEILI_OPENAI_API_KEY` and `OPENAI_API_KEY` environment variables",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"

View File

@ -985,7 +985,7 @@ async fn bad_settings() {
}
},
"error": {
"message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting a single \"{{embedding}}\", expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence",
"message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting a single \"{{embedding}}\", expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1025,7 +1025,7 @@ async fn bad_settings() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: While embedding documents for embedder `rest`: runtime error: was expecting embeddings of dimension `2`, got embeddings of dimensions `3`",
"message": "While embedding documents for embedder `rest`: runtime error: was expecting embeddings of dimension `2`, got embeddings of dimensions `3`",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1178,7 +1178,7 @@ async fn server_returns_bad_request() {
}
},
"error": {
"message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"test\\\", expected struct MultipleRequest at line 1 column 6\"}`",
"message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"test\\\", expected struct MultipleRequest at line 1 column 6\"}`",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1247,7 +1247,7 @@ async fn server_returns_bad_request() {
"indexedDocuments": 0
},
"error": {
"message": "Index `doggo`: While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`",
"message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1306,7 +1306,7 @@ async fn server_returns_bad_response() {
}
},
"error": {
"message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting the array of \"{{embedding}}\"s, configuration expects `response` to be an array with at least 1 item(s) but server sent an object with 1 field(s)",
"message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting the array of \"{{embedding}}\"s, configuration expects `response` to be an array with at least 1 item(s) but server sent an object with 1 field(s)",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1362,7 +1362,7 @@ async fn server_returns_bad_response() {
}
},
"error": {
"message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting item #0 from the array of \"{{embedding}}\"s, expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence",
"message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response`, while extracting item #0 from the array of \"{{embedding}}\"s, expected `response` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected a sequence",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1414,7 +1414,7 @@ async fn server_returns_bad_response() {
}
},
"error": {
"message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output`, while extracting a single \"{{embedding}}\", expected `output` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected f32",
"message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output`, while extracting a single \"{{embedding}}\", expected `output` to be an array of numbers, but failed to parse server response:\n - invalid type: map, expected f32",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1478,7 +1478,7 @@ async fn server_returns_bad_response() {
}
},
"error": {
"message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.embedding`, while extracting item #0 from the array of \"{{embedding}}\"s, configuration expects `embedding` to be an object with key `data` but server sent an array of size 3",
"message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.embedding`, while extracting item #0 from the array of \"{{embedding}}\"s, configuration expects `embedding` to be an object with key `data` but server sent an array of size 3",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1542,7 +1542,7 @@ async fn server_returns_bad_response() {
}
},
"error": {
"message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output[0]`, while extracting a single \"{{embedding}}\", configuration expects key \"embeddings\", which is missing in response\n - Hint: item #0 inside `output` has key `embedding`, did you mean `response.output[0].embedding` in embedder configuration?",
"message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with runtime error: error extracting embeddings from the response:\n - in `response.output[0]`, while extracting a single \"{{embedding}}\", configuration expects key \"embeddings\", which is missing in response\n - Hint: item #0 inside `output` has key `embedding`, did you mean `response.output[0].embedding` in embedder configuration?",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1908,7 +1908,7 @@ async fn server_custom_header() {
}
},
"error": {
"message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration",
"message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"missing header 'my-nonstandard-auth'\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -1951,7 +1951,7 @@ async fn server_custom_header() {
}
},
"error": {
"message": "Index `doggo`: Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration",
"message": "Error while generating embeddings: runtime error: could not determine model dimensions:\n - test embedding failed with user error: could not authenticate against embedding server\n - server replied with `{\"error\":\"thou shall not pass, Balrog\"}`\n - Hint: Check the `apiKey` parameter in the embedder configuration",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"
@ -2099,7 +2099,7 @@ async fn searchable_reindex() {
]
},
"error": {
"message": "Index `doggo`: While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`",
"message": "While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`",
"code": "vector_embedding_error",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#vector_embedding_error"

View File

@ -10,15 +10,12 @@ license.workspace = true
[dependencies]
anyhow = "1.0.86"
arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" }
clap = { version = "4.5.9", features = ["derive"] }
dump = { path = "../dump" }
file-store = { path = "../file-store" }
indexmap = {version = "2.7.0", features = ["serde"]}
meilisearch-auth = { path = "../meilisearch-auth" }
meilisearch-types = { path = "../meilisearch-types" }
serde = { version = "1.0.209", features = ["derive"] }
serde_json = {version = "1.0.133", features = ["preserve_order"]}
tempfile = "3.14.0"
time = { version = "0.3.36", features = ["formatting", "parsing", "alloc"] }
uuid = { version = "1.10.0", features = ["v4"], default-features = false }
arroy_v04_to_v05 = { package = "arroy", git = "https://github.com/meilisearch/arroy/", tag = "DO-NOT-DELETE-upgrade-v04-to-v05" }

View File

@ -73,7 +73,7 @@ enum Command {
///
/// Supported upgrade paths:
///
/// - v1.9.x -> v1.10.x -> v1.11.x -> v1.12.x
/// - v1.9.x -> v1.10.x -> v1.11.x
OfflineUpgrade {
#[arg(long)]
target_version: String,

View File

@ -1,14 +1,13 @@
mod v1_10;
mod v1_11;
mod v1_12;
mod v1_9;
use std::path::{Path, PathBuf};
use anyhow::{bail, Context};
use meilisearch_types::versioning::create_version_file;
use v1_10::v1_9_to_v1_10;
use v1_12::v1_11_to_v1_12;
use crate::upgrade::v1_11::v1_10_to_v1_11;
@ -23,7 +22,6 @@ impl OfflineUpgrade {
let upgrade_list = [
(v1_9_to_v1_10 as fn(&Path) -> Result<(), anyhow::Error>, "1", "10", "0"),
(v1_10_to_v1_11, "1", "11", "0"),
(v1_11_to_v1_12, "1", "12", "0"),
];
let (current_major, current_minor, current_patch) = &self.current_version;
@ -35,7 +33,6 @@ impl OfflineUpgrade {
) {
("1", "9", _) => 0,
("1", "10", _) => 1,
("1", "11", _) => 2,
_ => {
bail!("Unsupported current version {current_major}.{current_minor}.{current_patch}. Can only upgrade from v1.9 and v1.10")
}
@ -46,7 +43,6 @@ impl OfflineUpgrade {
let ends_at = match (target_major.as_str(), target_minor.as_str(), target_patch.as_str()) {
("1", "10", _) => 0,
("1", "11", _) => 1,
("1", "12", _) => 2,
(major, _, _) if major.starts_with('v') => {
bail!("Target version must not starts with a `v`. Instead of writing `v1.9.0` write `1.9.0` for example.")
}

View File

@ -1,13 +1,18 @@
use anyhow::bail;
use std::path::Path;
use anyhow::{bail, Context};
use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified};
use meilisearch_types::milli::index::{db_name, main_key};
use anyhow::Context;
use meilisearch_types::{
heed::{
types::{SerdeJson, Str},
Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified,
},
milli::index::{db_name, main_key},
};
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
use super::v1_9;
use crate::uuid_codec::UuidCodec;
use crate::{try_opening_database, try_opening_poly_database};
pub type FieldDistribution = std::collections::BTreeMap<String, u64>;

View File

@ -7,12 +7,12 @@
use std::path::Path;
use anyhow::Context;
use meilisearch_types::heed::types::Str;
use meilisearch_types::heed::{Database, EnvOpenOptions};
use meilisearch_types::milli::index::db_name;
use meilisearch_types::{
heed::{types::Str, Database, EnvOpenOptions},
milli::index::db_name,
};
use crate::uuid_codec::UuidCodec;
use crate::{try_opening_database, try_opening_poly_database};
use crate::{try_opening_database, try_opening_poly_database, uuid_codec::UuidCodec};
pub fn v1_10_to_v1_11(db_path: &Path) -> anyhow::Result<()> {
println!("Upgrading from v1.10.0 to v1.11.0");

View File

@ -1,79 +0,0 @@
//! The breaking changes that happened between the v1.11 and the v1.12 are:
//! - The new indexer changed the update files format from OBKV to ndjson. https://github.com/meilisearch/meilisearch/pull/4900
use std::io::BufWriter;
use std::path::Path;
use anyhow::Context;
use file_store::FileStore;
use indexmap::IndexMap;
use meilisearch_types::milli::documents::DocumentsBatchReader;
use serde_json::value::RawValue;
use tempfile::NamedTempFile;
pub fn v1_11_to_v1_12(db_path: &Path) -> anyhow::Result<()> {
println!("Upgrading from v1.11.0 to v1.12.0");
convert_update_files(db_path)?;
Ok(())
}
/// Convert the update files from OBKV to ndjson format.
///
/// 1) List all the update files using the file store.
/// 2) For each update file, read the update file into a DocumentsBatchReader.
/// 3) For each document in the update file, convert the document to a JSON object.
/// 4) Write the JSON object to a tmp file in the update files directory.
/// 5) Persist the tmp file replacing the old update file.
fn convert_update_files(db_path: &Path) -> anyhow::Result<()> {
let update_files_dir_path = db_path.join("update_files");
let file_store = FileStore::new(&update_files_dir_path).with_context(|| {
format!("while creating file store for update files dir {update_files_dir_path:?}")
})?;
for uuid in file_store.all_uuids().context("while retrieving uuids from file store")? {
let uuid = uuid.context("while retrieving uuid from file store")?;
let update_file_path = file_store.get_update_path(uuid);
let update_file = file_store
.get_update(uuid)
.with_context(|| format!("while getting update file for uuid {uuid:?}"))?;
let mut file =
NamedTempFile::new_in(&update_files_dir_path).map(BufWriter::new).with_context(
|| format!("while creating bufwriter for update file {update_file_path:?}"),
)?;
let reader = DocumentsBatchReader::from_reader(update_file).with_context(|| {
format!("while creating documents batch reader for update file {update_file_path:?}")
})?;
let (mut cursor, index) = reader.into_cursor_and_fields_index();
while let Some(document) = cursor.next_document().with_context(|| {
format!(
"while reading documents from batch reader for update file {update_file_path:?}"
)
})? {
let mut json_document = IndexMap::new();
for (fid, value) in document {
let field_name = index
.name(fid)
.with_context(|| format!("while getting field name for fid {fid} for update file {update_file_path:?}"))?;
let value: &RawValue = serde_json::from_slice(value)?;
json_document.insert(field_name, value);
}
serde_json::to_writer(&mut file, &json_document)?;
}
let file = file.into_inner().map_err(|e| e.into_error()).context(format!(
"while flushing update file bufwriter for update file {update_file_path:?}"
))?;
let _ = file
// atomically replace the obkv file with the rewritten NDJSON file
.persist(&update_file_path)
.with_context(|| format!("while persisting update file {update_file_path:?}"))?;
}
Ok(())
}

View File

@ -91,8 +91,8 @@ ureq = { version = "2.10.0", features = ["json"] }
url = "2.5.2"
rayon-par-bridge = "0.1.0"
hashbrown = "0.15.0"
raw-collections = { git = "https://github.com/meilisearch/raw-collections.git", version = "0.1.0" }
bumpalo = "3.16.0"
bumparaw-collections = "0.1.2"
thread_local = "1.1.8"
allocator-api2 = "0.2.18"
rustc-hash = "2.0.0"

View File

@ -280,7 +280,7 @@ fn starts_with(selector: &str, key: &str) -> bool {
pub fn validate_document_id_str(document_id: &str) -> Option<&str> {
if document_id.is_empty()
|| document_id.len() >= 512
|| document_id.len() > 512
|| !document_id.chars().all(|c| c.is_ascii_alphanumeric() || c == '-' || c == '_')
{
None

View File

@ -114,7 +114,7 @@ pub enum UserError {
"Document identifier `{}` is invalid. \
A document identifier can be of type integer or string, \
only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_), \
and can not be more than 511 bytes.", .document_id.to_string()
and can not be more than 512 bytes.", .document_id.to_string()
)]
InvalidDocumentId { document_id: Value },
#[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))]

View File

@ -1734,7 +1734,6 @@ pub(crate) mod tests {
use crate::error::{Error, InternalError};
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::progress::Progress;
use crate::update::new::indexer;
use crate::update::settings::InnerIndexSettings;
use crate::update::{
@ -1811,7 +1810,7 @@ pub(crate) mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)?;
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
@ -1830,7 +1829,7 @@ pub(crate) mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
})
.unwrap()?;
@ -1902,7 +1901,7 @@ pub(crate) mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)?;
if let Some(error) = operation_stats.into_iter().find_map(|stat| stat.error) {
@ -1921,7 +1920,7 @@ pub(crate) mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
})
.unwrap()?;
@ -1983,7 +1982,7 @@ pub(crate) mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2002,7 +2001,7 @@ pub(crate) mod tests {
&document_changes,
embedders,
&|| should_abort.load(Relaxed),
&Progress::default(),
&|_| (),
)
})
.unwrap()

View File

@ -31,7 +31,6 @@ pub mod vector;
#[macro_use]
pub mod snapshot_tests;
mod fieldids_weights_map;
pub mod progress;
use std::collections::{BTreeMap, HashMap};
use std::convert::{TryFrom, TryInto};

View File

@ -1,152 +0,0 @@
use std::any::TypeId;
use std::borrow::Cow;
use std::sync::atomic::{AtomicU32, Ordering};
use std::sync::{Arc, RwLock};
use serde::Serialize;
pub trait Step: 'static + Send + Sync {
fn name(&self) -> Cow<'static, str>;
fn current(&self) -> u32;
fn total(&self) -> u32;
}
#[derive(Clone, Default)]
pub struct Progress {
steps: Arc<RwLock<Vec<(TypeId, Box<dyn Step>)>>>,
}
impl Progress {
pub fn update_progress<P: Step>(&self, sub_progress: P) {
let mut steps = self.steps.write().unwrap();
let step_type = TypeId::of::<P>();
if let Some(idx) = steps.iter().position(|(id, _)| *id == step_type) {
steps.truncate(idx);
}
steps.push((step_type, Box::new(sub_progress)));
}
// TODO: This code should be in meilisearch_types but cannot because milli can't depend on meilisearch_types
pub fn as_progress_view(&self) -> ProgressView {
let steps = self.steps.read().unwrap();
let mut percentage = 0.0;
let mut prev_factors = 1.0;
let mut step_view = Vec::with_capacity(steps.len());
for (_, step) in steps.iter() {
prev_factors *= step.total() as f32;
percentage += step.current() as f32 / prev_factors;
step_view.push(ProgressStepView {
current_step: step.name(),
finished: step.current(),
total: step.total(),
});
}
ProgressView { steps: step_view, percentage: percentage * 100.0 }
}
}
/// This trait lets you use the AtomicSubStep defined right below.
/// The name must be a const that never changed but that can't be enforced by the type system because it make the trait non object-safe.
/// By forcing the Default trait + the &'static str we make it harder to miss-use the trait.
pub trait NamedStep: 'static + Send + Sync + Default {
fn name(&self) -> &'static str;
}
/// Structure to quickly define steps that need very quick, lockless updating of their current step.
/// You can use this struct if:
/// - The name of the step doesn't change
/// - The total number of steps doesn't change
pub struct AtomicSubStep<Name: NamedStep> {
unit_name: Name,
current: Arc<AtomicU32>,
total: u32,
}
impl<Name: NamedStep> AtomicSubStep<Name> {
pub fn new(total: u32) -> (Arc<AtomicU32>, Self) {
let current = Arc::new(AtomicU32::new(0));
(current.clone(), Self { current, total, unit_name: Name::default() })
}
}
impl<Name: NamedStep> Step for AtomicSubStep<Name> {
fn name(&self) -> Cow<'static, str> {
self.unit_name.name().into()
}
fn current(&self) -> u32 {
self.current.load(Ordering::Relaxed)
}
fn total(&self) -> u32 {
self.total
}
}
#[macro_export]
macro_rules! make_enum_progress {
($visibility:vis enum $name:ident { $($variant:ident,)+ }) => {
#[repr(u8)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)]
#[allow(clippy::enum_variant_names)]
$visibility enum $name {
$($variant),+
}
impl Step for $name {
fn name(&self) -> Cow<'static, str> {
use convert_case::Casing;
match self {
$(
$name::$variant => stringify!($variant).from_case(convert_case::Case::Camel).to_case(convert_case::Case::Lower).into()
),+
}
}
fn current(&self) -> u32 {
*self as u32
}
fn total(&self) -> u32 {
Self::CARDINALITY as u32
}
}
};
}
#[macro_export]
macro_rules! make_atomic_progress {
($struct_name:ident alias $atomic_struct_name:ident => $step_name:literal) => {
#[derive(Default, Debug, Clone, Copy)]
pub struct $struct_name {}
impl NamedStep for $struct_name {
fn name(&self) -> &'static str {
$step_name
}
}
pub type $atomic_struct_name = AtomicSubStep<$struct_name>;
};
}
make_atomic_progress!(Document alias AtomicDocumentStep => "document" );
make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" );
#[derive(Debug, Serialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct ProgressView {
pub steps: Vec<ProgressStepView>,
pub percentage: f32,
}
#[derive(Debug, Serialize, Clone)]
#[serde(rename_all = "camelCase")]
pub struct ProgressStepView {
pub current_step: Cow<'static, str>,
pub finished: u32,
pub total: u32,
}

View File

@ -3,13 +3,12 @@ use std::collections::BTreeMap;
use std::fmt::{self, Debug};
use bumpalo::Bump;
use bumparaw_collections::{RawMap, RawVec, Value};
use liquid::model::{
ArrayView, DisplayCow, KString, KStringCow, ObjectRender, ObjectSource, ScalarCow, State,
Value as LiquidValue,
};
use liquid::{ObjectView, ValueView};
use rustc_hash::FxBuildHasher;
use raw_collections::{RawMap, RawVec};
use serde_json::value::RawValue;
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
@ -196,7 +195,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc
}
impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> {
fn as_debug(&self) -> &dyn Debug {
fn as_debug(&self) -> &dyn fmt::Debug {
self
}
fn render(&self) -> liquid::model::DisplayCow<'_> {
@ -244,13 +243,14 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc,
}
}
#[derive(Debug)]
struct ParseableValue<'doc> {
value: Value<'doc, FxBuildHasher>,
value: raw_collections::Value<'doc>,
}
impl<'doc> ParseableValue<'doc> {
pub fn new(value: &'doc RawValue, doc_alloc: &'doc Bump) -> Self {
let value = Value::from_raw_value_and_hasher(value, FxBuildHasher, doc_alloc).unwrap();
let value = raw_collections::Value::from_raw_value(value, doc_alloc).unwrap();
Self { value }
}
@ -260,19 +260,19 @@ impl<'doc> ParseableValue<'doc> {
}
// transparent newtype for implementing ValueView
#[derive(Debug)]
#[repr(transparent)]
struct ParseableMap<'doc>(RawMap<'doc, FxBuildHasher>);
#[derive(Debug)]
struct ParseableMap<'doc>(RawMap<'doc>);
// transparent newtype for implementing ValueView
#[derive(Debug)]
#[repr(transparent)]
#[derive(Debug)]
struct ParseableArray<'doc>(RawVec<'doc>);
impl<'doc> ParseableMap<'doc> {
pub fn as_parseable<'a>(map: &'a RawMap<'doc, FxBuildHasher>) -> &'a ParseableMap<'doc> {
pub fn as_parseable<'a>(map: &'a RawMap<'doc>) -> &'a ParseableMap<'doc> {
// SAFETY: repr(transparent)
unsafe { &*(map as *const RawMap<FxBuildHasher> as *const Self) }
unsafe { &*(map as *const RawMap as *const Self) }
}
}
@ -447,9 +447,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn render(&self) -> DisplayCow<'_> {
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
use raw_collections::value::Number;
use raw_collections::Value;
match &self.value {
Value::Null => LiquidValue::Nil.render(),
Value::Bool(v) => v.render(),
@ -465,9 +464,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn source(&self) -> DisplayCow<'_> {
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
use raw_collections::value::Number;
use raw_collections::Value;
match &self.value {
Value::Null => LiquidValue::Nil.source(),
Value::Bool(v) => ValueView::source(v),
@ -483,9 +481,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn type_name(&self) -> &'static str {
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
use raw_collections::value::Number;
use raw_collections::Value;
match &self.value {
Value::Null => LiquidValue::Nil.type_name(),
Value::Bool(v) => v.type_name(),
@ -501,8 +498,7 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn query_state(&self, state: State) -> bool {
use bumparaw_collections::Value;
use raw_collections::Value;
match &self.value {
Value::Null => ValueView::query_state(&LiquidValue::Nil, state),
Value::Bool(v) => ValueView::query_state(v, state),
@ -519,8 +515,7 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn to_kstr(&self) -> KStringCow<'_> {
use bumparaw_collections::Value;
use raw_collections::Value;
match &self.value {
Value::Null => ValueView::to_kstr(&LiquidValue::Nil),
Value::Bool(v) => ValueView::to_kstr(v),
@ -532,14 +527,12 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn to_value(&self) -> LiquidValue {
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
use raw_collections::Value;
match &self.value {
Value::Null => LiquidValue::Nil,
Value::Bool(v) => LiquidValue::Scalar(liquid::model::ScalarCow::new(*v)),
Value::Number(number) => match number {
Number::PosInt(number) => {
raw_collections::value::Number::PosInt(number) => {
let number: i64 = match (*number).try_into() {
Ok(number) => number,
Err(_) => {
@ -548,8 +541,12 @@ impl<'doc> ValueView for ParseableValue<'doc> {
};
LiquidValue::Scalar(ScalarCow::new(number))
}
Number::NegInt(number) => LiquidValue::Scalar(ScalarCow::new(*number)),
Number::Finite(number) => LiquidValue::Scalar(ScalarCow::new(*number)),
raw_collections::value::Number::NegInt(number) => {
LiquidValue::Scalar(ScalarCow::new(*number))
}
raw_collections::value::Number::Finite(number) => {
LiquidValue::Scalar(ScalarCow::new(*number))
}
},
Value::String(s) => LiquidValue::Scalar(liquid::model::ScalarCow::new(s.to_string())),
Value::Array(raw_vec) => ParseableArray::as_parseable(raw_vec).to_value(),
@ -558,9 +555,8 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn as_scalar(&self) -> Option<liquid::model::ScalarCow<'_>> {
use bumparaw_collections::value::Number;
use bumparaw_collections::Value;
use raw_collections::value::Number;
use raw_collections::Value;
match &self.value {
Value::Bool(v) => Some(liquid::model::ScalarCow::new(*v)),
Value::Number(number) => match number {
@ -580,41 +576,34 @@ impl<'doc> ValueView for ParseableValue<'doc> {
}
fn is_scalar(&self) -> bool {
use bumparaw_collections::Value;
use raw_collections::Value;
matches!(&self.value, Value::Bool(_) | Value::Number(_) | Value::String(_))
}
fn as_array(&self) -> Option<&dyn liquid::model::ArrayView> {
if let Value::Array(array) = &self.value {
if let raw_collections::Value::Array(array) = &self.value {
return Some(ParseableArray::as_parseable(array) as _);
}
None
}
fn is_array(&self) -> bool {
matches!(&self.value, bumparaw_collections::Value::Array(_))
matches!(&self.value, raw_collections::Value::Array(_))
}
fn as_object(&self) -> Option<&dyn ObjectView> {
if let Value::Object(object) = &self.value {
if let raw_collections::Value::Object(object) = &self.value {
return Some(ParseableMap::as_parseable(object) as _);
}
None
}
fn is_object(&self) -> bool {
matches!(&self.value, bumparaw_collections::Value::Object(_))
matches!(&self.value, raw_collections::Value::Object(_))
}
fn is_nil(&self) -> bool {
matches!(&self.value, bumparaw_collections::Value::Null)
}
}
impl Debug for ParseableValue<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("ParseableValue").field("value", &self.value).finish()
matches!(&self.value, raw_collections::Value::Null)
}
}

View File

@ -38,16 +38,6 @@ pub struct RenderPromptError {
pub fault: FaultSource,
}
impl RenderPromptError {
pub(crate) fn missing_context_with_external_docid(
external_docid: String,
inner: liquid::Error,
) -> RenderPromptError {
Self {
kind: RenderPromptErrorKind::MissingContextWithExternalDocid(external_docid, inner),
fault: FaultSource::User,
}
}
pub(crate) fn missing_context(inner: liquid::Error) -> RenderPromptError {
Self { kind: RenderPromptErrorKind::MissingContext(inner), fault: FaultSource::User }
}
@ -57,8 +47,6 @@ impl RenderPromptError {
pub enum RenderPromptErrorKind {
#[error("missing field in document: {0}")]
MissingContext(liquid::Error),
#[error("missing field in document `{0}`: {1}")]
MissingContextWithExternalDocid(String, liquid::Error),
}
impl From<RenderPromptError> for crate::Error {

View File

@ -119,7 +119,6 @@ impl Prompt {
'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents
>(
&self,
external_docid: &str,
document: impl crate::update::new::document::Document<'a> + Debug,
field_id_map: &RefCell<GlobalFieldsIdsMap>,
doc_alloc: &'doc Bump,
@ -131,12 +130,9 @@ impl Prompt {
self.max_bytes.unwrap_or_else(default_max_bytes).get(),
doc_alloc,
);
self.template.render_to(&mut rendered, &context).map_err(|liquid_error| {
RenderPromptError::missing_context_with_external_docid(
external_docid.to_owned(),
liquid_error,
)
})?;
self.template
.render_to(&mut rendered, &context)
.map_err(RenderPromptError::missing_context)?;
Ok(std::str::from_utf8(rendered.into_bump_slice())
.expect("render can only write UTF-8 because all inputs and processing preserve utf-8"))
}

View File

@ -5,7 +5,6 @@ use bumpalo::Bump;
use heed::EnvOpenOptions;
use maplit::{btreemap, hashset};
use crate::progress::Progress;
use crate::update::new::indexer;
use crate::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use crate::vector::EmbeddingConfigs;
@ -73,7 +72,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -92,7 +91,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();

View File

@ -1,4 +1,4 @@
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use rayon::{ThreadPool, ThreadPoolBuilder};
@ -9,8 +9,6 @@ use thiserror::Error;
#[derive(Debug)]
pub struct ThreadPoolNoAbort {
thread_pool: ThreadPool,
/// The number of active operations.
active_operations: AtomicUsize,
/// Set to true if the thread pool catched a panic.
pool_catched_panic: Arc<AtomicBool>,
}
@ -21,9 +19,7 @@ impl ThreadPoolNoAbort {
OP: FnOnce() -> R + Send,
R: Send,
{
self.active_operations.fetch_add(1, Ordering::Relaxed);
let output = self.thread_pool.install(op);
self.active_operations.fetch_sub(1, Ordering::Relaxed);
// While reseting the pool panic catcher we return an error if we catched one.
if self.pool_catched_panic.swap(false, Ordering::SeqCst) {
Err(PanicCatched)
@ -35,11 +31,6 @@ impl ThreadPoolNoAbort {
pub fn current_num_threads(&self) -> usize {
self.thread_pool.current_num_threads()
}
/// The number of active operations.
pub fn active_operations(&self) -> usize {
self.active_operations.load(Ordering::Relaxed)
}
}
#[derive(Error, Debug)]
@ -73,10 +64,6 @@ impl ThreadPoolNoAbortBuilder {
let catched_panic = pool_catched_panic.clone();
move |_result| catched_panic.store(true, Ordering::SeqCst)
});
Ok(ThreadPoolNoAbort {
thread_pool: self.0.build()?,
active_operations: AtomicUsize::new(0),
pool_catched_panic,
})
Ok(ThreadPoolNoAbort { thread_pool: self.0.build()?, pool_catched_panic })
}
}

View File

@ -766,7 +766,6 @@ mod tests {
use crate::documents::mmap_from_objects;
use crate::index::tests::TempIndex;
use crate::index::IndexEmbeddingConfig;
use crate::progress::Progress;
use crate::search::TermsMatchingStrategy;
use crate::update::new::indexer;
use crate::update::Setting;
@ -1965,7 +1964,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2149,7 +2148,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2164,7 +2163,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2211,7 +2210,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2226,7 +2225,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2264,7 +2263,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2279,7 +2278,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2316,7 +2315,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2331,7 +2330,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2370,7 +2369,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2385,7 +2384,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2429,7 +2428,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2444,7 +2443,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2481,7 +2480,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2496,7 +2495,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2533,7 +2532,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2548,7 +2547,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2727,7 +2726,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2742,7 +2741,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2786,7 +2785,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2801,7 +2800,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();
@ -2842,7 +2841,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -2857,7 +2856,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();
wtxn.commit().unwrap();

View File

@ -1,8 +1,8 @@
use std::collections::{BTreeMap, BTreeSet};
use bumparaw_collections::RawMap;
use either::Either;
use heed::RoTxn;
use rustc_hash::FxBuildHasher;
use raw_collections::RawMap;
use serde_json::value::RawValue;
use super::vector_document::VectorDocument;
@ -210,29 +210,34 @@ impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
for MergedDocument<'d, 'doc, 't, Mapper>
{
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'d str, &'d RawValue)>> {
let mut new_doc_it = self.new_doc.iter_top_level_fields();
let mut db_it = self.db.iter().flat_map(|db| db.iter_top_level_fields());
let mut seen_fields = BTreeSet::new();
match &self.db {
Some(db) => {
let mut new_doc_it = self.new_doc.iter_top_level_fields();
let mut db_it = db.iter_top_level_fields();
let mut seen_fields = BTreeSet::new();
std::iter::from_fn(move || {
if let Some(next) = new_doc_it.next() {
if let Ok((name, _)) = next {
seen_fields.insert(name);
}
return Some(next);
}
loop {
match db_it.next()? {
Ok((name, value)) => {
if seen_fields.contains(name) {
continue;
Either::Left(std::iter::from_fn(move || {
if let Some(next) = new_doc_it.next() {
if let Ok((name, _)) = next {
seen_fields.insert(name);
}
return Some(Ok((name, value)));
return Some(next);
}
Err(err) => return Some(Err(err)),
}
loop {
match db_it.next()? {
Ok((name, value)) => {
if seen_fields.contains(name) {
continue;
}
return Some(Ok((name, value)));
}
Err(err) => return Some(Err(err)),
}
}
}))
}
})
None => Either::Right(self.new_doc.iter_top_level_fields()),
}
}
fn vectors_field(&self) -> Result<Option<&'d RawValue>> {
@ -386,12 +391,12 @@ pub type Entry<'doc> = (&'doc str, &'doc RawValue);
#[derive(Debug)]
pub struct Versions<'doc> {
data: RawMap<'doc, FxBuildHasher>,
data: RawMap<'doc>,
}
impl<'doc> Versions<'doc> {
pub fn multiple(
mut versions: impl Iterator<Item = Result<RawMap<'doc, FxBuildHasher>>>,
mut versions: impl Iterator<Item = Result<RawMap<'doc>>>,
) -> Result<Option<Self>> {
let Some(data) = versions.next() else { return Ok(None) };
let mut data = data?;
@ -404,7 +409,7 @@ impl<'doc> Versions<'doc> {
Ok(Some(Self::single(data)))
}
pub fn single(version: RawMap<'doc, FxBuildHasher>) -> Self {
pub fn single(version: RawMap<'doc>) -> Self {
Self { data: version }
}

View File

@ -69,12 +69,12 @@ use std::io::BufReader;
use std::{io, iter, mem};
use bumpalo::Bump;
use bumparaw_collections::bbbul::{BitPacker, BitPacker4x};
use bumparaw_collections::map::FrozenMap;
use bumparaw_collections::{Bbbul, FrozenBbbul};
use grenad::ReaderCursor;
use hashbrown::hash_map::RawEntryMut;
use hashbrown::HashMap;
use raw_collections::bbbul::{BitPacker, BitPacker4x};
use raw_collections::map::FrozenMap;
use raw_collections::{Bbbul, FrozenBbbul};
use roaring::RoaringBitmap;
use rustc_hash::FxBuildHasher;
@ -177,12 +177,12 @@ impl<'extractor> BalancedCaches<'extractor> {
Ok(())
}
pub fn freeze(&mut self, source_id: usize) -> Result<Vec<FrozenCache<'_, 'extractor>>> {
pub fn freeze(&mut self) -> Result<Vec<FrozenCache<'_, 'extractor>>> {
match &mut self.caches {
InnerCaches::Normal(NormalCaches { caches }) => caches
.iter_mut()
.enumerate()
.map(|(bucket_id, map)| {
.map(|(bucket, map)| {
// safety: we are transmuting the Bbbul into a FrozenBbbul
// that are the same size.
let map = unsafe {
@ -201,19 +201,14 @@ impl<'extractor> BalancedCaches<'extractor> {
>,
>(map)
};
Ok(FrozenCache {
source_id,
bucket_id,
cache: FrozenMap::new(map),
spilled: Vec::new(),
})
Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() })
})
.collect(),
InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches
.iter_mut()
.zip(mem::take(spilled_entries))
.enumerate()
.map(|(bucket_id, (map, sorter))| {
.map(|(bucket, (map, sorter))| {
let spilled = sorter
.into_reader_cursors()?
.into_iter()
@ -239,7 +234,7 @@ impl<'extractor> BalancedCaches<'extractor> {
>,
>(map)
};
Ok(FrozenCache { source_id, bucket_id, cache: FrozenMap::new(map), spilled })
Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled })
})
.collect(),
}
@ -445,8 +440,7 @@ fn spill_entry_to_sorter(
}
pub struct FrozenCache<'a, 'extractor> {
bucket_id: usize,
source_id: usize,
bucket: usize,
cache: FrozenMap<
'a,
'extractor,
@ -463,9 +457,9 @@ pub fn transpose_and_freeze_caches<'a, 'extractor>(
let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0);
let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect();
for (thread_index, thread_cache) in caches.iter_mut().enumerate() {
for frozen in thread_cache.freeze(thread_index)? {
bucket_caches[frozen.bucket_id].push(frozen);
for thread_cache in caches {
for frozen in thread_cache.freeze()? {
bucket_caches[frozen.bucket].push(frozen);
}
}
@ -483,16 +477,21 @@ where
F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>,
{
let mut maps = Vec::new();
let mut heap = BinaryHeap::new();
let mut readers = Vec::new();
let mut current_bucket = None;
for FrozenCache { source_id, bucket_id, cache, spilled } in frozen {
assert_eq!(*current_bucket.get_or_insert(bucket_id), bucket_id);
maps.push((source_id, cache));
for reader in spilled {
let mut cursor = reader.into_cursor()?;
if cursor.move_on_next()?.is_some() {
heap.push(Entry { cursor, source_id });
}
for FrozenCache { bucket, cache, ref mut spilled } in frozen {
assert_eq!(*current_bucket.get_or_insert(bucket), bucket);
maps.push(cache);
readers.append(spilled);
}
// First manage the spilled entries by looking into the HashMaps,
// merge them and mark them as dummy.
let mut heap = BinaryHeap::new();
for (source_index, source) in readers.into_iter().enumerate() {
let mut cursor = source.into_cursor()?;
if cursor.move_on_next()?.is_some() {
heap.push(Entry { cursor, source_index });
}
}
@ -509,29 +508,25 @@ where
let mut output = DelAddRoaringBitmap::from_bytes(first_value)?;
while let Some(mut entry) = heap.peek_mut() {
if let Some((key, value)) = entry.cursor.current() {
if first_key != key {
if let Some((key, _value)) = entry.cursor.current() {
if first_key == key {
let new = DelAddRoaringBitmap::from_bytes(first_value)?;
output = output.merge(new);
// When we are done we the current value of this entry move make
// it move forward and let the heap reorganize itself (on drop)
if entry.cursor.move_on_next()?.is_none() {
PeekMut::pop(entry);
}
} else {
break;
}
let new = DelAddRoaringBitmap::from_bytes(value)?;
output = output.merge(new);
// When we are done we the current value of this entry move make
// it move forward and let the heap reorganize itself (on drop)
if entry.cursor.move_on_next()?.is_none() {
PeekMut::pop(entry);
}
}
}
// Once we merged all of the spilled bitmaps we must also
// fetch the entries from the non-spilled entries (the HashMaps).
for (source_id, map) in maps.iter_mut() {
debug_assert!(
!(map.get(first_key).is_some() && first_entry.source_id == *source_id),
"A thread should not have spiled a key that has been inserted in the cache"
);
if first_entry.source_id != *source_id {
for (map_index, map) in maps.iter_mut().enumerate() {
if first_entry.source_index != map_index {
if let Some(new) = map.get_mut(first_key) {
output.union_and_clear_bbbul(new);
}
@ -543,12 +538,12 @@ where
// Don't forget to put the first entry back into the heap.
if first_entry.cursor.move_on_next()?.is_some() {
heap.push(first_entry);
heap.push(first_entry)
}
}
// Then manage the content on the HashMap entries that weren't taken (mem::take).
while let Some((_, mut map)) = maps.pop() {
while let Some(mut map) = maps.pop() {
// Make sure we don't try to work with entries already managed by the spilled
let mut ordered_entries: Vec<_> =
map.iter_mut().filter(|(_, bbbul)| !bbbul.is_empty()).collect();
@ -558,7 +553,7 @@ where
let mut output = DelAddRoaringBitmap::empty();
output.union_and_clear_bbbul(bbbul);
for (_, rhs) in maps.iter_mut() {
for rhs in maps.iter_mut() {
if let Some(new) = rhs.get_mut(key) {
output.union_and_clear_bbbul(new);
}
@ -574,14 +569,14 @@ where
struct Entry<R> {
cursor: ReaderCursor<R>,
source_id: usize,
source_index: usize,
}
impl<R> Ord for Entry<R> {
fn cmp(&self, other: &Entry<R>) -> Ordering {
let skey = self.cursor.current().map(|(k, _)| k);
let okey = other.cursor.current().map(|(k, _)| k);
skey.cmp(&okey).then(self.source_id.cmp(&other.source_id)).reverse()
skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse()
}
}

View File

@ -16,10 +16,10 @@ use crate::update::del_add::DelAdd;
use crate::update::new::channel::FieldIdDocidFacetSender;
use crate::update::new::extract::perm_json_p;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
};
use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::steps::IndexingStep;
use crate::update::new::steps::Step;
use crate::update::new::thread_local::{FullySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
@ -373,16 +373,26 @@ fn truncate_str(s: &str) -> &str {
impl FacetedDocidsExtractor {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
pub fn run_extraction<
'pl,
'fid,
'indexer,
'index,
'extractor,
DC: DocumentChanges<'pl>,
MSP,
SP,
>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
sender: &FieldIdDocidFacetSender,
step: IndexingStep,
step: Step,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
let index = indexing_context.index;
let rtxn = index.read_txn()?;

View File

@ -15,22 +15,23 @@ pub use geo::*;
pub use searchable::*;
pub use vectors::EmbeddingExtractor;
use super::indexer::document_changes::{DocumentChanges, IndexingContext};
use super::steps::IndexingStep;
use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress};
use super::steps::Step;
use super::thread_local::{FullySend, ThreadLocal};
use crate::update::GrenadParameters;
use crate::Result;
pub trait DocidsExtractor {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
step: Step,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync;
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync;
}
/// TODO move in permissive json pointer

View File

@ -11,10 +11,10 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use crate::update::new::extract::cache::BalancedCaches;
use crate::update::new::extract::perm_json_p::contained_in;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
};
use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::steps::IndexingStep;
use crate::update::new::steps::Step;
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
@ -28,7 +28,7 @@ pub struct WordDocidsBalancedCaches<'extractor> {
exact_word_docids: BalancedCaches<'extractor>,
word_position_docids: BalancedCaches<'extractor>,
fid_word_count_docids: BalancedCaches<'extractor>,
fid_word_count: HashMap<FieldId, (Option<usize>, Option<usize>)>,
fid_word_count: HashMap<FieldId, (usize, usize)>,
current_docid: Option<DocumentId>,
}
@ -85,8 +85,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
self.fid_word_count
.entry(field_id)
.and_modify(|(_current_count, new_count)| *new_count.get_or_insert(0) += 1)
.or_insert((None, Some(1)));
.and_modify(|(_current_count, new_count)| *new_count += 1)
.or_insert((0, 1));
self.current_docid = Some(docid);
Ok(())
@ -130,8 +130,8 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
self.fid_word_count
.entry(field_id)
.and_modify(|(current_count, _new_count)| *current_count.get_or_insert(0) += 1)
.or_insert((Some(1), None));
.and_modify(|(current_count, _new_count)| *current_count += 1)
.or_insert((1, 0));
self.current_docid = Some(docid);
@ -141,18 +141,14 @@ impl<'extractor> WordDocidsBalancedCaches<'extractor> {
fn flush_fid_word_count(&mut self, buffer: &mut BumpVec<u8>) -> Result<()> {
for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
if current_count != new_count {
if let Some(current_count) =
current_count.filter(|current_count| *current_count <= MAX_COUNTED_WORDS)
{
if current_count <= MAX_COUNTED_WORDS {
buffer.clear();
buffer.extend_from_slice(&fid.to_be_bytes());
buffer.push(current_count as u8);
self.fid_word_count_docids
.insert_del_u32(buffer, self.current_docid.unwrap())?;
}
if let Some(new_count) =
new_count.filter(|new_count| *new_count <= MAX_COUNTED_WORDS)
{
if new_count <= MAX_COUNTED_WORDS {
buffer.clear();
buffer.extend_from_slice(&fid.to_be_bytes());
buffer.push(new_count as u8);
@ -239,15 +235,25 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
pub struct WordDocidsExtractors;
impl WordDocidsExtractors {
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
pub fn run_extraction<
'pl,
'fid,
'indexer,
'index,
'extractor,
DC: DocumentChanges<'pl>,
MSP,
SP,
>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
step: Step,
) -> Result<WordDocidsCaches<'extractor>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
let index = indexing_context.index;
let rtxn = index.read_txn()?;

View File

@ -14,9 +14,9 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::cache::BalancedCaches;
use super::DocidsExtractor;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
};
use crate::update::new::steps::IndexingStep;
use crate::update::new::steps::Step;
use crate::update::new::thread_local::{FullySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
@ -56,15 +56,16 @@ impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
}
pub trait SearchableExtractor: Sized + Sync {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
step: Step,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
let rtxn = indexing_context.index.read_txn()?;
let stop_words = indexing_context.index.stop_words(&rtxn)?;
@ -133,15 +134,16 @@ pub trait SearchableExtractor: Sized + Sync {
}
impl<T: SearchableExtractor> DocidsExtractor for T {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: IndexingStep,
step: Step,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
Self::run_extraction(
grenad_parameters,

View File

@ -176,10 +176,9 @@ pub fn tokenizer_builder<'a>(
#[cfg(test)]
mod test {
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use charabia::TokenizerBuilder;
use meili_snap::snapshot;
use rustc_hash::FxBuildHasher;
use raw_collections::RawMap;
use serde_json::json;
use serde_json::value::RawValue;
@ -235,7 +234,7 @@ mod test {
let bump = Bump::new();
let document: &RawValue = serde_json::from_str(&document).unwrap();
let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, &bump).unwrap();
let document = RawMap::from_raw_value(document, &bump).unwrap();
let document = Versions::single(document);
let document = DocumentFromVersions::new(&document);

View File

@ -130,7 +130,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
);
} else if new_vectors.regenerate {
let new_rendered = prompt.render_document(
update.external_document_id(),
update.current(
&context.rtxn,
context.index,
@ -140,7 +139,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
&context.doc_alloc,
)?;
let old_rendered = prompt.render_document(
update.external_document_id(),
update.merged(
&context.rtxn,
context.index,
@ -160,7 +158,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
}
} else if old_vectors.regenerate {
let old_rendered = prompt.render_document(
update.external_document_id(),
update.current(
&context.rtxn,
context.index,
@ -170,7 +167,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
&context.doc_alloc,
)?;
let new_rendered = prompt.render_document(
update.external_document_id(),
update.merged(
&context.rtxn,
context.index,
@ -220,7 +216,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
);
} else if new_vectors.regenerate {
let rendered = prompt.render_document(
insertion.external_document_id(),
insertion.inserted(),
context.new_fields_ids_map,
&context.doc_alloc,
@ -234,7 +229,6 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
}
} else {
let rendered = prompt.render_document(
insertion.external_document_id(),
insertion.inserted(),
context.new_fields_ids_map,
&context.doc_alloc,

View File

@ -103,8 +103,6 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")]
pub fn merge_and_write(self, index: &Index, wtxn: &mut RwTxn, rtxn: &RoTxn) -> Result<()> {
tracing::trace!("merge facet strings for facet search: {:?}", self.registered_facets);
let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?;
let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString);
builder.extend(reader);
@ -120,15 +118,12 @@ impl<'indexer> FacetSearchBuilder<'indexer> {
BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?;
if current_field_id != Some(field_id) {
if let (Some(current_field_id), Some(fst_merger_builder)) =
(current_field_id, fst_merger_builder)
{
if let Some(fst_merger_builder) = fst_merger_builder {
let mmap = fst_merger_builder.build(&mut callback)?;
index.facet_id_string_fst.remap_data_type::<Bytes>().put(
wtxn,
&current_field_id,
&mmap,
)?;
index
.facet_id_string_fst
.remap_data_type::<Bytes>()
.put(wtxn, &field_id, &mmap)?;
}
fst = index.facet_id_string_fst.get(rtxn, &field_id)?;

View File

@ -1,8 +1,6 @@
use std::ops::ControlFlow;
use bumpalo::Bump;
use bumparaw_collections::RawVec;
use rustc_hash::FxBuildHasher;
use serde::de::{DeserializeSeed, Deserializer as _, Visitor};
use serde_json::value::RawValue;
@ -362,7 +360,7 @@ impl<'a> DeserrRawValue<'a> {
}
pub struct DeserrRawVec<'a> {
vec: RawVec<'a>,
vec: raw_collections::RawVec<'a>,
alloc: &'a Bump,
}
@ -381,7 +379,7 @@ impl<'a> deserr::Sequence for DeserrRawVec<'a> {
}
pub struct DeserrRawVecIter<'a> {
it: bumparaw_collections::vec::iter::IntoIter<'a>,
it: raw_collections::vec::iter::IntoIter<'a>,
alloc: &'a Bump,
}
@ -395,7 +393,7 @@ impl<'a> Iterator for DeserrRawVecIter<'a> {
}
pub struct DeserrRawMap<'a> {
map: bumparaw_collections::RawMap<'a, FxBuildHasher>,
map: raw_collections::RawMap<'a>,
alloc: &'a Bump,
}
@ -418,7 +416,7 @@ impl<'a> deserr::Map for DeserrRawMap<'a> {
}
pub struct DeserrRawMapIter<'a> {
it: bumparaw_collections::map::iter::IntoIter<'a>,
it: raw_collections::map::iter::IntoIter<'a>,
alloc: &'a Bump,
}
@ -617,7 +615,7 @@ impl<'de> Visitor<'de> for DeserrRawValueVisitor<'de> {
where
A: serde::de::SeqAccess<'de>,
{
let mut raw_vec = RawVec::new_in(self.alloc);
let mut raw_vec = raw_collections::RawVec::new_in(self.alloc);
while let Some(next) = seq.next_element()? {
raw_vec.push(next);
}

View File

@ -1,5 +1,4 @@
use std::cell::{Cell, RefCell};
use std::sync::atomic::Ordering;
use std::sync::{Arc, RwLock};
use bumpalo::Bump;
@ -8,9 +7,8 @@ use rayon::iter::IndexedParallelIterator;
use super::super::document_change::DocumentChange;
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::progress::{AtomicDocumentStep, Progress};
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
use crate::update::new::steps::IndexingStep;
use crate::update::new::steps::Step;
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result};
@ -135,8 +133,10 @@ pub struct IndexingContext<
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
> where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
pub index: &'index Index,
pub db_fields_ids_map: &'indexer FieldsIdsMap,
@ -144,7 +144,7 @@ pub struct IndexingContext<
pub doc_allocs: &'indexer ThreadLocal<FullySend<Cell<Bump>>>,
pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
pub must_stop_processing: &'indexer MSP,
pub progress: &'indexer Progress,
pub send_progress: &'indexer SP,
}
impl<
@ -152,15 +152,18 @@ impl<
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
> Copy
for IndexingContext<
'fid, // invariant lifetime of fields ids map
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
}
@ -169,15 +172,18 @@ impl<
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
> Clone
for IndexingContext<
'fid, // invariant lifetime of fields ids map
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
fn clone(&self) -> Self {
*self
@ -196,6 +202,7 @@ pub fn extract<
EX,
DC: DocumentChanges<'pl>,
MSP,
SP,
>(
document_changes: &DC,
extractor: &EX,
@ -206,18 +213,18 @@ pub fn extract<
doc_allocs,
fields_ids_map_store,
must_stop_processing,
progress,
}: IndexingContext<'fid, 'indexer, 'index, MSP>,
send_progress,
}: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
datastore: &'data ThreadLocal<EX::Data>,
step: IndexingStep,
step: Step,
) -> Result<()>
where
EX: Extractor<'extractor>,
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
tracing::trace!("We are resetting the extractor allocators");
progress.update_progress(step);
// Clean up and reuse the extractor allocs
for extractor_alloc in extractor_allocs.iter_mut() {
tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes());
@ -225,11 +232,9 @@ where
}
let total_documents = document_changes.len() as u32;
let (step, progress_step) = AtomicDocumentStep::new(total_documents);
progress.update_progress(progress_step);
let pi = document_changes.iter(CHUNK_SIZE);
pi.try_arc_for_each_try_init(
pi.enumerate().try_arc_for_each_try_init(
|| {
DocumentChangeContext::new(
index,
@ -242,10 +247,13 @@ where
move |index_alloc| extractor.init_data(index_alloc),
)
},
|context, items| {
|context, (finished_documents, items)| {
if (must_stop_processing)() {
return Err(Arc::new(InternalError::AbortedIndexation.into()));
}
let finished_documents = (finished_documents * CHUNK_SIZE) as u32;
(send_progress)(Progress::from_step_substep(step, finished_documents, total_documents));
// Clean up and reuse the document-specific allocator
context.doc_alloc.reset();
@ -256,7 +264,6 @@ where
});
let res = extractor.process(changes, context).map_err(Arc::new);
step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed);
// send back the doc_alloc in the pool
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
@ -264,7 +271,32 @@ where
res
},
)?;
step.store(total_documents, Ordering::Relaxed);
(send_progress)(Progress::from_step_substep(step, total_documents, total_documents));
Ok(())
}
pub struct Progress {
pub finished_steps: u16,
pub total_steps: u16,
pub step_name: &'static str,
pub finished_total_substep: Option<(u32, u32)>,
}
impl Progress {
pub fn from_step(step: Step) -> Self {
Self {
finished_steps: step.finished_steps(),
total_steps: Step::total_steps(),
step_name: step.name(),
finished_total_substep: None,
}
}
pub fn from_step_substep(step: Step, finished_substep: u32, total_substep: u32) -> Self {
Self {
finished_total_substep: Some((finished_substep, total_substep)),
..Progress::from_step(step)
}
}
}

View File

@ -92,12 +92,11 @@ mod test {
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::index::tests::TempIndex;
use crate::progress::Progress;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, Extractor, IndexingContext,
};
use crate::update::new::indexer::DocumentDeletion;
use crate::update::new::steps::IndexingStep;
use crate::update::new::steps::Step;
use crate::update::new::thread_local::{MostlySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::DocumentId;
@ -165,7 +164,7 @@ mod test {
doc_allocs: &doc_allocs,
fields_ids_map_store: &fields_ids_map_store,
must_stop_processing: &(|| false),
progress: &Progress::default(),
send_progress: &(|_progress| {}),
};
for _ in 0..3 {
@ -177,7 +176,7 @@ mod test {
context,
&mut extractor_allocs,
&datastore,
IndexingStep::ExtractingDocuments,
Step::ExtractingDocuments,
)
.unwrap();

View File

@ -1,23 +1,19 @@
use std::sync::atomic::Ordering;
use bumpalo::collections::CollectIn;
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use hashbrown::hash_map::Entry;
use heed::RoTxn;
use memmap2::Mmap;
use raw_collections::RawMap;
use rayon::slice::ParallelSlice;
use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue;
use serde_json::Deserializer;
use super::super::document_change::DocumentChange;
use super::document_changes::{DocumentChangeContext, DocumentChanges};
use super::document_changes::{DocumentChangeContext, DocumentChanges, Progress};
use super::retrieve_or_guess_primary_key;
use crate::documents::PrimaryKey;
use crate::progress::{AtomicPayloadStep, Progress};
use crate::update::new::document::Versions;
use crate::update::new::steps::IndexingStep;
use crate::update::new::steps::Step;
use crate::update::new::thread_local::MostlySend;
use crate::update::new::{Deletion, Insertion, Update};
use crate::update::{AvailableIds, IndexDocumentsMethod};
@ -48,7 +44,7 @@ impl<'pl> DocumentOperation<'pl> {
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")]
pub fn into_changes<MSP>(
pub fn into_changes<MSP, SP>(
self,
indexer: &'pl Bump,
index: &Index,
@ -56,12 +52,12 @@ impl<'pl> DocumentOperation<'pl> {
primary_key_from_op: Option<&'pl str>,
new_fields_ids_map: &mut FieldsIdsMap,
must_stop_processing: &MSP,
progress: Progress,
send_progress: &SP,
) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)>
where
MSP: Fn() -> bool,
SP: Fn(Progress),
{
progress.update_progress(IndexingStep::PreparingPayloads);
let Self { operations, method } = self;
let documents_ids = index.documents_ids(rtxn)?;
@ -71,14 +67,16 @@ impl<'pl> DocumentOperation<'pl> {
let mut primary_key = None;
let payload_count = operations.len();
let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32);
progress.update_progress(progress_step);
for (payload_index, operation) in operations.into_iter().enumerate() {
if must_stop_processing() {
return Err(InternalError::AbortedIndexation.into());
}
step.store(payload_index as u32, Ordering::Relaxed);
send_progress(Progress::from_step_substep(
Step::PreparingPayloads,
payload_index as u32,
payload_count as u32,
));
let mut bytes = 0;
let result = match operation {
@ -119,7 +117,12 @@ impl<'pl> DocumentOperation<'pl> {
};
operations_stats.push(PayloadStats { document_count, bytes, error });
}
step.store(payload_count as u32, Ordering::Relaxed);
send_progress(Progress::from_step_substep(
Step::PreparingPayloads,
payload_count as u32,
payload_count as u32,
));
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> =
@ -163,9 +166,8 @@ fn extract_addition_payload_changes<'r, 'pl: 'r>(
// Only guess the primary key if it is the first document
let retrieved_primary_key = if previous_offset == 0 {
let doc = RawMap::from_raw_value_and_hasher(doc, FxBuildHasher, indexer)
.map(Some)
.map_err(UserError::SerdeJson)?;
let doc =
RawMap::from_raw_value(doc, indexer).map(Some).map_err(UserError::SerdeJson)?;
let result = retrieve_or_guess_primary_key(
rtxn,
@ -543,9 +545,8 @@ impl MergeChanges for MergeDocumentForReplacement {
match operations.last() {
Some(InnerDocOp::Addition(DocumentOffset { content })) => {
let document = serde_json::from_slice(content).unwrap();
let document =
RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(UserError::SerdeJson)?;
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
.map_err(UserError::SerdeJson)?;
if is_new {
Ok(Some(DocumentChange::Insertion(Insertion::create(
@ -631,9 +632,8 @@ impl MergeChanges for MergeDocumentForUpdates {
}
};
let document = serde_json::from_slice(content).unwrap();
let document =
RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(UserError::SerdeJson)?;
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
.map_err(UserError::SerdeJson)?;
Some(Versions::single(document))
}
@ -647,9 +647,8 @@ impl MergeChanges for MergeDocumentForUpdates {
};
let document = serde_json::from_slice(content).unwrap();
let document =
RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
.map_err(UserError::SerdeJson)?;
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
.map_err(UserError::SerdeJson)?;
Ok(document)
});
Versions::multiple(versions)?

View File

@ -4,8 +4,7 @@ use std::sync::{OnceLock, RwLock};
use std::thread::{self, Builder};
use big_s::S;
use bumparaw_collections::RawMap;
use document_changes::{extract, DocumentChanges, IndexingContext};
use document_changes::{extract, DocumentChanges, IndexingContext, Progress};
pub use document_deletion::DocumentDeletion;
pub use document_operation::{DocumentOperation, PayloadStats};
use hashbrown::HashMap;
@ -14,7 +13,7 @@ use heed::{RoTxn, RwTxn};
use itertools::{merge_join_by, EitherOrBoth};
pub use partial_dump::PartialDump;
use rand::SeedableRng as _;
use rustc_hash::FxBuildHasher;
use raw_collections::RawMap;
use time::OffsetDateTime;
pub use update_by_function::UpdateByFunction;
@ -22,7 +21,7 @@ use super::channel::*;
use super::extract::*;
use super::facet_search_builder::FacetSearchBuilder;
use super::merger::FacetFieldIdsDelta;
use super::steps::IndexingStep;
use super::steps::Step;
use super::thread_local::ThreadLocal;
use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder};
use super::words_prefix_docids::{
@ -33,7 +32,6 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
use crate::facet::FacetType;
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
use crate::progress::Progress;
use crate::proximity::ProximityPrecision;
use crate::update::del_add::DelAdd;
use crate::update::new::extract::EmbeddingExtractor;
@ -61,7 +59,7 @@ mod update_by_function;
///
/// TODO return stats
#[allow(clippy::too_many_arguments)] // clippy: 😝
pub fn index<'pl, 'indexer, 'index, DC, MSP>(
pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>(
wtxn: &mut RwTxn,
index: &'index Index,
pool: &ThreadPoolNoAbort,
@ -72,24 +70,16 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>(
document_changes: &DC,
embedders: EmbeddingConfigs,
must_stop_processing: &'indexer MSP,
progress: &'indexer Progress,
send_progress: &'indexer SP,
) -> Result<()>
where
DC: DocumentChanges<'pl>,
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
let mut bbbuffers = Vec::new();
let finished_extraction = AtomicBool::new(false);
// We reduce the actual memory used to 5%. The reason we do this here and not in Meilisearch
// is because we still use the old indexer for the settings and it is highly impacted by the
// max memory. So we keep the changes here and will remove these changes once we use the new
// indexer to also index settings. Related to #5125 and #5141.
let grenad_parameters = GrenadParameters {
max_memory: grenad_parameters.max_memory.map(|mm| mm * 5 / 100),
..grenad_parameters
};
// We compute and remove the allocated BBQueues buffers capacity from the indexing memory.
let minimum_capacity = 50 * 1024 * 1024 * pool.current_num_threads(); // 50 MiB
let (grenad_parameters, total_bbbuffer_capacity) = grenad_parameters.max_memory.map_or(
@ -125,7 +115,7 @@ where
doc_allocs: &doc_allocs,
fields_ids_map_store: &fields_ids_map_store,
must_stop_processing,
progress,
send_progress,
};
let mut index_embeddings = index.embedding_configs(wtxn)?;
@ -159,7 +149,7 @@ where
indexing_context,
&mut extractor_allocs,
&datastore,
IndexingStep::ExtractingDocuments,
Step::ExtractingDocuments,
)?;
}
{
@ -191,7 +181,7 @@ where
indexing_context,
&mut extractor_allocs,
&extractor_sender.field_id_docid_facet_sender(),
IndexingStep::ExtractingFacets
Step::ExtractingFacets
)?
};
@ -224,7 +214,7 @@ where
document_changes,
indexing_context,
&mut extractor_allocs,
IndexingStep::ExtractingWords
Step::ExtractingWords
)?
};
@ -302,7 +292,7 @@ where
document_changes,
indexing_context,
&mut extractor_allocs,
IndexingStep::ExtractingWordProximity,
Step::ExtractingWordProximity,
)?
};
@ -338,7 +328,7 @@ where
indexing_context,
&mut extractor_allocs,
&datastore,
IndexingStep::ExtractingEmbeddings,
Step::ExtractingEmbeddings,
)?;
}
{
@ -371,7 +361,7 @@ where
indexing_context,
&mut extractor_allocs,
&datastore,
IndexingStep::WritingGeoPoints
Step::WritingGeoPoints
)?;
}
@ -383,7 +373,9 @@ where
&indexing_context.must_stop_processing,
)?;
}
indexing_context.progress.update_progress(IndexingStep::WritingToDatabase);
(indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase));
finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed);
Result::Ok((facet_field_ids_delta, index_embeddings))
@ -483,7 +475,7 @@ where
)?;
}
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
(indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors));
let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?;
@ -496,7 +488,10 @@ where
break 'vectors;
}
indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase);
(indexing_context.send_progress)(Progress::from_step(
Step::WritingEmbeddingsToDatabase,
));
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers {
let dimensions = *dimensions;
@ -512,19 +507,21 @@ where
index.put_embedding_configs(wtxn, index_embeddings)?;
}
indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets);
(indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets));
if index.facet_search(wtxn)? {
compute_facet_search_database(index, wtxn, global_fields_ids_map)?;
}
compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
(indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords));
if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
compute_prefix_database(index, wtxn, prefix_delta, grenad_parameters)?;
}
indexing_context.progress.update_progress(IndexingStep::Finalizing);
(indexing_context.send_progress)(Progress::from_step(Step::Finalizing));
Ok(()) as Result<_>
})?;
@ -770,7 +767,7 @@ pub fn retrieve_or_guess_primary_key<'a>(
index: &Index,
new_fields_ids_map: &mut FieldsIdsMap,
primary_key_from_op: Option<&'a str>,
first_document: Option<RawMap<'a, FxBuildHasher>>,
first_document: Option<RawMap<'a>>,
) -> Result<StdResult<(PrimaryKey<'a>, bool), UserError>> {
// make sure that we have a declared primary key, either fetching it from the index or attempting to guess it.

View File

@ -1,8 +1,6 @@
use std::ops::DerefMut;
use bumparaw_collections::RawMap;
use rayon::iter::IndexedParallelIterator;
use rustc_hash::FxBuildHasher;
use serde_json::value::RawValue;
use super::document_changes::{DocumentChangeContext, DocumentChanges};
@ -77,7 +75,7 @@ where
self.primary_key.extract_fields_and_docid(document, fields_ids_map, doc_alloc)?;
let external_document_id = external_document_id.to_de();
let document = RawMap::from_raw_value_and_hasher(document, FxBuildHasher, doc_alloc)
let document = raw_collections::RawMap::from_raw_value(document, doc_alloc)
.map_err(InternalError::SerdeJson)?;
let insertion = Insertion::create(docid, external_document_id, Versions::single(document));

View File

@ -1,9 +1,8 @@
use bumparaw_collections::RawMap;
use raw_collections::RawMap;
use rayon::iter::IndexedParallelIterator;
use rayon::slice::ParallelSlice as _;
use rhai::{Dynamic, Engine, OptimizationLevel, Scope, AST};
use roaring::RoaringBitmap;
use rustc_hash::FxBuildHasher;
use super::document_changes::DocumentChangeContext;
use super::DocumentChanges;
@ -161,12 +160,8 @@ impl<'index> DocumentChanges<'index> for UpdateByFunctionChanges<'index> {
if document_id != new_document_id {
Err(Error::UserError(UserError::DocumentEditionCannotModifyPrimaryKey))
} else {
let raw_new_doc = RawMap::from_raw_value_and_hasher(
raw_new_doc,
FxBuildHasher,
doc_alloc,
)
.map_err(InternalError::SerdeJson)?;
let raw_new_doc = RawMap::from_raw_value(raw_new_doc, doc_alloc)
.map_err(InternalError::SerdeJson)?;
Ok(Some(DocumentChange::Update(Update::create(
docid,

View File

@ -235,12 +235,8 @@ fn merge_cbo_bitmaps(
(Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange
(Some(current), None, Some(add)) => Ok(Operation::Write(current | add)),
(Some(current), Some(del), add) => {
debug_assert!(
del.is_subset(&current),
"del is not a subset of current, which must be impossible."
);
let output = match add {
Some(add) => (&current - (&del - &add)) | (add - del),
Some(add) => (&current - del) | add,
None => &current - del,
};
if output.is_empty() {

View File

@ -1,12 +1,8 @@
use std::borrow::Cow;
use enum_iterator::Sequence;
use crate::progress::Step;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)]
#[repr(u8)]
pub enum IndexingStep {
#[repr(u16)]
pub enum Step {
PreparingPayloads,
ExtractingDocuments,
ExtractingFacets,
@ -22,31 +18,30 @@ pub enum IndexingStep {
Finalizing,
}
impl Step for IndexingStep {
fn name(&self) -> Cow<'static, str> {
impl Step {
pub fn name(&self) -> &'static str {
match self {
IndexingStep::PreparingPayloads => "preparing update file",
IndexingStep::ExtractingDocuments => "extracting documents",
IndexingStep::ExtractingFacets => "extracting facets",
IndexingStep::ExtractingWords => "extracting words",
IndexingStep::ExtractingWordProximity => "extracting word proximity",
IndexingStep::ExtractingEmbeddings => "extracting embeddings",
IndexingStep::WritingGeoPoints => "writing geo points",
IndexingStep::WritingToDatabase => "writing to database",
IndexingStep::WaitingForExtractors => "waiting for extractors",
IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database",
IndexingStep::PostProcessingFacets => "post-processing facets",
IndexingStep::PostProcessingWords => "post-processing words",
IndexingStep::Finalizing => "finalizing",
Step::PreparingPayloads => "preparing update file",
Step::ExtractingDocuments => "extracting documents",
Step::ExtractingFacets => "extracting facets",
Step::ExtractingWords => "extracting words",
Step::ExtractingWordProximity => "extracting word proximity",
Step::ExtractingEmbeddings => "extracting embeddings",
Step::WritingGeoPoints => "writing geo points",
Step::WritingToDatabase => "writing to database",
Step::WaitingForExtractors => "waiting for extractors",
Step::WritingEmbeddingsToDatabase => "writing embeddings to database",
Step::PostProcessingFacets => "post-processing facets",
Step::PostProcessingWords => "post-processing words",
Step::Finalizing => "finalizing",
}
.into()
}
fn current(&self) -> u32 {
*self as u32
pub fn finished_steps(self) -> u16 {
self as u16
}
fn total(&self) -> u32 {
Self::CARDINALITY as u32
pub const fn total_steps() -> u16 {
Self::CARDINALITY as u16
}
}

View File

@ -1,10 +1,9 @@
use std::collections::BTreeSet;
use bumpalo::Bump;
use bumparaw_collections::RawMap;
use deserr::{Deserr, IntoValue};
use heed::RoTxn;
use rustc_hash::FxBuildHasher;
use raw_collections::RawMap;
use serde::Serialize;
use serde_json::value::RawValue;
@ -85,7 +84,7 @@ pub struct VectorDocumentFromDb<'t> {
docid: DocumentId,
embedding_config: Vec<IndexEmbeddingConfig>,
index: &'t Index,
vectors_field: Option<RawMap<'t, FxBuildHasher>>,
vectors_field: Option<RawMap<'t>>,
rtxn: &'t RoTxn<'t>,
doc_alloc: &'t Bump,
}
@ -103,10 +102,9 @@ impl<'t> VectorDocumentFromDb<'t> {
};
let vectors = document.vectors_field()?;
let vectors_field = match vectors {
Some(vectors) => Some(
RawMap::from_raw_value_and_hasher(vectors, FxBuildHasher, doc_alloc)
.map_err(InternalError::SerdeJson)?,
),
Some(vectors) => {
Some(RawMap::from_raw_value(vectors, doc_alloc).map_err(InternalError::SerdeJson)?)
}
None => None,
};
@ -222,7 +220,7 @@ fn entry_from_raw_value(
pub struct VectorDocumentFromVersions<'doc> {
external_document_id: &'doc str,
vectors: RawMap<'doc, FxBuildHasher>,
vectors: RawMap<'doc>,
embedders: &'doc EmbeddingConfigs,
}
@ -235,8 +233,8 @@ impl<'doc> VectorDocumentFromVersions<'doc> {
) -> Result<Option<Self>> {
let document = DocumentFromVersions::new(versions);
if let Some(vectors_field) = document.vectors_field()? {
let vectors = RawMap::from_raw_value_and_hasher(vectors_field, FxBuildHasher, bump)
.map_err(UserError::SerdeJson)?;
let vectors =
RawMap::from_raw_value(vectors_field, bump).map_err(UserError::SerdeJson)?;
Ok(Some(Self { external_document_id, vectors, embedders }))
} else {
Ok(None)

View File

@ -5,7 +5,7 @@ use rayon::slice::ParallelSlice as _;
use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind};
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
use super::{DistributionShift, REQUEST_PARALLELISM};
use super::DistributionShift;
use crate::error::FaultSource;
use crate::vector::Embedding;
use crate::ThreadPoolNoAbort;
@ -113,30 +113,20 @@ impl Embedder {
texts: &[&str],
threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<f32>>, EmbedError> {
if threads.active_operations() >= REQUEST_PARALLELISM {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None))
.collect();
threads
.install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None))
.collect();
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
} else {
threads
.install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None))
.collect();
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
pub fn chunk_count_hint(&self) -> usize {

View File

@ -6,7 +6,7 @@ use rayon::slice::ParallelSlice as _;
use super::error::{EmbedError, NewEmbedderError};
use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions};
use super::{DistributionShift, REQUEST_PARALLELISM};
use super::DistributionShift;
use crate::error::FaultSource;
use crate::vector::error::EmbedErrorKind;
use crate::vector::Embedding;
@ -270,29 +270,20 @@ impl Embedder {
texts: &[&str],
threads: &ThreadPoolNoAbort,
) -> Result<Vec<Vec<f32>>, EmbedError> {
if threads.active_operations() >= REQUEST_PARALLELISM {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None))
.collect();
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
} else {
threads
.install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None))
.collect();
threads
.install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed(chunk, None))
.collect();
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
pub fn chunk_count_hint(&self) -> usize {

View File

@ -203,30 +203,20 @@ impl Embedder {
texts: &[&str],
threads: &ThreadPoolNoAbort,
) -> Result<Vec<Embedding>, EmbedError> {
if threads.active_operations() >= REQUEST_PARALLELISM {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed_ref(chunk, None))
.collect();
threads
.install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed_ref(chunk, None))
.collect();
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
} else {
threads
.install(move || {
let embeddings: Result<Vec<Vec<Embedding>>, _> = texts
.par_chunks(self.prompt_count_in_chunk_hint())
.map(move |chunk| self.embed_ref(chunk, None))
.collect();
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
let embeddings = embeddings?;
Ok(embeddings.into_iter().flatten().collect())
})
.map_err(|error| EmbedError {
kind: EmbedErrorKind::PanicInThreadPool(error),
fault: FaultSource::Bug,
})?
}
pub fn chunk_count_hint(&self) -> usize {

View File

@ -3,7 +3,6 @@ use bumpalo::Bump;
use heed::EnvOpenOptions;
use maplit::hashset;
use milli::documents::mmap_from_objects;
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
@ -58,7 +57,7 @@ fn test_facet_distribution_with_no_facet_values() {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -73,7 +72,7 @@ fn test_facet_distribution_with_no_facet_values() {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();

View File

@ -7,7 +7,6 @@ use bumpalo::Bump;
use either::{Either, Left, Right};
use heed::EnvOpenOptions;
use maplit::{btreemap, hashset};
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
@ -91,7 +90,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -110,7 +109,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();

View File

@ -5,7 +5,6 @@ use bumpalo::Bump;
use heed::EnvOpenOptions;
use itertools::Itertools;
use maplit::hashset;
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
@ -327,7 +326,7 @@ fn criteria_ascdesc() {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -342,7 +341,7 @@ fn criteria_ascdesc() {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();

View File

@ -3,7 +3,6 @@ use std::collections::BTreeSet;
use bumpalo::Bump;
use heed::EnvOpenOptions;
use milli::documents::mmap_from_objects;
use milli::progress::Progress;
use milli::update::new::indexer;
use milli::update::{IndexDocumentsMethod, IndexerConfig, Settings};
use milli::vector::EmbeddingConfigs;
@ -136,7 +135,7 @@ fn test_typo_disabled_on_word() {
None,
&mut new_fields_ids_map,
&|| false,
Progress::default(),
&|_progress| (),
)
.unwrap();
@ -151,7 +150,7 @@ fn test_typo_disabled_on_word() {
&document_changes,
embedders,
&|| false,
&Progress::default(),
&|_| (),
)
.unwrap();

View File

@ -1,56 +1,23 @@
use std::collections::BTreeMap;
use std::time::Duration;
use anyhow::{bail, Context as _};
use tokio::process::Command;
use tokio::time;
use super::assets::Asset;
use super::client::Client;
use super::workload::Workload;
pub async fn kill(mut meilisearch: tokio::process::Child) {
let Some(id) = meilisearch.id() else { return };
match Command::new("kill").args(["--signal=TERM", &id.to_string()]).spawn() {
Ok(mut cmd) => {
let Err(error) = cmd.wait().await else { return };
tracing::warn!(
error = &error as &dyn std::error::Error,
"while awaiting the Meilisearch server kill"
);
}
Err(error) => {
tracing::warn!(
error = &error as &dyn std::error::Error,
"while terminating Meilisearch server with a kill -s TERM"
);
if let Err(error) = meilisearch.kill().await {
tracing::warn!(
error = &error as &dyn std::error::Error,
"while terminating Meilisearch server"
)
}
return;
}
};
match time::timeout(Duration::from_secs(5), meilisearch.wait()).await {
Ok(_) => (),
Err(_) => {
if let Err(error) = meilisearch.kill().await {
tracing::warn!(
error = &error as &dyn std::error::Error,
"while terminating Meilisearch server"
)
}
}
if let Err(error) = meilisearch.kill().await {
tracing::warn!(
error = &error as &dyn std::error::Error,
"while terminating Meilisearch server"
)
}
}
#[tracing::instrument]
pub async fn build() -> anyhow::Result<()> {
let mut command = Command::new("cargo");
let mut command = tokio::process::Command::new("cargo");
command.arg("build").arg("--release").arg("-p").arg("meilisearch");
command.kill_on_drop(true);
@ -70,8 +37,17 @@ pub async fn start(
master_key: Option<&str>,
workload: &Workload,
asset_folder: &str,
mut command: Command,
) -> anyhow::Result<tokio::process::Child> {
let mut command = tokio::process::Command::new("cargo");
command
.arg("run")
.arg("--release")
.arg("-p")
.arg("meilisearch")
.arg("--bin")
.arg("meilisearch")
.arg("--");
command.arg("--db-path").arg("./_xtask_benchmark.ms");
if let Some(master_key) = master_key {
command.arg("--master-key").arg(master_key);
@ -110,7 +86,7 @@ async fn wait_for_health(
return Ok(());
}
time::sleep(Duration::from_millis(500)).await;
tokio::time::sleep(std::time::Duration::from_millis(500)).await;
// check whether the Meilisearch instance exited early (cut the wait)
if let Some(exit_code) =
meilisearch.try_wait().context("cannot check Meilisearch server process status")?

View File

@ -86,12 +86,6 @@ pub struct BenchDeriveArgs {
/// The maximum time in seconds we allow for fetching the task queue before timing out.
#[arg(long, default_value_t = 60)]
tasks_queue_timeout_secs: u64,
/// The path to the binary to run.
///
/// If unspecified, runs `cargo run` after building Meilisearch with `cargo build`.
#[arg(long)]
binary_path: Option<PathBuf>,
}
pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> {
@ -145,7 +139,7 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> {
rt.block_on(async {
dashboard_client.send_machine_info(&env).await?;
let commit_message = build_info.commit_msg.unwrap_or_default().split('\n').next().unwrap();
let commit_message = build_info.commit_msg.context("missing commit message")?.split('\n').next().unwrap();
let max_workloads = args.workload_file.len();
let reason: Option<&str> = args.reason.as_deref();
let invocation_uuid = dashboard_client.create_invocation(build_info.clone(), commit_message, env, max_workloads, reason).await?;
@ -176,7 +170,6 @@ pub fn run(args: BenchDeriveArgs) -> anyhow::Result<()> {
args.master_key.as_deref(),
workload,
&args,
args.binary_path.as_deref(),
)
.await?;

View File

@ -1,7 +1,6 @@
use std::collections::BTreeMap;
use std::fs::File;
use std::io::{Seek as _, Write as _};
use std::path::Path;
use anyhow::{bail, Context as _};
use futures_util::TryStreamExt as _;
@ -86,13 +85,13 @@ pub async fn execute(
master_key: Option<&str>,
workload: Workload,
args: &BenchDeriveArgs,
binary_path: Option<&Path>,
) -> anyhow::Result<()> {
assets::fetch_assets(assets_client, &workload.assets, &args.asset_folder).await?;
let workload_uuid = dashboard_client.create_workload(invocation_uuid, &workload).await?;
let mut tasks = Vec::new();
for i in 0..workload.run_count {
tasks.push(
execute_run(
@ -103,7 +102,6 @@ pub async fn execute(
master_key,
&workload,
args,
binary_path,
i,
)
.await?,
@ -111,6 +109,7 @@ pub async fn execute(
}
let mut reports = Vec::with_capacity(workload.run_count as usize);
for task in tasks {
reports.push(
task.await
@ -134,31 +133,13 @@ async fn execute_run(
master_key: Option<&str>,
workload: &Workload,
args: &BenchDeriveArgs,
binary_path: Option<&Path>,
run_number: u16,
) -> anyhow::Result<tokio::task::JoinHandle<anyhow::Result<std::fs::File>>> {
meili_process::delete_db();
let run_command = match binary_path {
Some(binary_path) => tokio::process::Command::new(binary_path),
None => {
meili_process::build().await?;
let mut command = tokio::process::Command::new("cargo");
command
.arg("run")
.arg("--release")
.arg("-p")
.arg("meilisearch")
.arg("--bin")
.arg("meilisearch")
.arg("--");
command
}
};
meili_process::build().await?;
let meilisearch =
meili_process::start(meili_client, master_key, workload, &args.asset_folder, run_command)
.await?;
meili_process::start(meili_client, master_key, workload, &args.asset_folder).await?;
let processor = run_commands(
dashboard_client,