Make the CI happy about the never type

Expose an experimental parameter to control the generation of prefix dbs
Merge #4891
2025-07-18 12:20:48 +00:00 · 2024-09-16 11:40:26 +02:00 · 2024-09-16 10:57:52 +02:00 · 2024-08-27 16:04:18 +00:00 · 2024-08-27 16:02:43 +00:00 · 2024-08-27 15:14:58 +00:00
18 changed files with 220 additions and 167 deletions
--- a/.github/workflows/flaky-tests.yml
+++ b/.github/workflows/flaky-tests.yml
@ -1,4 +1,6 @@
 name: Look for flaky tests
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
 on:
  workflow_dispatch:
  schedule:
--- a/.github/workflows/fuzzer-indexing.yml
+++ b/.github/workflows/fuzzer-indexing.yml
@ -1,5 +1,6 @@
 name: Run the indexing fuzzer
-
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
 on:
  push:
    branches:
--- a/.github/workflows/publish-apt-brew-pkg.yml
+++ b/.github/workflows/publish-apt-brew-pkg.yml
@ -15,6 +15,8 @@ jobs:

  debian:
    name: Publish debian packagge
+    env:
+      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    runs-on: ubuntu-latest
    needs: check-version
    container:
--- a/.github/workflows/publish-binaries.yml
+++ b/.github/workflows/publish-binaries.yml
@ -35,6 +35,8 @@ jobs:
  publish-linux:
    name: Publish binary for Linux
    runs-on: ubuntu-latest
+    env:
+      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    needs: check-version
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27
@ -132,6 +134,8 @@ jobs:
    name: Publish binary for aarch64
    runs-on: ubuntu-latest
    needs: check-version
+    env:
+      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27
      image: ubuntu:18.04
--- a/.github/workflows/test-suite.yml
+++ b/.github/workflows/test-suite.yml
@ -21,6 +21,8 @@ jobs:
  test-linux:
    name: Tests on ubuntu-18.04
    runs-on: ubuntu-latest
+    env:
+      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
      image: ubuntu:18.04
@ -77,6 +79,8 @@ jobs:
  test-all-features:
    name: Tests almost all features
    runs-on: ubuntu-latest
+    env:
+      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
      image: ubuntu:18.04
@ -100,6 +104,8 @@ jobs:

  test-disabled-tokenization:
    name: Test disabled tokenization
+    env:
+      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    runs-on: ubuntu-latest
    container:
      image: ubuntu:18.04
@ -127,6 +133,8 @@ jobs:
  # We run tests in debug also, to make sure that the debug_assertions are hit
  test-debug:
    name: Run tests in debug
+    env:
+      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    runs-on: ubuntu-latest
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
--- a/Cargo.lock
+++ b/Cargo.lock
@ -503,7 +503,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"

 [[package]]
 name = "benchmarks"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "anyhow",
 "bytes",
@ -648,7 +648,7 @@ dependencies = [

 [[package]]
 name = "build-info"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "anyhow",
 "time",
@ -1579,7 +1579,7 @@ dependencies = [

 [[package]]
 name = "dump"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "anyhow",
 "big_s",
@ -1804,7 +1804,7 @@ dependencies = [

 [[package]]
 name = "file-store"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "faux",
 "tempfile",
@ -1827,7 +1827,7 @@ dependencies = [

 [[package]]
 name = "filter-parser"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "insta",
 "nom",
@ -1847,7 +1847,7 @@ dependencies = [

 [[package]]
 name = "flatten-serde-json"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "criterion",
 "serde_json",
@ -1965,7 +1965,7 @@ dependencies = [

 [[package]]
 name = "fuzzers"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "arbitrary",
 "clap",
@ -2452,7 +2452,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"

 [[package]]
 name = "index-scheduler"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "anyhow",
 "arroy",
@ -2649,7 +2649,7 @@ dependencies = [

 [[package]]
 name = "json-depth-checker"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "criterion",
 "serde_json",
@ -3257,7 +3257,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "meili-snap"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "insta",
 "md5",
@ -3266,7 +3266,7 @@ dependencies = [

 [[package]]
 name = "meilisearch"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "actix-cors",
 "actix-http",
@ -3358,7 +3358,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-auth"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "base64 0.21.7",
 "enum-iterator",
@ -3377,7 +3377,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-types"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "actix-web",
 "anyhow",
@ -3407,7 +3407,7 @@ dependencies = [

 [[package]]
 name = "meilitool"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "anyhow",
 "clap",
@ -3446,7 +3446,7 @@ dependencies = [

 [[package]]
 name = "milli"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "arroy",
 "big_s",
@ -3886,7 +3886,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"

 [[package]]
 name = "permissive-json-pointer"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "big_s",
 "serde_json",
@ -5098,9 +5098,9 @@ dependencies = [

 [[package]]
 name = "time"
-version = "0.3.34"
+version = "0.3.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
 dependencies = [
 "deranged",
 "itoa",
@ -5121,9 +5121,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"

 [[package]]
 name = "time-macros"
-version = "0.2.17"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
 dependencies = [
 "num-conv",
 "time-core",
@ -6042,7 +6042,7 @@ dependencies = [

 [[package]]
 name = "xtask"
-version = "1.9.0"
+version = "1.9.1"
 dependencies = [
 "anyhow",
 "build-info",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -22,7 +22,7 @@ members = [
 ]

 [workspace.package]
-version = "1.9.0"
+version = "1.9.1"
 authors = [
    "Quentin de Quelen <quentin@dequelen.me>",
    "Clément Renault <clement@meilisearch.com>",
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@ -897,91 +897,95 @@ impl IndexScheduler {
                dump_tasks.flush()?;

                // 3. Dump the indexes
-                self.index_mapper.try_for_each_index(&rtxn, |uid, index| -> Result<()> {
-                    let rtxn = index.read_txn()?;
-                    let metadata = IndexMetadata {
-                        uid: uid.to_owned(),
-                        primary_key: index.primary_key(&rtxn)?.map(String::from),
-                        created_at: index.created_at(&rtxn)?,
-                        updated_at: index.updated_at(&rtxn)?,
-                    };
-                    let mut index_dumper = dump.create_index(uid, &metadata)?;
+                let () =
+                    self.index_mapper.try_for_each_index(&rtxn, |uid, index| -> Result<()> {
+                        let rtxn = index.read_txn()?;
+                        let metadata = IndexMetadata {
+                            uid: uid.to_owned(),
+                            primary_key: index.primary_key(&rtxn)?.map(String::from),
+                            created_at: index.created_at(&rtxn)?,
+                            updated_at: index.updated_at(&rtxn)?,
+                        };
+                        let mut index_dumper = dump.create_index(uid, &metadata)?;

-                    let fields_ids_map = index.fields_ids_map(&rtxn)?;
-                    let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
-                    let embedding_configs = index.embedding_configs(&rtxn)?;
+                        let fields_ids_map = index.fields_ids_map(&rtxn)?;
+                        let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
+                        let embedding_configs = index.embedding_configs(&rtxn)?;

-                    // 3.1. Dump the documents
-                    for ret in index.all_documents(&rtxn)? {
-                        if self.must_stop_processing.get() {
-                            return Err(Error::AbortedTask);
-                        }
-
-                        let (id, doc) = ret?;
-
-                        let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
-
-                        'inject_vectors: {
-                            let embeddings = index.embeddings(&rtxn, id)?;
-
-                            if embeddings.is_empty() {
-                                break 'inject_vectors;
+                        // 3.1. Dump the documents
+                        for ret in index.all_documents(&rtxn)? {
+                            if self.must_stop_processing.get() {
+                                return Err(Error::AbortedTask);
                            }

-                            let vectors = document
-                                .entry(RESERVED_VECTORS_FIELD_NAME.to_owned())
-                                .or_insert(serde_json::Value::Object(Default::default()));
+                            let (id, doc) = ret?;

-                            let serde_json::Value::Object(vectors) = vectors else {
-                                return Err(milli::Error::UserError(
-                                    milli::UserError::InvalidVectorsMapType {
-                                        document_id: {
-                                            if let Ok(Some(Ok(index))) = index
-                                                .external_id_of(&rtxn, std::iter::once(id))
-                                                .map(|it| it.into_iter().next())
-                                            {
-                                                index
-                                            } else {
-                                                format!("internal docid={id}")
-                                            }
+                            let mut document =
+                                milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
+
+                            'inject_vectors: {
+                                let embeddings = index.embeddings(&rtxn, id)?;
+
+                                if embeddings.is_empty() {
+                                    break 'inject_vectors;
+                                }
+
+                                let vectors = document
+                                    .entry(RESERVED_VECTORS_FIELD_NAME.to_owned())
+                                    .or_insert(serde_json::Value::Object(Default::default()));
+
+                                let serde_json::Value::Object(vectors) = vectors else {
+                                    return Err(milli::Error::UserError(
+                                        milli::UserError::InvalidVectorsMapType {
+                                            document_id: {
+                                                if let Ok(Some(Ok(index))) = index
+                                                    .external_id_of(&rtxn, std::iter::once(id))
+                                                    .map(|it| it.into_iter().next())
+                                                {
+                                                    index
+                                                } else {
+                                                    format!("internal docid={id}")
+                                                }
+                                            },
+                                            value: vectors.clone(),
                                        },
-                                        value: vectors.clone(),
-                                    },
-                                )
-                                .into());
-                            };
-
-                            for (embedder_name, embeddings) in embeddings {
-                                let user_provided = embedding_configs
-                                    .iter()
-                                    .find(|conf| conf.name == embedder_name)
-                                    .is_some_and(|conf| conf.user_provided.contains(id));
-
-                                let embeddings = ExplicitVectors {
-                                    embeddings: Some(
-                                        VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
-                                    ),
-                                    regenerate: !user_provided,
+                                    )
+                                    .into());
                                };
-                                vectors.insert(
-                                    embedder_name,
-                                    serde_json::to_value(embeddings).unwrap(),
-                                );
+
+                                for (embedder_name, embeddings) in embeddings {
+                                    let user_provided = embedding_configs
+                                        .iter()
+                                        .find(|conf| conf.name == embedder_name)
+                                        .is_some_and(|conf| conf.user_provided.contains(id));
+
+                                    let embeddings = ExplicitVectors {
+                                        embeddings: Some(
+                                            VectorOrArrayOfVectors::from_array_of_vectors(
+                                                embeddings,
+                                            ),
+                                        ),
+                                        regenerate: !user_provided,
+                                    };
+                                    vectors.insert(
+                                        embedder_name,
+                                        serde_json::to_value(embeddings).unwrap(),
+                                    );
+                                }
                            }
+
+                            index_dumper.push_document(&document)?;
                        }

-                        index_dumper.push_document(&document)?;
-                    }
-
-                    // 3.2. Dump the settings
-                    let settings = meilisearch_types::settings::settings(
-                        index,
-                        &rtxn,
-                        meilisearch_types::settings::SecretPolicy::RevealSecrets,
-                    )?;
-                    index_dumper.settings(&settings)?;
-                    Ok(())
-                })?;
+                        // 3.2. Dump the settings
+                        let settings = meilisearch_types::settings::settings(
+                            index,
+                            &rtxn,
+                            meilisearch_types::settings::SecretPolicy::RevealSecrets,
+                        )?;
+                        index_dumper.settings(&settings)?;
+                        Ok(())
+                    })?;

                // 4. Dump experimental feature settings
                let features = self.features().runtime_features();
@ -1288,7 +1292,11 @@ impl IndexScheduler {
                    }
                }

-                let config = IndexDocumentsConfig { update_method: method, ..Default::default() };
+                let config = IndexDocumentsConfig {
+                    update_method: method,
+                    compute_prefix_databases: self.compute_prefix_databases,
+                    ..Default::default()
+                };

                let embedder_configs = index.embedding_configs(index_wtxn)?;
                // TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
@ -1398,6 +1406,7 @@ impl IndexScheduler {
                let deleted_documents = delete_document_by_filter(
                    index_wtxn,
                    filter,
+                    self.compute_prefix_databases,
                    self.index_mapper.indexer_config(),
                    self.must_stop_processing.clone(),
                    index,
@ -1638,6 +1647,7 @@ impl IndexScheduler {
 fn delete_document_by_filter<'a>(
    wtxn: &mut RwTxn<'a>,
    filter: &serde_json::Value,
+    compute_prefix_databases: bool,
    indexer_config: &IndexerConfig,
    must_stop_processing: MustStopProcessing,
    index: &'a Index,
@ -1653,6 +1663,7 @@ fn delete_document_by_filter<'a>(

        let config = IndexDocumentsConfig {
            update_method: IndexDocumentsMethod::ReplaceDocuments,
+            compute_prefix_databases,
            ..Default::default()
        };

--- a/index-scheduler/src/insta_snapshot.rs
+++ b/index-scheduler/src/insta_snapshot.rs
@ -32,6 +32,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
        features: _,
        max_number_of_tasks: _,
        max_number_of_batched_tasks: _,
+        compute_prefix_databases: _,
        wake_up: _,
        dumps_path: _,
        snapshots_path: _,
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@ -276,6 +276,8 @@ pub struct IndexSchedulerOptions {
    pub max_number_of_batched_tasks: usize,
    /// The experimental features enabled for this instance.
    pub instance_features: InstanceTogglableFeatures,
+    /// An experimental option to control the generation of prefix databases.
+    pub compute_prefix_databases: bool,
 }

 /// Structure which holds meilisearch's indexes and schedules the tasks
@ -283,19 +285,13 @@ pub struct IndexSchedulerOptions {
 pub struct IndexScheduler {
    /// The LMDB environment which the DBs are associated with.
    pub(crate) env: Env,
-
    /// A boolean that can be set to true to stop the currently processing tasks.
    pub(crate) must_stop_processing: MustStopProcessing,
-
    /// The list of tasks currently processing
    pub(crate) processing_tasks: Arc<RwLock<ProcessingTasks>>,
-
    /// The list of files referenced by the tasks
-    pub(crate) file_store: FileStore,
-
-    // The main database, it contains all the tasks accessible by their Id.
+    pub(crate) file_store: FileStore, // The main database, it contains all the tasks accessible by their Id.
    pub(crate) all_tasks: Database<BEU32, SerdeJson<Task>>,
-
    /// All the tasks ids grouped by their status.
    // TODO we should not be able to serialize a `Status::Processing` in this database.
    pub(crate) status: Database<SerdeBincode<Status>, RoaringBitmapCodec>,
@ -303,58 +299,43 @@ pub struct IndexScheduler {
    pub(crate) kind: Database<SerdeBincode<Kind>, RoaringBitmapCodec>,
    /// Store the tasks associated to an index.
    pub(crate) index_tasks: Database<Str, RoaringBitmapCodec>,
-
    /// Store the tasks that were canceled by a task uid
    pub(crate) canceled_by: Database<BEU32, RoaringBitmapCodec>,
-
    /// Store the task ids of tasks which were enqueued at a specific date
    pub(crate) enqueued_at: Database<BEI128, CboRoaringBitmapCodec>,
-
    /// Store the task ids of finished tasks which started being processed at a specific date
    pub(crate) started_at: Database<BEI128, CboRoaringBitmapCodec>,
-
    /// Store the task ids of tasks which finished at a specific date
    pub(crate) finished_at: Database<BEI128, CboRoaringBitmapCodec>,
-
    /// In charge of creating, opening, storing and returning indexes.
    pub(crate) index_mapper: IndexMapper,
-
    /// In charge of fetching and setting the status of experimental features.
    features: features::FeatureData,
-
    /// Get a signal when a batch needs to be processed.
    pub(crate) wake_up: Arc<SignalEvent>,
-
    /// Whether auto-batching is enabled or not.
    pub(crate) autobatching_enabled: bool,
-
    /// Whether we should automatically cleanup the task queue or not.
    pub(crate) cleanup_enabled: bool,
-
    /// The max number of tasks allowed before the scheduler starts to delete
    /// the finished tasks automatically.
    pub(crate) max_number_of_tasks: usize,
-
    /// The maximum number of tasks that will be batched together.
    pub(crate) max_number_of_batched_tasks: usize,
-
+    /// Control wether we must generate the prefix databases or not.
+    pub(crate) compute_prefix_databases: bool,
    /// The webhook url we should send tasks to after processing every batches.
    pub(crate) webhook_url: Option<String>,
    /// The Authorization header to send to the webhook URL.
    pub(crate) webhook_authorization_header: Option<String>,
-
    /// The path used to create the dumps.
    pub(crate) dumps_path: PathBuf,
-
    /// The path used to create the snapshots.
    pub(crate) snapshots_path: PathBuf,
-
    /// The path to the folder containing the auth LMDB env.
    pub(crate) auth_path: PathBuf,
-
    /// The path to the version file of Meilisearch.
    pub(crate) version_file_path: PathBuf,
-
    embedders: Arc<RwLock<HashMap<EmbedderOptions, Arc<Embedder>>>>,

    // ================= test
@ -364,13 +345,11 @@ pub struct IndexScheduler {
    /// See [self.breakpoint()](`IndexScheduler::breakpoint`) for an explanation.
    #[cfg(test)]
    test_breakpoint_sdr: crossbeam::channel::Sender<(Breakpoint, bool)>,
-
    /// A list of planned failures within the [`tick`](IndexScheduler::tick) method of the index scheduler.
    ///
    /// The first field is the iteration index and the second field identifies a location in the code.
    #[cfg(test)]
    planned_failures: Vec<(usize, tests::FailureLocation)>,
-
    /// A counter that is incremented before every call to [`tick`](IndexScheduler::tick)
    #[cfg(test)]
    run_loop_iteration: Arc<RwLock<usize>>,
@ -397,6 +376,7 @@ impl IndexScheduler {
            cleanup_enabled: self.cleanup_enabled,
            max_number_of_tasks: self.max_number_of_tasks,
            max_number_of_batched_tasks: self.max_number_of_batched_tasks,
+            compute_prefix_databases: self.compute_prefix_databases,
            snapshots_path: self.snapshots_path.clone(),
            dumps_path: self.dumps_path.clone(),
            auth_path: self.auth_path.clone(),
@ -499,6 +479,7 @@ impl IndexScheduler {
            cleanup_enabled: options.cleanup_enabled,
            max_number_of_tasks: options.max_number_of_tasks,
            max_number_of_batched_tasks: options.max_number_of_batched_tasks,
+            compute_prefix_databases: options.compute_prefix_databases,
            dumps_path: options.dumps_path,
            snapshots_path: options.snapshots_path,
            auth_path: options.auth_path,
@ -1819,6 +1800,7 @@ mod tests {
                max_number_of_tasks: 1_000_000,
                max_number_of_batched_tasks: usize::MAX,
                instance_features: Default::default(),
+                compute_prefix_databases: true,
            };
            configuration(&mut options);

--- a/meilisearch/src/analytics/segment_analytics.rs
+++ b/meilisearch/src/analytics/segment_analytics.rs
@ -256,6 +256,7 @@ struct Infos {
    experimental_enable_logs_route: bool,
    experimental_reduce_indexing_memory_usage: bool,
    experimental_max_number_of_batched_tasks: usize,
+    experimental_disable_prefix_db: bool,
    gpu_enabled: bool,
    db_path: bool,
    import_dump: bool,
@ -298,6 +299,7 @@ impl From<Opt> for Infos {
            experimental_enable_logs_route,
            experimental_reduce_indexing_memory_usage,
            experimental_max_number_of_batched_tasks,
+            experimental_disable_prefix_db,
            http_addr,
            master_key: _,
            env,
@ -347,6 +349,7 @@ impl From<Opt> for Infos {
            experimental_replication_parameters,
            experimental_enable_logs_route,
            experimental_reduce_indexing_memory_usage,
+            experimental_disable_prefix_db,
            gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(),
            db_path: db_path != PathBuf::from("./data.ms"),
            import_dump: import_dump.is_some(),
--- a/meilisearch/src/lib.rs
+++ b/meilisearch/src/lib.rs
@ -311,6 +311,7 @@ fn open_or_create_database_unchecked(
            index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
            index_count: DEFAULT_INDEX_COUNT,
            instance_features,
+            compute_prefix_databases: !opt.experimental_disable_prefix_db,
        })?)
    };

--- a/meilisearch/src/option.rs
+++ b/meilisearch/src/option.rs
@ -60,6 +60,7 @@ const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
    "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
 const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
    "MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS";
+const MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB: &str = "MEILI_EXPERIMENTAL_DISABLE_PREFIXDB";

 const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
 const DEFAULT_DB_PATH: &str = "./data.ms";
@ -389,6 +390,11 @@ pub struct Opt {
    #[serde(default = "default_limit_batched_tasks")]
    pub experimental_max_number_of_batched_tasks: usize,

+    /// Experimentally disable the prefix database, see: <https://github.com/orgs/meilisearch/discussions>
+    #[clap(long, env = MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB)]
+    #[serde(default)]
+    pub experimental_disable_prefix_db: bool,
+
    #[serde(flatten)]
    #[clap(flatten)]
    pub indexer_options: IndexerOpts,
@ -489,6 +495,7 @@ impl Opt {
            experimental_enable_logs_route,
            experimental_replication_parameters,
            experimental_reduce_indexing_memory_usage,
+            experimental_disable_prefix_db,
        } = self;
        export_to_env_if_not_present(MEILI_DB_PATH, db_path);
        export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
@ -518,6 +525,10 @@ impl Opt {
            MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS,
            experimental_max_number_of_batched_tasks.to_string(),
        );
+        export_to_env_if_not_present(
+            MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB,
+            experimental_disable_prefix_db.to_string(),
+        );
        if let Some(ssl_cert_path) = ssl_cert_path {
            export_to_env_if_not_present(MEILI_SSL_CERT_PATH, ssl_cert_path);
        }
--- a/meilisearch/tests/documents/get_documents.rs
+++ b/meilisearch/tests/documents/get_documents.rs
@ -644,7 +644,12 @@ async fn get_document_with_vectors() {
        {
          "id": 1,
          "name": "echo",
-          "_vectors": {}
+          "_vectors": {
+            "manual": {
+              "embeddings": [],
+              "regenerate": false
+            }
+          }
        }
      ],
      "offset": 0,
@ -700,7 +705,12 @@ async fn get_document_with_vectors() {
        },
        {
          "name": "echo",
-          "_vectors": {}
+          "_vectors": {
+            "manual": {
+              "embeddings": [],
+              "regenerate": false
+            }
+          }
        }
      ],
      "offset": 0,
--- a/meilisearch/tests/vector/mod.rs
+++ b/meilisearch/tests/vector/mod.rs
@ -119,7 +119,12 @@ async fn add_remove_user_provided() {
        {
          "id": 1,
          "name": "echo",
-          "_vectors": {}
+          "_vectors": {
+            "manual": {
+              "embeddings": [],
+              "regenerate": false
+            }
+          }
        }
      ],
      "offset": 0,
@ -141,7 +146,12 @@ async fn add_remove_user_provided() {
        {
          "id": 1,
          "name": "echo",
-          "_vectors": {}
+          "_vectors": {
+            "manual": {
+              "embeddings": [],
+              "regenerate": false
+            }
+          }
        }
      ],
      "offset": 0,
@ -577,7 +587,12 @@ async fn add_remove_one_vector_4588() {
        {
          "id": 0,
          "name": "kefir",
-          "_vectors": {}
+          "_vectors": {
+            "manual": {
+              "embeddings": [],
+              "regenerate": false
+            }
+          }
        }
      ],
      "offset": 0,
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -141,3 +141,6 @@ swedish-recomposition = ["charabia/swedish-recomposition"]

 # allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
 cuda = ["candle-core/cuda"]
+
+[lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(fuzzing)'] }
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -1230,6 +1230,11 @@ impl Index {
        )
    }

+    /// Deletes the FST which is the words prefixes dictionary of the engine.
+    pub fn delete_words_prefixes_fst(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
+        self.main.remap_key_type::<Str>().delete(wtxn, main_key::WORDS_PREFIXES_FST_KEY)
+    }
+
    /// Returns the FST which is the words prefixes dictionary of the engine.
    pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
        match self.main.remap_types::<Str, Bytes>().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? {
@ -1660,9 +1665,7 @@ impl Index {
                }
            }

-            if !embeddings.is_empty() {
-                res.insert(embedder_name.to_owned(), embeddings);
-            }
+            res.insert(embedder_name.to_owned(), embeddings);
        }
        Ok(res)
    }
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -85,7 +85,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
    embedders: EmbeddingConfigs,
 }

-#[derive(Default, Debug, Clone)]
+#[derive(Debug, Clone)]
 pub struct IndexDocumentsConfig {
    pub words_prefix_threshold: Option<u32>,
    pub max_prefix_length: Option<usize>,
@ -93,6 +93,21 @@ pub struct IndexDocumentsConfig {
    pub words_positions_min_level_size: Option<NonZeroU32>,
    pub update_method: IndexDocumentsMethod,
    pub autogenerate_docids: bool,
+    pub compute_prefix_databases: bool,
+}
+
+impl Default for IndexDocumentsConfig {
+    fn default() -> Self {
+        Self {
+            words_prefix_threshold: Default::default(),
+            max_prefix_length: Default::default(),
+            words_positions_level_group_size: Default::default(),
+            words_positions_min_level_size: Default::default(),
+            update_method: Default::default(),
+            autogenerate_docids: Default::default(),
+            compute_prefix_databases: true,
+        }
+    }
 }

 impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
@ -558,12 +573,20 @@ where
            .map_err(InternalError::from)??;
        }

-        self.execute_prefix_databases(
-            word_docids.map(MergerBuilder::build),
-            exact_word_docids.map(MergerBuilder::build),
-            word_position_docids.map(MergerBuilder::build),
-            word_fid_docids.map(MergerBuilder::build),
-        )?;
+        if self.config.compute_prefix_databases {
+            self.execute_prefix_databases(
+                word_docids.map(MergerBuilder::build),
+                exact_word_docids.map(MergerBuilder::build),
+                word_position_docids.map(MergerBuilder::build),
+                word_fid_docids.map(MergerBuilder::build),
+            )?;
+        } else {
+            self.index.words_prefixes_fst(self.wtxn)?;
+            self.index.word_prefix_docids.clear(self.wtxn)?;
+            self.index.exact_word_prefix_docids.clear(self.wtxn)?;
+            self.index.word_prefix_position_docids.clear(self.wtxn)?;
+            self.index.word_prefix_fid_docids.clear(self.wtxn)?;
+        }

        Ok(number_of_documents)
    }
@ -2180,33 +2203,6 @@ mod tests {
        index.add_documents(doc1).unwrap();
    }

-    #[cfg(feature = "default")]
-    #[test]
-    fn store_detected_script_and_language_per_document_during_indexing() {
-        use charabia::{Language, Script};
-        let index = TempIndex::new();
-        index
-            .add_documents(documents!([
-                { "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
-                { "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
-                { "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
-                { "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" },
-                { "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" },
-                { "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" },
-            ]))
-            .unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-        let key_jpn = (Script::Cj, Language::Jpn);
-        let key_cmn = (Script::Cj, Language::Cmn);
-        let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
-        let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
-        let expected_cj_jpn_docids = [3].iter().collect();
-        assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
-        let expected_cj_cmn_docids = [1, 5].iter().collect();
-        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
-    }
-
    #[test]
    fn add_and_delete_documents_in_single_transform() {
        let mut index = TempIndex::new();
Author	SHA1	Message	Date
Clément Renault	af2b722fed	Make the CI happy about the never type	2024-09-16 11:40:26 +02:00
Clément Renault	8cb7001755	Expose an experimental parameter to control the generation of prefix dbs	2024-09-16 10:57:52 +02:00
meili-bors[bot]	882663bf7f	Merge #4891 4891: Update version for the next release (v1.9.1) in Cargo.toml r=dureuill a=meili-bot ⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging. Co-authored-by: dureuill <dureuill@users.noreply.github.com>	2024-08-27 16:04:18 +00:00
dureuill	3234f63c00	Update version for the next release (v1.9.1) in Cargo.toml	2024-08-27 16:02:43 +00:00
meili-bors[bot]	9fff081043	Merge #4889 4889: When `retrieveVectors` is true, retrieve `_vectors.embedder` even if … r=Kerollmops a=dureuill …there are no vector for that embedder backports a bug fix from v1.10.0: `82647bcded` Co-authored-by: Louis Dureuil <louis@meilisearch.com>	2024-08-27 15:14:58 +00:00
Louis Dureuil	575b7b7a0b	Fix tests	2024-08-27 17:14:10 +02:00
Louis Dureuil	6287f5b204	Remove unexecuted test	2024-08-27 16:54:33 +02:00
Louis Dureuil	5dac8e7168	Allow fuzzing cfg	2024-08-27 16:43:44 +02:00
Louis Dureuil	e669af1e49	CI: Add ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION workaround to keep using Ubuntu 18.04	2024-08-27 16:29:45 +02:00
Louis Dureuil	0e0e29459c	Update time	2024-08-27 16:27:05 +02:00
Louis Dureuil	c25f7e3450	When `retrieveVectors` is true, retrieve `_vectors.embedder` even if there are no vector for that embedder	2024-08-27 16:26:41 +02:00