Fix the tests

update the description of the cli argument
Expose a new flag to limit the number of batched tasks
2025-12-02 18:55:36 +00:00 · 2023-09-14 16:29:03 +02:00 · 2023-09-14 15:50:36 +02:00 · 2023-09-14 15:48:32 +02:00
19 changed files with 63 additions and 382 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -469,7 +469,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"

 [[package]]
 name = "benchmarks"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "anyhow",
 "bytes",
@@ -1199,7 +1199,7 @@ dependencies = [

 [[package]]
 name = "dump"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "anyhow",
 "big_s",
@@ -1413,7 +1413,7 @@ dependencies = [

 [[package]]
 name = "file-store"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "faux",
 "tempfile",
@@ -1435,7 +1435,7 @@ dependencies = [

 [[package]]
 name = "filter-parser"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "insta",
 "nom",
@@ -1454,7 +1454,7 @@ dependencies = [

 [[package]]
 name = "flatten-serde-json"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "criterion",
 "serde_json",
@@ -1572,7 +1572,7 @@ dependencies = [

 [[package]]
 name = "fuzzers"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "arbitrary",
 "clap",
@@ -1894,7 +1894,7 @@ dependencies = [

 [[package]]
 name = "index-scheduler"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "anyhow",
 "big_s",
@@ -2081,7 +2081,7 @@ dependencies = [

 [[package]]
 name = "json-depth-checker"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "criterion",
 "serde_json",
@@ -2493,7 +2493,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "meili-snap"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "insta",
 "md5",
@@ -2502,7 +2502,7 @@ dependencies = [

 [[package]]
 name = "meilisearch"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "actix-cors",
 "actix-http",
@@ -2591,7 +2591,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-auth"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "base64 0.21.2",
 "enum-iterator",
@@ -2610,7 +2610,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-types"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "actix-web",
 "anyhow",
@@ -2664,7 +2664,7 @@ dependencies = [

 [[package]]
 name = "milli"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "big_s",
 "bimap",
@@ -2994,7 +2994,7 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"

 [[package]]
 name = "permissive-json-pointer"
-version = "1.3.1"
+version = "1.3.0"
 dependencies = [
 "big_s",
 "serde_json",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ members = [
 ]

 [workspace.package]
-version = "1.3.1"
+version = "1.3.0"
 authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
 description = "Meilisearch HTTP server"
 homepage = "https://meilisearch.com"
--- a/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-10.snap
+++ b/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-10.snap
@@ -1,24 +0,0 @@
---
-source: dump/src/reader/mod.rs
-expression: spells.settings().unwrap()
---
-{
-  "displayedAttributes": [
-    "*"
-  ],
-  "searchableAttributes": [
-    "*"
-  ],
-  "filterableAttributes": [],
-  "sortableAttributes": [],
-  "rankingRules": [
-    "typo",
-    "words",
-    "proximity",
-    "attribute",
-    "exactness"
-  ],
-  "stopWords": [],
-  "synonyms": {},
-  "distinctAttribute": null
-}
--- a/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-4.snap
+++ b/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-4.snap
@@ -1,38 +0,0 @@
---
-source: dump/src/reader/mod.rs
-expression: products.settings().unwrap()
---
-{
-  "displayedAttributes": [
-    "*"
-  ],
-  "searchableAttributes": [
-    "*"
-  ],
-  "filterableAttributes": [],
-  "sortableAttributes": [],
-  "rankingRules": [
-    "typo",
-    "words",
-    "proximity",
-    "attribute",
-    "exactness"
-  ],
-  "stopWords": [],
-  "synonyms": {
-    "android": [
-      "phone",
-      "smartphone"
-    ],
-    "iphone": [
-      "phone",
-      "smartphone"
-    ],
-    "phone": [
-      "android",
-      "iphone",
-      "smartphone"
-    ]
-  },
-  "distinctAttribute": null
-}
--- a/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-7.snap
+++ b/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-7.snap
@@ -1,31 +0,0 @@
---
-source: dump/src/reader/mod.rs
-expression: movies.settings().unwrap()
---
-{
-  "displayedAttributes": [
-    "*"
-  ],
-  "searchableAttributes": [
-    "*"
-  ],
-  "filterableAttributes": [
-    "genres",
-    "id"
-  ],
-  "sortableAttributes": [
-    "genres",
-    "id"
-  ],
-  "rankingRules": [
-    "typo",
-    "words",
-    "proximity",
-    "attribute",
-    "exactness",
-    "release_date:asc"
-  ],
-  "stopWords": [],
-  "synonyms": {},
-  "distinctAttribute": null
-}
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -534,7 +534,9 @@ impl IndexScheduler {
        let index_tasks = self.index_tasks(rtxn, index_name)? & enqueued;

        // If autobatching is disabled we only take one task at a time.
-        let tasks_limit = if self.autobatching_enabled { usize::MAX } else { 1 };
+        // Otherwise, we take only a maximum of tasks to create batches.
+        let tasks_limit =
+            if self.autobatching_enabled { self.maximum_number_of_batched_tasks } else { 1 };

        let enqueued = index_tasks
            .into_iter()
--- a/index-scheduler/src/insta_snapshot.rs
+++ b/index-scheduler/src/insta_snapshot.rs
@@ -15,6 +15,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {

    let IndexScheduler {
        autobatching_enabled,
+        maximum_number_of_batched_tasks: _,
        must_stop_processing: _,
        processing_tasks,
        file_store,
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -253,6 +253,9 @@ pub struct IndexSchedulerOptions {
    /// Set to `true` iff the index scheduler is allowed to automatically
    /// batch tasks together, to process multiple tasks at once.
    pub autobatching_enabled: bool,
+    /// If the autobatcher is allowed to automatically batch tasks
+    /// it will only batch this defined number of tasks at once.
+    pub maximum_number_of_batched_tasks: usize,
    /// The maximum number of tasks stored in the task queue before starting
    /// to auto schedule task deletions.
    pub max_number_of_tasks: usize,
@@ -310,6 +313,9 @@ pub struct IndexScheduler {
    /// Whether auto-batching is enabled or not.
    pub(crate) autobatching_enabled: bool,

+    /// The maximum number of tasks that will be batched together.
+    pub(crate) maximum_number_of_batched_tasks: usize,
+
    /// The max number of tasks allowed before the scheduler starts to delete
    /// the finished tasks automatically.
    pub(crate) max_number_of_tasks: usize,
@@ -363,6 +369,7 @@ impl IndexScheduler {
            index_mapper: self.index_mapper.clone(),
            wake_up: self.wake_up.clone(),
            autobatching_enabled: self.autobatching_enabled,
+            maximum_number_of_batched_tasks: self.maximum_number_of_batched_tasks,
            max_number_of_tasks: self.max_number_of_tasks,
            snapshots_path: self.snapshots_path.clone(),
            dumps_path: self.dumps_path.clone(),
@@ -458,6 +465,7 @@ impl IndexScheduler {
            // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
            wake_up: Arc::new(SignalEvent::auto(true)),
            autobatching_enabled: options.autobatching_enabled,
+            maximum_number_of_batched_tasks: options.maximum_number_of_batched_tasks,
            max_number_of_tasks: options.max_number_of_tasks,
            dumps_path: options.dumps_path,
            snapshots_path: options.snapshots_path,
@@ -790,19 +798,10 @@ impl IndexScheduler {

        let mut res = BTreeMap::new();

-        let processing_tasks = { self.processing_tasks.read().unwrap().processing.len() };
-
        res.insert(
            "statuses".to_string(),
            enum_iterator::all::<Status>()
-                .map(|s| {
-                    let tasks = self.get_status(&rtxn, s)?.len();
-                    match s {
-                        Status::Enqueued => Ok((s.to_string(), tasks - processing_tasks)),
-                        Status::Processing => Ok((s.to_string(), processing_tasks)),
-                        s => Ok((s.to_string(), tasks)),
-                    }
-                })
+                .map(|s| Ok((s.to_string(), self.get_status(&rtxn, s)?.len())))
                .collect::<Result<BTreeMap<String, u64>>>()?,
        );
        res.insert(
@@ -1587,6 +1586,7 @@ mod tests {
                index_count: 5,
                indexer_config,
                autobatching_enabled: true,
+                maximum_number_of_batched_tasks: usize::MAX,
                max_number_of_tasks: 1_000_000,
                instance_features: Default::default(),
            };
@@ -4138,154 +4138,4 @@ mod tests {
        snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "everything_has_been_processed");
        drop(rtxn);
    }
-
-    #[test]
-    fn basic_get_stats() {
-        let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
-
-        let kind = index_creation_task("catto", "mouse");
-        let _task = index_scheduler.register(kind).unwrap();
-        let kind = index_creation_task("doggo", "sheep");
-        let _task = index_scheduler.register(kind).unwrap();
-        let kind = index_creation_task("whalo", "fish");
-        let _task = index_scheduler.register(kind).unwrap();
-
-        snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
-        {
-          "indexes": {
-            "catto": 1,
-            "doggo": 1,
-            "whalo": 1
-          },
-          "statuses": {
-            "canceled": 0,
-            "enqueued": 3,
-            "failed": 0,
-            "processing": 0,
-            "succeeded": 0
-          },
-          "types": {
-            "documentAdditionOrUpdate": 0,
-            "documentDeletion": 0,
-            "dumpCreation": 0,
-            "indexCreation": 3,
-            "indexDeletion": 0,
-            "indexSwap": 0,
-            "indexUpdate": 0,
-            "settingsUpdate": 0,
-            "snapshotCreation": 0,
-            "taskCancelation": 0,
-            "taskDeletion": 0
-          }
-        }
-        "###);
-
-        handle.advance_till([Start, BatchCreated]);
-        snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
-        {
-          "indexes": {
-            "catto": 1,
-            "doggo": 1,
-            "whalo": 1
-          },
-          "statuses": {
-            "canceled": 0,
-            "enqueued": 2,
-            "failed": 0,
-            "processing": 1,
-            "succeeded": 0
-          },
-          "types": {
-            "documentAdditionOrUpdate": 0,
-            "documentDeletion": 0,
-            "dumpCreation": 0,
-            "indexCreation": 3,
-            "indexDeletion": 0,
-            "indexSwap": 0,
-            "indexUpdate": 0,
-            "settingsUpdate": 0,
-            "snapshotCreation": 0,
-            "taskCancelation": 0,
-            "taskDeletion": 0
-          }
-        }
-        "###);
-
-        handle.advance_till([
-            InsideProcessBatch,
-            InsideProcessBatch,
-            ProcessBatchSucceeded,
-            AfterProcessing,
-            Start,
-            BatchCreated,
-        ]);
-        snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
-        {
-          "indexes": {
-            "catto": 1,
-            "doggo": 1,
-            "whalo": 1
-          },
-          "statuses": {
-            "canceled": 0,
-            "enqueued": 1,
-            "failed": 0,
-            "processing": 1,
-            "succeeded": 1
-          },
-          "types": {
-            "documentAdditionOrUpdate": 0,
-            "documentDeletion": 0,
-            "dumpCreation": 0,
-            "indexCreation": 3,
-            "indexDeletion": 0,
-            "indexSwap": 0,
-            "indexUpdate": 0,
-            "settingsUpdate": 0,
-            "snapshotCreation": 0,
-            "taskCancelation": 0,
-            "taskDeletion": 0
-          }
-        }
-        "###);
-
-        // now we make one more batch, the started_at field of the new tasks will be past `second_start_time`
-        handle.advance_till([
-            InsideProcessBatch,
-            InsideProcessBatch,
-            ProcessBatchSucceeded,
-            AfterProcessing,
-            Start,
-            BatchCreated,
-        ]);
-        snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
-        {
-          "indexes": {
-            "catto": 1,
-            "doggo": 1,
-            "whalo": 1
-          },
-          "statuses": {
-            "canceled": 0,
-            "enqueued": 0,
-            "failed": 0,
-            "processing": 1,
-            "succeeded": 2
-          },
-          "types": {
-            "documentAdditionOrUpdate": 0,
-            "documentDeletion": 0,
-            "dumpCreation": 0,
-            "indexCreation": 3,
-            "indexDeletion": 0,
-            "indexSwap": 0,
-            "indexUpdate": 0,
-            "settingsUpdate": 0,
-            "snapshotCreation": 0,
-            "taskCancelation": 0,
-            "taskDeletion": 0
-          }
-        }
-        "###);
-    }
 }
--- a/meili-snap/src/lib.rs
+++ b/meili-snap/src/lib.rs
@@ -167,9 +167,7 @@ macro_rules! snapshot {
        let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, Some(&snap_name));
        settings.bind(|| {
            let snap = format!("{}", $value);
-            insta::allow_duplicates! {
-                meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
-            }
+            meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
        });
    };
    ($value:expr, @$inline:literal) => {
@@ -178,9 +176,7 @@ macro_rules! snapshot {
        let (settings, _, _) = $crate::default_snapshot_settings_for_test("", Some("_dummy_argument"));
        settings.bind(|| {
            let snap = format!("{}", $value);
-            insta::allow_duplicates! {
-                meili_snap::insta::assert_snapshot!(snap, @$inline);
-            }
+            meili_snap::insta::assert_snapshot!(snap, @$inline);
        });
    };
    ($value:expr) => {
@@ -198,9 +194,7 @@ macro_rules! snapshot {
        let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, None);
        settings.bind(|| {
            let snap = format!("{}", $value);
-            insta::allow_duplicates! {
-                meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
-            }
+            meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
        });
    };
 }
--- a/meilisearch/src/analytics/segment_analytics.rs
+++ b/meilisearch/src/analytics/segment_analytics.rs
@@ -285,6 +285,7 @@ impl From<Opt> for Infos {
            db_path,
            experimental_enable_metrics,
            experimental_reduce_indexing_memory_usage,
+            experimental_limit_batched_tasks: _,
            http_addr,
            master_key: _,
            env,
--- a/meilisearch/src/lib.rs
+++ b/meilisearch/src/lib.rs
@@ -236,6 +236,7 @@ fn open_or_create_database_unchecked(
            enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
            indexer_config: (&opt.indexer_options).try_into()?,
            autobatching_enabled: true,
+            maximum_number_of_batched_tasks: opt.experimental_limit_batched_tasks,
            max_number_of_tasks: 1_000_000,
            index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
            index_count: DEFAULT_INDEX_COUNT,
--- a/meilisearch/src/option.rs
+++ b/meilisearch/src/option.rs
@@ -51,6 +51,7 @@ const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL";
 const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
 const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
    "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
+const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS: &str = "MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS";

 const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
 const DEFAULT_DB_PATH: &str = "./data.ms";
@@ -301,6 +302,11 @@ pub struct Opt {
    #[serde(default)]
    pub experimental_reduce_indexing_memory_usage: bool,

+    /// Experimental limit to the number of tasks per batch
+    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS, default_value_t = default_limit_batched_tasks())]
+    #[serde(default = "default_limit_batched_tasks")]
+    pub experimental_limit_batched_tasks: usize,
+
    #[serde(flatten)]
    #[clap(flatten)]
    pub indexer_options: IndexerOpts,
@@ -393,7 +399,8 @@ impl Opt {
            #[cfg(all(not(debug_assertions), feature = "analytics"))]
            no_analytics,
            experimental_enable_metrics: enable_metrics_route,
-            experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage,
+            experimental_reduce_indexing_memory_usage,
+            experimental_limit_batched_tasks,
        } = self;
        export_to_env_if_not_present(MEILI_DB_PATH, db_path);
        export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
@@ -437,7 +444,11 @@ impl Opt {
        );
        export_to_env_if_not_present(
            MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE,
-            reduce_indexing_memory_usage.to_string(),
+            experimental_reduce_indexing_memory_usage.to_string(),
+        );
+        export_to_env_if_not_present(
+            MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS,
+            experimental_limit_batched_tasks.to_string(),
        );
        indexer_options.export_to_env();
    }
@@ -739,6 +750,10 @@ fn default_dump_dir() -> PathBuf {
    PathBuf::from(DEFAULT_DUMP_DIR)
 }

+fn default_limit_batched_tasks() -> usize {
+    usize::MAX
+}
+
 /// Indicates if a snapshot was scheduled, and if yes with which interval.
 #[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)]
 pub enum ScheduleSnapshot {
--- a/meilisearch/tests/search/geo.rs
+++ b/meilisearch/tests/search/geo.rs
@@ -1,4 +1,3 @@
-use meili_snap::{json_string, snapshot};
 use once_cell::sync::Lazy;
 use serde_json::{json, Value};

@@ -61,59 +60,3 @@ async fn geo_sort_with_geo_strings() {
        )
        .await;
 }
-
-#[actix_rt::test]
-async fn geo_bounding_box_with_string_and_number() {
-    let server = Server::new().await;
-    let index = server.index("test");
-
-    let documents = DOCUMENTS.clone();
-    index.update_settings_filterable_attributes(json!(["_geo"])).await;
-    index.update_settings_sortable_attributes(json!(["_geo"])).await;
-    index.add_documents(documents, None).await;
-    index.wait_task(2).await;
-
-    index
-        .search(
-            json!({
-                "filter": "_geoBoundingBox([89, 179], [-89, -179])",
-            }),
-            |response, code| {
-                assert_eq!(code, 200, "{}", response);
-                snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
-                {
-                  "hits": [
-                    {
-                      "id": 1,
-                      "name": "Taco Truck",
-                      "address": "444 Salsa Street, Burritoville",
-                      "type": "Mexican",
-                      "rating": 9,
-                      "_geo": {
-                        "lat": 34.0522,
-                        "lng": -118.2437
-                      }
-                    },
-                    {
-                      "id": 2,
-                      "name": "La Bella Italia",
-                      "address": "456 Elm Street, Townsville",
-                      "type": "Italian",
-                      "rating": 9,
-                      "_geo": {
-                        "lat": "45.4777599",
-                        "lng": "9.1967508"
-                      }
-                    }
-                  ],
-                  "query": "",
-                  "processingTimeMs": "[time]",
-                  "limit": 20,
-                  "offset": 0,
-                  "estimatedTotalHits": 2
-                }
-                "###);
-            },
-        )
-        .await;
-}
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1718,11 +1718,11 @@ pub(crate) mod tests {
            .unwrap();
        index
            .add_documents(documents!([
-                { "id": 0, "_geo": { "lat": "0", "lng": "0" } },
-                { "id": 1, "_geo": { "lat": 0, "lng": "-175" } },
-                { "id": 2, "_geo": { "lat": "0", "lng": 175 } },
+                { "id": 0, "_geo": { "lat": 0, "lng": 0 } },
+                { "id": 1, "_geo": { "lat": 0, "lng": -175 } },
+                { "id": 2, "_geo": { "lat": 0, "lng": 175 } },
                { "id": 3, "_geo": { "lat": 85, "lng": 0 } },
-                { "id": 4, "_geo": { "lat": "-85", "lng": "0" } },
+                { "id": 4, "_geo": { "lat": -85, "lng": 0 } },
            ]))
            .unwrap();

--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -97,7 +97,7 @@ const MAX_LMDB_KEY_LENGTH: usize = 500;
 ///
 /// This number is determined by the keys of the different facet databases
 /// and adding a margin of safety.
-pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 32;
+pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;

 /// The maximum length a word can be
 pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -94,7 +94,7 @@ use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValu
 use crate::heed_codec::ByteSliceRefCodec;
 use crate::update::index_documents::create_sorter;
 use crate::update::merge_btreeset_string;
-use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
+use crate::{BEU16StrCodec, Index, Result, BEU16};

 pub mod bulk;
 pub mod delete;
@@ -191,16 +191,7 @@ impl<'i> FacetsUpdate<'i> {
        for result in database.iter(wtxn)? {
            let (facet_group_key, ()) = result?;
            if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
-                let mut normalized_facet = left_bound.normalize(&options);
-                let normalized_truncated_facet: String;
-                if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
-                    normalized_truncated_facet = normalized_facet
-                        .char_indices()
-                        .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
-                        .map(|(_, c)| c)
-                        .collect();
-                    normalized_facet = normalized_truncated_facet.into();
-                }
+                let normalized_facet = left_bound.normalize(&options);
                let set = BTreeSet::from_iter(std::iter::once(left_bound));
                let key = (field_id, normalized_facet.as_ref());
                let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@@ -44,7 +44,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
        if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
            normalised_truncated_value = normalised_value
                .char_indices()
-                .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
+                .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
                .map(|(_, c)| c)
                .collect();
            normalised_value = normalised_truncated_value.as_str();
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -28,13 +28,11 @@ pub struct ExtractedFacetValues {
 ///
 /// Returns the generated grenad reader containing the docid the fid and the orginal value as key
 /// and the normalized value as value extracted from the given chunk of documents.
-/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
 #[logging_timer::time]
 pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
    obkv_documents: grenad::Reader<R>,
    indexer: GrenadParameters,
    faceted_fields: &HashSet<FieldId>,
-    geo_fields_ids: Option<(FieldId, FieldId)>,
 ) -> Result<ExtractedFacetValues> {
    let max_memory = indexer.max_memory_by_thread();

@@ -84,10 +82,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(

                let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;

-                match extract_facet_values(
-                    &value,
-                    geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
-                ) {
+                match extract_facet_values(&value) {
                    FilterableValues::Null => {
                        facet_is_null_docids.entry(field_id).or_default().insert(document);
                    }
@@ -180,13 +175,12 @@ enum FilterableValues {
    Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
 }

-fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
+fn extract_facet_values(value: &Value) -> FilterableValues {
    fn inner_extract_facet_values(
        value: &Value,
        can_recurse: bool,
        output_numbers: &mut Vec<f64>,
        output_strings: &mut Vec<(String, String)>,
-        geo_field: bool,
    ) {
        match value {
            Value::Null => (),
@@ -197,30 +191,13 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
                }
            }
            Value::String(original) => {
-                // if we're working on a geofield it MUST be something we can parse or else there was an internal error
-                // in the enrich pipeline. But since the enrich pipeline worked, we want to avoid crashing at all costs.
-                if geo_field {
-                    if let Ok(float) = original.parse() {
-                        output_numbers.push(float);
-                    } else {
-                        log::warn!(
-                            "Internal error, could not parse a geofield that has been validated. Please open an issue."
-                        )
-                    }
-                }
                let normalized = crate::normalize_facet(original);
                output_strings.push((normalized, original.clone()));
            }
            Value::Array(values) => {
                if can_recurse {
                    for value in values {
-                        inner_extract_facet_values(
-                            value,
-                            false,
-                            output_numbers,
-                            output_strings,
-                            geo_field,
-                        );
+                        inner_extract_facet_values(value, false, output_numbers, output_strings);
                    }
                }
            }
@@ -236,7 +213,7 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
        otherwise => {
            let mut numbers = Vec::new();
            let mut strings = Vec::new();
-            inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings, geo_field);
+            inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
            FilterableValues::Values { numbers, strings }
        }
    }
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -366,7 +366,6 @@ fn send_and_extract_flattened_documents_data(
                    flattened_documents_chunk.clone(),
                    indexer,
                    faceted_fields,
-                    geo_fields_ids,
                )?;

                // send docid_fid_facet_numbers_chunk to DB writer
Author	SHA1	Message	Date
Clément Renault	c9cd150ca6	Fix the tests	2023-09-14 16:29:03 +02:00
Tamo	8b218d35b7	update the description of the cli argument	2023-09-14 15:50:36 +02:00
Clément Renault	3b521f6c69	Expose a new flag to limit the number of batched tasks	2023-09-14 15:48:32 +02:00