Grow by 1TB instead of 1MB

Use less of the total budget
Restore budget to what it was
2025-12-13 07:57:02 +00:00 · 2024-06-05 09:22:58 +02:00 · 2024-06-05 09:22:50 +02:00 · 2024-06-05 09:22:39 +02:00 · 2024-06-04 11:10:31 +02:00 · 2024-05-21 16:38:36 +00:00
16 changed files with 224 additions and 386 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -494,7 +494,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
 [[package]]
 name = "benchmarks"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "anyhow",
 "bytes",
@@ -639,7 +639,7 @@ dependencies = [
 [[package]]
 name = "build-info"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "anyhow",
 "time",
@@ -889,9 +889,9 @@ dependencies = [
 [[package]]
 name = "charabia"
-version = "0.8.9"
+version = "0.8.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f6a65052f308636e5d5e1777f0dbc07919f5fbac24b6c8ad3e140472e5520de9"
+checksum = "933f20f2269b24d32fd5503e7b3c268af902190daf8d9d2b73ed2e75d77c00b4"
 dependencies = [
 "aho-corasick",
 "cow-utils",
@@ -1539,7 +1539,7 @@ dependencies = [
 [[package]]
 name = "dump"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "anyhow",
 "big_s",
@@ -1787,7 +1787,7 @@ dependencies = [
 [[package]]
 name = "file-store"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "faux",
 "tempfile",
@@ -1810,7 +1810,7 @@ dependencies = [
 [[package]]
 name = "filter-parser"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "insta",
 "nom",
@@ -1830,7 +1830,7 @@ dependencies = [
 [[package]]
 name = "flatten-serde-json"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "criterion",
 "serde_json",
@@ -1948,7 +1948,7 @@ dependencies = [
 [[package]]
 name = "fuzzers"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "arbitrary",
 "clap",
@@ -2442,7 +2442,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"
 [[package]]
 name = "index-scheduler"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "anyhow",
 "big_s",
@@ -2638,7 +2638,7 @@ dependencies = [
 [[package]]
 name = "json-depth-checker"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "criterion",
 "serde_json",
@@ -3275,7 +3275,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 [[package]]
 name = "meili-snap"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "insta",
 "md5",
@@ -3284,7 +3284,7 @@ dependencies = [
 [[package]]
 name = "meilisearch"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "actix-cors",
 "actix-http",
@@ -3377,7 +3377,7 @@ dependencies = [
 [[package]]
 name = "meilisearch-auth"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "base64 0.21.7",
 "enum-iterator",
@@ -3396,7 +3396,7 @@ dependencies = [
 [[package]]
 name = "meilisearch-types"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "actix-web",
 "anyhow",
@@ -3426,7 +3426,7 @@ dependencies = [
 [[package]]
 name = "meilitool"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "anyhow",
 "clap",
@@ -3465,7 +3465,7 @@ dependencies = [
 [[package]]
 name = "milli"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "arroy",
 "big_s",
@@ -3906,7 +3906,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
 [[package]]
 name = "permissive-json-pointer"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "big_s",
 "serde_json",
@@ -6074,7 +6074,7 @@ dependencies = [
 [[package]]
 name = "xtask"
-version = "1.8.0"
+version = "1.8.1"
 dependencies = [
 "anyhow",
 "build-info",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,7 @@ members = [
 ]
 [workspace.package]
-version = "1.8.0"
+version = "1.8.1"
 authors = [
    "Quentin de Quelen <quentin@dequelen.me>",
    "Clément Renault <clement@meilisearch.com>",
--- a/dump/src/lib.rs
+++ b/dump/src/lib.rs
@@ -12,7 +12,6 @@ use serde::{Deserialize, Serialize};
 use time::OffsetDateTime;
 mod error;
 mod new_writer;
 mod reader;
 mod writer;
--- a/dump/src/new_writer.rs
+++ b/dump/src/new_writer.rs
@@ -1,251 +0,0 @@
 use std::fs::File;
 use std::io::{Read, Seek, Write};
 use std::path::Path;
 use std::result::Result as StdResult;
 use flate2::write::GzEncoder;
 use flate2::Compression;
 use meilisearch_types::milli::documents::{
    obkv_to_object, DocumentsBatchCursor, DocumentsBatchIndex, DocumentsBatchReader,
 };
 use tar::{Builder as TarBuilder, Header};
 use time::OffsetDateTime;
 use uuid::Uuid;
 use crate::{Key, Metadata, Result, TaskId, CURRENT_DUMP_VERSION};
 pub struct DumpWriter<W: Write> {
    tar: TarBuilder<GzEncoder<W>>,
 }
 impl<W: Write> DumpWriter<W> {
    pub fn new(instance_uuid: Option<Uuid>, writer: W) -> Result<Self> {
        /// TODO: should we use a BuffWriter?
        let gz_encoder = GzEncoder::new(writer, Compression::default());
        let mut tar = TarBuilder::new(gz_encoder);
        let mut header = Header::new_gnu();
        // Append metadata into metadata.json.
        let metadata = Metadata {
            dump_version: CURRENT_DUMP_VERSION,
            db_version: env!("CARGO_PKG_VERSION").to_string(),
            dump_date: OffsetDateTime::now_utc(),
        };
        let data = serde_json::to_string(&metadata).unwrap();
        header.set_size(data.len() as u64);
        tar.append_data(&mut header, "metadata.json", data.as_bytes()).unwrap();
        // Append instance uid into instance_uid.uuid.
        if let Some(instance_uuid) = instance_uuid {
            let data = instance_uuid.as_hyphenated().to_string();
            header.set_size(data.len() as u64);
            tar.append_data(&mut header, "instance_uid.uuid", data.as_bytes()).unwrap();
        }
        Ok(Self { tar })
    }
    pub fn dump_keys(&mut self, keys: &[Key]) -> Result<()> {
        let mut buffer = Vec::new();
        for key in keys {
            serde_json::to_writer(&mut buffer, key)?;
            buffer.push(b'\n');
        }
        let mut header = Header::new_gnu();
        header.set_path("keys.jsonl");
        header.set_size(buffer.len() as u64);
        self.tar.append(&mut header, buffer.as_slice())?;
        Ok(())
    }
    pub fn create_tasks(&mut self) -> Result<FileWriter<W>> {
        FileWriter::new(&mut self.tar, "tasks/queue.jsonl")
    }
    pub fn dump_update_file<R: Read + Seek>(
        &mut self,
        task_uid: TaskId,
        update_file: DocumentsBatchReader<R>,
    ) -> Result<()> {
        let path = format!("tasks/update_files/{}.jsonl", task_uid);
        let mut fw = FileWriter::new(&mut self.tar, path)?;
        let mut serializer = UpdateFileSerializer::new(update_file);
        fw.calculate_len(SerializerIteratorReader::new(&mut serializer))?;
        serializer.reset();
        fw.write_data(SerializerIteratorReader::new(&mut serializer))
    }
 }
 trait SerializerIterator {
    fn next_serialize_into(&mut self, buffer: &mut Vec<u8>) -> StdResult<bool, std::io::Error>;
 }
 struct SerializerIteratorReader<'i, I: SerializerIterator> {
    iterator: &'i mut I,
    buffer: Vec<u8>,
 }
 impl<I: SerializerIterator> Read for SerializerIteratorReader<'_, I> {
    fn read(&mut self, buf: &mut [u8]) -> StdResult<usize, std::io::Error> {
        let mut size = 0;
        loop {
            // if the inner buffer is empty, fill it with a new document.
            if self.buffer.is_empty() {
                if !self.iterator.next_serialize_into(&mut self.buffer)? {
                    // nothing more to write, return the written size.
                    return Ok(size);
                }
            }
            let doc_size = self.buffer.len();
            let remaining_size = buf[size..].len();
            if remaining_size < doc_size {
                // if the serialized document size exceed the buf size,
                // drain the inner buffer filling the remaining space.
                buf[size..].copy_from_slice(&self.buffer[..remaining_size]);
                self.buffer.drain(..remaining_size);
                // then return.
                return Ok(buf.len());
            } else {
                // otherwise write the whole inner buffer into the buf, clear it and continue.
                buf[size..][..doc_size].copy_from_slice(&self.buffer);
                size += doc_size;
                self.buffer.clear();
            }
        }
    }
 }
 impl<'i, I: SerializerIterator> SerializerIteratorReader<'i, I> {
    fn new(iterator: &'i mut I) -> Self {
        Self { iterator, buffer: Vec::new() }
    }
 }
 struct UpdateFileSerializer<R: Read> {
    cursor: DocumentsBatchCursor<R>,
    documents_batch_index: DocumentsBatchIndex,
 }
 impl<R: Read + Seek> SerializerIterator for UpdateFileSerializer<R> {
    fn next_serialize_into(&mut self, buffer: &mut Vec<u8>) -> StdResult<bool, std::io::Error> {
        /// TODO: don't unwrap, original version: `cursor.next_document().map_err(milli::Error::from)?`
        match self.cursor.next_document().unwrap() {
            Some(doc) => {
                /// TODO: don't unwrap
                let json_value = obkv_to_object(&doc, &self.documents_batch_index).unwrap();
                serde_json::to_writer(&mut *buffer, &json_value)?;
                buffer.push(b'\n');
                Ok(true)
            }
            None => Ok(false),
        }
    }
 }
 impl<R: Read + Seek> UpdateFileSerializer<R> {
    fn new(reader: DocumentsBatchReader<R>) -> Self {
        let (cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
        Self { cursor, documents_batch_index }
    }
    /// Resets the cursor to be able to read from the start again.
    pub fn reset(&mut self) {
        self.cursor.reset();
    }
 }
 pub struct FileWriter<'a, W: Write> {
    header: Header,
    tar: &'a mut TarBuilder<GzEncoder<W>>,
    size: Option<u64>,
 }
 impl<'a, W: Write> FileWriter<'a, W> {
    pub(crate) fn new<P: AsRef<Path>>(
        tar: &'a mut TarBuilder<GzEncoder<W>>,
        path: P,
    ) -> Result<Self> {
        let mut header = Header::new_gnu();
        header.set_path(path);
        Ok(Self { header, tar, size: None })
    }
    pub fn calculate_len<R: Read>(&mut self, mut reader: R) -> Result<u64> {
        let mut calculator = SizeCalculatorWriter::new();
        std::io::copy(&mut reader, &mut calculator)?;
        let size = calculator.into_inner();
        self.size = Some(size);
        Ok(size)
    }
    pub fn write_data<R: Read>(mut self, reader: R) -> Result<()> {
        let expected_size =
            self.size.expect("calculate_len must be called before writing the data.");
        self.header.set_size(expected_size);
        let mut scr = SizeCalculatorReader::new(reader);
        self.tar.append(&mut self.header, &mut scr)?;
        assert_eq!(
            expected_size,
            scr.into_inner(),
            "Provided data size is different from the pre-calculated size."
        );
        Ok(())
    }
 }
 struct SizeCalculatorWriter {
    size: usize,
 }
 impl SizeCalculatorWriter {
    fn new() -> Self {
        Self { size: 0 }
    }
    fn into_inner(self) -> u64 {
        self.size as u64
    }
 }
 impl Write for SizeCalculatorWriter {
    fn write(&mut self, buf: &[u8]) -> StdResult<usize, std::io::Error> {
        self.size += buf.len();
        Ok(self.size)
    }
    fn flush(&mut self) -> std::result::Result<(), std::io::Error> {
        Ok(())
    }
 }
 struct SizeCalculatorReader<R: Read> {
    size: usize,
    reader: R,
 }
 impl<R: Read> SizeCalculatorReader<R> {
    fn new(reader: R) -> Self {
        Self { size: 0, reader }
    }
    fn into_inner(self) -> u64 {
        self.size as u64
    }
 }
 impl<R: Read> Read for SizeCalculatorReader<R> {
    fn read(&mut self, buf: &mut [u8]) -> StdResult<usize, std::io::Error> {
        let size = self.reader.read(buf)?;
        self.size += size;
        Ok(size)
    }
 }
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -567,16 +567,16 @@ impl IndexScheduler {
        tracing::debug!("index budget: {budget}B");
        let mut index_count = budget / base_map_size;
-        if index_count < 2 {
+        if index_count < 3 {
            // take a bit less than half than the budget to make sure we can always afford to open an index
            let map_size = (budget * 2) / 5;
            // single index of max budget
            tracing::debug!("1 index of {map_size}B can be opened simultaneously.");
            return IndexBudget { map_size, index_count: 1, task_db_size };
        }
-        // give us some space for an additional index when the cache is already full
+        // give us some space for additional indexes when the cache is already full
-        // decrement is OK because index_count >= 2.
+        // decrement is OK because index_count >= 3.
-        index_count -= 1;
+        index_count -= 2;
        if index_count > max_index_count {
            index_count = max_index_count;
        }
@@ -1834,7 +1834,7 @@ mod tests {
                task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
                index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
                enable_mdb_writemap: false,
-                index_growth_amount: 1000 * 1000, // 1 MB
+                index_growth_amount: 1000 * 1000 * 1000 * 1000, // 1 TB
                index_count: 5,
                indexer_config,
                autobatching_enabled: true,
--- a/meilisearch-types/Cargo.toml
+++ b/meilisearch-types/Cargo.toml
@@ -57,3 +57,5 @@ greek = ["milli/greek"]
 khmer = ["milli/khmer"]
 # allow vietnamese specialized tokenization
 vietnamese = ["milli/vietnamese"]
 # force swedish character recomposition
 swedish-recomposition = ["milli/swedish-recomposition"]
--- a/meilisearch/Cargo.toml
+++ b/meilisearch/Cargo.toml
@@ -156,6 +156,7 @@ thai = ["meilisearch-types/thai"]
 greek = ["meilisearch-types/greek"]
 khmer = ["meilisearch-types/khmer"]
 vietnamese = ["meilisearch-types/vietnamese"]
 swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
 [package.metadata.mini-dashboard]
 assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"
--- a/meilisearch/src/routes/mod.rs
+++ b/meilisearch/src/routes/mod.rs
@@ -367,12 +367,6 @@ async fn get_version(
    })
 }
 #[derive(Serialize)]
 struct KeysResponse {
    private: Option<String>,
    public: Option<String>,
 }
 pub async fn get_health(
    index_scheduler: Data<IndexScheduler>,
    auth_controller: Data<AuthController>,
--- a/meilisearch/tests/search/geo.rs
+++ b/meilisearch/tests/search/geo.rs
@@ -117,3 +117,69 @@ async fn geo_bounding_box_with_string_and_number() {
        )
        .await;
 }
 #[actix_rt::test]
 async fn bug_4640() {
    // https://github.com/meilisearch/meilisearch/issues/4640
    let server = Server::new().await;
    let index = server.index("test");
    let documents = DOCUMENTS.clone();
    index.add_documents(documents, None).await;
    index.update_settings_filterable_attributes(json!(["_geo"])).await;
    let (ret, _code) = index.update_settings_sortable_attributes(json!(["_geo"])).await;
    index.wait_task(ret.uid()).await;
    // Sort the document with the second one first
    index
        .search(
            json!({
                "sort": ["_geoPoint(45.4777599, 9.1967508):asc"],
            }),
            |response, code| {
                assert_eq!(code, 200, "{}", response);
                snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
                {
                  "hits": [
                    {
                      "id": 2,
                      "name": "La Bella Italia",
                      "address": "456 Elm Street, Townsville",
                      "type": "Italian",
                      "rating": 9,
                      "_geo": {
                        "lat": "45.4777599",
                        "lng": "9.1967508"
                      }
                    },
                    {
                      "id": 1,
                      "name": "Taco Truck",
                      "address": "444 Salsa Street, Burritoville",
                      "type": "Mexican",
                      "rating": 9,
                      "_geo": {
                        "lat": 34.0522,
                        "lng": -118.2437
                      },
                      "_geoDistance": 9714063
                    },
                    {
                      "id": 3,
                      "name": "Crêpe Truck",
                      "address": "2 Billig Avenue, Rouenville",
                      "type": "French",
                      "rating": 10
                    }
                  ],
                  "query": "",
                  "processingTimeMs": "[time]",
                  "limit": 20,
                  "offset": 0,
                  "estimatedTotalHits": 3
                }
                "###);
            },
        )
        .await;
 }
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -17,7 +17,7 @@ bincode = "1.3.3"
 bstr = "1.9.0"
 bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] }
 byteorder = "1.5.0"
-charabia = { version = "0.8.9", default-features = false }
+charabia = { version = "0.8.10", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.11"
 deserr = "0.6.1"
@@ -136,7 +136,11 @@ greek = ["charabia/greek"]
 # allow khmer specialized tokenization
 khmer = ["charabia/khmer"]
 # allow vietnamese specialized tokenization
 vietnamese = ["charabia/vietnamese"]
 # force swedish character recomposition
 swedish-recomposition = ["charabia/swedish-recomposition"]
 # allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
 cuda = ["candle-core/cuda"]
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -45,7 +45,6 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
    obkv_documents: grenad::Reader<R>,
    indexer: GrenadParameters,
    settings_diff: &InnerIndexSettingsDiff,
    geo_fields_ids: Option<(FieldId, FieldId)>,
 ) -> Result<ExtractedFacetValues> {
    puffin::profile_function!();
@@ -127,12 +126,18 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
                    add_exists.insert(document);
                }
-                let geo_support =
+                let del_geo_support = settings_diff
-                    geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
+                    .old
                    .geo_fields_ids
                    .map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
                let add_geo_support = settings_diff
                    .new
                    .geo_fields_ids
                    .map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
                let del_filterable_values =
-                    del_value.map(|value| extract_facet_values(&value, geo_support));
+                    del_value.map(|value| extract_facet_values(&value, del_geo_support));
                let add_filterable_values =
-                    add_value.map(|value| extract_facet_values(&value, geo_support));
+                    add_value.map(|value| extract_facet_values(&value, add_geo_support));
                // Those closures are just here to simplify things a bit.
                let mut insert_numbers_diff = |del_numbers, add_numbers| {
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -8,6 +8,7 @@ use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
 use crate::error::GeoError;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::extract_finite_float_from_value;
 use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
 use crate::{FieldId, InternalError, Result};
 /// Extracts the geographical coordinates contained in each document under the `_geo` field.
@@ -18,7 +19,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
    obkv_documents: grenad::Reader<R>,
    indexer: GrenadParameters,
    primary_key_id: FieldId,
-    (lat_fid, lng_fid): (FieldId, FieldId),
+    settings_diff: &InnerIndexSettingsDiff,
 ) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();
@@ -40,47 +41,27 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
            serde_json::from_slice(document_id).unwrap()
        };
-        // first we get the two fields
+        // extract old version
-        match (obkv.get(lat_fid), obkv.get(lng_fid)) {
+        let del_lat_lng =
-            (Some(lat), Some(lng)) => {
+            extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
-                let deladd_lat_obkv = KvReaderDelAdd::new(lat);
+        // extract new version
-                let deladd_lng_obkv = KvReaderDelAdd::new(lng);
+        let add_lat_lng =
            extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
-                // then we extract the values
+        if del_lat_lng != add_lat_lng {
-                let del_lat_lng = deladd_lat_obkv
+            let mut obkv = KvWriterDelAdd::memory();
-                    .get(DelAdd::Deletion)
+            if let Some([lat, lng]) = del_lat_lng {
-                    .zip(deladd_lng_obkv.get(DelAdd::Deletion))
+                #[allow(clippy::drop_non_drop)]
-                    .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
+                let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
-                    .transpose()?;
+                obkv.insert(DelAdd::Deletion, bytes)?;
                let add_lat_lng = deladd_lat_obkv
                    .get(DelAdd::Addition)
                    .zip(deladd_lng_obkv.get(DelAdd::Addition))
                    .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
                    .transpose()?;
                if del_lat_lng != add_lat_lng {
                    let mut obkv = KvWriterDelAdd::memory();
                    if let Some([lat, lng]) = del_lat_lng {
                        #[allow(clippy::drop_non_drop)]
                        let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
                        obkv.insert(DelAdd::Deletion, bytes)?;
                    }
                    if let Some([lat, lng]) = add_lat_lng {
                        #[allow(clippy::drop_non_drop)]
                        let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
                        obkv.insert(DelAdd::Addition, bytes)?;
                    }
                    let bytes = obkv.into_inner()?;
                    writer.insert(docid_bytes, bytes)?;
                }
            }
-            (None, Some(_)) => {
+            if let Some([lat, lng]) = add_lat_lng {
-                return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
+                #[allow(clippy::drop_non_drop)]
                let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
                obkv.insert(DelAdd::Addition, bytes)?;
            }
-            (Some(_), None) => {
+            let bytes = obkv.into_inner()?;
-                return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
+            writer.insert(docid_bytes, bytes)?;
            }
            (None, None) => (),
        }
    }
@@ -88,16 +69,37 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
 }
 /// Extract the finite floats lat and lng from two bytes slices.
-fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
+fn extract_lat_lng(
-    let lat = extract_finite_float_from_value(
+    document: &obkv::KvReader<FieldId>,
-        serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
+    settings: &InnerIndexSettings,
-    )
+    deladd: DelAdd,
-    .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
+    document_id: impl Fn() -> Value,
 ) -> Result<Option<[f64; 2]>> {
    match settings.geo_fields_ids {
        Some((lat_fid, lng_fid)) => {
            let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
            let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
            let (lat, lng) = match (lat, lng) {
                (Some(lat), Some(lng)) => (lat, lng),
                (Some(_), None) => {
                    return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
                }
                (None, Some(_)) => {
                    return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
                }
                (None, None) => return Ok(None),
            };
            let lat = extract_finite_float_from_value(
                serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
            )
            .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
-    let lng = extract_finite_float_from_value(
+            let lng = extract_finite_float_from_value(
-        serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
+                serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
-    )
+            )
-    .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
+            .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
-
+            Ok(Some([lat, lng]))
-    Ok([lat, lng])
+        }
        None => Ok(None),
    }
 }
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -43,7 +43,6 @@ pub(crate) fn data_from_obkv_documents(
    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
    settings_diff: Arc<InnerIndexSettingsDiff>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<()> {
@@ -72,7 +71,6 @@ pub(crate) fn data_from_obkv_documents(
                        indexer,
                        lmdb_writer_sx.clone(),
                        primary_key_id,
                        geo_fields_ids,
                        settings_diff.clone(),
                        max_positions_per_attributes,
                    )
@@ -300,7 +298,6 @@ fn send_and_extract_flattened_documents_data(
    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
    settings_diff: Arc<InnerIndexSettingsDiff>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
@@ -310,12 +307,13 @@ fn send_and_extract_flattened_documents_data(
    let flattened_documents_chunk =
        flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
-    if let Some(geo_fields_ids) = geo_fields_ids {
+    if settings_diff.run_geo_indexing() {
        let documents_chunk_cloned = flattened_documents_chunk.clone();
        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
        let settings_diff = settings_diff.clone();
        rayon::spawn(move || {
            let result =
-                extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids);
+                extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff);
            let _ = match result {
                Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
@@ -354,7 +352,6 @@ fn send_and_extract_flattened_documents_data(
                    flattened_documents_chunk.clone(),
                    indexer,
                    &settings_diff,
                    geo_fields_ids,
                )?;
                // send fid_docid_facet_numbers_chunk to DB writer
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -324,28 +324,6 @@ where
        // get the primary key field id
        let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();
        // get the fid of the `_geo.lat` and `_geo.lng` fields.
        let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
        // self.index.fields_ids_map($a)? ==>> field_id_map
        let geo_fields_ids = match field_id_map.id("_geo") {
            Some(gfid) => {
                let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
                let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
                // if `_geo` is faceted then we get the `lat` and `lng`
                if is_sortable || is_filterable {
                    let field_ids = field_id_map
                        .insert("_geo.lat")
                        .zip(field_id_map.insert("_geo.lng"))
                        .ok_or(UserError::AttributeLimitReached)?;
                    Some(field_ids)
                } else {
                    None
                }
            }
            None => None,
        };
        let pool_params = GrenadParameters {
            chunk_compression_type: self.indexer_config.chunk_compression_type,
            chunk_compression_level: self.indexer_config.chunk_compression_level,
@@ -412,7 +390,6 @@ where
                        pool_params,
                        lmdb_writer_sx.clone(),
                        primary_key_id,
                        geo_fields_ids,
                        settings_diff.clone(),
                        max_positions_per_attributes,
                    )
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -1161,6 +1161,11 @@ impl InnerIndexSettingsDiff {
    pub fn settings_update_only(&self) -> bool {
        self.settings_update_only
    }
    pub fn run_geo_indexing(&self) -> bool {
        self.old.geo_fields_ids != self.new.geo_fields_ids
            || (!self.settings_update_only && self.new.geo_fields_ids.is_some())
    }
 }
 #[derive(Clone)]
@@ -1177,6 +1182,7 @@ pub(crate) struct InnerIndexSettings {
    pub proximity_precision: ProximityPrecision,
    pub embedding_configs: EmbeddingConfigs,
    pub existing_fields: HashSet<String>,
    pub geo_fields_ids: Option<(FieldId, FieldId)>,
 }
 impl InnerIndexSettings {
@@ -1185,7 +1191,7 @@ impl InnerIndexSettings {
        let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
        let allowed_separators = index.allowed_separators(rtxn)?;
        let dictionary = index.dictionary(rtxn)?;
-        let fields_ids_map = index.fields_ids_map(rtxn)?;
+        let mut fields_ids_map = index.fields_ids_map(rtxn)?;
        let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
        let user_defined_searchable_fields =
            user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
@@ -1200,6 +1206,24 @@ impl InnerIndexSettings {
            .into_iter()
            .filter_map(|(field, count)| (count != 0).then_some(field))
            .collect();
        // index.fields_ids_map($a)? ==>> fields_ids_map
        let geo_fields_ids = match fields_ids_map.id("_geo") {
            Some(gfid) => {
                let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid);
                let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid);
                // if `_geo` is faceted then we get the `lat` and `lng`
                if is_sortable || is_filterable {
                    let field_ids = fields_ids_map
                        .insert("_geo.lat")
                        .zip(fields_ids_map.insert("_geo.lng"))
                        .ok_or(UserError::AttributeLimitReached)?;
                    Some(field_ids)
                } else {
                    None
                }
            }
            None => None,
        };
        Ok(Self {
            stop_words,
@@ -1214,6 +1238,7 @@ impl InnerIndexSettings {
            proximity_precision,
            embedding_configs,
            existing_fields,
            geo_fields_ids,
        })
    }
--- a/milli/src/vector/settings.rs
+++ b/milli/src/vector/settings.rs
@@ -301,10 +301,14 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
    fn from(value: EmbeddingConfig) -> Self {
        let EmbeddingConfig { embedder_options, prompt } = value;
        match embedder_options {
-            super::EmbedderOptions::HuggingFace(options) => Self {
+            super::EmbedderOptions::HuggingFace(super::hf::EmbedderOptions {
                model,
                revision,
                distribution,
            }) => Self {
                source: Setting::Set(EmbedderSource::HuggingFace),
-                model: Setting::Set(options.model),
+                model: Setting::Set(model),
-                revision: options.revision.map(Setting::Set).unwrap_or_default(),
+                revision: revision.map(Setting::Set).unwrap_or_default(),
                api_key: Setting::NotSet,
                dimensions: Setting::NotSet,
                document_template: Setting::Set(prompt.template),
@@ -314,14 +318,19 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
                path_to_embeddings: Setting::NotSet,
                embedding_object: Setting::NotSet,
                input_type: Setting::NotSet,
-                distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
+                distribution: distribution.map(Setting::Set).unwrap_or_default(),
            },
-            super::EmbedderOptions::OpenAi(options) => Self {
+            super::EmbedderOptions::OpenAi(super::openai::EmbedderOptions {
                api_key,
                embedding_model,
                dimensions,
                distribution,
            }) => Self {
                source: Setting::Set(EmbedderSource::OpenAi),
-                model: Setting::Set(options.embedding_model.name().to_owned()),
+                model: Setting::Set(embedding_model.name().to_owned()),
                revision: Setting::NotSet,
-                api_key: options.api_key.map(Setting::Set).unwrap_or_default(),
+                api_key: api_key.map(Setting::Set).unwrap_or_default(),
-                dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(),
+                dimensions: dimensions.map(Setting::Set).unwrap_or_default(),
                document_template: Setting::Set(prompt.template),
                url: Setting::NotSet,
                query: Setting::NotSet,
@@ -329,29 +338,37 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
                path_to_embeddings: Setting::NotSet,
                embedding_object: Setting::NotSet,
                input_type: Setting::NotSet,
-                distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
+                distribution: distribution.map(Setting::Set).unwrap_or_default(),
            },
-            super::EmbedderOptions::Ollama(options) => Self {
+            super::EmbedderOptions::Ollama(super::ollama::EmbedderOptions {
                embedding_model,
                url,
                api_key,
                distribution,
            }) => Self {
                source: Setting::Set(EmbedderSource::Ollama),
-                model: Setting::Set(options.embedding_model.to_owned()),
+                model: Setting::Set(embedding_model),
                revision: Setting::NotSet,
-                api_key: options.api_key.map(Setting::Set).unwrap_or_default(),
+                api_key: api_key.map(Setting::Set).unwrap_or_default(),
                dimensions: Setting::NotSet,
                document_template: Setting::Set(prompt.template),
-                url: Setting::NotSet,
+                url: url.map(Setting::Set).unwrap_or_default(),
                query: Setting::NotSet,
                input_field: Setting::NotSet,
                path_to_embeddings: Setting::NotSet,
                embedding_object: Setting::NotSet,
                input_type: Setting::NotSet,
-                distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
+                distribution: distribution.map(Setting::Set).unwrap_or_default(),
            },
-            super::EmbedderOptions::UserProvided(options) => Self {
+            super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions {
                dimensions,
                distribution,
            }) => Self {
                source: Setting::Set(EmbedderSource::UserProvided),
                model: Setting::NotSet,
                revision: Setting::NotSet,
                api_key: Setting::NotSet,
-                dimensions: Setting::Set(options.dimensions),
+                dimensions: Setting::Set(dimensions),
                document_template: Setting::NotSet,
                url: Setting::NotSet,
                query: Setting::NotSet,
@@ -359,7 +376,7 @@ impl From<EmbeddingConfig> for EmbeddingSettings {
                path_to_embeddings: Setting::NotSet,
                embedding_object: Setting::NotSet,
                input_type: Setting::NotSet,
-                distribution: options.distribution.map(Setting::Set).unwrap_or_default(),
+                distribution: distribution.map(Setting::Set).unwrap_or_default(),
            },
            super::EmbedderOptions::Rest(super::rest::EmbedderOptions {
                api_key,
Author	SHA1	Message	Date
Louis Dureuil	ee99196c92	Grow by 1TB instead of 1MB	2024-06-05 09:22:58 +02:00
Louis Dureuil	aeef2bae33	Use less of the total budget	2024-06-05 09:22:50 +02:00
Louis Dureuil	7f7d2d0449	Restore budget to what it was	2024-06-05 09:22:39 +02:00
Louis Dureuil	e100292417	Adjust default budget to relieve stress on virtual memory space	2024-06-04 11:10:31 +02:00
meili-bors[bot]	ba75d23bfe	Merge #4648 4648: Update version for the next release (v1.8.1) in Cargo.toml r=ManyTheFish a=meili-bot ⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging. Co-authored-by: ManyTheFish <ManyTheFish@users.noreply.github.com>	2024-05-21 16:38:36 +00:00
ManyTheFish	7fbb3bf8e8	Update version for the next release (v1.8.1) in Cargo.toml	2024-05-21 15:13:03 +00:00
meili-bors[bot]	9066a446a3	Merge #4642 4642: Index the _geo fields when changing the setting while there is already documents in the DB r=ManyTheFish a=irevoire # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/4640 Fixes https://github.com/meilisearch/meilisearch/issues/4628 ## What does this PR do? - Add an integration test that first indexes the document and then changes the settings - Fix `extract_geo_point` by detecting if the `_geo` field has been faceted in this setting change and index all documents Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>	2024-05-21 13:16:11 +00:00
ManyTheFish	f762307838	Fix clippy	2024-05-21 13:44:20 +02:00
ManyTheFish	3e94a90722	Fixes	2024-05-21 13:39:46 +02:00
ManyTheFish	fc7e817221	Index geo points based on the settings differences	2024-05-20 12:27:26 +02:00
Tamo	0f78703b85	add a test reproducing the bug	2024-05-20 10:58:08 +02:00
meili-bors[bot]	c668043c4f	Merge #4617 4617: Destructure `EmbedderOptions` so we don't miss some options r=dureuill a=dureuill # Pull Request ## Related issue #4595 was caused by the code not destructuring the embedder options. ## What does this PR do? This PR adds the missing `url` parameter for ollama, and makes sure similar issue cannot happen in the future Co-authored-by: Louis Dureuil <louis@meilisearch.com>	2024-05-02 14:55:32 +00:00
Louis Dureuil	5a305bfdea	Remove unused struct	2024-05-02 16:14:37 +02:00
Louis Dureuil	f4dd73ec8c	Destructure EmbedderOptions so we don't miss some options	2024-05-02 15:39:36 +02:00
meili-bors[bot]	66dce4600d	Merge #4603 4603: Update charabia v0.8.10 r=Kerollmops a=ManyTheFish - Update Charabia v0.8.10 - Add `swedish-recomposition` as an optional feature flag Co-authored-by: ManyTheFish <many@meilisearch.com>	2024-04-30 13:04:02 +00:00
ManyTheFish	fe51ceca6d	Update lock file	2024-04-30 14:33:37 +02:00
ManyTheFish	88174b8ae4	Update charabia v0.8.10	2024-04-30 14:30:23 +02:00