update segment and reqwest to the latest version

Merge #4622
4622: Bump Rustls to non-vulnerable versions r=Kerollmops a=Kerollmops This PR Fixes #4599 by bumping the Rustls dependency to v0.21.12 and [ureq to v2.9.7](https://github.com/algesten/ureq/blob/main/CHANGELOG.md#297) (which bump rustls to v0.22.4). Co-authored-by: Clément Renault <clement@meilisearch.com>
2025-12-03 11:15:35 +00:00 · 2024-05-13 10:25:29 +02:00 · 2024-05-07 09:47:30 +00:00 · 2024-05-07 10:39:38 +02:00 · 2024-05-07 10:31:39 +02:00 · 2024-05-06 13:46:39 +00:00
36 changed files with 455 additions and 458 deletions
--- a/.github/workflows/flaky-tests.yml
+++ b/.github/workflows/flaky-tests.yml
@@ -1,6 +1,4 @@
 name: Look for flaky tests
-env:
-  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
 on:
  workflow_dispatch:
  schedule:
--- a/.github/workflows/fuzzer-indexing.yml
+++ b/.github/workflows/fuzzer-indexing.yml
@@ -1,6 +1,5 @@
 name: Run the indexing fuzzer
-env:
-  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+
 on:
  push:
    branches:
--- a/.github/workflows/publish-apt-brew-pkg.yml
+++ b/.github/workflows/publish-apt-brew-pkg.yml
@@ -15,8 +15,6 @@ jobs:

  debian:
    name: Publish debian packagge
-    env:
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    runs-on: ubuntu-latest
    needs: check-version
    container:
--- a/.github/workflows/publish-binaries.yml
+++ b/.github/workflows/publish-binaries.yml
@@ -35,8 +35,6 @@ jobs:
  publish-linux:
    name: Publish binary for Linux
    runs-on: ubuntu-latest
-    env:
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    needs: check-version
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27
@@ -134,8 +132,6 @@ jobs:
    name: Publish binary for aarch64
    runs-on: ubuntu-latest
    needs: check-version
-    env:
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27
      image: ubuntu:18.04
--- a/.github/workflows/test-suite.yml
+++ b/.github/workflows/test-suite.yml
@@ -21,8 +21,6 @@ jobs:
  test-linux:
    name: Tests on ubuntu-18.04
    runs-on: ubuntu-latest
-    env:
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
      image: ubuntu:18.04
@@ -79,8 +77,6 @@ jobs:
  test-all-features:
    name: Tests almost all features
    runs-on: ubuntu-latest
-    env:
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
      image: ubuntu:18.04
@@ -104,8 +100,6 @@ jobs:

  test-disabled-tokenization:
    name: Test disabled tokenization
-    env:
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    runs-on: ubuntu-latest
    container:
      image: ubuntu:18.04
@@ -133,8 +127,6 @@ jobs:
  # We run tests in debug also, to make sure that the debug_assertions are hit
  test-debug:
    name: Run tests in debug
-    env:
-      ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
    runs-on: ubuntu-latest
    container:
      # Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,7 +22,7 @@ members = [
 ]

 [workspace.package]
-version = "1.8.4"
+version = "1.8.0"
 authors = [
    "Quentin de Quelen <quentin@dequelen.me>",
    "Clément Renault <clement@meilisearch.com>",
--- a/2
+++ b/2
@@ -17,7 +17,7 @@ RUN     set -eux; \
        if [ "$apkArch" = "aarch64" ]; then \
            export JEMALLOC_SYS_WITH_LG_PAGE=16; \
        fi && \
-        cargo build --release -p meilisearch -p meilitool --features "swedish-recomposition"
+        cargo build --release -p meilisearch -p meilitool

 # Run
 FROM    alpine:3.16
--- a/dump/src/reader/v3/settings.rs
+++ b/dump/src/reader/v3/settings.rs
@@ -152,7 +152,6 @@ impl Settings<Unchecked> {
 }

 #[derive(Debug, Clone, Deserialize)]
-#[allow(dead_code)] // otherwise rustc complains that the fields go unused
 #[cfg_attr(test, derive(serde::Serialize))]
 #[serde(deny_unknown_fields)]
 #[serde(rename_all = "camelCase")]
--- a/dump/src/reader/v4/settings.rs
+++ b/dump/src/reader/v4/settings.rs
@@ -182,7 +182,6 @@ impl Settings<Unchecked> {
    }
 }

-#[allow(dead_code)] // otherwise rustc complains that the fields go unused
 #[derive(Debug, Clone, Deserialize)]
 #[cfg_attr(test, derive(serde::Serialize))]
 #[serde(deny_unknown_fields)]
--- a/dump/src/reader/v5/tasks.rs
+++ b/dump/src/reader/v5/tasks.rs
@@ -200,7 +200,6 @@ impl std::ops::Deref for IndexUid {
    }
 }

-#[allow(dead_code)] // otherwise rustc complains that the fields go unused
 #[derive(Debug)]
 #[cfg_attr(test, derive(serde::Serialize))]
 #[cfg_attr(test, serde(rename_all = "camelCase"))]
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -568,7 +568,7 @@ pub mod tests {
        insta::assert_display_snapshot!(p(r"title = 'foo\\\\'"), @r#"{title} = {foo\\}"#);
        insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\'"), @r#"{title} = {foo\\\}"#);
        insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#);
-        // but it also works with other sequencies
+        // but it also works with other sequences
        insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
    }

--- a/index-scheduler/Cargo.toml
+++ b/index-scheduler/Cargo.toml
@@ -37,7 +37,7 @@ time = { version = "0.3.31", features = [
    "macros",
 ] }
 tracing = "0.1.40"
-ureq = "2.9.1"
+ureq = "2.9.7"
 uuid = { version = "1.6.1", features = ["serde", "v4"] }

 [dev-dependencies]
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -13,7 +13,7 @@ We can combine the two tasks in a single batch:
 1. import documents X and Y

 Processing this batch is functionally equivalent to processing the two
-tasks individally, but should be much faster since we are only performing
+tasks individually, but should be much faster since we are only performing
 one indexing operation.
 */

@@ -914,34 +914,8 @@ impl IndexScheduler {
                        if self.must_stop_processing.get() {
                            return Err(Error::AbortedTask);
                        }
-                        let (id, doc) = ret?;
-                        let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
-
-                        'inject_vectors: {
-                            let embeddings = index.embeddings(&rtxn, id)?;
-
-                            if embeddings.is_empty() {
-                                break 'inject_vectors;
-                            }
-
-                            let vectors = document
-                                .entry("_vectors".to_owned())
-                                .or_insert(serde_json::Value::Object(Default::default()));
-
-                            let serde_json::Value::Object(vectors) = vectors else {
-                                break 'inject_vectors;
-                            };
-
-                            for (embedder_name, embeddings) in embeddings {
-                                vectors.entry(embedder_name).or_insert_with(|| {
-                                    serde_json::json!({
-                                        "embeddings": embeddings,
-                                        "regenerate": true
-                                    })
-                                });
-                            }
-                        }
-
+                        let (_id, doc) = ret?;
+                        let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
                        index_dumper.push_document(&document)?;
                    }

--- a/meilisearch-types/src/deserr/mod.rs
+++ b/meilisearch-types/src/deserr/mod.rs
@@ -26,7 +26,7 @@ pub type DeserrQueryParamError<C = BadRequest> = DeserrError<DeserrQueryParam, C

 /// A request deserialization error.
 ///
-/// The first generic paramater is a marker type describing the format of the request: either json (e.g. [`DeserrJson`] or [`DeserrQueryParam`]).
+/// The first generic parameter is a marker type describing the format of the request: either json (e.g. [`DeserrJson`] or [`DeserrQueryParam`]).
 /// The second generic parameter is the default error code for the deserialization error, in case it is not given.
 pub struct DeserrError<Format, C: Default + ErrorCode> {
    pub msg: String,
--- a/meilisearch/Cargo.toml
+++ b/meilisearch/Cargo.toml
@@ -71,13 +71,13 @@ puffin = { version = "0.16.0", features = ["serialization"] }
 rand = "0.8.5"
 rayon = "1.8.0"
 regex = "1.10.2"
-reqwest = { version = "0.11.23", features = [
+reqwest = { version = "0.12.4", features = [
    "rustls-tls",
    "json",
 ], default-features = false }
-rustls = "0.21.6"
+rustls = "0.21.12"
 rustls-pemfile = "1.0.2"
-segment = { version = "0.2.3", optional = true }
+segment = { version = "0.2.4", optional = true }
 serde = { version = "1.0.195", features = ["derive"] }
 serde_json = { version = "1.0.111", features = ["preserve_order"] }
 sha2 = "0.10.8"
--- a/meilisearch/src/lib.rs
+++ b/meilisearch/src/lib.rs
@@ -419,41 +419,7 @@ fn import_dump(
        let file = tempfile::tempfile()?;
        let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file));
        for document in index_reader.documents()? {
-            let mut document = document?;
-
-            'remove_injected_vectors: {
-                let Some(vectors) = document.get_mut("_vectors") else {
-                    break 'remove_injected_vectors;
-                };
-
-                let Some(vectors) = vectors.as_object_mut() else { break 'remove_injected_vectors };
-
-                vectors.retain(|_embedder, embedding_object| {
-                    // don't touch values that aren't objects
-                    let Some(embedding_object) = embedding_object.as_object() else {
-                        return true;
-                    };
-
-                    let mut has_regenerate_true = false;
-                    for (field, value) in embedding_object {
-                        match (field.as_str(), value) {
-                            // detected regenerate : true
-                            // if we don't have any superfluous field, we'll remove the entire entry
-                            ("regenerate", serde_json::Value::Bool(true)) => {
-                                has_regenerate_true = true;
-                            }
-                            // ignore embeddings
-                            ("embeddings", _) => continue,
-                            // any other field: immediately retain the entry
-                            _ => return true,
-                        }
-                    }
-                    // retain the entry unless it has regenerate: true
-                    !has_regenerate_true
-                })
-            }
-
-            builder.append_json_object(&document)?;
+            builder.append_json_object(&document?)?;
        }

        // This flush the content of the batch builder.
--- a/meilisearch/src/middleware.rs
+++ b/meilisearch/src/middleware.rs
@@ -59,10 +59,12 @@ where
            let request_path = req.path();
            let is_registered_resource = req.resource_map().has_resource(request_path);
            if is_registered_resource {
+                let request_pattern = req.match_pattern();
+                let metric_path = request_pattern.as_ref().map_or(request_path, String::as_str);
                let request_method = req.method().to_string();
                histogram_timer = Some(
                    crate::metrics::MEILISEARCH_HTTP_RESPONSE_TIME_SECONDS
-                        .with_label_values(&[&request_method, request_path])
+                        .with_label_values(&[&request_method, metric_path])
                        .start_timer(),
                );
            }
--- a/meilisearch/src/search_queue.rs
+++ b/meilisearch/src/search_queue.rs
@@ -40,9 +40,8 @@ pub struct Permit {

 impl Drop for Permit {
    fn drop(&mut self) {
-        let sender = self.sender.clone();
        // if the channel is closed then the whole instance is down
-        std::mem::drop(tokio::spawn(async move { sender.send(()).await }));
+        let _ = futures::executor::block_on(self.sender.send(()));
    }
 }

--- a/meilisearch/tests/search/geo.rs
+++ b/meilisearch/tests/search/geo.rs
@@ -117,69 +117,3 @@ async fn geo_bounding_box_with_string_and_number() {
        )
        .await;
 }
-
-#[actix_rt::test]
-async fn bug_4640() {
-    // https://github.com/meilisearch/meilisearch/issues/4640
-    let server = Server::new().await;
-    let index = server.index("test");
-
-    let documents = DOCUMENTS.clone();
-    index.add_documents(documents, None).await;
-    index.update_settings_filterable_attributes(json!(["_geo"])).await;
-    let (ret, _code) = index.update_settings_sortable_attributes(json!(["_geo"])).await;
-    index.wait_task(ret.uid()).await;
-
-    // Sort the document with the second one first
-    index
-        .search(
-            json!({
-                "sort": ["_geoPoint(45.4777599, 9.1967508):asc"],
-            }),
-            |response, code| {
-                assert_eq!(code, 200, "{}", response);
-                snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
-                {
-                  "hits": [
-                    {
-                      "id": 2,
-                      "name": "La Bella Italia",
-                      "address": "456 Elm Street, Townsville",
-                      "type": "Italian",
-                      "rating": 9,
-                      "_geo": {
-                        "lat": "45.4777599",
-                        "lng": "9.1967508"
-                      }
-                    },
-                    {
-                      "id": 1,
-                      "name": "Taco Truck",
-                      "address": "444 Salsa Street, Burritoville",
-                      "type": "Mexican",
-                      "rating": 9,
-                      "_geo": {
-                        "lat": 34.0522,
-                        "lng": -118.2437
-                      },
-                      "_geoDistance": 9714063
-                    },
-                    {
-                      "id": 3,
-                      "name": "Crêpe Truck",
-                      "address": "2 Billig Avenue, Rouenville",
-                      "type": "French",
-                      "rating": 10
-                    }
-                  ],
-                  "query": "",
-                  "processingTimeMs": "[time]",
-                  "limit": 20,
-                  "offset": 0,
-                  "estimatedTotalHits": 3
-                }
-                "###);
-            },
-        )
-        .await;
-}
--- a/meilitool/src/main.rs
+++ b/meilitool/src/main.rs
@@ -129,7 +129,7 @@ fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
        }
    }

-    eprintln!("Sucessfully deleted {count} content files from disk!");
+    eprintln!("Successfully deleted {count} content files from disk!");

    Ok(())
 }
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -74,10 +74,10 @@ csv = "1.3.0"
 candle-core = { version = "0.4.1" }
 candle-transformers = { version = "0.4.1" }
 candle-nn = { version = "0.4.1" }
-tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
+tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [
    "onig",
 ] }
-hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
+hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
    "online",
 ] }
 tiktoken-rs = "0.5.8"
@@ -85,7 +85,7 @@ liquid = "0.26.4"
 arroy = "0.2.0"
 rand = "0.8.5"
 tracing = "0.1.40"
-ureq = { version = "2.9.6", features = ["json"] }
+ureq = { version = "2.9.7", features = ["json"] }
 url = "2.5.0"

 [dev-dependencies]
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -203,7 +203,7 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) {
            "string" => (field_name, AllowedType::String),
            "boolean" => (field_name, AllowedType::Boolean),
            "number" => (field_name, AllowedType::Number),
-            // if the pattern isn't reconized, we keep the whole field.
+            // if the pattern isn't recognized, we keep the whole field.
            _otherwise => (header, AllowedType::String),
        },
        None => (header, AllowedType::String),
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -22,7 +22,7 @@ use crate::heed_codec::{
 };
 use crate::order_by_map::OrderByMap;
 use crate::proximity::ProximityPrecision;
-use crate::vector::{Embedding, EmbeddingConfig};
+use crate::vector::EmbeddingConfig;
 use crate::{
    default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
    FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec,
@@ -1516,42 +1516,6 @@ impl Index {
            .unwrap_or_default())
    }

-    pub fn embeddings(
-        &self,
-        rtxn: &RoTxn<'_>,
-        docid: DocumentId,
-    ) -> Result<BTreeMap<String, Vec<Embedding>>> {
-        let mut res = BTreeMap::new();
-        for row in self.embedder_category_id.iter(rtxn)? {
-            let (embedder_name, embedder_id) = row?;
-            let embedder_id = (embedder_id as u16) << 8;
-            let mut embeddings = Vec::new();
-            'vectors: for i in 0..=u8::MAX {
-                let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
-                    .map(Some)
-                    .or_else(|e| match e {
-                        arroy::Error::MissingMetadata => Ok(None),
-                        e => Err(e),
-                    })
-                    .transpose();
-
-                let Some(reader) = reader else {
-                    break 'vectors;
-                };
-
-                let embedding = reader?.item_vector(rtxn, docid)?;
-                if let Some(embedding) = embedding {
-                    embeddings.push(embedding)
-                } else {
-                    break 'vectors;
-                }
-            }
-
-            res.insert(embedder_name.to_owned(), embeddings);
-        }
-        Ok(res)
-    }
-
    pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
        self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
    }
--- a/milli/src/search/new/geo_sort.rs
+++ b/milli/src/search/new/geo_sort.rs
@@ -42,7 +42,7 @@ fn facet_number_values<'a>(
 }

 /// Define the strategy used by the geo sort.
-/// The paramater represents the cache size, and, in the case of the Dynamic strategy,
+/// The parameter represents the cache size, and, in the case of the Dynamic strategy,
 /// the point where we move from using the iterative strategy to the rtree.
 #[derive(Debug, Clone, Copy)]
 pub enum Strategy {
--- a/milli/src/search/new/logger/visual.rs
+++ b/milli/src/search/new/logger/visual.rs
@@ -22,7 +22,7 @@ pub enum SearchEvents {
    RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
    RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
    RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
-    RankingRuleEndIteration { ranking_rule_idx: usize },
+    RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
    ExtendResults { new: Vec<u32> },
    ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
    ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
@@ -123,9 +123,12 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
        &mut self,
        ranking_rule_idx: usize,
        _ranking_rule: &dyn RankingRule<QueryGraph>,
-        _universe: &RoaringBitmap,
+        universe: &RoaringBitmap,
    ) {
-        self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx });
+        self.events.push(SearchEvents::RankingRuleEndIteration {
+            ranking_rule_idx,
+            universe_len: universe.len(),
+        });
        self.location.pop();
    }
    fn add_to_results(&mut self, docids: &[u32]) {
@@ -323,7 +326,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
                assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
                self.write_skip_bucket(bucket_len)?;
            }
-            SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => {
+            SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => {
                assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
                self.write_end_iteration()?;
            }
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -134,7 +134,7 @@ impl<'t> Matcher<'t, '_> {
            for (token_position, word_position, word) in words_positions {
                partial = match partial.match_token(word) {
                    // token matches the partial match, but the match is not full,
-                    // we temporarly save the current token then we try to match the next one.
+                    // we temporarily save the current token then we try to match the next one.
                    Some(MatchType::Partial(partial)) => {
                        potential_matches.push((token_position, word_position, partial.char_len()));
                        partial
@@ -722,7 +722,7 @@ mod tests {
            @"…void void void void void split the world void void"
        );

-        // Text containing matches with diferent density.
+        // Text containing matches with different density.
        let text = "split void the void void world void void void void void void void void void void split the world void void";
        let mut matcher = builder.build(text);
        // crop should return 10 last words with a marker at the start.
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@@ -119,7 +119,7 @@ pub fn located_query_terms_from_tokens(
                            if let Some(located_query_term) = phrase.build(ctx) {
                                // as we are evaluating a negative operator we put the phrase
                                // in the negative one *but* we don't reset the negative operator
-                                // as we are immediatly starting a new negative phrase.
+                                // as we are immediately starting a new negative phrase.
                                if negative_phrase {
                                    negative_phrases.push(located_query_term);
                                } else {
--- a/milli/src/update/facet/incremental.rs
+++ b/milli/src/update/facet/incremental.rs
@@ -499,7 +499,7 @@ impl FacetsUpdateIncrementalInner {
                    ModificationResult::Expand | ModificationResult::Reduce { .. }
                )
            {
-                // if any modification occured, insert it in the database.
+                // if any modification occurred, insert it in the database.
                self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
                Ok(insertion_key_modification)
            } else {
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -37,7 +37,7 @@ pub struct ExtractedFacetValues {

 /// Extracts the facet values of each faceted field of each document.
 ///
-/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
+/// Returns the generated grenad reader containing the docid the fid and the original value as key
 /// and the normalized value as value extracted from the given chunk of documents.
 /// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
 #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
@@ -45,6 +45,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
    obkv_documents: grenad::Reader<R>,
    indexer: GrenadParameters,
    settings_diff: &InnerIndexSettingsDiff,
+    geo_fields_ids: Option<(FieldId, FieldId)>,
 ) -> Result<ExtractedFacetValues> {
    puffin::profile_function!();

@@ -126,18 +127,12 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
                    add_exists.insert(document);
                }

-                let del_geo_support = settings_diff
-                    .old
-                    .geo_fields_ids
-                    .map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
-                let add_geo_support = settings_diff
-                    .new
-                    .geo_fields_ids
-                    .map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
+                let geo_support =
+                    geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
                let del_filterable_values =
-                    del_value.map(|value| extract_facet_values(&value, del_geo_support));
+                    del_value.map(|value| extract_facet_values(&value, geo_support));
                let add_filterable_values =
-                    add_value.map(|value| extract_facet_values(&value, add_geo_support));
+                    add_value.map(|value| extract_facet_values(&value, geo_support));

                // Those closures are just here to simplify things a bit.
                let mut insert_numbers_diff = |del_numbers, add_numbers| {
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -8,7 +8,6 @@ use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
 use crate::error::GeoError;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::extract_finite_float_from_value;
-use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
 use crate::{FieldId, InternalError, Result};

 /// Extracts the geographical coordinates contained in each document under the `_geo` field.
@@ -19,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
    obkv_documents: grenad::Reader<R>,
    indexer: GrenadParameters,
    primary_key_id: FieldId,
-    settings_diff: &InnerIndexSettingsDiff,
+    (lat_fid, lng_fid): (FieldId, FieldId),
 ) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();

@@ -41,27 +40,47 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
            serde_json::from_slice(document_id).unwrap()
        };

-        // extract old version
-        let del_lat_lng =
-            extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
-        // extract new version
-        let add_lat_lng =
-            extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
+        // first we get the two fields
+        match (obkv.get(lat_fid), obkv.get(lng_fid)) {
+            (Some(lat), Some(lng)) => {
+                let deladd_lat_obkv = KvReaderDelAdd::new(lat);
+                let deladd_lng_obkv = KvReaderDelAdd::new(lng);

-        if del_lat_lng != add_lat_lng {
-            let mut obkv = KvWriterDelAdd::memory();
-            if let Some([lat, lng]) = del_lat_lng {
-                #[allow(clippy::drop_non_drop)]
-                let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
-                obkv.insert(DelAdd::Deletion, bytes)?;
+                // then we extract the values
+                let del_lat_lng = deladd_lat_obkv
+                    .get(DelAdd::Deletion)
+                    .zip(deladd_lng_obkv.get(DelAdd::Deletion))
+                    .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
+                    .transpose()?;
+                let add_lat_lng = deladd_lat_obkv
+                    .get(DelAdd::Addition)
+                    .zip(deladd_lng_obkv.get(DelAdd::Addition))
+                    .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
+                    .transpose()?;
+
+                if del_lat_lng != add_lat_lng {
+                    let mut obkv = KvWriterDelAdd::memory();
+                    if let Some([lat, lng]) = del_lat_lng {
+                        #[allow(clippy::drop_non_drop)]
+                        let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
+                        obkv.insert(DelAdd::Deletion, bytes)?;
+                    }
+                    if let Some([lat, lng]) = add_lat_lng {
+                        #[allow(clippy::drop_non_drop)]
+                        let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
+                        obkv.insert(DelAdd::Addition, bytes)?;
+                    }
+                    let bytes = obkv.into_inner()?;
+                    writer.insert(docid_bytes, bytes)?;
+                }
            }
-            if let Some([lat, lng]) = add_lat_lng {
-                #[allow(clippy::drop_non_drop)]
-                let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
-                obkv.insert(DelAdd::Addition, bytes)?;
+            (None, Some(_)) => {
+                return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
            }
-            let bytes = obkv.into_inner()?;
-            writer.insert(docid_bytes, bytes)?;
+            (Some(_), None) => {
+                return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
+            }
+            (None, None) => (),
        }
    }

@@ -69,37 +88,16 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
 }

 /// Extract the finite floats lat and lng from two bytes slices.
-fn extract_lat_lng(
-    document: &obkv::KvReader<FieldId>,
-    settings: &InnerIndexSettings,
-    deladd: DelAdd,
-    document_id: impl Fn() -> Value,
-) -> Result<Option<[f64; 2]>> {
-    match settings.geo_fields_ids {
-        Some((lat_fid, lng_fid)) => {
-            let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
-            let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
-            let (lat, lng) = match (lat, lng) {
-                (Some(lat), Some(lng)) => (lat, lng),
-                (Some(_), None) => {
-                    return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
-                }
-                (None, Some(_)) => {
-                    return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
-                }
-                (None, None) => return Ok(None),
-            };
-            let lat = extract_finite_float_from_value(
-                serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
-            )
-            .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
+fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
+    let lat = extract_finite_float_from_value(
+        serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
+    )
+    .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;

-            let lng = extract_finite_float_from_value(
-                serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
-            )
-            .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
-            Ok(Some([lat, lng]))
-        }
-        None => Ok(None),
-    }
+    let lng = extract_finite_float_from_value(
+        serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
+    )
+    .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
+
+    Ok([lat, lng])
 }
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -11,7 +11,7 @@ mod extract_word_position_docids;

 use std::fs::File;
 use std::io::BufReader;
-use std::sync::{Arc, OnceLock};
+use std::sync::Arc;

 use crossbeam_channel::Sender;
 use rayon::prelude::*;
@@ -31,7 +31,7 @@ use self::extract_word_position_docids::extract_word_position_docids;
 use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
 use super::{helpers, TypedChunk};
 use crate::update::settings::InnerIndexSettingsDiff;
-use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
+use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};

 /// Extract data for each databases from obkv documents in parallel.
 /// Send data in grenad file over provided Sender.
@@ -43,6 +43,7 @@ pub(crate) fn data_from_obkv_documents(
    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
    primary_key_id: FieldId,
+    geo_fields_ids: Option<(FieldId, FieldId)>,
    settings_diff: Arc<InnerIndexSettingsDiff>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<()> {
@@ -71,6 +72,7 @@ pub(crate) fn data_from_obkv_documents(
                        indexer,
                        lmdb_writer_sx.clone(),
                        primary_key_id,
+                        geo_fields_ids,
                        settings_diff.clone(),
                        max_positions_per_attributes,
                    )
@@ -213,18 +215,6 @@ fn run_extraction_task<FE, FS, M>(
    })
 }

-fn request_threads() -> &'static ThreadPoolNoAbort {
-    static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
-
-    REQUEST_THREADS.get_or_init(|| {
-        ThreadPoolNoAbortBuilder::new()
-            .num_threads(crate::vector::REQUEST_PARALLELISM)
-            .thread_name(|index| format!("embedding-request-{index}"))
-            .build()
-            .unwrap()
-    })
-}
-
 /// Extract chunked data and send it into lmdb_writer_sx sender:
 /// - documents
 fn send_original_documents_data(
@@ -239,6 +229,11 @@ fn send_original_documents_data(
    let documents_chunk_cloned = original_documents_chunk.clone();
    let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();

+    let request_threads = ThreadPoolNoAbortBuilder::new()
+        .num_threads(crate::vector::REQUEST_PARALLELISM)
+        .thread_name(|index| format!("embedding-request-{index}"))
+        .build()?;
+
    if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
        let settings_diff = settings_diff.clone();
        rayon::spawn(move || {
@@ -256,7 +251,7 @@ fn send_original_documents_data(
                            prompts,
                            indexer,
                            embedder.clone(),
-                            request_threads(),
+                            &request_threads,
                        ) {
                            Ok(results) => Some(results),
                            Err(error) => {
@@ -305,6 +300,7 @@ fn send_and_extract_flattened_documents_data(
    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
    primary_key_id: FieldId,
+    geo_fields_ids: Option<(FieldId, FieldId)>,
    settings_diff: Arc<InnerIndexSettingsDiff>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
@@ -314,13 +310,12 @@ fn send_and_extract_flattened_documents_data(
    let flattened_documents_chunk =
        flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;

-    if settings_diff.run_geo_indexing() {
+    if let Some(geo_fields_ids) = geo_fields_ids {
        let documents_chunk_cloned = flattened_documents_chunk.clone();
        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
-        let settings_diff = settings_diff.clone();
        rayon::spawn(move || {
            let result =
-                extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff);
+                extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, geo_fields_ids);
            let _ = match result {
                Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
@@ -359,6 +354,7 @@ fn send_and_extract_flattened_documents_data(
                    flattened_documents_chunk.clone(),
                    indexer,
                    &settings_diff,
+                    geo_fields_ids,
                )?;

                // send fid_docid_facet_numbers_chunk to DB writer
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -324,6 +324,28 @@ where
        // get the primary key field id
        let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap();

+        // get the fid of the `_geo.lat` and `_geo.lng` fields.
+        let mut field_id_map = self.index.fields_ids_map(self.wtxn)?;
+
+        // self.index.fields_ids_map($a)? ==>> field_id_map
+        let geo_fields_ids = match field_id_map.id("_geo") {
+            Some(gfid) => {
+                let is_sortable = self.index.sortable_fields_ids(self.wtxn)?.contains(&gfid);
+                let is_filterable = self.index.filterable_fields_ids(self.wtxn)?.contains(&gfid);
+                // if `_geo` is faceted then we get the `lat` and `lng`
+                if is_sortable || is_filterable {
+                    let field_ids = field_id_map
+                        .insert("_geo.lat")
+                        .zip(field_id_map.insert("_geo.lng"))
+                        .ok_or(UserError::AttributeLimitReached)?;
+                    Some(field_ids)
+                } else {
+                    None
+                }
+            }
+            None => None,
+        };
+
        let pool_params = GrenadParameters {
            chunk_compression_type: self.indexer_config.chunk_compression_type,
            chunk_compression_level: self.indexer_config.chunk_compression_level,
@@ -390,6 +412,7 @@ where
                        pool_params,
                        lmdb_writer_sx.clone(),
                        primary_key_id,
+                        geo_fields_ids,
                        settings_diff.clone(),
                        max_positions_per_attributes,
                    )
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -48,6 +48,7 @@ pub struct Transform<'a, 'i> {
    fields_ids_map: FieldsIdsMap,

    indexer_settings: &'a IndexerConfig,
+    pub autogenerate_docids: bool,
    pub index_documents_method: IndexDocumentsMethod,
    available_documents_ids: AvailableDocumentsIds,

@@ -101,7 +102,7 @@ impl<'a, 'i> Transform<'a, 'i> {
        index: &'i Index,
        indexer_settings: &'a IndexerConfig,
        index_documents_method: IndexDocumentsMethod,
-        _autogenerate_docids: bool,
+        autogenerate_docids: bool,
    ) -> Result<Self> {
        // We must choose the appropriate merge function for when two or more documents
        // with the same user id must be merged or fully replaced in the same batch.
@@ -135,6 +136,7 @@ impl<'a, 'i> Transform<'a, 'i> {
            index,
            fields_ids_map: index.fields_ids_map(wtxn)?,
            indexer_settings,
+            autogenerate_docids,
            available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
            original_sorter,
            flattened_sorter,
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -1161,11 +1161,6 @@ impl InnerIndexSettingsDiff {
    pub fn settings_update_only(&self) -> bool {
        self.settings_update_only
    }
-
-    pub fn run_geo_indexing(&self) -> bool {
-        self.old.geo_fields_ids != self.new.geo_fields_ids
-            || (!self.settings_update_only && self.new.geo_fields_ids.is_some())
-    }
 }

 #[derive(Clone)]
@@ -1182,7 +1177,6 @@ pub(crate) struct InnerIndexSettings {
    pub proximity_precision: ProximityPrecision,
    pub embedding_configs: EmbeddingConfigs,
    pub existing_fields: HashSet<String>,
-    pub geo_fields_ids: Option<(FieldId, FieldId)>,
 }

 impl InnerIndexSettings {
@@ -1191,7 +1185,7 @@ impl InnerIndexSettings {
        let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap());
        let allowed_separators = index.allowed_separators(rtxn)?;
        let dictionary = index.dictionary(rtxn)?;
-        let mut fields_ids_map = index.fields_ids_map(rtxn)?;
+        let fields_ids_map = index.fields_ids_map(rtxn)?;
        let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?;
        let user_defined_searchable_fields =
            user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
@@ -1206,24 +1200,6 @@ impl InnerIndexSettings {
            .into_iter()
            .filter_map(|(field, count)| (count != 0).then_some(field))
            .collect();
-        // index.fields_ids_map($a)? ==>> fields_ids_map
-        let geo_fields_ids = match fields_ids_map.id("_geo") {
-            Some(gfid) => {
-                let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid);
-                let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid);
-                // if `_geo` is faceted then we get the `lat` and `lng`
-                if is_sortable || is_filterable {
-                    let field_ids = fields_ids_map
-                        .insert("_geo.lat")
-                        .zip(fields_ids_map.insert("_geo.lng"))
-                        .ok_or(UserError::AttributeLimitReached)?;
-                    Some(field_ids)
-                } else {
-                    None
-                }
-            }
-            None => None,
-        };

        Ok(Self {
            stop_words,
@@ -1238,7 +1214,6 @@ impl InnerIndexSettings {
            proximity_precision,
            embedding_configs,
            existing_fields,
-            geo_fields_ids,
        })
    }

--- a/xtask/Cargo.toml
+++ b/xtask/Cargo.toml
@@ -21,7 +21,7 @@ reqwest = { version = "0.11.23", features = [
    "stream",
    "json",
    "rustls-tls",
-], default-features = false }
+], default_features = false }
 serde = { version = "1.0.195", features = ["derive"] }
 serde_json = "1.0.111"
 sha2 = "0.10.8"
Author	SHA1	Message	Date
Tamo	c8d8522279	update segment and reqwest to the latest version	2024-05-13 10:25:29 +02:00
meili-bors[bot]	95fcd17373	Merge #4622 4622: Bump Rustls to non-vulnerable versions r=Kerollmops a=Kerollmops This PR Fixes #4599 by bumping the Rustls dependency to v0.21.12 and [ureq to v2.9.7](https://github.com/algesten/ureq/blob/main/CHANGELOG.md#297) (which bump rustls to v0.22.4). Co-authored-by: Clément Renault <clement@meilisearch.com>	2024-05-07 09:47:30 +00:00
Clément Renault	ac4bc143c4	Bump ureq to v2.9.7	2024-05-07 10:39:38 +02:00
Clément Renault	f33a1282f8	Bump Rustls to v0.21.12	2024-05-07 10:31:39 +02:00
meili-bors[bot]	4d5971f343	Merge #4621 4621: Bring back changes from v1.8.0 into main r=curquiza a=curquiza Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com> Co-authored-by: Clément Renault <clement@meilisearch.com>	2024-05-06 13:46:39 +00:00
meili-bors[bot]	ecb5c506b3	Merge #4619 4619: Use http path pattern instead of full path in metrics r=irevoire a=gh2k # Pull Request ## Related issue Fixes #3983 ## What does this PR do? - This records only the HTTP pattern in metrics instead of the full path An alternative solution was proposed in #4145, but this doesn't really fix the root cause of the issue. The problem I'm experiencing at my end is that by using the full path, the number of labels is far too high to be useful. It is normal practice to use the path with variable placeholders, instead of the fully-expanded path. The example given in the ticket was endpoints under `/tasks`, but this can also be a very significant problem under `/indexes/{index-uid}/documents`. e.g.: <img width="1510" alt="Screenshot 2024-05-03 at 12 14 36" src="https://github.com/meilisearch/meilisearch/assets/6530014/1df2ec19-5f69-4164-90d2-f65c59f9b544"> This patch replaces the fully-expanded path with the matched pattern. The linked PR also mentions paths under other routes, e.g. `/static`, but this feels like a separate concern and these can be stripped out at the Prometheus end by filters if they are unwanted. The most important thing is to make the paths usable so that we can still get stats on e.g. the number of document deletes we see. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Simon Detheridge <s@sd.ai> Co-authored-by: Tamo <tamo@meilisearch.com>	2024-05-06 09:37:32 +00:00
Tamo	3698aef66b	fix warning	2024-05-06 11:36:37 +02:00
Simon Detheridge	7f5ab3cef5	Use http path pattern instead of full path in metrics	2024-05-03 12:29:31 +01:00
meili-bors[bot]	248e22005a	Merge #4582 4582: Fix some typos in comments r=curquiza a=writegr # Pull Request ## Related issue No ## What does this PR do? fix some typos in comments ## PR checklist Please check if your PR fulfills the following requirements: - [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [ ] Have you read the contributing guidelines? - [ ] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: writegr <wellweek@outlook.com>	2024-04-18 07:07:33 +00:00
writegr	ab43a8a949	chore: fix some typos in comments Signed-off-by: writegr <wellweek@outlook.com>	2024-04-18 14:12:52 +08:00
meili-bors[bot]	4089dd04a5	Merge #4568 4568: Fix some typos in comments r=curquiza a=yudrywet # Pull Request ## Related issue No ## What does this PR do? fix some typos in comments ## PR checklist Please check if your PR fulfills the following requirements: - [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [ ] Have you read the contributing guidelines? - [ ] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: yudrywet <yudeyao@yeah.net>	2024-04-15 08:12:43 +00:00
yudrywet	cf864a1c2e	chore: fix some typos in comments Signed-off-by: yudrywet <yudeyao@yeah.net>	2024-04-14 20:11:34 +08:00