Make rustfmt happy

Return a user error when the _vectors type is invalid
Display the _semanticSimilarity even if the _vectors field is not displayed
2025-07-20 13:30:38 +00:00 · 2023-06-20 16:26:00 +02:00 · 2023-06-20 16:18:24 +02:00 · 2023-06-20 15:56:53 +02:00 · 2023-06-20 14:38:58 +02:00 · 2023-06-20 13:48:57 +02:00
12 changed files with 160 additions and 33 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2627,6 +2627,7 @@ dependencies = [
 "num_cpus",
 "obkv",
 "once_cell",
+ "ordered-float",
 "parking_lot",
 "permissive-json-pointer",
 "pin-project-lite",
@ -2976,9 +2977,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"

 [[package]]
 name = "ordered-float"
-version = "3.6.0"
+version = "3.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "13a384337e997e6860ffbaa83708b2ef329fd8c54cb67a5f64d421e0f943254f"
+checksum = "2fc2dbde8f8a79f2102cc474ceb0ad68e3b80b85289ea62389b60e66777e4213"
 dependencies = [
 "num-traits",
 ]
--- a/meilisearch-types/src/error.rs
+++ b/meilisearch-types/src/error.rs
@ -218,6 +218,7 @@ MissingDocumentFilter                 , InvalidRequest       , BAD_REQUEST ;
 InvalidDocumentFilter                 , InvalidRequest       , BAD_REQUEST ;
 InvalidDocumentGeoField               , InvalidRequest       , BAD_REQUEST ;
 InvalidVectorDimensions               , InvalidRequest       , BAD_REQUEST ;
+InvalidVectorsType                    , InvalidRequest       , BAD_REQUEST ;
 InvalidDocumentId                     , InvalidRequest       , BAD_REQUEST ;
 InvalidDocumentLimit                  , InvalidRequest       , BAD_REQUEST ;
 InvalidDocumentOffset                 , InvalidRequest       , BAD_REQUEST ;
@ -334,6 +335,7 @@ impl ErrorCode for milli::Error {
                    UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
                    UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
                    UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
+                    UserError::InvalidVectorsType { .. } => Code::InvalidVectorsType,
                    UserError::SortError(_) => Code::InvalidSearchSort,
                    UserError::InvalidMinTypoWordLenSetting(_, _) => {
                        Code::InvalidSettingsTypoTolerance
--- a/meilisearch/Cargo.toml
+++ b/meilisearch/Cargo.toml
@ -48,6 +48,7 @@ mime = "0.3.17"
 num_cpus = "1.15.0"
 obkv = "0.2.0"
 once_cell = "1.17.1"
+ordered-float = "3.7.0"
 parking_lot = "0.12.1"
 permissive-json-pointer = { path = "../permissive-json-pointer" }
 pin-project-lite = "0.2.9"
--- a/meilisearch/src/search.rs
+++ b/meilisearch/src/search.rs
@ -9,13 +9,15 @@ use meilisearch_auth::IndexSearchRules;
 use meilisearch_types::deserr::DeserrJsonError;
 use meilisearch_types::error::deserr_codes::*;
 use meilisearch_types::index_uid::IndexUid;
+use meilisearch_types::milli::dot_product_similarity;
 use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
 use meilisearch_types::{milli, Document};
 use milli::tokenizer::TokenizerBuilder;
 use milli::{
    AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, MatcherBuilder,
-    SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
+    SortError, TermsMatchingStrategy, VectorOrArrayOfVectors, DEFAULT_VALUES_PER_FACET,
 };
+use ordered_float::OrderedFloat;
 use regex::Regex;
 use serde::Serialize;
 use serde_json::{json, Value};
@ -215,6 +217,8 @@ pub struct SearchHit {
 pub struct SearchResult {
    pub hits: Vec<SearchHit>,
    pub query: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub vector: Option<Vec<f32>>,
    pub processing_time_ms: u128,
    #[serde(flatten)]
    pub hits_info: HitsInfo,
@ -399,7 +403,6 @@ pub fn perform_search(
    formatter_builder.highlight_suffix(query.highlight_post_tag);

    let mut documents = Vec::new();
-
    let documents_iter = index.documents(&rtxn, documents_ids)?;

    for (_id, obkv) in documents_iter {
@ -426,6 +429,12 @@ pub fn perform_search(
            insert_geo_distance(sort, &mut document);
        }

+        if let Some(vector) = query.vector.as_ref() {
+            if let Some(vectors) = extract_field("_vectors", &fields_ids_map, obkv)? {
+                insert_semantic_similarity(vector, vectors, &mut document);
+            }
+        }
+
        let hit = SearchHit { document, formatted, matches_position };
        documents.push(hit);
    }
@ -475,7 +484,8 @@ pub fn perform_search(
    let result = SearchResult {
        hits: documents,
        hits_info,
-        query: query.q.clone().unwrap_or_default(),
+        query: query.q.unwrap_or_default(),
+        vector: query.vector,
        processing_time_ms: before_search.elapsed().as_millis(),
        facet_distribution,
        facet_stats,
@ -499,6 +509,20 @@ fn insert_geo_distance(sorts: &[String], document: &mut Document) {
    }
 }

+fn insert_semantic_similarity(query: &[f32], vectors: Value, document: &mut Document) {
+    let vectors =
+        match serde_json::from_value(vectors).map(VectorOrArrayOfVectors::into_array_of_vectors) {
+            Ok(vectors) => vectors,
+            Err(_) => return,
+        };
+    let similarity = vectors
+        .into_iter()
+        .map(|v| OrderedFloat(dot_product_similarity(query, &v)))
+        .max()
+        .map(OrderedFloat::into_inner);
+    document.insert("_semanticSimilarity".to_string(), json!(similarity));
+}
+
 fn compute_formatted_options(
    attr_to_highlight: &HashSet<String>,
    attr_to_crop: &[String],
@ -626,6 +650,22 @@ fn make_document(
    Ok(document)
 }

+/// Extract the JSON value under the field name specified
+/// but doesn't support nested objects.
+fn extract_field(
+    field_name: &str,
+    field_ids_map: &FieldsIdsMap,
+    obkv: obkv::KvReaderU16,
+) -> Result<Option<serde_json::Value>, MeilisearchHttpError> {
+    match field_ids_map.id(field_name) {
+        Some(fid) => match obkv.get(fid) {
+            Some(value) => Ok(serde_json::from_slice(value).map(Some)?),
+            None => Ok(None),
+        },
+        None => Ok(None),
+    }
+}
+
 fn format_fields<A: AsRef<[u8]>>(
    document: &Document,
    field_ids_map: &FieldsIdsMap,
--- a/milli/src/distance.rs
+++ b/milli/src/distance.rs
@ -12,13 +12,18 @@ impl Metric<Vec<f32>> for DotProduct {
    //
    // Following <https://docs.rs/space/0.17.0/space/trait.Metric.html>.
    fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
-        let dist: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
-        let dist = 1.0 - dist;
+        let dist = 1.0 - dot_product_similarity(a, b);
        debug_assert!(!dist.is_nan());
        dist.to_bits()
    }
 }

+/// Returns the dot product similarity score that will between 0.0 and 1.0
+/// if both vectors are normalized. The higher the more similar the vectors are.
+pub fn dot_product_similarity(a: &[f32], b: &[f32]) -> f32 {
+    a.iter().zip(b).map(|(a, b)| a * b).sum()
+}
+
 #[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
 pub struct Euclidean;

@ -26,9 +31,14 @@ impl Metric<Vec<f32>> for Euclidean {
    type Unit = u32;

    fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
-        let squared: f32 = a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum();
-        let dist = squared.sqrt();
+        let dist = euclidean_squared_distance(a, b).sqrt();
        debug_assert!(!dist.is_nan());
        dist.to_bits()
    }
 }
+
+/// Return the squared euclidean distance between both vectors that will
+/// between 0.0 and +inf. The smaller the nearer the vectors are.
+pub fn euclidean_squared_distance(a: &[f32], b: &[f32]) -> f32 {
+    a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum()
+}
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@ -112,6 +112,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
    InvalidGeoField(#[from] GeoError),
    #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
    InvalidVectorDimensions { expected: usize, found: usize },
+    #[error("The `_vectors` field in the document with the id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")]
+    InvalidVectorsType { document_id: Value, value: Value },
    #[error("{0}")]
    InvalidFilter(String),
    #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -30,6 +30,7 @@ use std::convert::{TryFrom, TryInto};
 use std::hash::BuildHasherDefault;

 use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
+pub use distance::{dot_product_similarity, euclidean_squared_distance};
 pub use filter_parser::{Condition, FilterCondition, Span, Token};
 use fxhash::{FxHasher32, FxHasher64};
 pub use grenad::CompressionType;
@ -284,6 +285,35 @@ pub fn normalize_facet(original: &str) -> String {
    CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
 }

+/// Represents either a vector or an array of multiple vectors.
+#[derive(serde::Serialize, serde::Deserialize, Debug)]
+#[serde(transparent)]
+pub struct VectorOrArrayOfVectors {
+    #[serde(with = "either::serde_untagged")]
+    inner: either::Either<Vec<f32>, Vec<Vec<f32>>>,
+}
+
+impl VectorOrArrayOfVectors {
+    pub fn into_array_of_vectors(self) -> Vec<Vec<f32>> {
+        match self.inner {
+            either::Either::Left(vector) => vec![vector],
+            either::Either::Right(vectors) => vectors,
+        }
+    }
+}
+
+/// Normalize a vector by dividing the dimensions by the lenght of it.
+pub fn normalize_vector(mut vector: Vec<f32>) -> Vec<f32> {
+    let squared: f32 = vector.iter().map(|x| x * x).sum();
+    let length = squared.sqrt();
+    if length <= f32::EPSILON {
+        vector
+    } else {
+        vector.iter_mut().for_each(|x| *x /= length);
+        vector
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use serde_json::json;
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -48,7 +48,8 @@ use self::graph_based_ranking_rule::Words;
 use self::interner::Interned;
 use crate::search::new::distinct::apply_distinct_rule;
 use crate::{
-    AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, BEU32,
+    normalize_vector, AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy,
+    UserError, BEU32,
 };

 /// A structure used throughout the execution of a search query.
@ -440,7 +441,8 @@ pub fn execute_search(
            let hnsw = ctx.index.vector_hnsw(ctx.txn)?.unwrap_or_default();
            let ef = hnsw.len().min(100);
            let mut dest = vec![Neighbor { index: 0, distance: 0 }; ef];
-            let neighbors = hnsw.nearest(vector, ef, &mut searcher, &mut dest[..]);
+            let vector = normalize_vector(vector.clone());
+            let neighbors = hnsw.nearest(&vector, ef, &mut searcher, &mut dest[..]);

            let mut docids = Vec::new();
            for Neighbor { index, distance: _ } in neighbors.iter() {
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@ -1,20 +1,23 @@
+use std::convert::TryFrom;
 use std::fs::File;
 use std::io;

 use bytemuck::cast_slice;
-use serde_json::from_slice;
+use serde_json::{from_slice, Value};

 use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
-use crate::{FieldId, InternalError, Result};
+use crate::error::UserError;
+use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};

-/// Extracts the embedding vector contained in each document under the `_vector` field.
+/// Extracts the embedding vector contained in each document under the `_vectors` field.
 ///
 /// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
 #[logging_timer::time]
 pub fn extract_vector_points<R: io::Read + io::Seek>(
    obkv_documents: grenad::Reader<R>,
    indexer: GrenadParameters,
-    vector_fid: FieldId,
+    primary_key_id: FieldId,
+    vectors_fid: FieldId,
 ) -> Result<grenad::Reader<File>> {
    let mut writer = create_writer(
        indexer.chunk_compression_type,
@ -26,14 +29,40 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
    while let Some((docid_bytes, value)) = cursor.move_on_next()? {
        let obkv = obkv::KvReader::new(value);

-        // first we get the _vector field
-        if let Some(vector) = obkv.get(vector_fid) {
-            // try to extract the vector
-            let vector: Vec<f32> = from_slice(vector).map_err(InternalError::SerdeJson).unwrap();
-            let bytes = cast_slice(&vector);
-            writer.insert(docid_bytes, bytes)?;
+        // since we only needs the primary key when we throw an error we create this getter to
+        // lazily get it when needed
+        let document_id = || -> Value {
+            let document_id = obkv.get(primary_key_id).unwrap();
+            serde_json::from_slice(document_id).unwrap()
+        };
+
+        // first we retrieve the _vectors field
+        if let Some(vectors) = obkv.get(vectors_fid) {
+            // extract the vectors
+            let vectors = match from_slice(vectors) {
+                Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors),
+                Err(_) => {
+                    return Err(UserError::InvalidVectorsType {
+                        document_id: document_id(),
+                        value: from_slice(vectors).map_err(InternalError::SerdeJson)?,
+                    }
+                    .into())
+                }
+            };
+
+            for (i, vector) in vectors.into_iter().enumerate() {
+                match u16::try_from(i) {
+                    Ok(i) => {
+                        let mut key = docid_bytes.to_vec();
+                        key.extend_from_slice(&i.to_ne_bytes());
+                        let bytes = cast_slice(&vector);
+                        writer.insert(key, bytes)?;
+                    }
+                    Err(_) => continue,
+                }
+            }
        }
-        // else => the _vector object was `null`, there is nothing to do
+        // else => the `_vectors` object was `null`, there is nothing to do
    }

    writer_into_reader(writer)
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -47,7 +47,7 @@ pub(crate) fn data_from_obkv_documents(
    faceted_fields: HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
-    vector_field_id: Option<FieldId>,
+    vectors_field_id: Option<FieldId>,
    stop_words: Option<fst::Set<&[u8]>>,
    max_positions_per_attributes: Option<u32>,
    exact_attributes: HashSet<FieldId>,
@ -72,7 +72,7 @@ pub(crate) fn data_from_obkv_documents(
                    &faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
-                    vector_field_id,
+                    vectors_field_id,
                    &stop_words,
                    max_positions_per_attributes,
                )
@ -283,7 +283,7 @@ fn send_and_extract_flattened_documents_data(
    faceted_fields: &HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
-    vector_field_id: Option<FieldId>,
+    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
@ -312,11 +312,16 @@ fn send_and_extract_flattened_documents_data(
        });
    }

-    if let Some(vector_field_id) = vector_field_id {
+    if let Some(vectors_field_id) = vectors_field_id {
        let documents_chunk_cloned = flattened_documents_chunk.clone();
        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
        rayon::spawn(move || {
-            let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id);
+            let result = extract_vector_points(
+                documents_chunk_cloned,
+                indexer,
+                primary_key_id,
+                vectors_field_id,
+            );
            let _ = match result {
                Ok(vector_points) => {
                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -304,8 +304,8 @@ where
            }
            None => None,
        };
-        // get the fid of the `_vector` field.
-        let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector");
+        // get the fid of the `_vectors` field.
+        let vectors_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vectors");

        let stop_words = self.index.stop_words(self.wtxn)?;
        let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
@ -342,7 +342,7 @@ where
                    faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
-                    vector_field_id,
+                    vectors_field_id,
                    stop_words,
                    max_positions_per_attributes,
                    exact_attributes,
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@ -20,8 +20,11 @@ use super::{ClonableMmap, MergeFn};
 use crate::error::UserError;
 use crate::facet::FacetType;
 use crate::update::facet::FacetsUpdate;
-use crate::update::index_documents::helpers::as_cloneable_grenad;
-use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
+use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
+use crate::{
+    lat_lng_to_xyz, normalize_vector, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result,
+    BEU32,
+};

 pub(crate) enum TypedChunk {
    FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
@ -241,7 +244,8 @@ pub(crate) fn write_typed_chunk_into_index(
            let mut cursor = vector_points.into_cursor()?;
            while let Some((key, value)) = cursor.move_on_next()? {
                // convert the key back to a u32 (4 bytes)
-                let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
+                let (left, _index) = try_split_array_at(key).unwrap();
+                let docid = DocumentId::from_be_bytes(left);
                // convert the vector back to a Vec<f32>
                let vector: Vec<f32> = pod_collect_to_vec(value);

@ -252,6 +256,7 @@ pub(crate) fn write_typed_chunk_into_index(
                    return Err(UserError::InvalidVectorDimensions { expected, found })?;
                }

+                let vector = normalize_vector(vector);
                let vector_id = hnsw.insert(vector, &mut searcher) as u32;
                index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
            }
Author	SHA1	Message	Date
Kerollmops	2036014e4d	Make rustfmt happy	2023-06-20 16:26:00 +02:00
Kerollmops	486d78ab1d	Return a user error when the _vectors type is invalid	2023-06-20 16:18:24 +02:00
Kerollmops	b34df0a6a1	Display the _semanticSimilarity even if the `_vectors` field is not displayed	2023-06-20 15:56:53 +02:00
Kerollmops	e4d82a2fe1	Expose an _semanticSimilarity as a dot product in the documents	2023-06-20 14:38:58 +02:00
Kerollmops	d256f77645	Make Rustfmt happy	2023-06-20 13:48:57 +02:00
Kerollmops	d33886f891	Make clippy happy	2023-06-20 13:48:39 +02:00
Kerollmops	c76345dc46	Normalize the vectors during indexation and search	2023-06-20 12:37:04 +02:00
Kerollmops	a1edb6652a	Accept multiple vectors by documents using the _vectors field	2023-06-20 11:17:20 +02:00
Kerollmops	ca03ca0b84	Return the vector in the output of the search routes	2023-06-20 10:14:25 +02:00