mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-20 13:30:38 +00:00
Compare commits
9 Commits
prototype-
...
prototype-
Author | SHA1 | Date | |
---|---|---|---|
2036014e4d | |||
486d78ab1d | |||
b34df0a6a1 | |||
e4d82a2fe1 | |||
d256f77645 | |||
d33886f891 | |||
c76345dc46 | |||
a1edb6652a | |||
ca03ca0b84 |
5
Cargo.lock
generated
5
Cargo.lock
generated
@ -2627,6 +2627,7 @@ dependencies = [
|
||||
"num_cpus",
|
||||
"obkv",
|
||||
"once_cell",
|
||||
"ordered-float",
|
||||
"parking_lot",
|
||||
"permissive-json-pointer",
|
||||
"pin-project-lite",
|
||||
@ -2976,9 +2977,9 @@ checksum = "624a8340c38c1b80fd549087862da4ba43e08858af025b236e509b6649fc13d5"
|
||||
|
||||
[[package]]
|
||||
name = "ordered-float"
|
||||
version = "3.6.0"
|
||||
version = "3.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13a384337e997e6860ffbaa83708b2ef329fd8c54cb67a5f64d421e0f943254f"
|
||||
checksum = "2fc2dbde8f8a79f2102cc474ceb0ad68e3b80b85289ea62389b60e66777e4213"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
@ -218,6 +218,7 @@ MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidVectorsType , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentId , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ;
|
||||
@ -334,6 +335,7 @@ impl ErrorCode for milli::Error {
|
||||
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
||||
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
||||
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
||||
UserError::InvalidVectorsType { .. } => Code::InvalidVectorsType,
|
||||
UserError::SortError(_) => Code::InvalidSearchSort,
|
||||
UserError::InvalidMinTypoWordLenSetting(_, _) => {
|
||||
Code::InvalidSettingsTypoTolerance
|
||||
|
@ -48,6 +48,7 @@ mime = "0.3.17"
|
||||
num_cpus = "1.15.0"
|
||||
obkv = "0.2.0"
|
||||
once_cell = "1.17.1"
|
||||
ordered-float = "3.7.0"
|
||||
parking_lot = "0.12.1"
|
||||
permissive-json-pointer = { path = "../permissive-json-pointer" }
|
||||
pin-project-lite = "0.2.9"
|
||||
|
@ -9,13 +9,15 @@ use meilisearch_auth::IndexSearchRules;
|
||||
use meilisearch_types::deserr::DeserrJsonError;
|
||||
use meilisearch_types::error::deserr_codes::*;
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::milli::dot_product_similarity;
|
||||
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
|
||||
use meilisearch_types::{milli, Document};
|
||||
use milli::tokenizer::TokenizerBuilder;
|
||||
use milli::{
|
||||
AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, MatcherBuilder,
|
||||
SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
SortError, TermsMatchingStrategy, VectorOrArrayOfVectors, DEFAULT_VALUES_PER_FACET,
|
||||
};
|
||||
use ordered_float::OrderedFloat;
|
||||
use regex::Regex;
|
||||
use serde::Serialize;
|
||||
use serde_json::{json, Value};
|
||||
@ -215,6 +217,8 @@ pub struct SearchHit {
|
||||
pub struct SearchResult {
|
||||
pub hits: Vec<SearchHit>,
|
||||
pub query: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub vector: Option<Vec<f32>>,
|
||||
pub processing_time_ms: u128,
|
||||
#[serde(flatten)]
|
||||
pub hits_info: HitsInfo,
|
||||
@ -399,7 +403,6 @@ pub fn perform_search(
|
||||
formatter_builder.highlight_suffix(query.highlight_post_tag);
|
||||
|
||||
let mut documents = Vec::new();
|
||||
|
||||
let documents_iter = index.documents(&rtxn, documents_ids)?;
|
||||
|
||||
for (_id, obkv) in documents_iter {
|
||||
@ -426,6 +429,12 @@ pub fn perform_search(
|
||||
insert_geo_distance(sort, &mut document);
|
||||
}
|
||||
|
||||
if let Some(vector) = query.vector.as_ref() {
|
||||
if let Some(vectors) = extract_field("_vectors", &fields_ids_map, obkv)? {
|
||||
insert_semantic_similarity(vector, vectors, &mut document);
|
||||
}
|
||||
}
|
||||
|
||||
let hit = SearchHit { document, formatted, matches_position };
|
||||
documents.push(hit);
|
||||
}
|
||||
@ -475,7 +484,8 @@ pub fn perform_search(
|
||||
let result = SearchResult {
|
||||
hits: documents,
|
||||
hits_info,
|
||||
query: query.q.clone().unwrap_or_default(),
|
||||
query: query.q.unwrap_or_default(),
|
||||
vector: query.vector,
|
||||
processing_time_ms: before_search.elapsed().as_millis(),
|
||||
facet_distribution,
|
||||
facet_stats,
|
||||
@ -499,6 +509,20 @@ fn insert_geo_distance(sorts: &[String], document: &mut Document) {
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_semantic_similarity(query: &[f32], vectors: Value, document: &mut Document) {
|
||||
let vectors =
|
||||
match serde_json::from_value(vectors).map(VectorOrArrayOfVectors::into_array_of_vectors) {
|
||||
Ok(vectors) => vectors,
|
||||
Err(_) => return,
|
||||
};
|
||||
let similarity = vectors
|
||||
.into_iter()
|
||||
.map(|v| OrderedFloat(dot_product_similarity(query, &v)))
|
||||
.max()
|
||||
.map(OrderedFloat::into_inner);
|
||||
document.insert("_semanticSimilarity".to_string(), json!(similarity));
|
||||
}
|
||||
|
||||
fn compute_formatted_options(
|
||||
attr_to_highlight: &HashSet<String>,
|
||||
attr_to_crop: &[String],
|
||||
@ -626,6 +650,22 @@ fn make_document(
|
||||
Ok(document)
|
||||
}
|
||||
|
||||
/// Extract the JSON value under the field name specified
|
||||
/// but doesn't support nested objects.
|
||||
fn extract_field(
|
||||
field_name: &str,
|
||||
field_ids_map: &FieldsIdsMap,
|
||||
obkv: obkv::KvReaderU16,
|
||||
) -> Result<Option<serde_json::Value>, MeilisearchHttpError> {
|
||||
match field_ids_map.id(field_name) {
|
||||
Some(fid) => match obkv.get(fid) {
|
||||
Some(value) => Ok(serde_json::from_slice(value).map(Some)?),
|
||||
None => Ok(None),
|
||||
},
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn format_fields<A: AsRef<[u8]>>(
|
||||
document: &Document,
|
||||
field_ids_map: &FieldsIdsMap,
|
||||
|
@ -12,13 +12,18 @@ impl Metric<Vec<f32>> for DotProduct {
|
||||
//
|
||||
// Following <https://docs.rs/space/0.17.0/space/trait.Metric.html>.
|
||||
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
|
||||
let dist: f32 = a.iter().zip(b).map(|(a, b)| a * b).sum();
|
||||
let dist = 1.0 - dist;
|
||||
let dist = 1.0 - dot_product_similarity(a, b);
|
||||
debug_assert!(!dist.is_nan());
|
||||
dist.to_bits()
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the dot product similarity score that will between 0.0 and 1.0
|
||||
/// if both vectors are normalized. The higher the more similar the vectors are.
|
||||
pub fn dot_product_similarity(a: &[f32], b: &[f32]) -> f32 {
|
||||
a.iter().zip(b).map(|(a, b)| a * b).sum()
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy, Serialize, Deserialize)]
|
||||
pub struct Euclidean;
|
||||
|
||||
@ -26,9 +31,14 @@ impl Metric<Vec<f32>> for Euclidean {
|
||||
type Unit = u32;
|
||||
|
||||
fn distance(&self, a: &Vec<f32>, b: &Vec<f32>) -> Self::Unit {
|
||||
let squared: f32 = a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum();
|
||||
let dist = squared.sqrt();
|
||||
let dist = euclidean_squared_distance(a, b).sqrt();
|
||||
debug_assert!(!dist.is_nan());
|
||||
dist.to_bits()
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the squared euclidean distance between both vectors that will
|
||||
/// between 0.0 and +inf. The smaller the nearer the vectors are.
|
||||
pub fn euclidean_squared_distance(a: &[f32], b: &[f32]) -> f32 {
|
||||
a.iter().zip(b).map(|(a, b)| (a - b).powi(2)).sum()
|
||||
}
|
||||
|
@ -112,6 +112,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
InvalidGeoField(#[from] GeoError),
|
||||
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
||||
InvalidVectorDimensions { expected: usize, found: usize },
|
||||
#[error("The `_vectors` field in the document with the id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")]
|
||||
InvalidVectorsType { document_id: Value, value: Value },
|
||||
#[error("{0}")]
|
||||
InvalidFilter(String),
|
||||
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
|
||||
|
@ -30,6 +30,7 @@ use std::convert::{TryFrom, TryInto};
|
||||
use std::hash::BuildHasherDefault;
|
||||
|
||||
use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer};
|
||||
pub use distance::{dot_product_similarity, euclidean_squared_distance};
|
||||
pub use filter_parser::{Condition, FilterCondition, Span, Token};
|
||||
use fxhash::{FxHasher32, FxHasher64};
|
||||
pub use grenad::CompressionType;
|
||||
@ -284,6 +285,35 @@ pub fn normalize_facet(original: &str) -> String {
|
||||
CompatibilityDecompositionNormalizer.normalize_str(original.trim()).to_lowercase()
|
||||
}
|
||||
|
||||
/// Represents either a vector or an array of multiple vectors.
|
||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
||||
#[serde(transparent)]
|
||||
pub struct VectorOrArrayOfVectors {
|
||||
#[serde(with = "either::serde_untagged")]
|
||||
inner: either::Either<Vec<f32>, Vec<Vec<f32>>>,
|
||||
}
|
||||
|
||||
impl VectorOrArrayOfVectors {
|
||||
pub fn into_array_of_vectors(self) -> Vec<Vec<f32>> {
|
||||
match self.inner {
|
||||
either::Either::Left(vector) => vec![vector],
|
||||
either::Either::Right(vectors) => vectors,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize a vector by dividing the dimensions by the lenght of it.
|
||||
pub fn normalize_vector(mut vector: Vec<f32>) -> Vec<f32> {
|
||||
let squared: f32 = vector.iter().map(|x| x * x).sum();
|
||||
let length = squared.sqrt();
|
||||
if length <= f32::EPSILON {
|
||||
vector
|
||||
} else {
|
||||
vector.iter_mut().for_each(|x| *x /= length);
|
||||
vector
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use serde_json::json;
|
||||
|
@ -48,7 +48,8 @@ use self::graph_based_ranking_rule::Words;
|
||||
use self::interner::Interned;
|
||||
use crate::search::new::distinct::apply_distinct_rule;
|
||||
use crate::{
|
||||
AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError, BEU32,
|
||||
normalize_vector, AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy,
|
||||
UserError, BEU32,
|
||||
};
|
||||
|
||||
/// A structure used throughout the execution of a search query.
|
||||
@ -440,7 +441,8 @@ pub fn execute_search(
|
||||
let hnsw = ctx.index.vector_hnsw(ctx.txn)?.unwrap_or_default();
|
||||
let ef = hnsw.len().min(100);
|
||||
let mut dest = vec![Neighbor { index: 0, distance: 0 }; ef];
|
||||
let neighbors = hnsw.nearest(vector, ef, &mut searcher, &mut dest[..]);
|
||||
let vector = normalize_vector(vector.clone());
|
||||
let neighbors = hnsw.nearest(&vector, ef, &mut searcher, &mut dest[..]);
|
||||
|
||||
let mut docids = Vec::new();
|
||||
for Neighbor { index, distance: _ } in neighbors.iter() {
|
||||
|
@ -1,20 +1,23 @@
|
||||
use std::convert::TryFrom;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use serde_json::from_slice;
|
||||
use serde_json::{from_slice, Value};
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
use crate::error::UserError;
|
||||
use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vector` field.
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||
#[logging_timer::time]
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
vector_fid: FieldId,
|
||||
primary_key_id: FieldId,
|
||||
vectors_fid: FieldId,
|
||||
) -> Result<grenad::Reader<File>> {
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
@ -26,14 +29,40 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
|
||||
// first we get the _vector field
|
||||
if let Some(vector) = obkv.get(vector_fid) {
|
||||
// try to extract the vector
|
||||
let vector: Vec<f32> = from_slice(vector).map_err(InternalError::SerdeJson).unwrap();
|
||||
let bytes = cast_slice(&vector);
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
// since we only needs the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let document_id = obkv.get(primary_key_id).unwrap();
|
||||
serde_json::from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// first we retrieve the _vectors field
|
||||
if let Some(vectors) = obkv.get(vectors_fid) {
|
||||
// extract the vectors
|
||||
let vectors = match from_slice(vectors) {
|
||||
Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors),
|
||||
Err(_) => {
|
||||
return Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value: from_slice(vectors).map_err(InternalError::SerdeJson)?,
|
||||
}
|
||||
.into())
|
||||
}
|
||||
};
|
||||
|
||||
for (i, vector) in vectors.into_iter().enumerate() {
|
||||
match u16::try_from(i) {
|
||||
Ok(i) => {
|
||||
let mut key = docid_bytes.to_vec();
|
||||
key.extend_from_slice(&i.to_ne_bytes());
|
||||
let bytes = cast_slice(&vector);
|
||||
writer.insert(key, bytes)?;
|
||||
}
|
||||
Err(_) => continue,
|
||||
}
|
||||
}
|
||||
}
|
||||
// else => the _vector object was `null`, there is nothing to do
|
||||
// else => the `_vectors` object was `null`, there is nothing to do
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
|
@ -47,7 +47,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
faceted_fields: HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
vector_field_id: Option<FieldId>,
|
||||
vectors_field_id: Option<FieldId>,
|
||||
stop_words: Option<fst::Set<&[u8]>>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
exact_attributes: HashSet<FieldId>,
|
||||
@ -72,7 +72,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
vector_field_id,
|
||||
vectors_field_id,
|
||||
&stop_words,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
@ -283,7 +283,7 @@ fn send_and_extract_flattened_documents_data(
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
vector_field_id: Option<FieldId>,
|
||||
vectors_field_id: Option<FieldId>,
|
||||
stop_words: &Option<fst::Set<&[u8]>>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
@ -312,11 +312,16 @@ fn send_and_extract_flattened_documents_data(
|
||||
});
|
||||
}
|
||||
|
||||
if let Some(vector_field_id) = vector_field_id {
|
||||
if let Some(vectors_field_id) = vectors_field_id {
|
||||
let documents_chunk_cloned = flattened_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
let result = extract_vector_points(documents_chunk_cloned, indexer, vector_field_id);
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned,
|
||||
indexer,
|
||||
primary_key_id,
|
||||
vectors_field_id,
|
||||
);
|
||||
let _ = match result {
|
||||
Ok(vector_points) => {
|
||||
lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
|
||||
|
@ -304,8 +304,8 @@ where
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
// get the fid of the `_vector` field.
|
||||
let vector_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vector");
|
||||
// get the fid of the `_vectors` field.
|
||||
let vectors_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vectors");
|
||||
|
||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||
let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;
|
||||
@ -342,7 +342,7 @@ where
|
||||
faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
vector_field_id,
|
||||
vectors_field_id,
|
||||
stop_words,
|
||||
max_positions_per_attributes,
|
||||
exact_attributes,
|
||||
|
@ -20,8 +20,11 @@ use super::{ClonableMmap, MergeFn};
|
||||
use crate::error::UserError;
|
||||
use crate::facet::FacetType;
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::as_cloneable_grenad;
|
||||
use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
|
||||
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
|
||||
use crate::{
|
||||
lat_lng_to_xyz, normalize_vector, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result,
|
||||
BEU32,
|
||||
};
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||
@ -241,7 +244,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let mut cursor = vector_points.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
let (left, _index) = try_split_array_at(key).unwrap();
|
||||
let docid = DocumentId::from_be_bytes(left);
|
||||
// convert the vector back to a Vec<f32>
|
||||
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||
|
||||
@ -252,6 +256,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
return Err(UserError::InvalidVectorDimensions { expected, found })?;
|
||||
}
|
||||
|
||||
let vector = normalize_vector(vector);
|
||||
let vector_id = hnsw.insert(vector, &mut searcher) as u32;
|
||||
index.vector_id_docid.put(wtxn, &BEU32::new(vector_id), &BEU32::new(docid))?;
|
||||
}
|
||||
|
Reference in New Issue
Block a user