Compare commits

..

1 Commits

Author SHA1 Message Date
9874efc352 WIP 2024-07-04 11:18:45 +02:00
89 changed files with 1833 additions and 5072 deletions

View File

@ -1,6 +1,4 @@
name: Look for flaky tests
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
on:
workflow_dispatch:
schedule:

View File

@ -1,6 +1,5 @@
name: Run the indexing fuzzer
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
on:
push:
branches:

View File

@ -15,8 +15,6 @@ jobs:
debian:
name: Publish debian packagge
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
runs-on: ubuntu-latest
needs: check-version
container:

View File

@ -35,8 +35,6 @@ jobs:
publish-linux:
name: Publish binary for Linux
runs-on: ubuntu-latest
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
needs: check-version
container:
# Use ubuntu-18.04 to compile with glibc 2.27
@ -134,8 +132,6 @@ jobs:
name: Publish binary for aarch64
runs-on: ubuntu-latest
needs: check-version
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
container:
# Use ubuntu-18.04 to compile with glibc 2.27
image: ubuntu:18.04

View File

@ -21,8 +21,6 @@ jobs:
test-linux:
name: Tests on ubuntu-18.04
runs-on: ubuntu-latest
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
container:
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
image: ubuntu:18.04
@ -79,8 +77,6 @@ jobs:
test-all-features:
name: Tests almost all features
runs-on: ubuntu-latest
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
container:
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations
image: ubuntu:18.04
@ -104,8 +100,6 @@ jobs:
test-disabled-tokenization:
name: Test disabled tokenization
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
runs-on: ubuntu-latest
container:
image: ubuntu:18.04
@ -122,7 +116,7 @@ jobs:
override: true
- name: Run cargo tree without default features and check lindera is not present
run: |
if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -qz lindera; then
if cargo tree -f '{p} {f}' -e normal --no-default-features | grep -vqz lindera; then
echo "lindera has been found in the sources and it shouldn't"
exit 1
fi
@ -133,8 +127,6 @@ jobs:
# We run tests in debug also, to make sure that the debug_assertions are hit
test-debug:
name: Run tests in debug
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
runs-on: ubuntu-latest
container:
# Use ubuntu-18.04 to compile with glibc 2.27, which are the production expectations

72
Cargo.lock generated
View File

@ -381,9 +381,9 @@ dependencies = [
[[package]]
name = "arroy"
version = "0.4.0"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2ece9e5347e7fdaaea3181dec7f916677ad5f3fcbac183648ce1924eb4aeef9a"
checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9"
dependencies = [
"bytemuck",
"byteorder",
@ -503,7 +503,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"
[[package]]
name = "benchmarks"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"anyhow",
"bytes",
@ -648,7 +648,7 @@ dependencies = [
[[package]]
name = "build-info"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"anyhow",
"time",
@ -679,9 +679,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
[[package]]
name = "bytemuck"
version = "1.16.1"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15"
dependencies = [
"bytemuck_derive",
]
@ -1579,7 +1579,7 @@ dependencies = [
[[package]]
name = "dump"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"anyhow",
"big_s",
@ -1804,7 +1804,7 @@ dependencies = [
[[package]]
name = "file-store"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"faux",
"tempfile",
@ -1827,7 +1827,7 @@ dependencies = [
[[package]]
name = "filter-parser"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"insta",
"nom",
@ -1847,7 +1847,7 @@ dependencies = [
[[package]]
name = "flatten-serde-json"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"criterion",
"serde_json",
@ -1965,7 +1965,7 @@ dependencies = [
[[package]]
name = "fuzzers"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"arbitrary",
"clap",
@ -2273,9 +2273,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "heed"
version = "0.20.2"
version = "0.20.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f60d7cff16094be9627830b399c087a25017e93fb3768b87cd656a68ccb1ebe8"
checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd"
dependencies = [
"bitflags 2.5.0",
"byteorder",
@ -2452,10 +2452,9 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"
[[package]]
name = "index-scheduler"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"anyhow",
"arroy",
"big_s",
"bincode",
"crossbeam",
@ -2466,7 +2465,6 @@ dependencies = [
"file-store",
"flate2",
"insta",
"maplit",
"meili-snap",
"meilisearch-auth",
"meilisearch-types",
@ -2649,7 +2647,7 @@ dependencies = [
[[package]]
name = "json-depth-checker"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"criterion",
"serde_json",
@ -3172,9 +3170,9 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da"
[[package]]
name = "lmdb-master-sys"
version = "0.2.1"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a5142795c220effa4c8f4813537bd4c88113a07e45e93100ccb2adc5cec6c7f3"
checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a"
dependencies = [
"cc",
"doxygen-rs",
@ -3257,7 +3255,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
[[package]]
name = "meili-snap"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"insta",
"md5",
@ -3266,7 +3264,7 @@ dependencies = [
[[package]]
name = "meilisearch"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"actix-cors",
"actix-http",
@ -3358,7 +3356,7 @@ dependencies = [
[[package]]
name = "meilisearch-auth"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"base64 0.21.7",
"enum-iterator",
@ -3377,7 +3375,7 @@ dependencies = [
[[package]]
name = "meilisearch-types"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"actix-web",
"anyhow",
@ -3407,7 +3405,7 @@ dependencies = [
[[package]]
name = "meilitool"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"anyhow",
"clap",
@ -3446,7 +3444,7 @@ dependencies = [
[[package]]
name = "milli"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"arroy",
"big_s",
@ -3886,7 +3884,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "permissive-json-pointer"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"big_s",
"serde_json",
@ -5053,18 +5051,18 @@ dependencies = [
[[package]]
name = "thiserror"
version = "1.0.61"
version = "1.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.61"
version = "1.0.58"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
dependencies = [
"proc-macro2",
"quote",
@ -5098,9 +5096,9 @@ dependencies = [
[[package]]
name = "time"
version = "0.3.36"
version = "0.3.34"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
checksum = "c8248b6521bb14bc45b4067159b9b6ad792e2d6d754d6c41fb50e29fefe38749"
dependencies = [
"deranged",
"itoa",
@ -5121,9 +5119,9 @@ checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
[[package]]
name = "time-macros"
version = "0.2.18"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
checksum = "7ba3a3ef41e6672a2f0f001392bb5dcd3ff0a9992d618ca761a11c3121547774"
dependencies = [
"num-conv",
"time-core",
@ -5303,9 +5301,9 @@ dependencies = [
[[package]]
name = "tracing-actix-web"
version = "0.7.11"
version = "0.7.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19"
checksum = "fa069bd1503dd526ee793bb3fce408895136c95fc86d2edb2acf1c646d7f0684"
dependencies = [
"actix-web",
"mutually_exclusive_features",
@ -6042,7 +6040,7 @@ dependencies = [
[[package]]
name = "xtask"
version = "1.9.1"
version = "1.9.0"
dependencies = [
"anyhow",
"build-info",

View File

@ -22,7 +22,7 @@ members = [
]
[workspace.package]
version = "1.9.1"
version = "1.9.0"
authors = [
"Quentin de Quelen <quentin@dequelen.me>",
"Clément Renault <clement@meilisearch.com>",

View File

@ -780,7 +780,7 @@ expression: document
1.3484878540039063
]
],
"regenerate": true
"userProvided": false
}
}
}

View File

@ -779,7 +779,7 @@ expression: document
1.04031240940094
]
],
"regenerate": true
"userProvided": false
}
}
}

View File

@ -152,7 +152,6 @@ impl Settings<Unchecked> {
}
#[derive(Debug, Clone, Deserialize)]
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[cfg_attr(test, derive(serde::Serialize))]
#[serde(deny_unknown_fields)]
#[serde(rename_all = "camelCase")]

View File

@ -182,7 +182,6 @@ impl Settings<Unchecked> {
}
}
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[derive(Debug, Clone, Deserialize)]
#[cfg_attr(test, derive(serde::Serialize))]
#[serde(deny_unknown_fields)]

View File

@ -200,7 +200,6 @@ impl std::ops::Deref for IndexUid {
}
}
#[allow(dead_code)] // otherwise rustc complains that the fields go unused
#[derive(Debug)]
#[cfg_attr(test, derive(serde::Serialize))]
#[cfg_attr(test, serde(rename_all = "camelCase"))]

View File

@ -40,9 +40,7 @@ ureq = "2.9.7"
uuid = { version = "1.6.1", features = ["serde", "v4"] }
[dev-dependencies]
arroy = "0.4.0"
big_s = "1.0.2"
crossbeam = "0.8.4"
insta = { version = "1.34.0", features = ["json", "redactions"] }
maplit = "1.0.2"
meili-snap = { path = "../meili-snap" }

View File

@ -909,7 +909,6 @@ impl IndexScheduler {
let fields_ids_map = index.fields_ids_map(&rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let embedding_configs = index.embedding_configs(&rtxn)?;
// 3.1. Dump the documents
for ret in index.all_documents(&rtxn)? {
@ -952,21 +951,16 @@ impl IndexScheduler {
};
for (embedder_name, embeddings) in embeddings {
let user_provided = embedding_configs
.iter()
.find(|conf| conf.name == embedder_name)
.is_some_and(|conf| conf.user_provided.contains(id));
let embeddings = ExplicitVectors {
embeddings: Some(
VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
),
regenerate: !user_provided,
};
vectors.insert(
embedder_name,
serde_json::to_value(embeddings).unwrap(),
);
// don't change the entry if it already exists, because it was user-provided
vectors.entry(embedder_name).or_insert_with(|| {
let embeddings = ExplicitVectors {
embeddings: VectorOrArrayOfVectors::from_array_of_vectors(
embeddings,
),
user_provided: false,
};
serde_json::to_value(embeddings).unwrap()
});
}
}
@ -1288,11 +1282,7 @@ impl IndexScheduler {
}
}
let config = IndexDocumentsConfig {
update_method: method,
compute_prefix_databases: self.compute_prefix_databases,
..Default::default()
};
let config = IndexDocumentsConfig { update_method: method, ..Default::default() };
let embedder_configs = index.embedding_configs(index_wtxn)?;
// TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
@ -1402,7 +1392,6 @@ impl IndexScheduler {
let deleted_documents = delete_document_by_filter(
index_wtxn,
filter,
self.compute_prefix_databases,
self.index_mapper.indexer_config(),
self.must_stop_processing.clone(),
index,
@ -1643,7 +1632,6 @@ impl IndexScheduler {
fn delete_document_by_filter<'a>(
wtxn: &mut RwTxn<'a>,
filter: &serde_json::Value,
compute_prefix_databases: bool,
indexer_config: &IndexerConfig,
must_stop_processing: MustStopProcessing,
index: &'a Index,
@ -1659,7 +1647,6 @@ fn delete_document_by_filter<'a>(
let config = IndexDocumentsConfig {
update_method: IndexDocumentsMethod::ReplaceDocuments,
compute_prefix_databases,
..Default::default()
};

View File

@ -32,7 +32,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
features: _,
max_number_of_tasks: _,
max_number_of_batched_tasks: _,
compute_prefix_databases: _,
wake_up: _,
dumps_path: _,
snapshots_path: _,

File diff suppressed because it is too large Load Diff

View File

@ -6,6 +6,10 @@ expression: doc
"doggo": "kefir",
"breed": "patou",
"_vectors": {
"A_fakerest": {
"embeddings": "[vector]",
"userProvided": true
},
"noise": [
0.1,
0.2,

View File

@ -6,6 +6,10 @@ expression: doc
"doggo": "Intel",
"breed": "beagle",
"_vectors": {
"A_fakerest": {
"embeddings": "[vector]",
"userProvided": true
},
"noise": [
0.1,
0.2,

View File

@ -222,7 +222,6 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ;
InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ;
InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFields , InvalidRequest , BAD_REQUEST ;
InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQUEST ;
MissingDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ;
InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ;
@ -241,11 +240,9 @@ InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ;
InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
InvalidSimilarRetrieveVectors , InvalidRequest , BAD_REQUEST ;
InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ;
InvalidSearchRetrieveVectors , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ;
InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ;
InvalidSearchFacets , InvalidRequest , BAD_REQUEST ;
@ -273,14 +270,13 @@ InvalidSimilarShowRankingScore , InvalidRequest , BAD_REQUEST ;
InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
InvalidSearchSort , InvalidRequest , BAD_REQUEST ;
InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ;
InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ;
InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ;
InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ;
InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ;
InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSearchCutoffMs , InvalidRequest , BAD_REQUEST ;
InvalidSettingsEmbedders , InvalidRequest , BAD_REQUEST ;
InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ;
InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ;
@ -385,7 +381,6 @@ impl ErrorCode for milli::Error {
Code::IndexPrimaryKeyMultipleCandidatesFound
}
UserError::PrimaryKeyCannotBeChanged(_) => Code::IndexPrimaryKeyAlreadyExists,
UserError::InvalidDistinctAttribute { .. } => Code::InvalidSearchDistinct,
UserError::SortRankingRuleMissing => Code::InvalidSearchSort,
UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets,
UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort,
@ -398,8 +393,7 @@ impl ErrorCode for milli::Error {
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
UserError::InvalidVectorsMapType { .. }
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType,
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
UserError::SortError(_) => Code::InvalidSearchSort,
UserError::InvalidMinTypoWordLenSetting(_, _) => {

View File

@ -8,7 +8,6 @@ use std::str::FromStr;
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
use fst::IntoStreamer;
use milli::index::IndexEmbeddingConfig;
use milli::proximity::ProximityPrecision;
use milli::update::Setting;
use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET};
@ -673,7 +672,7 @@ pub fn settings(
let embedders: BTreeMap<_, _> = index
.embedding_configs(rtxn)?
.into_iter()
.map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into())))
.map(|(name, config)| (name, Setting::Set(config.into())))
.collect();
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };

View File

@ -158,5 +158,5 @@ vietnamese = ["meilisearch-types/vietnamese"]
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip"
sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe"
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"
sha1 = "e20cc9b390003c6c844f4b8bcc5c5013191a77ff"

View File

@ -74,8 +74,8 @@ pub enum DocumentDeletionKind {
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum DocumentFetchKind {
PerDocumentId { retrieve_vectors: bool },
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
PerDocumentId,
Normal { with_filter: bool, limit: usize, offset: usize },
}
pub trait Analytics: Sync + Send {

View File

@ -256,7 +256,6 @@ struct Infos {
experimental_enable_logs_route: bool,
experimental_reduce_indexing_memory_usage: bool,
experimental_max_number_of_batched_tasks: usize,
experimental_disable_prefix_db: bool,
gpu_enabled: bool,
db_path: bool,
import_dump: bool,
@ -299,7 +298,6 @@ impl From<Opt> for Infos {
experimental_enable_logs_route,
experimental_reduce_indexing_memory_usage,
experimental_max_number_of_batched_tasks,
experimental_disable_prefix_db,
http_addr,
master_key: _,
env,
@ -349,7 +347,6 @@ impl From<Opt> for Infos {
experimental_replication_parameters,
experimental_enable_logs_route,
experimental_reduce_indexing_memory_usage,
experimental_disable_prefix_db,
gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(),
db_path: db_path != PathBuf::from("./data.ms"),
import_dump: import_dump.is_some(),
@ -600,9 +597,6 @@ pub struct SearchAggregator {
// every time a request has a filter, this field must be incremented by one
sort_total_number_of_criteria: usize,
// distinct
distinct: bool,
// filter
filter_with_geo_radius: bool,
filter_with_geo_bounding_box: bool,
@ -628,7 +622,6 @@ pub struct SearchAggregator {
// Whether a non-default embedder was specified
embedder: bool,
hybrid: bool,
retrieve_vectors: bool,
// every time a search is done, we increment the counter linked to the used settings
matching_strategy: HashMap<String, usize>,
@ -669,7 +662,6 @@ impl SearchAggregator {
page,
hits_per_page,
attributes_to_retrieve: _,
retrieve_vectors,
attributes_to_crop: _,
crop_length,
attributes_to_highlight: _,
@ -678,7 +670,6 @@ impl SearchAggregator {
show_ranking_score_details,
filter,
sort,
distinct,
facets: _,
highlight_pre_tag,
highlight_post_tag,
@ -701,8 +692,6 @@ impl SearchAggregator {
ret.sort_sum_of_criteria_terms = sort.len();
}
ret.distinct = distinct.is_some();
if let Some(ref filter) = filter {
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
ret.filter_total_number_of_criteria = 1;
@ -739,7 +728,6 @@ impl SearchAggregator {
if let Some(ref vector) = vector {
ret.max_vector_size = vector.len();
}
ret.retrieve_vectors |= retrieve_vectors;
if query.is_finite_pagination() {
let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT);
@ -807,7 +795,6 @@ impl SearchAggregator {
sort_with_geo_point,
sort_sum_of_criteria_terms,
sort_total_number_of_criteria,
distinct,
filter_with_geo_radius,
filter_with_geo_bounding_box,
filter_sum_of_criteria_terms,
@ -816,7 +803,6 @@ impl SearchAggregator {
attributes_to_search_on_total_number_of_uses,
max_terms_number,
max_vector_size,
retrieve_vectors,
matching_strategy,
max_limit,
max_offset,
@ -865,9 +851,6 @@ impl SearchAggregator {
self.sort_total_number_of_criteria =
self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria);
// distinct
self.distinct |= distinct;
// filter
self.filter_with_geo_radius |= filter_with_geo_radius;
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
@ -890,7 +873,6 @@ impl SearchAggregator {
// vector
self.max_vector_size = self.max_vector_size.max(max_vector_size);
self.retrieve_vectors |= retrieve_vectors;
self.semantic_ratio |= semantic_ratio;
self.hybrid |= hybrid;
self.embedder |= embedder;
@ -939,7 +921,6 @@ impl SearchAggregator {
sort_with_geo_point,
sort_sum_of_criteria_terms,
sort_total_number_of_criteria,
distinct,
filter_with_geo_radius,
filter_with_geo_bounding_box,
filter_sum_of_criteria_terms,
@ -948,7 +929,6 @@ impl SearchAggregator {
attributes_to_search_on_total_number_of_uses,
max_terms_number,
max_vector_size,
retrieve_vectors,
matching_strategy,
max_limit,
max_offset,
@ -997,7 +977,6 @@ impl SearchAggregator {
"with_geoPoint": sort_with_geo_point,
"avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64),
},
"distinct": distinct,
"filter": {
"with_geoRadius": filter_with_geo_radius,
"with_geoBoundingBox": filter_with_geo_bounding_box,
@ -1012,7 +991,6 @@ impl SearchAggregator {
},
"vector": {
"max_vector_size": max_vector_size,
"retrieve_vectors": retrieve_vectors,
},
"hybrid": {
"enabled": hybrid,
@ -1101,7 +1079,6 @@ impl MultiSearchAggregator {
page: _,
hits_per_page: _,
attributes_to_retrieve: _,
retrieve_vectors: _,
attributes_to_crop: _,
crop_length: _,
attributes_to_highlight: _,
@ -1110,7 +1087,6 @@ impl MultiSearchAggregator {
show_matches_position: _,
filter: _,
sort: _,
distinct: _,
facets: _,
highlight_pre_tag: _,
highlight_post_tag: _,
@ -1558,9 +1534,6 @@ pub struct DocumentsFetchAggregator {
// if a filter was used
per_filter: bool,
#[serde(rename = "vector.retrieve_vectors")]
retrieve_vectors: bool,
// pagination
#[serde(rename = "pagination.max_limit")]
max_limit: usize,
@ -1570,21 +1543,18 @@ pub struct DocumentsFetchAggregator {
impl DocumentsFetchAggregator {
pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self {
let (limit, offset, retrieve_vectors) = match query {
DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors),
DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => {
(*limit, *offset, *retrieve_vectors)
}
let (limit, offset) = match query {
DocumentFetchKind::PerDocumentId => (1, 0),
DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset),
};
Self {
timestamp: Some(OffsetDateTime::now_utc()),
user_agents: extract_user_agents(request).into_iter().collect(),
total_received: 1,
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }),
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId),
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
max_limit: limit,
max_offset: offset,
retrieve_vectors,
}
}
@ -1598,7 +1568,6 @@ impl DocumentsFetchAggregator {
per_filter,
max_limit,
max_offset,
retrieve_vectors,
} = other;
if self.timestamp.is_none() {
@ -1614,8 +1583,6 @@ impl DocumentsFetchAggregator {
self.max_limit = self.max_limit.max(max_limit);
self.max_offset = self.max_offset.max(max_offset);
self.retrieve_vectors |= retrieve_vectors;
}
pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> {
@ -1656,7 +1623,6 @@ pub struct SimilarAggregator {
// Whether a non-default embedder was specified
embedder: bool,
retrieve_vectors: bool,
// pagination
max_limit: usize,
@ -1680,7 +1646,6 @@ impl SimilarAggregator {
offset,
limit,
attributes_to_retrieve: _,
retrieve_vectors,
show_ranking_score,
show_ranking_score_details,
filter,
@ -1725,7 +1690,6 @@ impl SimilarAggregator {
ret.ranking_score_threshold = ranking_score_threshold.is_some();
ret.embedder = embedder.is_some();
ret.retrieve_vectors = *retrieve_vectors;
ret
}
@ -1758,7 +1722,6 @@ impl SimilarAggregator {
show_ranking_score_details,
embedder,
ranking_score_threshold,
retrieve_vectors,
} = other;
if self.timestamp.is_none() {
@ -1788,7 +1751,6 @@ impl SimilarAggregator {
}
self.embedder |= embedder;
self.retrieve_vectors |= retrieve_vectors;
// pagination
self.max_limit = self.max_limit.max(max_limit);
@ -1823,7 +1785,6 @@ impl SimilarAggregator {
show_ranking_score_details,
embedder,
ranking_score_threshold,
retrieve_vectors,
} = self;
if total_received == 0 {
@ -1850,9 +1811,6 @@ impl SimilarAggregator {
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
},
"vector": {
"retrieve_vectors": retrieve_vectors,
},
"hybrid": {
"embedder": embedder,
},

View File

@ -311,7 +311,6 @@ fn open_or_create_database_unchecked(
index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
index_count: DEFAULT_INDEX_COUNT,
instance_features,
compute_prefix_databases: !opt.experimental_disable_prefix_db,
})?)
};

View File

@ -60,7 +60,6 @@ const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
"MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS";
const MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB: &str = "MEILI_EXPERIMENTAL_DISABLE_PREFIXDB";
const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
const DEFAULT_DB_PATH: &str = "./data.ms";
@ -390,11 +389,6 @@ pub struct Opt {
#[serde(default = "default_limit_batched_tasks")]
pub experimental_max_number_of_batched_tasks: usize,
/// Experimentally disable the prefix database, see: <https://github.com/orgs/meilisearch/discussions>
#[clap(long, env = MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB)]
#[serde(default)]
pub experimental_disable_prefix_db: bool,
#[serde(flatten)]
#[clap(flatten)]
pub indexer_options: IndexerOpts,
@ -495,7 +489,6 @@ impl Opt {
experimental_enable_logs_route,
experimental_replication_parameters,
experimental_reduce_indexing_memory_usage,
experimental_disable_prefix_db,
} = self;
export_to_env_if_not_present(MEILI_DB_PATH, db_path);
export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
@ -525,10 +518,6 @@ impl Opt {
MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS,
experimental_max_number_of_batched_tasks.to_string(),
);
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_DISABLE_PREFIX_DB,
experimental_disable_prefix_db.to_string(),
);
if let Some(ssl_cert_path) = ssl_cert_path {
export_to_env_if_not_present(MEILI_SSL_CERT_PATH, ssl_cert_path);
}

View File

@ -16,7 +16,6 @@ use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::heed::RoTxn;
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::update::IndexDocumentsMethod;
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
use meilisearch_types::milli::DocumentId;
use meilisearch_types::star_or::OptionStarOrList;
use meilisearch_types::tasks::KindWithContent;
@ -40,7 +39,7 @@ use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::{
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
};
use crate::search::{parse_filter, RetrieveVectors};
use crate::search::parse_filter;
use crate::Opt;
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
@ -95,8 +94,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
pub struct GetDocument {
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
fields: OptionStarOrList<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
retrieve_vectors: Param<bool>,
}
pub async fn get_document(
@ -110,20 +107,13 @@ pub async fn get_document(
debug!(parameters = ?params, "Get document");
let index_uid = IndexUid::try_from(index_uid)?;
let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner();
analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req);
let GetDocument { fields } = params.into_inner();
let attributes_to_retrieve = fields.merge_star_and_none();
let features = index_scheduler.features();
let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?;
analytics.get_fetch_documents(
&DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 },
&req,
);
let index = index_scheduler.index(&index_uid)?;
let document =
retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors)?;
let document = retrieve_document(&index, &document_id, attributes_to_retrieve)?;
debug!(returns = ?document, "Get document");
Ok(HttpResponse::Ok().json(document))
}
@ -163,8 +153,6 @@ pub struct BrowseQueryGet {
limit: Param<usize>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFields>)]
fields: OptionStarOrList<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentRetrieveVectors>)]
retrieve_vectors: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidDocumentFilter>)]
filter: Option<String>,
}
@ -178,8 +166,6 @@ pub struct BrowseQuery {
limit: usize,
#[deserr(default, error = DeserrJsonError<InvalidDocumentFields>)]
fields: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidDocumentRetrieveVectors>)]
retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidDocumentFilter>)]
filter: Option<Value>,
}
@ -199,7 +185,6 @@ pub async fn documents_by_query_post(
with_filter: body.filter.is_some(),
limit: body.limit,
offset: body.offset,
retrieve_vectors: body.retrieve_vectors,
},
&req,
);
@ -216,7 +201,7 @@ pub async fn get_documents(
) -> Result<HttpResponse, ResponseError> {
debug!(parameters = ?params, "Get documents GET");
let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner();
let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner();
let filter = match filter {
Some(f) => match serde_json::from_str(&f) {
@ -230,7 +215,6 @@ pub async fn get_documents(
offset: offset.0,
limit: limit.0,
fields: fields.merge_star_and_none(),
retrieve_vectors: retrieve_vectors.0,
filter,
};
@ -239,7 +223,6 @@ pub async fn get_documents(
with_filter: query.filter.is_some(),
limit: query.limit,
offset: query.offset,
retrieve_vectors: query.retrieve_vectors,
},
&req,
);
@ -253,14 +236,10 @@ fn documents_by_query(
query: BrowseQuery,
) -> Result<HttpResponse, ResponseError> {
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query;
let features = index_scheduler.features();
let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?;
let BrowseQuery { offset, limit, fields, filter } = query;
let index = index_scheduler.index(&index_uid)?;
let (total, documents) =
retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?;
let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?;
let ret = PaginationView::new(offset, limit, total as usize, documents);
@ -600,44 +579,13 @@ fn some_documents<'a, 't: 'a>(
index: &'a Index,
rtxn: &'t RoTxn,
doc_ids: impl IntoIterator<Item = DocumentId> + 'a,
retrieve_vectors: RetrieveVectors,
) -> Result<impl Iterator<Item = Result<Document, ResponseError>> + 'a, ResponseError> {
let fields_ids_map = index.fields_ids_map(rtxn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let embedding_configs = index.embedding_configs(rtxn)?;
Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| {
ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> {
let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
match retrieve_vectors {
RetrieveVectors::Ignore => {}
RetrieveVectors::Hide => {
document.remove("_vectors");
}
RetrieveVectors::Retrieve => {
let mut vectors = match document.remove("_vectors") {
Some(Value::Object(map)) => map,
_ => Default::default(),
};
for (name, vector) in index.embeddings(rtxn, key)? {
let user_provided = embedding_configs
.iter()
.find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(key));
let embeddings = ExplicitVectors {
embeddings: Some(vector.into()),
regenerate: !user_provided,
};
vectors.insert(
name,
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,
);
}
document.insert("_vectors".into(), vectors.into());
}
}
Ok(document)
ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> {
Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?)
})
}))
}
@ -648,7 +596,6 @@ fn retrieve_documents<S: AsRef<str>>(
limit: usize,
filter: Option<Value>,
attributes_to_retrieve: Option<Vec<S>>,
retrieve_vectors: RetrieveVectors,
) -> Result<(u64, Vec<Document>), ResponseError> {
let rtxn = index.read_txn()?;
let filter = &filter;
@ -673,57 +620,53 @@ fn retrieve_documents<S: AsRef<str>>(
let (it, number_of_documents) = {
let number_of_documents = candidates.len();
(
some_documents(
index,
&rtxn,
candidates.into_iter().skip(offset).take(limit),
retrieve_vectors,
)?,
some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?,
number_of_documents,
)
};
let documents: Vec<_> = it
let documents: Result<Vec<_>, ResponseError> = it
.map(|document| {
Ok(match &attributes_to_retrieve {
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
&document?,
attributes_to_retrieve.iter().map(|s| s.as_ref()).chain(
(retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"),
),
attributes_to_retrieve.iter().map(|s| s.as_ref()),
),
None => document?,
})
})
.collect::<Result<_, ResponseError>>()?;
.collect();
Ok((number_of_documents, documents))
Ok((number_of_documents, documents?))
}
fn retrieve_document<S: AsRef<str>>(
index: &Index,
doc_id: &str,
attributes_to_retrieve: Option<Vec<S>>,
retrieve_vectors: RetrieveVectors,
) -> Result<Document, ResponseError> {
let txn = index.read_txn()?;
let fields_ids_map = index.fields_ids_map(&txn)?;
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
let internal_id = index
.external_documents_ids()
.get(&txn, doc_id)?
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
let document = some_documents(index, &txn, Some(internal_id), retrieve_vectors)?
let document = index
.documents(&txn, std::iter::once(internal_id))?
.into_iter()
.next()
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))??;
.map(|(_, d)| d)
.ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?;
let document = match &attributes_to_retrieve {
Some(attributes_to_retrieve) => permissive_json_pointer::select_values(
&document,
attributes_to_retrieve
.iter()
.map(|s| s.as_ref())
.chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")),
attributes_to_retrieve.iter().map(|s| s.as_ref()),
),
None => document,
};

View File

@ -115,7 +115,6 @@ impl From<FacetSearchQuery> for SearchQuery {
page: None,
hits_per_page: None,
attributes_to_retrieve: None,
retrieve_vectors: false,
attributes_to_crop: None,
crop_length: DEFAULT_CROP_LENGTH(),
attributes_to_highlight: None,
@ -124,7 +123,6 @@ impl From<FacetSearchQuery> for SearchQuery {
show_ranking_score_details: false,
filter,
sort: None,
distinct: None,
facets: None,
highlight_pre_tag: DEFAULT_HIGHLIGHT_PRE_TAG(),
highlight_post_tag: DEFAULT_HIGHLIGHT_POST_TAG(),

View File

@ -20,9 +20,9 @@ use crate::extractors::sequential_extractor::SeqHandler;
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
use crate::search::{
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH,
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
};
use crate::search_queue::SearchQueue;
@ -51,8 +51,6 @@ pub struct SearchQueryGet {
hits_per_page: Option<Param<usize>>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToRetrieve>)]
attributes_to_retrieve: Option<CS<String>>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchRetrieveVectors>)]
retrieve_vectors: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToCrop>)]
attributes_to_crop: Option<CS<String>>,
#[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError<InvalidSearchCropLength>)]
@ -63,8 +61,6 @@ pub struct SearchQueryGet {
filter: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchSort>)]
sort: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchDistinct>)]
distinct: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchShowMatchesPosition>)]
show_matches_position: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSearchShowRankingScore>)]
@ -157,13 +153,11 @@ impl From<SearchQueryGet> for SearchQuery {
page: other.page.as_deref().copied(),
hits_per_page: other.hits_per_page.as_deref().copied(),
attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()),
retrieve_vectors: other.retrieve_vectors.0,
attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()),
crop_length: other.crop_length.0,
attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()),
filter,
sort: other.sort.map(|attr| fix_sort_query_parameters(&attr)),
distinct: other.distinct,
show_matches_position: other.show_matches_position.0,
show_ranking_score: other.show_ranking_score.0,
show_ranking_score_details: other.show_ranking_score_details.0,
@ -228,12 +222,10 @@ pub async fn search_with_url_query(
let features = index_scheduler.features();
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?;
let _permit = search_queue.try_get_search_permit().await?;
let search_result = tokio::task::spawn_blocking(move || {
perform_search(&index, query, search_kind, retrieve_vector)
})
.await?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?;
if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result);
}
@ -270,13 +262,10 @@ pub async fn search_with_post(
let features = index_scheduler.features();
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?;
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
let _permit = search_queue.try_get_search_permit().await?;
let search_result = tokio::task::spawn_blocking(move || {
perform_search(&index, query, search_kind, retrieve_vectors)
})
.await?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?;
if let Ok(ref search_result) = search_result {
aggregate.succeed(search_result);
if search_result.degraded {
@ -298,10 +287,11 @@ pub fn search_kind(
features: RoFeatures,
) -> Result<SearchKind, ResponseError> {
if query.vector.is_some() {
features.check_vector("Passing `vector` as a parameter")?;
features.check_vector("Passing `vector` as a query parameter")?;
}
if query.hybrid.is_some() {
features.check_vector("Passing `hybrid` as a parameter")?;
features.check_vector("Passing `hybrid` as a query parameter")?;
}
// regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing

View File

@ -4,7 +4,11 @@ use deserr::actix_web::{AwebJson, AwebQueryParameter};
use index_scheduler::IndexScheduler;
use meilisearch_types::deserr::query_params::Param;
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
use meilisearch_types::error::deserr_codes::*;
use meilisearch_types::error::deserr_codes::{
InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId,
InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarRankingScoreThreshold,
InvalidSimilarShowRankingScore, InvalidSimilarShowRankingScoreDetails,
};
use meilisearch_types::error::{ErrorCode as _, ResponseError};
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::keys::actions;
@ -17,8 +21,8 @@ use crate::analytics::{Analytics, SimilarAggregator};
use crate::extractors::authentication::GuardedData;
use crate::extractors::sequential_extractor::SeqHandler;
use crate::search::{
add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind,
SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
add_search_rules, perform_similar, RankingScoreThresholdSimilar, SearchKind, SimilarQuery,
SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
};
pub fn configure(cfg: &mut web::ServiceConfig) {
@ -93,8 +97,6 @@ async fn similar(
features.check_vector("Using the similar API")?;
let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?;
// Tenant token search_rules.
if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) {
add_search_rules(&mut query.filter, search_rules);
@ -105,10 +107,8 @@ async fn similar(
let (embedder_name, embedder) =
SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?;
tokio::task::spawn_blocking(move || {
perform_similar(&index, query, embedder_name, embedder, retrieve_vectors)
})
.await?
tokio::task::spawn_blocking(move || perform_similar(&index, query, embedder_name, embedder))
.await?
}
#[derive(Debug, deserr::Deserr)]
@ -122,8 +122,6 @@ pub struct SimilarQueryGet {
limit: Param<usize>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarAttributesToRetrieve>)]
attributes_to_retrieve: Option<CS<String>>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarRetrieveVectors>)]
retrieve_vectors: Param<bool>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarFilter>)]
filter: Option<String>,
#[deserr(default, error = DeserrQueryParamError<InvalidSimilarShowRankingScore>)]
@ -158,7 +156,6 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
offset,
limit,
attributes_to_retrieve,
retrieve_vectors,
filter,
show_ranking_score,
show_ranking_score_details,
@ -183,7 +180,6 @@ impl TryFrom<SimilarQueryGet> for SimilarQuery {
filter,
embedder,
attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()),
retrieve_vectors: retrieve_vectors.0,
show_ranking_score: show_ranking_score.0,
show_ranking_score_details: show_ranking_score_details.0,
ranking_score_threshold: ranking_score_threshold.map(|x| x.0),

View File

@ -15,7 +15,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData};
use crate::extractors::sequential_extractor::SeqHandler;
use crate::routes::indexes::search::search_kind;
use crate::search::{
add_search_rules, perform_search, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex,
add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex,
};
use crate::search_queue::SearchQueue;
@ -83,14 +83,11 @@ pub async fn multi_search_with_post(
let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)
.with_index(query_index)?;
let retrieve_vector =
RetrieveVectors::new(query.retrieve_vectors, features).with_index(query_index)?;
let search_result = tokio::task::spawn_blocking(move || {
perform_search(&index, query, search_kind, retrieve_vector)
})
.await
.with_index(query_index)?;
let search_result =
tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind))
.await
.with_index(query_index)?;
search_results.push(SearchResultWithIndex {
index_uid: index_uid.into_inner(),

View File

@ -15,7 +15,6 @@ use meilisearch_types::error::{Code, ResponseError};
use meilisearch_types::heed::RoTxn;
use meilisearch_types::index_uid::IndexUid;
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors;
use meilisearch_types::milli::vector::Embedder;
use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget};
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
@ -60,8 +59,6 @@ pub struct SearchQuery {
pub hits_per_page: Option<usize>,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
pub retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
pub attributes_to_crop: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
@ -78,8 +75,6 @@ pub struct SearchQuery {
pub filter: Option<Value>,
#[deserr(default, error = DeserrJsonError<InvalidSearchSort>)]
pub sort: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchDistinct>)]
pub distinct: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
pub facets: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
@ -146,7 +141,6 @@ impl fmt::Debug for SearchQuery {
page,
hits_per_page,
attributes_to_retrieve,
retrieve_vectors,
attributes_to_crop,
crop_length,
attributes_to_highlight,
@ -155,7 +149,6 @@ impl fmt::Debug for SearchQuery {
show_ranking_score_details,
filter,
sort,
distinct,
facets,
highlight_pre_tag,
highlight_post_tag,
@ -180,9 +173,6 @@ impl fmt::Debug for SearchQuery {
if let Some(q) = q {
debug.field("q", &q);
}
if *retrieve_vectors {
debug.field("retrieve_vectors", &retrieve_vectors);
}
if let Some(v) = vector {
if v.len() < 10 {
debug.field("vector", &v);
@ -205,9 +195,6 @@ impl fmt::Debug for SearchQuery {
if let Some(sort) = sort {
debug.field("sort", &sort);
}
if let Some(distinct) = distinct {
debug.field("distinct", &distinct);
}
if let Some(facets) = facets {
debug.field("facets", &facets);
}
@ -383,8 +370,6 @@ pub struct SearchQueryWithIndex {
pub hits_per_page: Option<usize>,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchRetrieveVectors>)]
pub retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidSearchAttributesToCrop>)]
pub attributes_to_crop: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchCropLength>, default = DEFAULT_CROP_LENGTH())]
@ -401,8 +386,6 @@ pub struct SearchQueryWithIndex {
pub filter: Option<Value>,
#[deserr(default, error = DeserrJsonError<InvalidSearchSort>)]
pub sort: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchDistinct>)]
pub distinct: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
pub facets: Option<Vec<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
@ -430,7 +413,6 @@ impl SearchQueryWithIndex {
page,
hits_per_page,
attributes_to_retrieve,
retrieve_vectors,
attributes_to_crop,
crop_length,
attributes_to_highlight,
@ -439,7 +421,6 @@ impl SearchQueryWithIndex {
show_matches_position,
filter,
sort,
distinct,
facets,
highlight_pre_tag,
highlight_post_tag,
@ -459,7 +440,6 @@ impl SearchQueryWithIndex {
page,
hits_per_page,
attributes_to_retrieve,
retrieve_vectors,
attributes_to_crop,
crop_length,
attributes_to_highlight,
@ -468,7 +448,6 @@ impl SearchQueryWithIndex {
show_matches_position,
filter,
sort,
distinct,
facets,
highlight_pre_tag,
highlight_post_tag,
@ -499,8 +478,6 @@ pub struct SimilarQuery {
pub embedder: Option<String>,
#[deserr(default, error = DeserrJsonError<InvalidSimilarAttributesToRetrieve>)]
pub attributes_to_retrieve: Option<BTreeSet<String>>,
#[deserr(default, error = DeserrJsonError<InvalidSimilarRetrieveVectors>)]
pub retrieve_vectors: bool,
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScore>, default)]
pub show_ranking_score: bool,
#[deserr(default, error = DeserrJsonError<InvalidSimilarShowRankingScoreDetails>, default)]
@ -739,10 +716,6 @@ fn prepare_search<'t>(
search.ranking_score_threshold(ranking_score_threshold.0);
}
if let Some(distinct) = &query.distinct {
search.distinct(distinct.clone());
}
match search_kind {
SearchKind::KeywordOnly => {
if let Some(q) = &query.q {
@ -837,7 +810,6 @@ pub fn perform_search(
index: &Index,
query: SearchQuery,
search_kind: SearchKind,
retrieve_vectors: RetrieveVectors,
) -> Result<SearchResult, MeilisearchHttpError> {
let before_search = Instant::now();
let rtxn = index.read_txn()?;
@ -875,8 +847,6 @@ pub fn perform_search(
page,
hits_per_page,
attributes_to_retrieve,
// use the enum passed as parameter
retrieve_vectors: _,
attributes_to_crop,
crop_length,
attributes_to_highlight,
@ -896,12 +866,10 @@ pub fn perform_search(
matching_strategy: _,
attributes_to_search_on: _,
filter: _,
distinct: _,
} = query;
let format = AttributesFormat {
attributes_to_retrieve,
retrieve_vectors,
attributes_to_highlight,
attributes_to_crop,
crop_length,
@ -985,7 +953,6 @@ pub fn perform_search(
struct AttributesFormat {
attributes_to_retrieve: Option<BTreeSet<String>>,
retrieve_vectors: RetrieveVectors,
attributes_to_highlight: Option<HashSet<String>>,
attributes_to_crop: Option<Vec<String>>,
crop_length: usize,
@ -998,36 +965,6 @@ struct AttributesFormat {
show_ranking_score_details: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RetrieveVectors {
/// Do not touch the `_vectors` field
///
/// this is the behavior when the vectorStore feature is disabled
Ignore,
/// Remove the `_vectors` field
///
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false`
Hide,
/// Retrieve vectors from the DB and merge them into the `_vectors` field
///
/// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `true`
Retrieve,
}
impl RetrieveVectors {
pub fn new(
retrieve_vector: bool,
features: index_scheduler::RoFeatures,
) -> Result<Self, index_scheduler::Error> {
match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) {
(true, Ok(())) => Ok(Self::Retrieve),
(true, Err(error)) => Err(error),
(false, Ok(())) => Ok(Self::Hide),
(false, Err(_)) => Ok(Self::Ignore),
}
}
}
fn make_hits(
index: &Index,
rtxn: &RoTxn<'_>,
@ -1037,32 +974,10 @@ fn make_hits(
document_scores: Vec<Vec<ScoreDetails>>,
) -> Result<Vec<SearchHit>, MeilisearchHttpError> {
let fields_ids_map = index.fields_ids_map(rtxn).unwrap();
let displayed_ids =
index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::<BTreeSet<_>>());
let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
(None, _) => false,
// displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
(Some(_), None) => true,
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
};
let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors {
if vectors_is_hidden {
RetrieveVectors::Hide
} else {
RetrieveVectors::Retrieve
}
} else {
format.retrieve_vectors
};
let displayed_ids =
displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
let displayed_ids = index
.displayed_fields_ids(rtxn)?
.map(|fields| fields.into_iter().collect::<BTreeSet<_>>())
.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect());
let fids = |attrs: &BTreeSet<String>| {
let mut ids = BTreeSet::new();
for attr in attrs {
@ -1085,7 +1000,6 @@ fn make_hits(
.intersection(&displayed_ids)
.cloned()
.collect();
let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default();
let attr_to_crop = format.attributes_to_crop.unwrap_or_default();
let formatted_options = compute_formatted_options(
@ -1119,48 +1033,18 @@ fn make_hits(
formatter_builder.highlight_prefix(format.highlight_pre_tag);
formatter_builder.highlight_suffix(format.highlight_post_tag);
let mut documents = Vec::new();
let embedding_configs = index.embedding_configs(rtxn)?;
let documents_iter = index.documents(rtxn, documents_ids)?;
for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
for ((_id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
// First generate a document with all the displayed fields
let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?;
let add_vectors_fid =
vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve);
// select the attributes to retrieve
let attributes_to_retrieve = to_retrieve_ids
.iter()
// skip the vectors_fid if RetrieveVectors::Hide
.filter(|fid| match vectors_fid {
Some(vectors_fid) => {
!(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid)
}
None => true,
})
// need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve`
.chain(add_vectors_fid.iter())
.map(|&fid| fields_ids_map.name(fid).expect("Missing field name"));
let mut document =
permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve);
if retrieve_vectors == RetrieveVectors::Retrieve {
let mut vectors = match document.remove("_vectors") {
Some(Value::Object(map)) => map,
_ => Default::default(),
};
for (name, vector) in index.embeddings(rtxn, id)? {
let user_provided = embedding_configs
.iter()
.find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(id));
let embeddings =
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
vectors.insert(name, serde_json::to_value(embeddings)?);
}
document.insert("_vectors".into(), vectors.into());
}
let (matches_position, formatted) = format_fields(
&displayed_document,
&fields_ids_map,
@ -1230,7 +1114,6 @@ pub fn perform_similar(
query: SimilarQuery,
embedder_name: String,
embedder: Arc<Embedder>,
retrieve_vectors: RetrieveVectors,
) -> Result<SimilarResult, ResponseError> {
let before_search = Instant::now();
let rtxn = index.read_txn()?;
@ -1242,7 +1125,6 @@ pub fn perform_similar(
filter: _,
embedder: _,
attributes_to_retrieve,
retrieve_vectors: _,
show_ranking_score,
show_ranking_score_details,
ranking_score_threshold,
@ -1289,7 +1171,6 @@ pub fn perform_similar(
let format = AttributesFormat {
attributes_to_retrieve,
retrieve_vectors,
attributes_to_highlight: None,
attributes_to_crop: None,
crop_length: DEFAULT_CROP_LENGTH(),

View File

@ -182,10 +182,14 @@ impl Index<'_> {
self.service.get(url).await
}
pub async fn get_document(&self, id: u64, options: Option<Value>) -> (Value, StatusCode) {
pub async fn get_document(
&self,
id: u64,
options: Option<GetDocumentOptions>,
) -> (Value, StatusCode) {
let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id);
if let Some(options) = options {
write!(url, "?{}", yaup::to_string(&options).unwrap()).unwrap();
if let Some(fields) = options.and_then(|o| o.fields) {
let _ = write!(url, "?fields={}", fields.join(","));
}
self.service.get(url).await
}
@ -201,11 +205,18 @@ impl Index<'_> {
}
pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) {
let url = format!(
"/indexes/{}/documents?{}",
urlencode(self.uid.as_ref()),
yaup::to_string(&options).unwrap()
);
let mut url = format!("/indexes/{}/documents?", urlencode(self.uid.as_ref()));
if let Some(limit) = options.limit {
let _ = write!(url, "limit={}&", limit);
}
if let Some(offset) = options.offset {
let _ = write!(url, "offset={}&", offset);
}
if let Some(attributes_to_retrieve) = options.attributes_to_retrieve {
let _ = write!(url, "fields={}&", attributes_to_retrieve.join(","));
}
self.service.get(url).await
}
@ -424,11 +435,13 @@ impl Index<'_> {
}
}
#[derive(Debug, Default, serde::Serialize)]
#[serde(rename_all = "camelCase")]
pub struct GetDocumentOptions {
pub fields: Option<Vec<&'static str>>,
}
#[derive(Debug, Default)]
pub struct GetAllDocumentsOptions {
pub limit: Option<usize>,
pub offset: Option<usize>,
pub retrieve_vectors: bool,
pub fields: Option<Vec<&'static str>>,
pub attributes_to_retrieve: Option<Vec<&'static str>>,
}

View File

@ -6,7 +6,7 @@ pub mod service;
use std::fmt::{self, Display};
#[allow(unused)]
pub use index::GetAllDocumentsOptions;
pub use index::{GetAllDocumentsOptions, GetDocumentOptions};
use meili_snap::json_string;
use serde::{Deserialize, Serialize};
#[allow(unused)]
@ -65,7 +65,7 @@ impl Display for Value {
write!(
f,
"{}",
json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]", ".processingTimeMs" => "[duration]" })
json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" })
)
}
}

View File

@ -795,70 +795,3 @@ async fn fetch_document_by_filter() {
}
"###);
}
#[actix_rt::test]
async fn retrieve_vectors() {
let server = Server::new().await;
let index = server.index("doggo");
// GETALL DOCUMENTS BY QUERY
let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await;
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`",
"code": "invalid_document_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
}
"###);
let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await;
snapshot!(json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
// FETCHALL DOCUMENTS BY POST
let (response, _code) =
index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await;
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`",
"code": "invalid_document_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
}
"###);
let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await;
snapshot!(json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
// GET A SINGLEDOCUMENT
let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await;
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`",
"code": "invalid_document_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors"
}
"###);
let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await;
snapshot!(json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
}

View File

@ -4,7 +4,7 @@ use meili_snap::*;
use urlencoding::encode as urlencode;
use crate::common::encoder::Encoder;
use crate::common::{GetAllDocumentsOptions, Server, Value};
use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value};
use crate::json;
// TODO: partial test since we are testing error, amd error is not yet fully implemented in
@ -59,7 +59,8 @@ async fn get_document() {
})
);
let (response, code) = index.get_document(0, Some(json!({ "fields": ["id"] }))).await;
let (response, code) =
index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["id"]) })).await;
assert_eq!(code, 200);
assert_eq!(
response,
@ -68,8 +69,9 @@ async fn get_document() {
})
);
let (response, code) =
index.get_document(0, Some(json!({ "fields": ["nested.content"] }))).await;
let (response, code) = index
.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["nested.content"]) }))
.await;
assert_eq!(code, 200);
assert_eq!(
response,
@ -209,7 +211,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
let (response, code) = index
.get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["name"]),
attributes_to_retrieve: Some(vec!["name"]),
..Default::default()
})
.await;
@ -223,19 +225,9 @@ async fn test_get_all_documents_attributes_to_retrieve() {
assert_eq!(response["limit"], json!(20));
assert_eq!(response["total"], json!(77));
let (response, code) = index.get_all_documents_raw("?fields=").await;
assert_eq!(code, 200);
assert_eq!(response["results"].as_array().unwrap().len(), 20);
for results in response["results"].as_array().unwrap() {
assert_eq!(results.as_object().unwrap().keys().count(), 0);
}
assert_eq!(response["offset"], json!(0));
assert_eq!(response["limit"], json!(20));
assert_eq!(response["total"], json!(77));
let (response, code) = index
.get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["wrong"]),
attributes_to_retrieve: Some(vec![]),
..Default::default()
})
.await;
@ -250,7 +242,22 @@ async fn test_get_all_documents_attributes_to_retrieve() {
let (response, code) = index
.get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["name", "tags"]),
attributes_to_retrieve: Some(vec!["wrong"]),
..Default::default()
})
.await;
assert_eq!(code, 200);
assert_eq!(response["results"].as_array().unwrap().len(), 20);
for results in response["results"].as_array().unwrap() {
assert_eq!(results.as_object().unwrap().keys().count(), 0);
}
assert_eq!(response["offset"], json!(0));
assert_eq!(response["limit"], json!(20));
assert_eq!(response["total"], json!(77));
let (response, code) = index
.get_all_documents(GetAllDocumentsOptions {
attributes_to_retrieve: Some(vec!["name", "tags"]),
..Default::default()
})
.await;
@ -263,7 +270,10 @@ async fn test_get_all_documents_attributes_to_retrieve() {
}
let (response, code) = index
.get_all_documents(GetAllDocumentsOptions { fields: Some(vec!["*"]), ..Default::default() })
.get_all_documents(GetAllDocumentsOptions {
attributes_to_retrieve: Some(vec!["*"]),
..Default::default()
})
.await;
assert_eq!(code, 200);
assert_eq!(response["results"].as_array().unwrap().len(), 20);
@ -273,7 +283,7 @@ async fn test_get_all_documents_attributes_to_retrieve() {
let (response, code) = index
.get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["*", "wrong"]),
attributes_to_retrieve: Some(vec!["*", "wrong"]),
..Default::default()
})
.await;
@ -306,10 +316,12 @@ async fn get_document_s_nested_attributes_to_retrieve() {
assert_eq!(code, 202);
index.wait_task(1).await;
let (response, code) = index.get_document(0, Some(json!({ "fields": ["content"] }))).await;
let (response, code) =
index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await;
assert_eq!(code, 200);
assert_eq!(response, json!({}));
let (response, code) = index.get_document(1, Some(json!({ "fields": ["content"] }))).await;
let (response, code) =
index.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await;
assert_eq!(code, 200);
assert_eq!(
response,
@ -321,7 +333,9 @@ async fn get_document_s_nested_attributes_to_retrieve() {
})
);
let (response, code) = index.get_document(0, Some(json!({ "fields": ["content.truc"] }))).await;
let (response, code) = index
.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) }))
.await;
assert_eq!(code, 200);
assert_eq!(
response,
@ -329,7 +343,9 @@ async fn get_document_s_nested_attributes_to_retrieve() {
"content.truc": "foobar",
})
);
let (response, code) = index.get_document(1, Some(json!({ "fields": ["content.truc"] }))).await;
let (response, code) = index
.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) }))
.await;
assert_eq!(code, 200);
assert_eq!(
response,
@ -524,217 +540,3 @@ async fn get_document_by_filter() {
}
"###);
}
#[actix_rt::test]
async fn get_document_with_vectors() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
{"id": 1, "name": "echo", "_vectors": { "manual": null }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
// by default you shouldn't see the `_vectors` object
let (documents, _code) = index.get_all_documents(Default::default()).await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir"
},
{
"id": 1,
"name": "echo"
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) = index.get_document(0, None).await;
snapshot!(json_string!(documents), @r###"
{
"id": 0,
"name": "kefir"
}
"###);
// if we try to retrieve the vectors with the `fields` parameter they
// still shouldn't be displayed
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions {
fields: Some(vec!["name", "_vectors"]),
..Default::default()
})
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"name": "kefir"
},
{
"name": "echo"
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) =
index.get_document(0, Some(json!({"fields": ["name", "_vectors"]}))).await;
snapshot!(json_string!(documents), @r###"
{
"name": "kefir"
}
"###);
// If we specify the retrieve vectors boolean and nothing else we should get the vectors
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await;
snapshot!(json_string!(documents), @r###"
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
}
"###);
// If we specify the retrieve vectors boolean and exclude vectors form the `fields` we should still get the vectors
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions {
retrieve_vectors: true,
fields: Some(vec!["name"]),
..Default::default()
})
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (documents, _code) =
index.get_document(0, Some(json!({"retrieveVectors": true, "fields": ["name"]}))).await;
snapshot!(json_string!(documents), @r###"
{
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
}
"###);
}

View File

@ -1938,210 +1938,3 @@ async fn import_dump_v6_containing_experimental_features() {
})
.await;
}
// In this test we must generate the dump ourselves to ensure the
// `user provided` vectors are well set
#[actix_rt::test]
#[cfg_attr(target_os = "windows", ignore)]
async fn generate_and_import_dump_containing_vectors() {
let temp = tempfile::tempdir().unwrap();
let mut opt = default_settings(temp.path());
let server = Server::new_with_options(opt.clone()).await.unwrap();
let (code, _) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let index = server.index("pets");
let (response, code) = index
.update_settings(json!(
{
"embedders": {
"doggo_embedder": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.doggo}}",
}
}
}
))
.await;
snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
snapshot!(response);
let (response, code) = index
.add_documents(
json!([
{"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }},
{"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "regenerate": false, "embeddings": vec![1; 384] }}},
{"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "regenerate": true, "embeddings": vec![2; 384] }}},
{"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "regenerate": true }}},
{"id": 4, "doggo": "max" },
]),
None,
)
.await;
snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
snapshot!(response);
let (response, code) = server.create_dump().await;
snapshot!(code, @"202 Accepted");
let response = index.wait_task(response.uid()).await;
snapshot!(response["status"], @r###""succeeded""###);
// ========= We made a dump, now we should clear the DB and try to import our dump
drop(server);
tokio::fs::remove_dir_all(&opt.db_path).await.unwrap();
let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap());
let dump_path = opt.dump_dir.join(dump_name);
assert!(dump_path.exists(), "path: `{}`", dump_path.display());
opt.import_dump = Some(dump_path);
// NOTE: We shouldn't have to change the database path but I lost one hour
// because of a « bad path » error and that fixed it.
opt.db_path = temp.path().join("data.ms");
let mut server = Server::new_auth_with_options(opt, temp).await;
server.use_api_key("MASTER_KEY");
let (indexes, code) = server.list_indexes(None, None).await;
assert_eq!(code, 200, "{indexes}");
snapshot!(indexes["results"].as_array().unwrap().len(), @"1");
snapshot!(indexes["results"][0]["uid"], @r###""pets""###);
snapshot!(indexes["results"][0]["primaryKey"], @r###""id""###);
let (response, code) = server.get_features().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let index = server.index("pets");
let (response, code) = index.settings().await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"displayedAttributes": [
"*"
],
"searchableAttributes": [
"*"
],
"filterableAttributes": [],
"sortableAttributes": [],
"rankingRules": [
"words",
"typo",
"proximity",
"attribute",
"sort",
"exactness"
],
"stopWords": [],
"nonSeparatorTokens": [],
"separatorTokens": [],
"dictionary": [],
"synonyms": {},
"distinctAttribute": null,
"proximityPrecision": "byWord",
"typoTolerance": {
"enabled": true,
"minWordSizeForTypos": {
"oneTypo": 5,
"twoTypos": 9
},
"disableOnWords": [],
"disableOnAttributes": []
},
"faceting": {
"maxValuesPerFacet": 100,
"sortFacetValuesBy": {
"*": "alpha"
}
},
"pagination": {
"maxTotalHits": 1000
},
"embedders": {
"doggo_embedder": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.doggo}}"
}
},
"searchCutoffMs": null
}
"###);
index
.search(json!({"retrieveVectors": true}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###"
[
{
"id": 0,
"doggo": "kefir",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": false
}
}
},
{
"id": 1,
"doggo": "echo",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": false
}
}
},
{
"id": 2,
"doggo": "intel",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": true
}
}
},
{
"id": 3,
"doggo": "bill",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": true
}
}
},
{
"id": 4,
"doggo": "max",
"_vectors": {
"doggo_embedder": {
"embeddings": "[vector]",
"regenerate": true
}
}
}
]
"###);
})
.await;
}

View File

@ -1,25 +0,0 @@
---
source: meilisearch/tests/dumps/mod.rs
---
{
"uid": 0,
"indexUid": "pets",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"doggo_embedder": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.doggo}}"
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -1,19 +0,0 @@
---
source: meilisearch/tests/dumps/mod.rs
---
{
"uid": 1,
"indexUid": "pets",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 5,
"indexedDocuments": 5
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -13,7 +13,6 @@ mod snapshot;
mod stats;
mod swap_indexes;
mod tasks;
mod vector;
// Tests are isolated by features in different modules to allow better readability, test
// targetability, and improved incremental compilation times.

View File

@ -107,39 +107,6 @@ static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
])
});
static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"id": 1,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": { "main": "Brown", "pattern": "stripped" },
},
{
"id": 2,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": { "main": "Black", "pattern": "stripped" },
},
{
"id": 3,
"description": "Leather Jacket",
"brand": "Lee Jeans",
"product_id": "123456",
"color": { "main": "Blue", "pattern": "used" },
},
{
"id": 4,
"description": "T-Shirt",
"brand": "Nike",
"product_id": "789012",
"color": { "main": "Blue", "pattern": "stripped" },
}
])
});
static DOCUMENT_PRIMARY_KEY: &str = "id";
static DOCUMENT_DISTINCT_KEY: &str = "product_id";
@ -272,35 +239,3 @@ async fn distinct_search_with_pagination_no_ranking() {
snapshot!(response["totalPages"], @"2");
snapshot!(response["totalHits"], @"6");
}
#[actix_rt::test]
async fn distinct_at_search_time() {
let server = Server::new().await;
let index = server.index("tamo");
let documents = NESTED_DOCUMENTS.clone();
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
let (task, _) = index.update_settings_filterable_attributes(json!(["color.main"])).await;
let task = index.wait_task(task.uid()).await;
snapshot!(task, name: "succeed");
fn get_hits(response: &Value) -> Vec<String> {
let hits_array = response["hits"]
.as_array()
.unwrap_or_else(|| panic!("{}", &serde_json::to_string_pretty(&response).unwrap()));
hits_array
.iter()
.map(|h| h[DOCUMENT_PRIMARY_KEY].as_number().unwrap().to_string())
.collect::<Vec<_>>()
}
let (response, code) =
index.search_post(json!({"page": 1, "hitsPerPage": 3, "distinct": "color.main"})).await;
let hits = get_hits(&response);
snapshot!(code, @"200 OK");
snapshot!(hits.len(), @"3");
snapshot!(format!("{:?}", hits), @r###"["1", "2", "3"]"###);
snapshot!(response["page"], @"1");
snapshot!(response["totalPages"], @"1");
snapshot!(response["totalHits"], @"3");
}

View File

@ -167,74 +167,6 @@ async fn search_bad_hits_per_page() {
"###);
}
#[actix_rt::test]
async fn search_bad_attributes_to_retrieve() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.search_post(json!({"attributesToRetrieve": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.attributesToRetrieve`: expected an array, but found a string: `\"doggo\"`",
"code": "invalid_search_attributes_to_retrieve",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_retrieve"
}
"###);
// Can't make the `attributes_to_retrieve` fail with a get search since it'll accept anything as an array of strings.
}
#[actix_rt::test]
async fn search_bad_retrieve_vectors() {
let server = Server::new().await;
let index = server.index("test");
let (response, code) = index.search_post(json!({"retrieveVectors": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
let (response, code) = index.search_post(json!({"retrieveVectors": [true]})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
let (response, code) = index.search_get("retrieveVectors=").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
let (response, code) = index.search_get("retrieveVectors=doggo").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`",
"code": "invalid_search_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors"
}
"###);
}
#[actix_rt::test]
async fn search_bad_attributes_to_crop() {
let server = Server::new().await;
@ -1140,66 +1072,3 @@ async fn search_on_unknown_field_plus_joker() {
)
.await;
}
#[actix_rt::test]
async fn distinct_at_search_time() {
let server = Server::new().await;
let index = server.index("tamo");
let (task, _) = index.create(None).await;
let task = index.wait_task(task.uid()).await;
snapshot!(task, name: "task-succeed");
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. This index does not have configured filterable attributes.",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
let (task, _) = index.update_settings_filterable_attributes(json!(["color", "machin"])).await;
index.wait_task(task.uid()).await;
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, machin`.",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
let (task, _) = index.update_settings_displayed_attributes(json!(["color"])).await;
index.wait_task(task.uid()).await;
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": "doggo.truc"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Attribute `doggo.truc` is not filterable and thus, cannot be used as distinct attribute. Available filterable attributes are: `color, <..hidden-attributes>`.",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
let (response, code) =
index.search_post(json!({"page": 0, "hitsPerPage": 2, "distinct": true})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(response, @r###"
{
"message": "Invalid value type at `.distinct`: expected a string, but found a boolean: `true`",
"code": "invalid_search_distinct",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_distinct"
}
"###);
}

View File

@ -124,61 +124,32 @@ async fn simple_search() {
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}}}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###);
snapshot!(response["semanticHitCount"], @"0");
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true, "retrieveVectors": true}),
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"2");
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true, "retrieveVectors": true}),
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3");
}
#[actix_rt::test]
async fn limit_offset() {
let server = Server::new().await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true, "offset": 1, "limit": 1}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###);
snapshot!(response["semanticHitCount"], @"0");
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
let server = Server::new().await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.9}, "retrieveVectors": true, "offset": 1, "limit": 1}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}}]"###);
snapshot!(response["semanticHitCount"], @"1");
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
}
#[actix_rt::test]
async fn simple_search_hf() {
let server = Server::new().await;
@ -233,10 +204,10 @@ async fn distribution_shift() {
let server = Server::new().await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true});
let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}});
let (response, code) = index.search_post(search.clone()).await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
let (response, code) = index
.update_settings(json!({
@ -257,7 +228,7 @@ async fn distribution_shift() {
let (response, code) = index.search_post(search).await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7}]"###);
}
#[actix_rt::test]
@ -268,23 +239,20 @@ async fn highlighter() {
let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 0.2},
"retrieveVectors": true,
"attributesToHighlight": [
"desc",
"_vectors",
"attributesToHighlight": [
"desc"
],
"highlightPreTag": "**BEGIN**",
"highlightPostTag": "**END**",
"highlightPreTag": "**BEGIN**",
"highlightPostTag": "**END**"
}))
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###);
snapshot!(response["semanticHitCount"], @"0");
let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 0.8},
"retrieveVectors": true,
"showRankingScore": true,
"attributesToHighlight": [
"desc"
@ -294,14 +262,13 @@ async fn highlighter() {
}))
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3");
// no highlighting on full semantic
let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 1.0},
"retrieveVectors": true,
"showRankingScore": true,
"attributesToHighlight": [
"desc"
@ -311,7 +278,7 @@ async fn highlighter() {
}))
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3");
}
@ -394,12 +361,12 @@ async fn single_document() {
let (response, code) = index
.search_post(
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}),
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0}"###);
snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0}"###);
snapshot!(response["semanticHitCount"], @"1");
}
@ -410,25 +377,25 @@ async fn query_combination() {
// search without query and vector, but with hybrid => still placeholder
let (response, code) = index
.search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
.search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}))
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###);
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###);
snapshot!(response["semanticHitCount"], @"null");
// same with a different semantic ratio
let (response, code) = index
.search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true}))
.search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true}))
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###);
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###);
snapshot!(response["semanticHitCount"], @"null");
// wrong vector dimensions
let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
.search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}))
.await;
snapshot!(code, @"400 Bad Request");
@ -443,34 +410,34 @@ async fn query_combination() {
// full vector
let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}))
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.6581138968467712}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712}]"###);
snapshot!(response["semanticHitCount"], @"3");
// full keyword, without a query
let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true}))
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true}))
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###);
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###);
snapshot!(response["semanticHitCount"], @"null");
// query + vector, full keyword => keyword
let (response, code) = index
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true}))
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true}))
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["semanticHitCount"], @"null");
// query + vector, no hybrid keyword =>
let (response, code) = index
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true}))
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true}))
.await;
snapshot!(code, @"400 Bad Request");
@ -486,7 +453,7 @@ async fn query_combination() {
// full vector, without a vector => error
let (response, code) = index
.search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}),
json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}),
)
.await;
@ -503,93 +470,11 @@ async fn query_combination() {
// hybrid without a vector => full keyword
let (response, code) = index
.search_post(
json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}),
json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###);
snapshot!(response["semanticHitCount"], @"0");
}
#[actix_rt::test]
async fn retrieve_vectors() {
let server = Server::new().await;
let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
)
.await;
snapshot!(code, @"200 OK");
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
[
{
"title": "Captain Planet",
"desc": "He's not part of the Marvel Cinematic Universe",
"id": "2",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
},
{
"title": "Captain Marvel",
"desc": "a Shazam ersatz",
"id": "3",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
},
{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1",
"_vectors": {
"default": {
"embeddings": "[vectors]",
"regenerate": true
}
}
}
]
"###);
// remove `_vectors` from displayed attributes
let (response, code) =
index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}),
)
.await;
snapshot!(code, @"200 OK");
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
[
{
"title": "Captain Planet",
"desc": "He's not part of the Marvel Cinematic Universe",
"id": "2"
},
{
"title": "Captain Marvel",
"desc": "a Shazam ersatz",
"id": "3"
},
{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1"
}
]
"###);
}

View File

@ -301,7 +301,7 @@ async fn negative_special_cases_search() {
index.add_documents(documents, None).await;
index.wait_task(0).await;
index.update_settings(json!({"synonyms": { "escape": ["gläss"] }})).await;
index.update_settings(json!({"synonyms": { "escape": ["glass"] }})).await;
index.wait_task(1).await;
// There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass
@ -1290,38 +1290,21 @@ async fn experimental_feature_vector_store() {
index.add_documents(json!(documents), None).await;
index.wait_task(0).await;
index
.search(json!({
let (response, code) = index
.search_post(json!({
"vector": [1.0, 2.0, 3.0],
"showRankingScore": true
}), |response, code|{
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Passing `vector` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
})
.await;
index
.search(json!({
"retrieveVectors": true,
"showRankingScore": true
}), |response, code|{
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
})
}))
.await;
meili_snap::snapshot!(code, @"400 Bad Request");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"message": "Passing `vector` as a query parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677",
"code": "feature_not_enabled",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
}
"###);
let (response, code) = server.set_features(json!({"vectorStore": true})).await;
meili_snap::snapshot!(code, @"200 OK");
@ -1354,7 +1337,6 @@ async fn experimental_feature_vector_store() {
.search_post(json!({
"vector": [1.0, 2.0, 3.0],
"showRankingScore": true,
"retrieveVectors": true,
}))
.await;
@ -1366,16 +1348,11 @@ async fn experimental_feature_vector_store() {
"title": "Shazam!",
"id": "287947",
"_vectors": {
"manual": {
"embeddings": [
[
1.0,
2.0,
3.0
]
],
"regenerate": false
}
"manual": [
1.0,
2.0,
3.0
]
},
"_rankingScore": 1.0
},
@ -1383,16 +1360,11 @@ async fn experimental_feature_vector_store() {
"title": "Captain Marvel",
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
1.0,
2.0,
54.0
]
],
"regenerate": false
}
"manual": [
1.0,
2.0,
54.0
]
},
"_rankingScore": 0.9129111766815186
},
@ -1400,16 +1372,11 @@ async fn experimental_feature_vector_store() {
"title": "Gläss",
"id": "450465",
"_vectors": {
"manual": {
"embeddings": [
[
-100.0,
340.0,
90.0
]
],
"regenerate": false
}
"manual": [
-100.0,
340.0,
90.0
]
},
"_rankingScore": 0.8106412887573242
},
@ -1417,16 +1384,11 @@ async fn experimental_feature_vector_store() {
"title": "How to Train Your Dragon: The Hidden World",
"id": "166428",
"_vectors": {
"manual": {
"embeddings": [
[
-100.0,
231.0,
32.0
]
],
"regenerate": false
}
"manual": [
-100.0,
231.0,
32.0
]
},
"_rankingScore": 0.7412010431289673
},
@ -1434,16 +1396,11 @@ async fn experimental_feature_vector_store() {
"title": "Escape Room",
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
10.0,
-23.0,
32.0
]
],
"regenerate": false
}
"manual": [
10.0,
-23.0,
32.0
]
},
"_rankingScore": 0.6972063183784485
}

View File

@ -1,20 +0,0 @@
---
source: meilisearch/tests/search/distinct.rs
---
{
"uid": 1,
"indexUid": "tamo",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"filterableAttributes": [
"color.main"
]
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -1,18 +0,0 @@
---
source: meilisearch/tests/search/errors.rs
---
{
"uid": 0,
"indexUid": "tamo",
"status": "succeeded",
"type": "indexCreation",
"canceledBy": null,
"details": {
"primaryKey": null
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -756,54 +756,3 @@ async fn filter_reserved_geo_point_string() {
})
.await;
}
#[actix_rt::test]
async fn similar_bad_retrieve_vectors() {
let server = Server::new().await;
server.set_features(json!({"vectorStore": true})).await;
let index = server.index("test");
let (response, code) = index.similar_post(json!({"retrieveVectors": "doggo"})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
let (response, code) = index.similar_post(json!({"retrieveVectors": [true]})).await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
let (response, code) = index.similar_get("retrieveVectors=").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
let (response, code) = index.similar_get("retrieveVectors=doggo").await;
snapshot!(code, @"400 Bad Request");
snapshot!(json_string!(response), @r###"
{
"message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`",
"code": "invalid_similar_retrieve_vectors",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors"
}
"###);
}

View File

@ -78,7 +78,7 @@ async fn basic() {
index.wait_task(value.uid()).await;
index
.similar(json!({"id": 143, "retrieveVectors": true}), |response, code| {
.similar(json!({"id": 143}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
@ -87,16 +87,11 @@ async fn basic() {
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
"manual": [
0.1,
0.6,
0.8
]
}
},
{
@ -104,16 +99,11 @@ async fn basic() {
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
0.6000000238418579,
0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
"manual": [
0.6,
0.8,
-0.2
]
}
},
{
@ -121,16 +111,11 @@ async fn basic() {
"release_year": 2019,
"id": "166428",
"_vectors": {
"manual": {
"embeddings": [
[
0.699999988079071,
0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
"manual": [
0.7,
0.7,
-0.4
]
}
},
{
@ -138,16 +123,11 @@ async fn basic() {
"release_year": 2019,
"id": "287947",
"_vectors": {
"manual": {
"embeddings": [
[
0.800000011920929,
0.4000000059604645,
-0.5
]
],
"regenerate": false
}
"manual": [
0.8,
0.4,
-0.5
]
}
}
]
@ -156,7 +136,7 @@ async fn basic() {
.await;
index
.similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| {
.similar(json!({"id": "299537"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
@ -165,16 +145,11 @@ async fn basic() {
"release_year": 2019,
"id": "166428",
"_vectors": {
"manual": {
"embeddings": [
[
0.699999988079071,
0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
"manual": [
0.7,
0.7,
-0.4
]
}
},
{
@ -182,16 +157,11 @@ async fn basic() {
"release_year": 2019,
"id": "287947",
"_vectors": {
"manual": {
"embeddings": [
[
0.800000011920929,
0.4000000059604645,
-0.5
]
],
"regenerate": false
}
"manual": [
0.8,
0.4,
-0.5
]
}
},
{
@ -199,16 +169,11 @@ async fn basic() {
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
"manual": [
0.1,
0.6,
0.8
]
}
},
{
@ -216,16 +181,11 @@ async fn basic() {
"release_year": 1930,
"id": "143",
"_vectors": {
"manual": {
"embeddings": [
[
-0.5,
0.30000001192092896,
0.8500000238418579
]
],
"regenerate": false
}
"manual": [
-0.5,
0.3,
0.85
]
}
}
]
@ -268,7 +228,7 @@ async fn ranking_score_threshold() {
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0, "retrieveVectors": true}),
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0}),
|response, code| {
snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4");
@ -279,16 +239,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
"manual": [
0.1,
0.6,
0.8
]
},
"_rankingScore": 0.890957772731781
},
@ -297,16 +252,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
0.6000000238418579,
0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
"manual": [
0.6,
0.8,
-0.2
]
},
"_rankingScore": 0.39060014486312866
},
@ -315,16 +265,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "166428",
"_vectors": {
"manual": {
"embeddings": [
[
0.699999988079071,
0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
"manual": [
0.7,
0.7,
-0.4
]
},
"_rankingScore": 0.2819308042526245
},
@ -333,16 +278,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "287947",
"_vectors": {
"manual": {
"embeddings": [
[
0.800000011920929,
0.4000000059604645,
-0.5
]
],
"regenerate": false
}
"manual": [
0.8,
0.4,
-0.5
]
},
"_rankingScore": 0.1662663221359253
}
@ -354,7 +294,7 @@ async fn ranking_score_threshold() {
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2, "retrieveVectors": true}),
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2}),
|response, code| {
snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3");
@ -365,16 +305,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
"manual": [
0.1,
0.6,
0.8
]
},
"_rankingScore": 0.890957772731781
},
@ -383,16 +318,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
0.6000000238418579,
0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
"manual": [
0.6,
0.8,
-0.2
]
},
"_rankingScore": 0.39060014486312866
},
@ -401,16 +331,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "166428",
"_vectors": {
"manual": {
"embeddings": [
[
0.699999988079071,
0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
"manual": [
0.7,
0.7,
-0.4
]
},
"_rankingScore": 0.2819308042526245
}
@ -422,7 +347,7 @@ async fn ranking_score_threshold() {
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3, "retrieveVectors": true}),
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3}),
|response, code| {
snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2");
@ -433,16 +358,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
"manual": [
0.1,
0.6,
0.8
]
},
"_rankingScore": 0.890957772731781
},
@ -451,16 +371,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
0.6000000238418579,
0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
"manual": [
0.6,
0.8,
-0.2
]
},
"_rankingScore": 0.39060014486312866
}
@ -472,7 +387,7 @@ async fn ranking_score_threshold() {
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6, "retrieveVectors": true}),
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6}),
|response, code| {
snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1");
@ -483,16 +398,11 @@ async fn ranking_score_threshold() {
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
"manual": [
0.1,
0.6,
0.8
]
},
"_rankingScore": 0.890957772731781
}
@ -504,7 +414,7 @@ async fn ranking_score_threshold() {
index
.similar(
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9, "retrieveVectors": true}),
json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9}),
|response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @"[]");
@ -546,97 +456,71 @@ async fn filter() {
index.wait_task(value.uid()).await;
index
.similar(
json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "Captain Marvel",
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
0.6000000238418579,
0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
}
},
{
"title": "How to Train Your Dragon: The Hidden World",
"release_year": 2019,
"id": "166428",
"_vectors": {
"manual": {
"embeddings": [
[
0.699999988079071,
0.699999988079071,
-0.4000000059604645
]
],
"regenerate": false
}
}
},
{
"title": "Shazam!",
"release_year": 2019,
"id": "287947",
"_vectors": {
"manual": {
"embeddings": [
[
0.800000011920929,
0.4000000059604645,
-0.5
]
],
"regenerate": false
}
}
}
]
"###);
},
)
.similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "Captain Marvel",
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": [
0.6,
0.8,
-0.2
]
}
},
{
"title": "How to Train Your Dragon: The Hidden World",
"release_year": 2019,
"id": "166428",
"_vectors": {
"manual": [
0.7,
0.7,
-0.4
]
}
},
{
"title": "Shazam!",
"release_year": 2019,
"id": "287947",
"_vectors": {
"manual": [
0.8,
0.4,
-0.5
]
}
}
]
"###);
})
.await;
index
.similar(
json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "All Quiet on the Western Front",
"release_year": 1930,
"id": "143",
"_vectors": {
"manual": {
"embeddings": [
[
-0.5,
0.30000001192092896,
0.8500000238418579
]
],
"regenerate": false
}
}
}
]
"###);
},
)
.similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "All Quiet on the Western Front",
"release_year": 1930,
"id": "143",
"_vectors": {
"manual": [
-0.5,
0.3,
0.85
]
}
}
]
"###);
})
.await;
}
@ -673,7 +557,7 @@ async fn limit_and_offset() {
index.wait_task(value.uid()).await;
index
.similar(json!({"id": 143, "limit": 1, "retrieveVectors": true}), |response, code| {
.similar(json!({"id": 143, "limit": 1}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
@ -682,16 +566,11 @@ async fn limit_and_offset() {
"release_year": 2019,
"id": "522681",
"_vectors": {
"manual": {
"embeddings": [
[
0.10000000149011612,
0.6000000238418579,
0.800000011920929
]
],
"regenerate": false
}
"manual": [
0.1,
0.6,
0.8
]
}
}
]
@ -700,32 +579,24 @@ async fn limit_and_offset() {
.await;
index
.similar(
json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "Captain Marvel",
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": {
"embeddings": [
[
0.6000000238418579,
0.800000011920929,
-0.20000000298023224
]
],
"regenerate": false
}
}
}
]
"###);
},
)
.similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| {
snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###"
[
{
"title": "Captain Marvel",
"release_year": 2019,
"id": "299537",
"_vectors": {
"manual": [
0.6,
0.8,
-0.2
]
}
}
]
"###);
})
.await;
}

View File

@ -1,603 +0,0 @@
mod settings;
use meili_snap::{json_string, snapshot};
use crate::common::index::Index;
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
#[actix_rt::test]
async fn add_remove_user_provided() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [
[
1.0,
1.0,
1.0
]
],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [10, 10, 10] }},
{"id": 1, "name": "echo", "_vectors": { "manual": null }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
10.0,
10.0,
10.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 2
}
"###);
let (value, code) = index.delete_document(0).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 1,
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 1
}
"###);
}
async fn generate_default_user_provided_documents(server: &Server) -> Index {
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
{"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }},
{"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }},
{"id": 3, "name": "intel", "_vectors": { "manual": { "regenerate": false, "embeddings": [3, 3, 3] }}},
{"id": 4, "name": "max", "_vectors": { "manual": { "regenerate": false, "embeddings": [[4, 4, 4], [4, 4, 5]] }}},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
index.wait_task(value.uid()).await;
index
}
#[actix_rt::test]
async fn user_provided_embeddings_error() {
let server = Server::new().await;
let index = generate_default_user_provided_documents(&server).await;
// First case, we forget to specify the `regenerate`
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [0, 0, 0] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 2,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// Second case, we don't specify anything
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": {}}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 3,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// Third case, we specify something wrong in place of regenerate
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": "yes please" }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 4,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 5,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 6,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 7,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [23, 0.1, -12], "regenerate": true }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task["status"], @r###""succeeded""###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task["status"], @r###""succeeded""###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [0.1, [0.2, 0.3]] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 10,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, 0.2], 0.3] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 11,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected an array, but found a number: `0.3`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, true], 0.3] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 12,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
#[actix_rt::test]
async fn clear_documents() {
let server = Server::new().await;
let index = generate_default_user_provided_documents(&server).await;
let (value, _code) = index.clear_all_documents().await;
index.wait_task(value.uid()).await;
// Make sure the documents DB has been cleared
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [],
"offset": 0,
"limit": 20,
"total": 0
}
"###);
// Make sure the arroy DB has been cleared
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
snapshot!(documents, @r###"
{
"hits": [],
"query": "",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 0,
"semanticHitCount": 0
}
"###);
}
#[actix_rt::test]
async fn add_remove_one_vector_4588() {
// https://github.com/meilisearch/meilisearch/issues/4588
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, name: "settings-processed");
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, name: "document-added");
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": null }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, name: "document-deleted");
let (documents, _code) = index.search_post(json!({"vector": [1, 1, 1] })).await;
snapshot!(documents, @r###"
{
"hits": [
{
"id": 0,
"name": "kefir"
}
],
"query": "",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 1,
"semanticHitCount": 1
}
"###);
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 1
}
"###);
}

View File

@ -1,228 +0,0 @@
use meili_snap::{json_string, snapshot};
use crate::common::{GetAllDocumentsOptions, Server};
use crate::json;
use crate::vector::generate_default_user_provided_documents;
#[actix_rt::test]
async fn update_embedder() {
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": { "manual": {}},
}))
.await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 2,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
let ret = server.wait_task(response.uid()).await;
snapshot!(ret, @r###"
{
"uid": 1,
"indexUid": "doggo",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 2
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
#[actix_rt::test]
async fn reset_embedder_documents() {
let server = Server::new().await;
let index = generate_default_user_provided_documents(&server).await;
let (response, code) = index.delete_settings().await;
snapshot!(code, @"202 Accepted");
server.wait_task(response.uid()).await;
// Make sure the documents are still present
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions {
limit: None,
offset: None,
retrieve_vectors: false,
fields: None,
})
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir"
},
{
"id": 1,
"name": "echo"
},
{
"id": 2,
"name": "billou"
},
{
"id": 3,
"name": "intel"
},
{
"id": 4,
"name": "max"
}
],
"offset": 0,
"limit": 20,
"total": 5
}
"###);
// Make sure we are still able to retrieve their vectors
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {
"manual": {
"embeddings": [
[
0.0,
0.0,
0.0
]
],
"regenerate": false
}
}
},
{
"id": 1,
"name": "echo",
"_vectors": {
"manual": {
"embeddings": [
[
1.0,
1.0,
1.0
]
],
"regenerate": false
}
}
},
{
"id": 2,
"name": "billou",
"_vectors": {
"manual": {
"embeddings": [
[
2.0,
2.0,
2.0
],
[
2.0,
2.0,
3.0
]
],
"regenerate": false
}
}
},
{
"id": 3,
"name": "intel",
"_vectors": {
"manual": {
"embeddings": [
[
3.0,
3.0,
3.0
]
],
"regenerate": false
}
}
},
{
"id": 4,
"name": "max",
"_vectors": {
"manual": {
"embeddings": [
[
4.0,
4.0,
4.0
],
[
4.0,
4.0,
5.0
]
],
"regenerate": false
}
}
}
],
"offset": 0,
"limit": 20,
"total": 5
}
"###);
// Make sure the arroy DB has been cleared
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
snapshot!(json_string!(documents), @r###"
{
"message": "Cannot find embedder with name `default`.",
"code": "invalid_embedder",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_embedder"
}
"###);
}

View File

@ -1,19 +0,0 @@
---
source: meilisearch/tests/vector/mod.rs
---
{
"uid": 1,
"indexUid": "doggo",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 1
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -1,19 +0,0 @@
---
source: meilisearch/tests/vector/mod.rs
---
{
"uid": 2,
"indexUid": "doggo",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 1
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -1,23 +0,0 @@
---
source: meilisearch/tests/vector/mod.rs
---
{
"uid": 0,
"indexUid": "doggo",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@ -44,7 +44,7 @@ once_cell = "1.19.0"
ordered-float = "4.2.0"
rand_pcg = { version = "0.3.1", features = ["serde1"] }
rayon = "1.8.0"
roaring = { version = "0.10.2", features = ["serde"] }
roaring = "0.10.2"
rstar = { version = "0.11.0", features = ["serde"] }
serde = { version = "1.0.195", features = ["derive"] }
serde_json = { version = "1.0.111", features = ["preserve_order"] }
@ -71,15 +71,15 @@ csv = "1.3.0"
candle-core = { version = "0.4.1" }
candle-transformers = { version = "0.4.1" }
candle-nn = { version = "0.4.1" }
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [
"onig",
] }
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
"online",
] }
tiktoken-rs = "0.5.8"
liquid = "0.26.4"
arroy = "0.4.0"
arroy = "0.3.1"
rand = "0.8.5"
tracing = "0.1.40"
ureq = { version = "2.9.7", features = ["json"] }
@ -141,6 +141,3 @@ swedish-recomposition = ["charabia/swedish-recomposition"]
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
cuda = ["candle-core/cuda"]
[lints.rust]
unexpected_cfgs = { level = "warn", check-cfg = ['cfg(fuzzing)'] }

View File

@ -59,7 +59,6 @@ fn main() -> Result<(), Box<dyn Error>> {
false,
universe,
&None,
&None,
GeoSortStrategy::default(),
0,
20,

View File

@ -119,8 +119,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
InvalidVectorDimensions { expected: usize, found: usize },
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
InvalidVectorsMapType { document_id: String, value: Value },
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError },
#[error("{0}")]
InvalidFilter(String),
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
@ -136,17 +134,6 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
}
)]
InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
#[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}",
.field,
match .valid_fields.is_empty() {
true => "This index does not have configured filterable attributes.".to_string(),
false => format!("Available filterable attributes are: `{}{}`.",
valid_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", "),
.hidden_fields.then_some(", <..hidden-attributes>").unwrap_or(""),
),
}
)]
InvalidDistinctAttribute { field: String, valid_fields: BTreeSet<String>, hidden_fields: bool },
#[error("Attribute `{}` is not facet-searchable. {}",
.field,
match .valid_fields.is_empty() {
@ -283,9 +270,8 @@ impl From<arroy::Error> for Error {
arroy::Error::DatabaseFull
| arroy::Error::InvalidItemAppend
| arroy::Error::UnmatchingDistance { .. }
| arroy::Error::NeedBuild(_)
| arroy::Error::MissingKey { .. }
| arroy::Error::MissingMetadata(_) => {
| arroy::Error::MissingNode
| arroy::Error::MissingMetadata => {
Error::InternalError(InternalError::ArroyError(value))
}
}

View File

@ -4,7 +4,6 @@ use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::{FieldId, FieldsIdsMap, Weight};
#[derive(Debug, Default, Serialize, Deserialize)]
@ -24,13 +23,7 @@ impl FieldidsWeightsMap {
/// Should only be called in the case there are NO searchable attributes.
/// All the fields will be inserted in the order of the fields ids map with a weight of 0.
pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self {
FieldidsWeightsMap {
map: fid_map
.iter()
.filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
.map(|(fid, _name)| (fid, 0))
.collect(),
}
FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() }
}
/// Removes a field id from the map, returning the associated weight previously in the map.

View File

@ -41,16 +41,6 @@ impl FieldsIdsMap {
}
}
/// Get the ids of a field and all its nested fields based on its name.
pub fn nested_ids(&self, name: &str) -> Vec<FieldId> {
self.names_ids
.range(name.to_string()..)
.take_while(|(key, _)| key.starts_with(name))
.filter(|(key, _)| crate::is_faceted_by(key, name))
.map(|(_name, id)| *id)
.collect()
}
/// Get the id of a field based on its name.
pub fn id(&self, name: &str) -> Option<FieldId> {
self.names_ids.get(name).copied()
@ -136,32 +126,4 @@ mod tests {
assert_eq!(iter.next(), Some((3, "title")));
assert_eq!(iter.next(), None);
}
#[test]
fn nested_fields() {
let mut map = FieldsIdsMap::new();
assert_eq!(map.insert("id"), Some(0));
assert_eq!(map.insert("doggo"), Some(1));
assert_eq!(map.insert("doggo.name"), Some(2));
assert_eq!(map.insert("doggolution"), Some(3));
assert_eq!(map.insert("doggo.breed.name"), Some(4));
assert_eq!(map.insert("description"), Some(5));
insta::assert_debug_snapshot!(map.nested_ids("doggo"), @r###"
[
1,
4,
2,
]
"###);
insta::assert_debug_snapshot!(map.nested_ids("doggo.breed"), @r###"
[
4,
]
"###);
insta::assert_debug_snapshot!(map.nested_ids("_vector"), @"[]");
}
}

View File

@ -9,7 +9,6 @@ use heed::types::*;
use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified};
use roaring::RoaringBitmap;
use rstar::RTree;
use serde::{Deserialize, Serialize};
use time::OffsetDateTime;
use crate::documents::PrimaryKey;
@ -24,7 +23,6 @@ use crate::heed_codec::{
};
use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision;
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::vector::{Embedding, EmbeddingConfig};
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -646,7 +644,6 @@ impl Index {
&self,
wtxn: &mut RwTxn,
user_fields: &[&str],
non_searchable_fields_ids: &[FieldId],
fields_ids_map: &FieldsIdsMap,
) -> Result<()> {
// We can write the user defined searchable fields as-is.
@ -665,7 +662,6 @@ impl Index {
for (weight, user_field) in user_fields.iter().enumerate() {
if crate::is_faceted_by(field_from_map, user_field)
&& !real_fields.contains(&field_from_map)
&& !non_searchable_fields_ids.contains(&id)
{
real_fields.push(field_from_map);
@ -712,7 +708,6 @@ impl Index {
Ok(self
.fields_ids_map(rtxn)?
.names()
.filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME))
.map(|field| Cow::Owned(field.to_string()))
.collect())
})
@ -1230,11 +1225,6 @@ impl Index {
)
}
/// Deletes the FST which is the words prefixes dictionary of the engine.
pub fn delete_words_prefixes_fst(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
self.main.remap_key_type::<Str>().delete(wtxn, main_key::WORDS_PREFIXES_FST_KEY)
}
/// Returns the FST which is the words prefixes dictionary of the engine.
pub fn words_prefixes_fst<'t>(&self, rtxn: &'t RoTxn) -> Result<fst::Set<Cow<'t, [u8]>>> {
match self.main.remap_types::<Str, Bytes>().get(rtxn, main_key::WORDS_PREFIXES_FST_KEY)? {
@ -1578,16 +1568,12 @@ impl Index {
Ok(script_language)
}
/// Put the embedding configs:
/// 1. The name of the embedder
/// 2. The configuration option for this embedder
/// 3. The list of documents with a user provided embedding
pub(crate) fn put_embedding_configs(
&self,
wtxn: &mut RwTxn<'_>,
configs: Vec<IndexEmbeddingConfig>,
configs: Vec<(String, EmbeddingConfig)>,
) -> heed::Result<()> {
self.main.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>().put(
self.main.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>().put(
wtxn,
main_key::EMBEDDING_CONFIGS,
&configs,
@ -1598,10 +1584,13 @@ impl Index {
self.main.remap_key_type::<Str>().delete(wtxn, main_key::EMBEDDING_CONFIGS)
}
pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result<Vec<IndexEmbeddingConfig>> {
pub fn embedding_configs(
&self,
rtxn: &RoTxn<'_>,
) -> Result<Vec<(String, crate::vector::EmbeddingConfig)>> {
Ok(self
.main
.remap_types::<Str, SerdeJson<Vec<IndexEmbeddingConfig>>>()
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>()
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
.unwrap_or_default())
}
@ -1615,7 +1604,7 @@ impl Index {
arroy::Reader::open(rtxn, k, self.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata(_) => Ok(None),
arroy::Error::MissingMetadata => Ok(None),
e => Err(e.into()),
})
.transpose()
@ -1648,7 +1637,7 @@ impl Index {
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata(_) => Ok(None),
arroy::Error::MissingMetadata => Ok(None),
e => Err(e),
})
.transpose();
@ -1665,19 +1654,14 @@ impl Index {
}
}
res.insert(embedder_name.to_owned(), embeddings);
if !embeddings.is_empty() {
res.insert(embedder_name.to_owned(), embeddings);
}
}
Ok(res)
}
}
#[derive(Debug, Deserialize, Serialize)]
pub struct IndexEmbeddingConfig {
pub name: String,
pub config: EmbeddingConfig,
pub user_provided: RoaringBitmap,
}
#[cfg(test)]
pub(crate) mod tests {
use std::collections::HashSet;
@ -1685,17 +1669,15 @@ pub(crate) mod tests {
use big_s::S;
use heed::{EnvOpenOptions, RwTxn};
use maplit::{btreemap, hashset};
use maplit::hashset;
use tempfile::TempDir;
use crate::documents::DocumentsBatchReader;
use crate::error::{Error, InternalError};
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::update::{
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
Settings,
self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
};
use crate::vector::settings::{EmbedderSource, EmbeddingSettings};
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
pub(crate) struct TempIndex {
@ -2801,95 +2783,4 @@ pub(crate) mod tests {
]
"###);
}
#[test]
fn vectors_are_never_indexed_as_searchable_or_filterable() {
let index = TempIndex::new();
index
.add_documents(documents!([
{ "id": 0, "_vectors": { "doggo": [2345] } },
{ "id": 1, "_vectors": { "doggo": [6789] } },
]))
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @r###"["id"]"###);
db_snap!(index, fieldids_weights_map, @r###"
fid weight
0 0 |
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
drop(rtxn);
index
.update_settings(|settings| {
settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]);
settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]);
})
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @"[]");
db_snap!(index, fieldids_weights_map, @r###"
fid weight
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
let mut search = index.search(&rtxn);
let results = search
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
.execute()
.unwrap();
assert!(results.candidates.is_empty());
index
.update_settings(|settings| {
settings.set_embedder_settings(btreemap! {
S("doggo") => Setting::Set(EmbeddingSettings {
dimensions: Setting::Set(1),
source: Setting::Set(EmbedderSource::UserProvided),
..EmbeddingSettings::default()}),
});
})
.unwrap();
db_snap!(index, fields_ids_map, @r###"
0 id |
1 _vectors |
2 _vectors.doggo |
"###);
db_snap!(index, searchable_fields, @"[]");
db_snap!(index, fieldids_weights_map, @r###"
fid weight
"###);
let rtxn = index.read_txn().unwrap();
let mut search = index.search(&rtxn);
let results = search.query("2345").execute().unwrap();
assert!(results.candidates.is_empty());
let mut search = index.search(&rtxn);
let results = search
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
.execute()
.unwrap();
assert!(results.candidates.is_empty());
}
}

View File

@ -6,11 +6,9 @@ use heed::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::BytesRefCodec;
use crate::{CboRoaringBitmapCodec, DocumentId};
use crate::DocumentId;
/// Call the given closure on the facet distribution of the candidate documents.
///
@ -33,9 +31,12 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
let highest_level = get_highest_level(rtxn, db, field_id)?;
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
@ -74,8 +75,11 @@ where
// Represents the list of keys that we must explore.
let mut heap = BinaryHeap::new();
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let highest_level = get_highest_level(rtxn, db, field_id)?;
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
// We first fill the heap with values from the highest level
@ -88,10 +92,7 @@ where
if key.field_id != field_id {
break;
}
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let intersection = value.bitmap & candidates;
let count = intersection.len();
if count != 0 {
heap.push(LevelEntry {
@ -120,10 +121,7 @@ where
if key.field_id != field_id {
break;
}
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let intersection = value.bitmap & candidates;
let count = intersection.len();
if count != 0 {
heap.push(LevelEntry {
@ -148,7 +146,7 @@ where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
field_id: u16,
callback: CB,
}
@ -173,10 +171,7 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let docids_in_common = value.bitmap & candidates;
if !docids_in_common.is_empty() {
let any_docid_in_common = docids_in_common.min().unwrap();
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
@ -210,10 +205,7 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let docids_in_common = value.bitmap & candidates;
if !docids_in_common.is_empty() {
let cf = self.iterate(
&docids_in_common,

View File

@ -159,7 +159,6 @@ impl<'a> Search<'a> {
offset: 0,
limit: self.limit + self.offset,
sort_criteria: self.sort_criteria.clone(),
distinct: self.distinct.clone(),
searchable_attributes: self.searchable_attributes,
geo_strategy: self.geo_strategy,
terms_matching_strategy: self.terms_matching_strategy,
@ -178,16 +177,16 @@ impl<'a> Search<'a> {
// completely skip semantic search if the results of the keyword search are good enough
if self.results_good_enough(&keyword_results, semantic_ratio) {
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
return Ok((keyword_results, Some(0)));
}
// no vector search against placeholder search
let Some(query) = search.query.take() else {
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
return Ok((keyword_results, Some(0)));
};
// no embedder, no semantic search
let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else {
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
return Ok((keyword_results, Some(0)));
};
let vector_query = match vector {
@ -239,44 +238,3 @@ impl<'a> Search<'a> {
true
}
}
fn return_keyword_results(
limit: usize,
offset: usize,
SearchResult {
matching_words,
candidates,
mut documents_ids,
mut document_scores,
degraded,
used_negative_operator,
}: SearchResult,
) -> (SearchResult, Option<u32>) {
let (documents_ids, document_scores) = if offset >= documents_ids.len() ||
// technically redudant because documents_ids.len() == document_scores.len(),
// defensive programming
offset >= document_scores.len()
{
(vec![], vec![])
} else {
// PANICS: offset < len
documents_ids.rotate_left(offset);
documents_ids.truncate(limit);
// PANICS: offset < len
document_scores.rotate_left(offset);
document_scores.truncate(limit);
(documents_ids, document_scores)
};
(
SearchResult {
matching_words,
candidates,
documents_ids,
document_scores,
degraded,
used_negative_operator,
},
Some(0),
)
}

View File

@ -11,8 +11,8 @@ use self::new::{execute_vector_search, PartialSearchResult};
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::vector::Embedder;
use crate::{
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index,
Result, SearchContext, TimeBudget, UserError,
execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Index, Result,
SearchContext, TimeBudget,
};
// Building these factories is not free.
@ -40,7 +40,6 @@ pub struct Search<'a> {
offset: usize,
limit: usize,
sort_criteria: Option<Vec<AscDesc>>,
distinct: Option<String>,
searchable_attributes: Option<&'a [String]>,
geo_strategy: new::GeoSortStrategy,
terms_matching_strategy: TermsMatchingStrategy,
@ -62,7 +61,6 @@ impl<'a> Search<'a> {
offset: 0,
limit: 20,
sort_criteria: None,
distinct: None,
searchable_attributes: None,
geo_strategy: new::GeoSortStrategy::default(),
terms_matching_strategy: TermsMatchingStrategy::default(),
@ -107,11 +105,6 @@ impl<'a> Search<'a> {
self
}
pub fn distinct(&mut self, distinct: String) -> &mut Search<'a> {
self.distinct = Some(distinct);
self
}
pub fn searchable_attributes(&mut self, searchable: &'a [String]) -> &mut Search<'a> {
self.searchable_attributes = Some(searchable);
self
@ -176,19 +169,6 @@ impl<'a> Search<'a> {
ctx.attributes_to_search_on(searchable_attributes)?;
}
if let Some(distinct) = &self.distinct {
let filterable_fields = ctx.index.filterable_fields(ctx.txn)?;
if !crate::is_faceted(distinct, &filterable_fields) {
let (valid_fields, hidden_fields) =
ctx.index.remove_hidden_fields(ctx.txn, filterable_fields)?;
return Err(Error::UserError(UserError::InvalidDistinctAttribute {
field: distinct.clone(),
valid_fields,
hidden_fields,
}));
}
}
let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
let PartialSearchResult {
located_query_terms,
@ -205,7 +185,6 @@ impl<'a> Search<'a> {
self.scoring_strategy,
universe,
&self.sort_criteria,
&self.distinct,
self.geo_strategy,
self.offset,
self.limit,
@ -223,7 +202,6 @@ impl<'a> Search<'a> {
self.exhaustive_number_hits,
universe,
&self.sort_criteria,
&self.distinct,
self.geo_strategy,
self.offset,
self.limit,
@ -260,7 +238,6 @@ impl fmt::Debug for Search<'_> {
offset,
limit,
sort_criteria,
distinct,
searchable_attributes,
geo_strategy: _,
terms_matching_strategy,
@ -280,7 +257,6 @@ impl fmt::Debug for Search<'_> {
.field("offset", offset)
.field("limit", limit)
.field("sort_criteria", sort_criteria)
.field("distinct", distinct)
.field("searchable_attributes", searchable_attributes)
.field("terms_matching_strategy", terms_matching_strategy)
.field("scoring_strategy", scoring_strategy)

View File

@ -22,7 +22,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ctx: &mut SearchContext<'ctx>,
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
query: &Q,
distinct: Option<&str>,
universe: &RoaringBitmap,
from: usize,
length: usize,
@ -35,12 +34,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
logger.ranking_rules(&ranking_rules);
logger.initial_universe(universe);
let distinct_field = match distinct {
Some(distinct) => Some(distinct),
None => ctx.index.distinct_field(ctx.txn)?,
};
let distinct_fid = if let Some(field) = distinct_field {
let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? {
ctx.index.fields_ids_map(ctx.txn)?.id(field)
} else {
None

View File

@ -22,7 +22,7 @@ pub enum SearchEvents {
RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 },
RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 },
RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 },
RankingRuleEndIteration { ranking_rule_idx: usize },
RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 },
ExtendResults { new: Vec<u32> },
ProximityGraph { graph: RankingRuleGraph<ProximityGraph> },
ProximityPaths { paths: Vec<Vec<Interned<ProximityCondition>>> },
@ -123,9 +123,12 @@ impl SearchLogger<QueryGraph> for VisualSearchLogger {
&mut self,
ranking_rule_idx: usize,
_ranking_rule: &dyn RankingRule<QueryGraph>,
_universe: &RoaringBitmap,
universe: &RoaringBitmap,
) {
self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx });
self.events.push(SearchEvents::RankingRuleEndIteration {
ranking_rule_idx,
universe_len: universe.len(),
});
self.location.pop();
}
fn add_to_results(&mut self, docids: &[u32]) {
@ -323,7 +326,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_skip_bucket(bucket_len)?;
}
SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => {
SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => {
assert!(ranking_rule_idx == self.rr_action_counter.len() - 1);
self.write_end_iteration()?;
}

View File

@ -516,7 +516,6 @@ mod tests {
false,
universe,
&None,
&None,
crate::search::new::GeoSortStrategy::default(),
0,
100,

View File

@ -568,7 +568,6 @@ pub fn execute_vector_search(
scoring_strategy: ScoringStrategy,
universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy,
from: usize,
length: usize,
@ -599,7 +598,6 @@ pub fn execute_vector_search(
ctx,
ranking_rules,
&PlaceholderQuery,
distinct.as_deref(),
&universe,
from,
length,
@ -629,7 +627,6 @@ pub fn execute_search(
exhaustive_number_hits: bool,
mut universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy,
from: usize,
length: usize,
@ -720,7 +717,6 @@ pub fn execute_search(
ctx,
ranking_rules,
&graph,
distinct.as_deref(),
&universe,
from,
length,
@ -736,7 +732,6 @@ pub fn execute_search(
ctx,
ranking_rules,
&PlaceholderQuery,
distinct.as_deref(),
&universe,
from,
length,
@ -753,12 +748,7 @@ pub fn execute_search(
// The candidates is the universe unless the exhaustive number of hits
// is requested and a distinct attribute is set.
if exhaustive_number_hits {
let distinct_field = match distinct.as_deref() {
Some(distinct) => Some(distinct),
None => ctx.index.distinct_field(ctx.txn)?,
};
if let Some(f) = distinct_field {
if let Some(f) = ctx.index.distinct_field(ctx.txn)? {
if let Some(distinct_fid) = fields_ids_map.id(f) {
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
}

View File

@ -205,18 +205,8 @@ fn create_index() -> TempIndex {
index
}
fn verify_distinct(
index: &Index,
txn: &RoTxn,
distinct: Option<&str>,
docids: &[u32],
) -> Vec<String> {
let vs = collect_field_values(
index,
txn,
distinct.or_else(|| index.distinct_field(txn).unwrap()).unwrap(),
docids,
);
fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec<String> {
let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids);
let mut unique = HashSet::new();
for v in vs.iter() {
@ -233,49 +223,12 @@ fn verify_distinct(
fn test_distinct_placeholder_no_ranking_rules() {
let index = create_index();
// Set the letter as filterable and unset the distinct attribute.
index
.update_settings(|s| {
s.set_filterable_fields(hashset! { S("letter") });
s.reset_distinct_field();
})
.unwrap();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.distinct(S("letter"));
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
let distinct_values = verify_distinct(&index, &txn, Some("letter"), &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"A\"",
"\"B\"",
"\"C\"",
"\"D\"",
"\"E\"",
"\"F\"",
"\"G\"",
"\"H\"",
"\"I\"",
"__does_not_exist__",
"__does_not_exist__",
"__does_not_exist__",
]
"###);
}
#[test]
fn test_distinct_at_search_placeholder_no_ranking_rules() {
let index = create_index();
let txn = index.read_txn().unwrap();
let s = Search::new(&txn, &index);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"A\"",
@ -310,7 +263,7 @@ fn test_distinct_placeholder_sort() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"E\"",
@ -350,7 +303,7 @@ fn test_distinct_placeholder_sort() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"I\"",
@ -393,7 +346,7 @@ fn test_distinct_placeholder_sort() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"I\"",
@ -446,7 +399,7 @@ fn test_distinct_words() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"A\"",
@ -500,7 +453,7 @@ fn test_distinct_sort_words() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"I\"",
@ -596,7 +549,7 @@ fn test_distinct_typo() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"B\"",

View File

@ -0,0 +1,244 @@
---
source: milli/src/search/new/tests/attribute_fid.rs
expression: "format!(\"{document_ids_scores:#?}\")"
---
[
(
2,
[
Fid(
Rank {
rank: 19,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
(
6,
[
Fid(
Rank {
rank: 15,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
5,
[
Fid(
Rank {
rank: 14,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
4,
[
Fid(
Rank {
rank: 13,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
3,
[
Fid(
Rank {
rank: 12,
max_rank: 19,
},
),
Position(
Rank {
rank: 83,
max_rank: 91,
},
),
],
),
(
9,
[
Fid(
Rank {
rank: 11,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
8,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
7,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 73,
max_rank: 91,
},
),
],
),
(
11,
[
Fid(
Rank {
rank: 7,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
10,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
13,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
12,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 78,
max_rank: 91,
},
),
],
),
(
14,
[
Fid(
Rank {
rank: 5,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
0,
[
Fid(
Rank {
rank: 1,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
]

View File

@ -13,7 +13,7 @@ use std::collections::BTreeSet;
use std::iter::FromIterator;
use crate::index::tests::TempIndex;
use crate::{Search, SearchResult, TermsMatchingStrategy};
use crate::{db_snap, Search, SearchResult, TermsMatchingStrategy};
fn create_index() -> TempIndex {
let index = TempIndex::new();
@ -66,10 +66,9 @@ fn create_index() -> TempIndex {
}
#[test]
#[cfg(not(feature = "swedish-recomposition"))]
fn test_stop_words_not_indexed() {
let index = create_index();
crate::db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264");
db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264");
}
#[test]

View File

@ -0,0 +1,7 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |

View File

@ -0,0 +1,7 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |

View File

@ -64,13 +64,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
// Remove all user-provided bits from the configs
let mut configs = self.index.embedding_configs(self.wtxn)?;
for config in configs.iter_mut() {
config.user_provided.clear();
}
self.index.put_embedding_configs(self.wtxn, configs)?;
// Clear the other databases.
external_documents_ids.clear(self.wtxn)?;
word_docids.clear(self.wtxn)?;

View File

@ -8,19 +8,18 @@ use std::sync::Arc;
use bytemuck::cast_slice;
use grenad::Writer;
use itertools::EitherOrBoth;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::index::IndexEmbeddingConfig;
use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
use crate::vector::settings::{EmbedderAction, ReindexAction};
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
use crate::vector::Embedder;
use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort};
use crate::{DocumentId, Result, ThreadPoolNoAbort};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -36,8 +35,6 @@ pub struct ExtractedVectorPoints {
// embedder
pub embedder_name: String,
pub embedder: Arc<Embedder>,
pub add_to_user_provided: RoaringBitmap,
pub remove_from_user_provided: RoaringBitmap,
}
enum VectorStateDelta {
@ -45,7 +42,12 @@ enum VectorStateDelta {
// Remove all vectors, generated or manual, from this document
NowRemoved,
NowManual(Vec<Vec<f32>>),
// Add the manually specified vectors, passed in the other grenad
// Remove any previously generated vectors
// Note: changing the value of the manually specified vector **should not record** this delta
WasGeneratedNowManual(Vec<Vec<f32>>),
ManualDelta(Vec<Vec<f32>>, Vec<Vec<f32>>),
// Add the vector computed from the specified prompt
// Remove any previous vector
@ -54,12 +56,14 @@ enum VectorStateDelta {
}
impl VectorStateDelta {
fn into_values(self) -> (bool, String, Vec<Vec<f32>>) {
fn into_values(self) -> (bool, String, (Vec<Vec<f32>>, Vec<Vec<f32>>)) {
match self {
VectorStateDelta::NoChange => Default::default(),
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
// We always delete the previous vectors
VectorStateDelta::NowManual(add) => (true, Default::default(), add),
VectorStateDelta::WasGeneratedNowManual(add) => {
(true, Default::default(), (Default::default(), add))
}
VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)),
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
}
}
@ -70,27 +74,12 @@ struct EmbedderVectorExtractor {
embedder: Arc<Embedder>,
prompt: Arc<Prompt>,
// (docid, _index) -> KvWriterDelAdd -> Vector
manual_vectors_writer: Writer<BufWriter<File>>,
// (docid) -> (prompt)
prompts_writer: Writer<BufWriter<File>>,
// (docid) -> ()
remove_vectors_writer: Writer<BufWriter<File>>,
// (docid, _index) -> KvWriterDelAdd -> Vector
manual_vectors_writer: Writer<BufWriter<File>>,
// The docids of the documents that contains a user defined embedding
add_to_user_provided: RoaringBitmap,
action: ExtractionAction,
}
struct DocumentOperation {
// The docids of the documents that contains an auto-generated embedding
remove_from_user_provided: RoaringBitmap,
}
enum ExtractionAction {
SettingsFullReindex,
SettingsRegeneratePrompts { old_prompt: Arc<Prompt> },
DocumentOperation(DocumentOperation),
}
/// Extracts the embedding vector contained in each document under the `_vectors` field.
@ -100,7 +89,6 @@ enum ExtractionAction {
pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
embedders_configs: &[IndexEmbeddingConfig],
settings_diff: &InnerIndexSettingsDiff,
) -> Result<Vec<ExtractedVectorPoints>> {
let reindex_vectors = settings_diff.reindex_vectors();
@ -109,207 +97,153 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
// the vector field id may have changed
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
// filter the old vector fid if the settings has been changed forcing reindexing.
let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors);
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
let mut extractors = Vec::new();
for (embedder_name, (embedder, prompt)) in
settings_diff.new.embedding_configs.clone().into_iter()
{
// (docid, _index) -> KvWriterDelAdd -> Vector
let manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut configs = settings_diff.new.embedding_configs.clone().into_inner();
let old_configs = &settings_diff.old.embedding_configs;
// (docid) -> (prompt)
let prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
if reindex_vectors {
for (name, action) in settings_diff.embedding_config_updates.iter() {
match action {
EmbedderAction::WriteBackToDocuments(_) => continue, // already deleted
EmbedderAction::Reindex(action) => {
let Some((embedder_name, (embedder, prompt))) = configs.remove_entry(name)
else {
tracing::error!(embedder = name, "Requested embedder config not found");
continue;
};
// (docid) -> ()
let remove_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid, _index) -> KvWriterDelAdd -> Vector
let manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> (prompt)
let prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> ()
let remove_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let action = match action {
ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex,
ReindexAction::RegeneratePrompts => {
let Some((_, old_prompt)) = old_configs.get(name) else {
tracing::error!(embedder = name, "Old embedder config not found");
continue;
};
ExtractionAction::SettingsRegeneratePrompts { old_prompt }
}
};
extractors.push(EmbedderVectorExtractor {
embedder_name,
embedder,
prompt,
prompts_writer,
remove_vectors_writer,
manual_vectors_writer,
add_to_user_provided: RoaringBitmap::new(),
action,
});
}
}
}
} else {
// document operation
for (embedder_name, (embedder, prompt)) in configs.into_iter() {
// (docid, _index) -> KvWriterDelAdd -> Vector
let manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> (prompt)
let prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> ()
let remove_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
extractors.push(EmbedderVectorExtractor {
embedder_name,
embedder,
prompt,
prompts_writer,
remove_vectors_writer,
manual_vectors_writer,
add_to_user_provided: RoaringBitmap::new(),
action: ExtractionAction::DocumentOperation(DocumentOperation {
remove_from_user_provided: RoaringBitmap::new(),
}),
});
}
extractors.push(EmbedderVectorExtractor {
embedder_name,
embedder,
prompt,
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
});
}
let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? {
// this must always be serialized as (docid, external_docid);
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
let (docid_bytes, external_id_bytes) =
try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
debug_assert!(from_utf8(external_id_bytes).is_ok());
let docid = DocumentId::from_be_bytes(docid_bytes);
let obkv = obkv::KvReader::new(value);
key_buffer.clear();
key_buffer.extend_from_slice(docid_bytes.as_slice());
key_buffer.extend_from_slice(docid_bytes);
// since we only need the primary key when we throw an error we create this getter to
// lazily get it when needed
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
let mut parsed_vectors = ParsedVectorsDiff::new(
docid,
embedders_configs,
obkv,
old_vectors_fid,
new_vectors_fid,
)
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid)
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
for EmbedderVectorExtractor {
embedder_name,
embedder: _,
prompt,
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
manual_vectors_writer,
add_to_user_provided,
action,
} in extractors.iter_mut()
{
let (old, new) = parsed_vectors.remove(embedder_name);
let delta = match action {
ExtractionAction::SettingsFullReindex => match old {
// A full reindex can be triggered either by:
// 1. a new embedder
// 2. an existing embedder changed so that it must regenerate all generated embeddings.
// For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
VectorState::Inline(vectors) => {
if !vectors.must_regenerate() {
add_to_user_provided.insert(docid);
}
let delta = match parsed_vectors.remove(embedder_name) {
(Some(old), Some(new)) => {
// no autogeneration
let del_vectors = old.into_array_of_vectors();
let add_vectors = new.into_array_of_vectors();
match vectors.into_array_of_vectors() {
Some(add_vectors) => {
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(
crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
),
));
}
VectorStateDelta::NowManual(add_vectors)
}
None => VectorStateDelta::NoChange,
}
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
VectorState::Manual => VectorStateDelta::NoChange,
// generated vectors must be regenerated
VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?,
},
// prompt regeneration is only triggered for existing embedders
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
if old.must_regenerate() {
regenerate_if_prompt_changed(
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
}
(Some(_old), None) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv,
(old_prompt, prompt),
(old_fields_ids_map, new_fields_ids_map),
)?
DelAdd::Addition,
new_fields_ids_map,
)?)
} else {
// we can simply ignore user provided vectors as they are not regenerated and are
// already in the DB since this is an existing embedder
VectorStateDelta::NoChange
VectorStateDelta::NowRemoved
}
}
(None, Some(new)) => {
// was possibly autogenerated, remove all vectors for that document
let add_vectors = new.into_array_of_vectors();
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
VectorStateDelta::WasGeneratedNowManual(add_vectors)
}
(None, None) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt = Some(&prompt)
// TODO: this filter works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
.filter(|_| !settings_diff.reindex_vectors())
.map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map)
.unwrap_or_default()
});
let new_prompt =
prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
);
VectorStateDelta::NowGenerated(new_prompt)
} else {
tracing::trace!("⏭️ Prompt unmodified, skipping");
VectorStateDelta::NoChange
}
} else {
VectorStateDelta::NowRemoved
}
}
ExtractionAction::DocumentOperation(DocumentOperation {
remove_from_user_provided,
}) => extract_vector_document_diff(
docid,
obkv,
prompt,
(add_to_user_provided, remove_from_user_provided),
(old, new),
(old_fields_ids_map, new_fields_ids_map),
document_id,
)?,
};
// and we finally push the unique vectors into the writer
push_vectors_diff(
remove_vectors_writer,
@ -317,6 +251,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
manual_vectors_writer,
&mut key_buffer,
delta,
reindex_vectors,
)?;
}
}
@ -327,185 +262,43 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
embedder_name,
embedder,
prompt: _,
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
action,
manual_vectors_writer,
add_to_user_provided,
} in extractors
{
let remove_from_user_provided =
if let ExtractionAction::DocumentOperation(DocumentOperation {
remove_from_user_provided,
}) = action
{
remove_from_user_provided
} else {
Default::default()
};
results.push(ExtractedVectorPoints {
// docid, _index -> KvWriterDelAdd -> Vector
manual_vectors: writer_into_reader(manual_vectors_writer)?,
// docid -> ()
remove_vectors: writer_into_reader(remove_vectors_writer)?,
// docid -> prompt
prompts: writer_into_reader(prompts_writer)?,
embedder,
embedder_name,
add_to_user_provided,
remove_from_user_provided,
})
}
Ok(results)
}
fn extract_vector_document_diff(
docid: DocumentId,
obkv: obkv::KvReader<'_, FieldId>,
prompt: &Prompt,
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
(old, new): (VectorState, VectorState),
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
document_id: impl Fn() -> Value,
) -> Result<VectorStateDelta> {
match (old.must_regenerate(), new.must_regenerate()) {
(true, true) | (false, false) => {}
(true, false) => {
add_to_user_provided.insert(docid);
}
(false, true) => {
remove_from_user_provided.insert(docid);
}
}
let delta = match (old, new) {
// regardless of the previous state, if a document now contains inline _vectors, they must
// be extracted manually
(_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
Some(add_vectors) => {
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
VectorStateDelta::NowManual(add_vectors)
}
None => VectorStateDelta::NoChange,
},
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
// document changed
(VectorState::Generated, VectorState::Generated) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt = Some(&prompt).map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
});
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
);
VectorStateDelta::NowGenerated(new_prompt)
} else {
tracing::trace!("⏭️ Prompt unmodified, skipping");
VectorStateDelta::NoChange
}
} else {
VectorStateDelta::NowRemoved
}
}
// inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from
// the previous version of the document.
// Manual -> Generated is also not possible without an Inline to the right (which is handled above)
// Generated -> Generated is handled above, so not possible
// As a result, this code is unreachable
(_not_generated, VectorState::Generated) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv,
DelAdd::Addition,
new_fields_ids_map,
)?)
} else {
// make sure the document is always removed from user provided on removal
remove_from_user_provided.insert(docid);
VectorStateDelta::NowRemoved
}
}
// inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous
// version of the document.
// however the Rust type system cannot know that.
(_manual, VectorState::Manual) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// if the new version of documents has the vectors in the DB,
// then they are user-provided and nothing possibly changed
VectorStateDelta::NoChange
} else {
// make sure the document is always removed from user provided on removal
remove_from_user_provided.insert(docid);
VectorStateDelta::NowRemoved
}
}
};
Ok(delta)
}
fn regenerate_if_prompt_changed(
obkv: obkv::KvReader<'_, FieldId>,
(old_prompt, new_prompt): (&Prompt, &Prompt),
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
) -> Result<VectorStateDelta> {
let old_prompt =
old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default());
let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if new_prompt == old_prompt {
return Ok(VectorStateDelta::NoChange);
}
Ok(VectorStateDelta::NowGenerated(new_prompt))
}
fn regenerate_prompt(
obkv: obkv::KvReader<'_, FieldId>,
prompt: &Prompt,
new_fields_ids_map: &FieldsIdsMap,
) -> Result<VectorStateDelta> {
let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
Ok(VectorStateDelta::NowGenerated(prompt))
}
/// We cannot compute the diff between both Del and Add vectors.
/// We'll push every vector and compute the difference later in TypedChunk.
/// Computes the diff between both Del and Add numbers and
/// only inserts the parts that differ in the sorter.
fn push_vectors_diff(
remove_vectors_writer: &mut Writer<BufWriter<File>>,
prompts_writer: &mut Writer<BufWriter<File>>,
manual_vectors_writer: &mut Writer<BufWriter<File>>,
key_buffer: &mut Vec<u8>,
delta: VectorStateDelta,
reindex_vectors: bool,
) -> Result<()> {
let (must_remove, prompt, mut add_vectors) = delta.into_values();
if must_remove {
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
if must_remove
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
&& !reindex_vectors
{
key_buffer.truncate(TRUNCATE_SIZE);
remove_vectors_writer.insert(&key_buffer, [])?;
}
@ -515,22 +308,44 @@ fn push_vectors_diff(
}
// We sort and dedup the vectors
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
let merged_vectors_iter =
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
// insert vectors into the writer
for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
// Generate the key by extending the unique index to it.
key_buffer.truncate(TRUNCATE_SIZE);
let index = u16::try_from(i).unwrap();
key_buffer.extend_from_slice(&index.to_be_bytes());
// We insert only the Add part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left(vector) => {
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
if !reindex_vectors {
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
}
}
EitherOrBoth::Right(vector) => {
// We insert only the Add part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
}
}
}
Ok(())

View File

@ -8,10 +8,11 @@ mod extract_vector_points;
mod extract_word_docids;
mod extract_word_pair_proximity_docids;
mod extract_word_position_docids;
// mod searchable;
use std::fs::File;
use std::io::BufReader;
use std::sync::{Arc, OnceLock};
use std::sync::Arc;
use crossbeam_channel::Sender;
use rayon::prelude::*;
@ -30,9 +31,8 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::index::IndexEmbeddingConfig;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
/// Extract data for each databases from obkv documents in parallel.
/// Send data in grenad file over provided Sender.
@ -44,7 +44,6 @@ pub(crate) fn data_from_obkv_documents(
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
primary_key_id: FieldId,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>,
) -> Result<()> {
@ -57,7 +56,6 @@ pub(crate) fn data_from_obkv_documents(
original_documents_chunk,
indexer,
lmdb_writer_sx.clone(),
embedders_configs.clone(),
settings_diff.clone(),
)
})
@ -207,47 +205,33 @@ fn run_extraction_task<FE, FS, M>(
})
}
fn request_threads() -> &'static ThreadPoolNoAbort {
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
REQUEST_THREADS.get_or_init(|| {
ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()
.unwrap()
})
}
/// Extract chunked data and send it into lmdb_writer_sx sender:
/// - documents
fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
settings_diff: Arc<InnerIndexSettingsDiff>,
) -> Result<()> {
let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
let request_threads = ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
// no point in indexing vectors without embedders
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
if index_vectors {
let settings_diff = settings_diff.clone();
let embedders_configs = embedders_configs.clone();
let original_documents_chunk = original_documents_chunk.clone();
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
match extract_vector_points(
original_documents_chunk.clone(),
indexer,
&embedders_configs,
&settings_diff,
) {
match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) {
Ok(extracted_vectors) => {
for ExtractedVectorPoints {
manual_vectors,
@ -255,15 +239,13 @@ fn send_original_documents_data(
prompts,
embedder_name,
embedder,
add_to_user_provided,
remove_from_user_provided,
} in extracted_vectors
{
let embeddings = match extract_embeddings(
prompts,
indexer,
embedder.clone(),
request_threads(),
&request_threads,
) {
Ok(results) => Some(results),
Err(error) => {
@ -281,8 +263,6 @@ fn send_original_documents_data(
expected_dimension: embedder.dimensions(),
manual_vectors,
embedder_name,
add_to_user_provided,
remove_from_user_provided,
}));
}
}

View File

@ -0,0 +1,211 @@
use std::collections::HashMap;
use charabia::normalizer::NormalizedTokenIter;
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
use roaring::RoaringBitmap;
use serde_json::Value;
use crate::update::settings::InnerIndexSettings;
use crate::{InternalError, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
pub struct FieldWordPositionExtractorBuilder<'a> {
max_positions_per_attributes: u16,
stop_words: Option<&'a fst::Set<Vec<u8>>>,
separators: Option<Vec<&'a str>>,
dictionary: Option<Vec<&'a str>>,
}
impl<'a> FieldWordPositionExtractorBuilder<'a> {
pub fn new(
max_positions_per_attributes: Option<u32>,
settings: &'a InnerIndexSettings,
) -> Result<Self> {
let stop_words = settings.stop_words.as_ref();
let separators: Option<Vec<_>> =
settings.allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
let dictionary: Option<Vec<_>> =
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
Ok(Self {
max_positions_per_attributes: max_positions_per_attributes
.map_or(MAX_POSITION_PER_ATTRIBUTE as u16, |max| {
max.min(MAX_POSITION_PER_ATTRIBUTE) as u16
}),
stop_words,
separators,
dictionary,
})
}
pub fn build(&'a self) -> FieldWordPositionExtractor<'a> {
let builder = tokenizer_builder(
self.stop_words,
self.separators.as_deref(),
self.dictionary.as_deref(),
None,
);
FieldWordPositionExtractor {
tokenizer: builder.into_tokenizer(),
max_positions_per_attributes: self.max_positions_per_attributes,
}
}
}
pub struct FieldWordPositionExtractor<'a> {
tokenizer: Tokenizer<'a>,
max_positions_per_attributes: u16,
}
impl<'a> FieldWordPositionExtractor<'a> {
pub fn extract<'b>(
&'a self,
field_bytes: &[u8],
buffer: &'b mut String,
) -> Result<ExtractedFieldWordPosition<'a, 'b>> {
let field_value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
Ok(ExtractedFieldWordPosition {
tokenizer: &self.tokenizer,
max_positions_per_attributes: self.max_positions_per_attributes,
field_value,
buffer: buffer,
})
}
}
pub struct ExtractedFieldWordPosition<'a, 'b> {
tokenizer: &'a Tokenizer<'a>,
max_positions_per_attributes: u16,
field_value: Value,
buffer: &'b mut String,
}
impl<'a> ExtractedFieldWordPosition<'a, '_> {
pub fn iter<'o>(&'o mut self) -> FieldWordPositionIter<'o> {
self.buffer.clear();
let inner = match json_to_string(&self.field_value, &mut self.buffer) {
Some(field) => Some(self.tokenizer.tokenize(field)),
None => None,
};
// create an iterator of token with their positions.
FieldWordPositionIter {
inner,
max_positions_per_attributes: self.max_positions_per_attributes,
position: 0,
prev_kind: None,
}
}
}
pub struct FieldWordPositionIter<'a> {
inner: Option<NormalizedTokenIter<'a, 'a>>,
max_positions_per_attributes: u16,
position: u16,
prev_kind: Option<TokenKind>,
}
impl<'a> Iterator for FieldWordPositionIter<'a> {
type Item = (u16, Token<'a>);
fn next(&mut self) -> Option<Self::Item> {
if self.position >= self.max_positions_per_attributes {
return None;
}
let token = self.inner.as_mut().map(|i| i.next()).flatten()?;
match token.kind {
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
self.position += match self.prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
self.prev_kind = Some(token.kind)
}
TokenKind::Separator(_) if self.position == 0 => {
return self.next();
}
TokenKind::Separator(SeparatorKind::Hard) => {
self.prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft)
if self.prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
{
self.prev_kind = Some(token.kind);
}
_ => return self.next(),
}
if !token.is_word() {
return self.next();
}
// keep a word only if it is not empty and fit in a LMDB key.
let lemma = token.lemma().trim();
if !lemma.is_empty() && lemma.len() <= MAX_WORD_LENGTH {
Some((self.position, token))
} else {
self.next()
}
}
}
/// Factorize tokenizer building.
pub fn tokenizer_builder<'a>(
stop_words: Option<&'a fst::Set<Vec<u8>>>,
allowed_separators: Option<&'a [&str]>,
dictionary: Option<&'a [&str]>,
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
) -> TokenizerBuilder<'a, Vec<u8>> {
let mut tokenizer_builder = TokenizerBuilder::new();
if let Some(stop_words) = stop_words {
tokenizer_builder.stop_words(stop_words);
}
if let Some(dictionary) = dictionary {
tokenizer_builder.words_dict(dictionary);
}
if let Some(separators) = allowed_separators {
tokenizer_builder.separators(separators);
}
if let Some(script_language) = script_language {
tokenizer_builder.allow_list(script_language);
}
tokenizer_builder
}
/// Transform a JSON value into a string that can be indexed.
fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> {
fn inner(value: &Value, output: &mut String) -> bool {
use std::fmt::Write;
match value {
Value::Null | Value::Object(_) => false,
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
Value::Number(number) => write!(output, "{}", number).is_ok(),
Value::String(string) => write!(output, "{}", string).is_ok(),
Value::Array(array) => {
let mut count = 0;
for value in array {
if inner(value, output) {
output.push_str(". ");
count += 1;
}
}
// check that at least one value was written
count != 0
}
}
}
if let Value::String(string) = value {
Some(string)
} else if inner(value, buffer) {
Some(buffer)
} else {
None
}
}

View File

@ -0,0 +1,114 @@
use std::collections::{BTreeMap, BTreeSet};
use std::convert::TryInto;
use std::fs::File;
use std::io;
use std::io::BufReader;
use field_word_position::FieldWordPositionExtractorBuilder;
use obkv::KvReader;
use roaring::RoaringBitmap;
use word_docids::{WordDocidsDump, WordDocidsExtractor};
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::update::index_documents::extract::extract_docid_word_positions::ScriptLanguageDocidsMap;
use crate::update::index_documents::GrenadParameters;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result, SerializationError};
mod field_word_position;
mod word_docids;
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
pub fn extract_searchable_data<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
max_positions_per_attributes: Option<u32>,
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
let searchable_fields_to_index = settings_diff.searchable_fields_to_index();
let mut documents_ids = RoaringBitmap::new();
let add_builder =
FieldWordPositionExtractorBuilder::new(max_positions_per_attributes, &settings_diff.new)?;
let add_token_positions_extractor = add_builder.build();
let del_builder;
let del_token_positions_extractor = if settings_diff.settings_update_only {
del_builder = FieldWordPositionExtractorBuilder::new(
max_positions_per_attributes,
&settings_diff.old,
)?;
del_builder.build()
} else {
add_builder.build()
};
let token_positions_extractor = &[del_token_positions_extractor, add_token_positions_extractor];
let mut word_map = BTreeMap::new();
let mut word_docids_extractor = WordDocidsExtractor::new(settings_diff);
let mut cursor = obkv_documents.into_cursor()?;
// loop over documents
while let Some((key, value)) = cursor.move_on_next()? {
let document_id = key
.try_into()
.map(u32::from_be_bytes)
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
let obkv = KvReader::<FieldId>::new(value);
// if the searchable fields didn't change, skip the searchable indexing for this document.
if !settings_diff.reindex_searchable()
&& !searchable_fields_changed(&obkv, &searchable_fields_to_index)
{
continue;
}
documents_ids.push(document_id);
let mut buffer = String::new();
for field_id in searchable_fields_to_index.iter() {
let Some(field_obkv) = obkv.get(*field_id).map(KvReaderDelAdd::new) else { continue };
for (deladd, field_bytes) in field_obkv {
let mut extracted_positions =
token_positions_extractor[deladd as usize].extract(field_bytes, &mut buffer)?;
for (position, token) in extracted_positions.iter() {
let word = token.lemma().trim();
if !word_map.contains_key(word) {
word_map.insert(word.to_string(), word_map.len() as u32);
}
let word_id = word_map.get(word).unwrap();
word_docids_extractor.insert(*word_id, *field_id, document_id, deladd);
}
}
}
if word_docids_extractor.rough_size_estimate()
> indexer.max_memory.map_or(512 * 1024 * 1024, |s| s.min(512 * 1024 * 1024))
{
let WordDocidsDump { .. } =
word_docids_extractor.dump(&word_map, &searchable_fields_to_index, indexer)?;
}
}
todo!()
}
/// Check if any searchable fields of a document changed.
fn searchable_fields_changed(
obkv: &KvReader<FieldId>,
searchable_fields: &BTreeSet<FieldId>,
) -> bool {
for field_id in searchable_fields {
let Some(field_obkv) = obkv.get(*field_id).map(KvReaderDelAdd::new) else { continue };
match (field_obkv.get(DelAdd::Deletion), field_obkv.get(DelAdd::Addition)) {
// if both fields are None, check the next field.
(None, None) => (),
// if both contains a value and values are the same, check the next field.
(Some(del), Some(add)) if del == add => (),
// otherwise the fields are different, return true.
_otherwise => return true,
}
}
false
}

View File

@ -0,0 +1,203 @@
use std::collections::hash_map::Entry::{Occupied, Vacant};
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fs::File;
use std::hash::Hash;
use std::io::BufReader;
use std::mem::size_of;
use roaring::RoaringBitmap;
use crate::update::del_add::KvWriterDelAdd;
use crate::update::index_documents::extract::searchable::DelAdd;
use crate::update::index_documents::{create_writer, writer_into_reader, GrenadParameters};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
pub struct WordDocidsExtractor<'a> {
word_fid_docids: RevertedIndex<(u32, FieldId)>,
settings_diff: &'a InnerIndexSettingsDiff,
}
impl<'a> WordDocidsExtractor<'a> {
pub fn new(settings_diff: &'a InnerIndexSettingsDiff) -> Self {
Self { word_fid_docids: RevertedIndex::new(), settings_diff }
}
pub fn insert(&mut self, wordid: u32, fieldid: FieldId, docid: DocumentId, del_add: DelAdd) {
self.word_fid_docids.insert((wordid, fieldid), docid, del_add);
}
pub fn rough_size_estimate(&self) -> usize {
self.word_fid_docids.rough_size_estimate()
}
pub fn dump(
&mut self,
word_map: &BTreeMap<String, u32>,
fields: &BTreeSet<FieldId>,
indexer: GrenadParameters,
) -> Result<WordDocidsDump> {
let mut word_fid_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut word_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut exact_word_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut exact_word_deletion = RoaringBitmap::new();
let mut exact_word_addition = RoaringBitmap::new();
let mut word_deletion = RoaringBitmap::new();
let mut word_addition = RoaringBitmap::new();
let mut key_buffer = Vec::new();
let mut bitmap_buffer = Vec::new();
let mut obkv_buffer = Vec::new();
for (word, wid) in word_map {
exact_word_deletion.clear();
exact_word_addition.clear();
word_deletion.clear();
word_addition.clear();
for fid in fields {
if let Some((deletion, addition)) = self.word_fid_docids.inner.get(&(*wid, *fid)) {
if self.settings_diff.old.exact_attributes.contains(&fid) {
exact_word_deletion |= deletion;
} else {
word_deletion |= deletion;
}
if self.settings_diff.new.exact_attributes.contains(&fid) {
exact_word_addition |= addition;
} else {
word_addition |= addition;
}
if deletion != addition {
key_buffer.clear();
key_buffer.extend_from_slice(word.as_bytes());
key_buffer.push(0);
key_buffer.extend_from_slice(&fid.to_be_bytes());
let value = bitmaps_into_deladd_obkv(
deletion,
addition,
&mut obkv_buffer,
&mut bitmap_buffer,
)?;
word_fid_docids_writer.insert(&key_buffer, value)?;
}
}
}
key_buffer.clear();
key_buffer.extend_from_slice(word.as_bytes());
if exact_word_deletion != exact_word_addition {
let value = bitmaps_into_deladd_obkv(
&exact_word_deletion,
&exact_word_addition,
&mut obkv_buffer,
&mut bitmap_buffer,
)?;
exact_word_docids_writer.insert(&key_buffer, value)?;
}
if word_deletion != word_addition {
let value = bitmaps_into_deladd_obkv(
&word_deletion,
&word_addition,
&mut obkv_buffer,
&mut bitmap_buffer,
)?;
word_docids_writer.insert(&key_buffer, value)?;
}
}
self.word_fid_docids.clear();
Ok(WordDocidsDump {
word_fid_docids: writer_into_reader(word_fid_docids_writer)?,
word_docids: writer_into_reader(word_docids_writer)?,
exact_word_docids: writer_into_reader(exact_word_docids_writer)?,
})
}
}
fn bitmaps_into_deladd_obkv<'a>(
deletion: &RoaringBitmap,
addition: &RoaringBitmap,
obkv_buffer: &'a mut Vec<u8>,
bitmap_buffer: &mut Vec<u8>,
) -> Result<&'a mut Vec<u8>> {
obkv_buffer.clear();
let mut value_writer = KvWriterDelAdd::new(obkv_buffer);
if !deletion.is_empty() {
bitmap_buffer.clear();
CboRoaringBitmapCodec::serialize_into(deletion, bitmap_buffer);
value_writer.insert(DelAdd::Deletion, &*bitmap_buffer)?;
}
if !addition.is_empty() {
bitmap_buffer.clear();
CboRoaringBitmapCodec::serialize_into(addition, bitmap_buffer);
value_writer.insert(DelAdd::Addition, &*bitmap_buffer)?;
}
Ok(value_writer.into_inner()?)
}
#[derive(Debug)]
struct RevertedIndex<K> {
inner: HashMap<K, (RoaringBitmap, RoaringBitmap)>,
max_value_size: usize,
}
impl<K: PartialEq + Eq + Hash> RevertedIndex<K> {
pub fn insert(&mut self, key: K, docid: DocumentId, del_add: DelAdd) {
let size = match self.inner.entry(key) {
Occupied(mut entry) => {
let (ref mut del, ref mut add) = entry.get_mut();
match del_add {
DelAdd::Deletion => del.insert(docid),
DelAdd::Addition => add.insert(docid),
};
del.serialized_size() + add.serialized_size()
}
Vacant(entry) => {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(docid);
let size = bitmap.serialized_size();
match del_add {
DelAdd::Deletion => entry.insert((bitmap, RoaringBitmap::new())),
DelAdd::Addition => entry.insert((RoaringBitmap::new(), bitmap)),
};
size * 2
}
};
self.max_value_size = self.max_value_size.max(size);
}
pub fn new() -> Self {
Self { inner: HashMap::new(), max_value_size: 0 }
}
pub fn rough_size_estimate(&self) -> usize {
self.inner.len() * size_of::<K>() + self.inner.len() * self.max_value_size
}
fn clear(&mut self) {
self.max_value_size = 0;
self.inner.clear();
}
}
pub struct WordDocidsDump {
pub word_fid_docids: grenad::Reader<BufReader<File>>,
pub word_docids: grenad::Reader<BufReader<File>>,
pub exact_word_docids: grenad::Reader<BufReader<File>>,
}

View File

@ -85,7 +85,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> {
embedders: EmbeddingConfigs,
}
#[derive(Debug, Clone)]
#[derive(Default, Debug, Clone)]
pub struct IndexDocumentsConfig {
pub words_prefix_threshold: Option<u32>,
pub max_prefix_length: Option<usize>,
@ -93,21 +93,6 @@ pub struct IndexDocumentsConfig {
pub words_positions_min_level_size: Option<NonZeroU32>,
pub update_method: IndexDocumentsMethod,
pub autogenerate_docids: bool,
pub compute_prefix_databases: bool,
}
impl Default for IndexDocumentsConfig {
fn default() -> Self {
Self {
words_prefix_threshold: Default::default(),
max_prefix_length: Default::default(),
words_positions_level_group_size: Default::default(),
words_positions_min_level_size: Default::default(),
update_method: Default::default(),
autogenerate_docids: Default::default(),
compute_prefix_databases: true,
}
}
}
impl<'t, 'i, 'a, FP, FA> IndexDocuments<'t, 'i, 'a, FP, FA>
@ -301,7 +286,6 @@ where
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
let settings_diff = Arc::new(settings_diff);
let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?);
let backup_pool;
let pool = match self.indexer_config.thread_pool {
@ -415,7 +399,6 @@ where
pool_params,
lmdb_writer_sx.clone(),
primary_key_id,
embedders_configs.clone(),
settings_diff_cloned,
max_positions_per_attributes,
)
@ -518,8 +501,6 @@ where
embeddings,
manual_vectors,
embedder_name,
add_to_user_provided,
remove_from_user_provided,
} => {
dimension.insert(embedder_name.clone(), expected_dimension);
TypedChunk::VectorPoints {
@ -528,8 +509,6 @@ where
expected_dimension,
manual_vectors,
embedder_name,
add_to_user_provided,
remove_from_user_provided,
}
}
otherwise => otherwise,
@ -562,31 +541,22 @@ where
pool.install(|| {
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
let writer = arroy::Writer::new(vector_arroy, k, dimension);
if writer.need_build(wtxn)? {
writer.build(wtxn, &mut rng, None)?;
} else if writer.is_empty(wtxn)? {
if writer.is_empty(wtxn)? {
break;
}
writer.build(wtxn, &mut rng, None)?;
}
Result::Ok(())
})
.map_err(InternalError::from)??;
}
if self.config.compute_prefix_databases {
self.execute_prefix_databases(
word_docids.map(MergerBuilder::build),
exact_word_docids.map(MergerBuilder::build),
word_position_docids.map(MergerBuilder::build),
word_fid_docids.map(MergerBuilder::build),
)?;
} else {
self.index.words_prefixes_fst(self.wtxn)?;
self.index.word_prefix_docids.clear(self.wtxn)?;
self.index.exact_word_prefix_docids.clear(self.wtxn)?;
self.index.word_prefix_position_docids.clear(self.wtxn)?;
self.index.word_prefix_fid_docids.clear(self.wtxn)?;
}
self.execute_prefix_databases(
word_docids.map(MergerBuilder::build),
exact_word_docids.map(MergerBuilder::build),
word_position_docids.map(MergerBuilder::build),
word_fid_docids.map(MergerBuilder::build),
)?;
Ok(number_of_documents)
}
@ -811,7 +781,6 @@ mod tests {
use super::*;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
use crate::index::IndexEmbeddingConfig;
use crate::search::TermsMatchingStrategy;
use crate::update::Setting;
use crate::{db_snap, Filter, Search};
@ -2203,6 +2172,33 @@ mod tests {
index.add_documents(doc1).unwrap();
}
#[cfg(feature = "default")]
#[test]
fn store_detected_script_and_language_per_document_during_indexing() {
use charabia::{Language, Script};
let index = TempIndex::new();
index
.add_documents(documents!([
{ "id": 1, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
{ "id": 2, "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
{ "id": 3, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
{ "id": 4, "title": "関西国際空港限定トートバッグ すもももももももものうち" },
{ "id": 5, "title": "ภาษาไทยง่ายนิดเดียว" },
{ "id": 6, "title": "The quick 在尊嚴和權利上一律平等。" },
]))
.unwrap();
let rtxn = index.read_txn().unwrap();
let key_jpn = (Script::Cj, Language::Jpn);
let key_cmn = (Script::Cj, Language::Cmn);
let cj_jpn_docs = index.script_language_documents_ids(&rtxn, &key_jpn).unwrap().unwrap();
let cj_cmn_docs = index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap();
let expected_cj_jpn_docids = [3].iter().collect();
assert_eq!(cj_jpn_docs, expected_cj_jpn_docids);
let expected_cj_cmn_docids = [1, 5].iter().collect();
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
}
#[test]
fn add_and_delete_documents_in_single_transform() {
let mut index = TempIndex::new();
@ -2620,12 +2616,10 @@ mod tests {
let rtxn = index.read_txn().unwrap();
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } =
embedding_configs.pop().unwrap();
insta::assert_snapshot!(embedder_name, @"manual");
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>");
let (embedder_name, embedder) = embedding_configs.pop().unwrap();
let embedder =
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
assert_eq!("manual", embedder_name);
let res = index
.search(&rtxn)
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))

View File

@ -1,7 +1,7 @@
use std::borrow::Cow;
use std::collections::btree_map::Entry as BEntry;
use std::collections::hash_map::Entry as HEntry;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::collections::{HashMap, HashSet};
use std::fs::File;
use std::io::{Read, Seek};
@ -27,8 +27,6 @@ use crate::update::del_add::{
use crate::update::index_documents::GrenadParameters;
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
use crate::vector::settings::{EmbedderAction, WriteBackToDocuments};
use crate::{
is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
};
@ -53,6 +51,7 @@ pub struct Transform<'a, 'i> {
fields_ids_map: FieldsIdsMap,
indexer_settings: &'a IndexerConfig,
pub autogenerate_docids: bool,
pub index_documents_method: IndexDocumentsMethod,
available_documents_ids: AvailableDocumentsIds,
@ -106,7 +105,7 @@ impl<'a, 'i> Transform<'a, 'i> {
index: &'i Index,
indexer_settings: &'a IndexerConfig,
index_documents_method: IndexDocumentsMethod,
_autogenerate_docids: bool,
autogenerate_docids: bool,
) -> Result<Self> {
// We must choose the appropriate merge function for when two or more documents
// with the same user id must be merged or fully replaced in the same batch.
@ -140,6 +139,7 @@ impl<'a, 'i> Transform<'a, 'i> {
index,
fields_ids_map: index.fields_ids_map(wtxn)?,
indexer_settings,
autogenerate_docids,
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
original_sorter,
flattened_sorter,
@ -808,13 +808,13 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut new_inner_settings = old_inner_settings.clone();
new_inner_settings.fields_ids_map = fields_ids_map;
let embedding_config_updates = Default::default();
let embedding_configs_updated = false;
let settings_update_only = false;
let settings_diff = InnerIndexSettingsDiff::new(
old_inner_settings,
new_inner_settings,
primary_key_id,
embedding_config_updates,
embedding_configs_updated,
settings_update_only,
);
@ -835,13 +835,10 @@ impl<'a, 'i> Transform<'a, 'i> {
/// Rebind the field_ids of the provided document to their values
/// based on the field_ids_maps difference between the old and the new settings,
/// then fill the provided buffers with delta documents using KvWritterDelAdd.
#[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo
fn rebind_existing_document(
old_obkv: KvReader<FieldId>,
settings_diff: &InnerIndexSettingsDiff,
modified_faceted_fields: &HashSet<String>,
mut injected_vectors: serde_json::Map<String, serde_json::Value>,
old_vectors_fid: Option<FieldId>,
original_obkv_buffer: Option<&mut Vec<u8>>,
flattened_obkv_buffer: Option<&mut Vec<u8>>,
) -> Result<()> {
@ -864,49 +861,9 @@ impl<'a, 'i> Transform<'a, 'i> {
// The operations that we must perform on the different fields.
let mut operations = HashMap::new();
let mut error_seen = false;
let mut obkv_writer = KvWriter::<_, FieldId>::memory();
'write_fid: for (id, val) in old_obkv.iter() {
if !injected_vectors.is_empty() {
'inject_vectors: {
let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors };
if id < vectors_fid {
break 'inject_vectors;
}
let mut existing_vectors = if id == vectors_fid {
let existing_vectors: std::result::Result<
serde_json::Map<String, serde_json::Value>,
serde_json::Error,
> = serde_json::from_slice(val);
match existing_vectors {
Ok(existing_vectors) => existing_vectors,
Err(error) => {
if !error_seen {
tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map");
error_seen = true;
}
Default::default()
}
}
} else {
Default::default()
};
existing_vectors.append(&mut injected_vectors);
operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition);
obkv_writer
.insert(vectors_fid, serde_json::to_vec(&existing_vectors).unwrap())?;
if id == vectors_fid {
continue 'write_fid;
}
}
}
for (id, val) in old_obkv.iter() {
if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors {
operations.insert(id, DelAddOperation::DeletionAndAddition);
obkv_writer.insert(id, val)?;
@ -915,15 +872,6 @@ impl<'a, 'i> Transform<'a, 'i> {
obkv_writer.insert(id, val)?;
}
}
if !injected_vectors.is_empty() {
'inject_vectors: {
let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors };
operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition);
obkv_writer.insert(vectors_fid, serde_json::to_vec(&injected_vectors).unwrap())?;
}
}
let data = obkv_writer.into_inner()?;
let obkv = KvReader::<FieldId>::new(&data);
@ -989,35 +937,6 @@ impl<'a, 'i> Transform<'a, 'i> {
None
};
let readers: Result<
BTreeMap<&str, (Vec<arroy::Reader<arroy::distances::Angular>>, &RoaringBitmap)>,
> = settings_diff
.embedding_config_updates
.iter()
.filter_map(|(name, action)| {
if let EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
embedder_id,
user_provided,
}) = action
{
let readers: Result<Vec<_>> =
self.index.arroy_readers(wtxn, *embedder_id).collect();
match readers {
Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))),
Err(error) => Some(Err(error)),
}
} else {
None
}
})
.collect();
let readers = readers?;
let old_vectors_fid = settings_diff
.old
.fields_ids_map
.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
// We initialize the sorter with the user indexing settings.
let mut flattened_sorter =
if settings_diff.reindex_searchable() || settings_diff.reindex_facets() {
@ -1044,50 +963,10 @@ impl<'a, 'i> Transform<'a, 'i> {
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?;
let injected_vectors: std::result::Result<
serde_json::Map<String, serde_json::Value>,
arroy::Error,
> = readers
.iter()
.filter_map(|(name, (readers, user_provided))| {
if !user_provided.contains(docid) {
return None;
}
let mut vectors = Vec::new();
for reader in readers {
let Some(vector) = reader.item_vector(wtxn, docid).transpose() else {
break;
};
match vector {
Ok(vector) => vectors.push(vector),
Err(error) => return Some(Err(error)),
}
}
if vectors.is_empty() {
return None;
}
Some(Ok((
name.to_string(),
serde_json::to_value(ExplicitVectors {
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
vectors,
)),
regenerate: false,
})
.unwrap(),
)))
})
.collect();
let injected_vectors = injected_vectors?;
Self::rebind_existing_document(
old_obkv,
&settings_diff,
&modified_faceted_fields,
injected_vectors,
old_vectors_fid,
Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()),
Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()),
)?;
@ -1104,23 +983,6 @@ impl<'a, 'i> Transform<'a, 'i> {
}
}
let mut writers = Vec::new();
// delete all vectors from the embedders that need removal
for (_, (readers, _)) in readers {
for reader in readers {
let dimensions = reader.dimensions();
let arroy_index = reader.index();
drop(reader);
let writer = arroy::Writer::new(self.index.vector_arroy, arroy_index, dimensions);
writers.push(writer);
}
}
for writer in writers {
writer.clear(wtxn)?;
}
let grenad_params = GrenadParameters {
chunk_compression_type: self.indexer_settings.chunk_compression_type,
chunk_compression_level: self.indexer_settings.chunk_compression_level,

View File

@ -20,7 +20,6 @@ use super::MergeFn;
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
use crate::facet::FacetType;
use crate::index::db_name::DOCUMENTS;
use crate::index::IndexEmbeddingConfig;
use crate::proximity::MAX_DISTANCE;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
use crate::update::facet::FacetsUpdate;
@ -91,8 +90,6 @@ pub(crate) enum TypedChunk {
expected_dimension: usize,
manual_vectors: grenad::Reader<BufReader<File>>,
embedder_name: String,
add_to_user_provided: RoaringBitmap,
remove_from_user_provided: RoaringBitmap,
},
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
}
@ -157,11 +154,8 @@ pub(crate) fn write_typed_chunk_into_index(
let mut docids = index.documents_ids(wtxn)?;
let mut iter = merger.into_stream_merger_iter()?;
let embedders: BTreeSet<_> = index
.embedding_configs(wtxn)?
.into_iter()
.map(|IndexEmbeddingConfig { name, .. }| name)
.collect();
let embedders: BTreeSet<_> =
index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect();
let mut vectors_buffer = Vec::new();
while let Some((key, reader)) = iter.next()? {
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -187,7 +181,7 @@ pub(crate) fn write_typed_chunk_into_index(
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
break 'vectors Some(addition);
};
vectors.retain_not_embedded_vectors(&embedders);
vectors.retain_user_provided_vectors(&embedders);
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
if vectors.is_empty() {
// skip writing empty `_vectors` map
@ -625,8 +619,6 @@ pub(crate) fn write_typed_chunk_into_index(
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
let mut add_to_user_provided = RoaringBitmap::new();
let mut remove_from_user_provided = RoaringBitmap::new();
let mut params = None;
for typed_chunk in typed_chunks {
let TypedChunk::VectorPoints {
@ -635,8 +627,6 @@ pub(crate) fn write_typed_chunk_into_index(
embeddings,
expected_dimension,
embedder_name,
add_to_user_provided: aud,
remove_from_user_provided: rud,
} = typed_chunk
else {
unreachable!();
@ -649,23 +639,11 @@ pub(crate) fn write_typed_chunk_into_index(
if let Some(embeddings) = embeddings {
embeddings_builder.push(embeddings.into_cursor()?);
}
add_to_user_provided |= aud;
remove_from_user_provided |= rud;
}
// typed chunks has always at least 1 chunk.
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
let mut embedding_configs = index.embedding_configs(wtxn)?;
let index_embedder_config = embedding_configs
.iter_mut()
.find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
.unwrap();
index_embedder_config.user_provided -= remove_from_user_provided;
index_embedder_config.user_provided |= add_to_user_provided;
index.put_embedding_configs(wtxn, embedding_configs)?;
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
)?;

View File

@ -6,7 +6,6 @@ use std::sync::Arc;
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
use deserr::{DeserializeError, Deserr};
use itertools::{EitherOrBoth, Itertools};
use roaring::RoaringBitmap;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use time::OffsetDateTime;
@ -15,18 +14,12 @@ use super::index_documents::{IndexDocumentsConfig, Transform};
use super::IndexerConfig;
use crate::criterion::Criterion;
use crate::error::UserError;
use crate::index::{
IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS,
};
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::vector::settings::{
check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction,
WriteBackToDocuments,
};
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
use crate::{FieldId, FieldsIdsMap, Index, Result};
@ -497,7 +490,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.index.put_all_searchable_fields_from_fields_ids_map(
self.wtxn,
&names,
&fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME),
&fields_ids_map,
)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
@ -927,177 +919,92 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(changed)
}
fn update_embedding_configs(&mut self) -> Result<BTreeMap<String, EmbedderAction>> {
match std::mem::take(&mut self.embedder_settings) {
Setting::Set(configs) => self.update_embedding_configs_set(configs),
Setting::Reset => {
// all vectors should be written back to documents
fn update_embedding_configs(&mut self) -> Result<bool> {
let update = match std::mem::take(&mut self.embedder_settings) {
Setting::Set(configs) => {
let mut changed = false;
let old_configs = self.index.embedding_configs(self.wtxn)?;
let remove_all: Result<BTreeMap<String, EmbedderAction>> = old_configs
let old_configs: BTreeMap<String, Setting<EmbeddingSettings>> =
old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect();
let mut new_configs = BTreeMap::new();
for joined in old_configs
.into_iter()
.map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> {
let embedder_id =
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
crate::InternalError::DatabaseMissingEntry {
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
key: None,
},
)?;
Ok((
name,
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
embedder_id,
user_provided,
}),
))
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
{
match joined {
// updated config
EitherOrBoth::Both((name, mut old), (_, new)) => {
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new);
if changed {
tracing::debug!(embedder = name, "need reindex");
} else {
tracing::debug!(embedder = name, "skip reindex");
}
let new = validate_embedding_settings(old, &name)?;
new_configs.insert(name, new);
}
// unchanged config
EitherOrBoth::Left((name, setting)) => {
new_configs.insert(name, setting);
}
// new config
EitherOrBoth::Right((name, mut setting)) => {
// apply the default source in case the source was not set so that it gets validated
crate::vector::settings::EmbeddingSettings::apply_default_source(
&mut setting,
);
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting,
);
let setting = validate_embedding_settings(setting, &name)?;
changed = true;
new_configs.insert(name, setting);
}
}
}
let new_configs: Vec<(String, EmbeddingConfig)> = new_configs
.into_iter()
.filter_map(|(name, setting)| match setting {
Setting::Set(value) => Some((name, value.into())),
Setting::Reset => None,
Setting::NotSet => Some((name, EmbeddingSettings::default().into())),
})
.collect();
let remove_all = remove_all?;
self.index.embedder_category_id.clear(self.wtxn)?;
for (index, (embedder_name, _)) in new_configs.iter().enumerate() {
self.index.embedder_category_id.put_with_flags(
self.wtxn,
heed::PutFlags::APPEND,
embedder_name,
&index
.try_into()
.map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?,
)?;
}
if new_configs.is_empty() {
self.index.delete_embedding_configs(self.wtxn)?;
} else {
self.index.put_embedding_configs(self.wtxn, new_configs)?;
}
changed
}
Setting::Reset => {
self.index.delete_embedding_configs(self.wtxn)?;
Ok(remove_all)
true
}
Setting::NotSet => Ok(Default::default()),
}
}
Setting::NotSet => false,
};
fn update_embedding_configs_set(
&mut self,
configs: BTreeMap<String, Setting<EmbeddingSettings>>,
) -> Result<BTreeMap<String, EmbedderAction>> {
use crate::vector::settings::SettingsDiff;
// if any changes force a reindexing
// clear the vector database.
if update {
self.index.vector_arroy.clear(self.wtxn)?;
}
let old_configs = self.index.embedding_configs(self.wtxn)?;
let old_configs: BTreeMap<String, (EmbeddingSettings, RoaringBitmap)> = old_configs
.into_iter()
.map(|IndexEmbeddingConfig { name, config, user_provided }| {
(name, (config.into(), user_provided))
})
.collect();
let mut updated_configs = BTreeMap::new();
let mut embedder_actions = BTreeMap::new();
for joined in old_configs
.into_iter()
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
{
match joined {
// updated config
EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => {
let settings_diff = SettingsDiff::from_settings(old, new);
match settings_diff {
SettingsDiff::Remove => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"removing embedder"
);
let embedder_id =
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
crate::InternalError::DatabaseMissingEntry {
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
key: None,
},
)?;
// free id immediately
self.index.embedder_category_id.delete(self.wtxn, &name)?;
embedder_actions.insert(
name,
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
embedder_id,
user_provided,
}),
);
}
SettingsDiff::Reindex { action, updated_settings } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
?action,
"reindex embedder"
);
embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action));
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
updated_configs.insert(name, (new, user_provided));
}
SettingsDiff::UpdateWithoutReindex { updated_settings } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"update without reindex embedder"
);
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
updated_configs.insert(name, (new, user_provided));
}
}
}
// unchanged config
EitherOrBoth::Left((name, (setting, user_provided))) => {
tracing::debug!(embedder = name, "unchanged embedder");
updated_configs.insert(name, (Setting::Set(setting), user_provided));
}
// new config
EitherOrBoth::Right((name, mut setting)) => {
tracing::debug!(embedder = name, "new embedder");
// apply the default source in case the source was not set so that it gets validated
crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting);
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting,
);
let setting = validate_embedding_settings(setting, &name)?;
embedder_actions
.insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex));
updated_configs.insert(name, (setting, RoaringBitmap::new()));
}
}
}
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
for res in self.index.embedder_category_id.iter(self.wtxn)? {
let (_name, id) = res?;
free_indices[id as usize] = false;
}
let mut free_indices = free_indices.iter_mut().enumerate();
let mut find_free_index =
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
for (name, action) in embedder_actions.iter() {
match action {
EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => {
/* cannot be a new embedder, so has to have an id already */
}
EmbedderAction::Reindex(ReindexAction::FullReindex) => {
if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() {
let id = find_free_index()
.ok_or(UserError::TooManyEmbedders(updated_configs.len()))?;
tracing::debug!(embedder = name, id, "assigning free id to new embedder");
self.index.embedder_category_id.put(self.wtxn, name, &id)?;
}
}
EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ }
}
}
let updated_configs: Vec<IndexEmbeddingConfig> = updated_configs
.into_iter()
.filter_map(|(name, (config, user_provided))| match config {
Setting::Set(config) => {
Some(IndexEmbeddingConfig { name, config: config.into(), user_provided })
}
Setting::Reset => None,
Setting::NotSet => Some(IndexEmbeddingConfig {
name,
config: EmbeddingSettings::default().into(),
user_provided,
}),
})
.collect();
if updated_configs.is_empty() {
self.index.delete_embedding_configs(self.wtxn)?;
} else {
self.index.put_embedding_configs(self.wtxn, updated_configs)?;
}
Ok(embedder_actions)
Ok(update)
}
fn update_search_cutoff(&mut self) -> Result<bool> {
@ -1151,8 +1058,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.update_searchable()?;
self.update_exact_attributes()?;
self.update_proximity_precision()?;
let embedding_config_updates = self.update_embedding_configs()?;
// TODO: very rough approximation of the needs for reindexing where any change will result in
// a full reindexing.
// What can be done instead:
// 1. Only change the distance on a distance change
// 2. Only change the name -> embedder mapping on a name change
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
let embedding_configs_updated = self.update_embedding_configs()?;
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
new_inner_settings.recompute_facets(self.wtxn, self.index)?;
@ -1166,7 +1078,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
old_inner_settings,
new_inner_settings,
primary_key_id,
embedding_config_updates,
embedding_configs_updated,
settings_update_only,
);
@ -1182,7 +1094,8 @@ pub struct InnerIndexSettingsDiff {
pub(crate) old: InnerIndexSettings,
pub(crate) new: InnerIndexSettings,
pub(crate) primary_key_id: Option<FieldId>,
pub(crate) embedding_config_updates: BTreeMap<String, EmbedderAction>,
// TODO: compare directly the embedders.
pub(crate) embedding_configs_updated: bool,
pub(crate) settings_update_only: bool,
/// The set of only the additional searchable fields.
/// If any other searchable field has been modified, is set to None.
@ -1203,7 +1116,7 @@ impl InnerIndexSettingsDiff {
old_settings: InnerIndexSettings,
new_settings: InnerIndexSettings,
primary_key_id: Option<FieldId>,
embedding_config_updates: BTreeMap<String, EmbedderAction>,
embedding_configs_updated: bool,
settings_update_only: bool,
) -> Self {
let only_additional_fields = match (
@ -1240,7 +1153,7 @@ impl InnerIndexSettingsDiff {
old: old_settings,
new: new_settings,
primary_key_id,
embedding_config_updates,
embedding_configs_updated,
settings_update_only,
only_additional_fields,
cache_reindex_searchable_without_user_defined,
@ -1249,6 +1162,18 @@ impl InnerIndexSettingsDiff {
}
}
pub fn searchable_fields_to_index(&self) -> BTreeSet<FieldId> {
if self.settings_update_only {
self.new
.fields_ids_map
.ids()
.filter(|id| self.reindex_searchable_id(*id).is_some())
.collect()
} else {
self.new.searchable_fields_ids.iter().copied().collect()
}
}
pub fn any_reindexing_needed(&self) -> bool {
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
}
@ -1307,7 +1232,7 @@ impl InnerIndexSettingsDiff {
}
pub fn reindex_vectors(&self) -> bool {
!self.embedding_config_updates.is_empty()
self.embedding_configs_updated
}
pub fn settings_update_only(&self) -> bool {
@ -1339,8 +1264,6 @@ pub(crate) struct InnerIndexSettings {
pub embedding_configs: EmbeddingConfigs,
pub existing_fields: HashSet<String>,
pub geo_fields_ids: Option<(FieldId, FieldId)>,
pub non_searchable_fields_ids: Vec<FieldId>,
pub non_faceted_fields_ids: Vec<FieldId>,
}
impl InnerIndexSettings {
@ -1354,8 +1277,8 @@ impl InnerIndexSettings {
let user_defined_searchable_fields =
user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect());
let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?;
let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
let searchable_fields_ids = index.searchable_fields_ids(rtxn)?;
let faceted_fields_ids = index.faceted_fields_ids(rtxn)?;
let exact_attributes = index.exact_attributes_ids(rtxn)?;
let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default();
let embedding_configs = embedders(index.embedding_configs(rtxn)?)?;
@ -1383,10 +1306,6 @@ impl InnerIndexSettings {
None => None,
};
let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME);
searchable_fields_ids.retain(|id| !vectors_fids.contains(id));
faceted_fields_ids.retain(|id| !vectors_fids.contains(id));
Ok(Self {
stop_words,
allowed_separators,
@ -1401,8 +1320,6 @@ impl InnerIndexSettings {
embedding_configs,
existing_fields,
geo_fields_ids,
non_searchable_fields_ids: vectors_fids.clone(),
non_faceted_fields_ids: vectors_fids.clone(),
})
}
@ -1410,10 +1327,9 @@ impl InnerIndexSettings {
pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> {
let new_facets = self
.fields_ids_map
.iter()
.filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid))
.filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields))
.map(|(_fid, field)| field.to_string())
.names()
.filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields))
.map(|field| field.to_string())
.collect();
index.put_faceted_fields(wtxn, &new_facets)?;
@ -1433,7 +1349,6 @@ impl InnerIndexSettings {
index.put_all_searchable_fields_from_fields_ids_map(
wtxn,
&searchable_fields,
&self.non_searchable_fields_ids,
&self.fields_ids_map,
)?;
}
@ -1444,25 +1359,19 @@ impl InnerIndexSettings {
}
}
fn embedders(embedding_configs: Vec<IndexEmbeddingConfig>) -> Result<EmbeddingConfigs> {
fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> {
let res: Result<_> = embedding_configs
.into_iter()
.map(
|IndexEmbeddingConfig {
name,
config: EmbeddingConfig { embedder_options, prompt },
..
}| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
.map(|(name, EmbeddingConfig { embedder_options, prompt })| {
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
let embedder = Arc::new(
Embedder::new(embedder_options.clone())
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?,
);
Ok((name, (embedder, prompt)))
},
)
let embedder = Arc::new(
Embedder::new(embedder_options.clone())
.map_err(crate::vector::Error::from)
.map_err(crate::Error::from)?,
);
Ok((name, (embedder, prompt)))
})
.collect();
res.map(EmbeddingConfigs::new)
}

View File

@ -152,10 +152,6 @@ impl EmbeddingConfigs {
&self.0
}
pub fn into_inner(self) -> HashMap<String, (Arc<Embedder>, Arc<Prompt>)> {
self.0
}
/// Get the name of the default embedder configuration.
///
/// The default embedder is determined as follows:

View File

@ -1,119 +1,51 @@
use std::collections::{BTreeMap, BTreeSet};
use deserr::{take_cf_content, DeserializeError, Deserr, Sequence};
use obkv::KvReader;
use serde_json::{from_slice, Value};
use super::Embedding;
use crate::index::IndexEmbeddingConfig;
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::{DocumentId, FieldId, InternalError, UserError};
use crate::{FieldId, InternalError, UserError};
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
#[derive(serde::Serialize, Debug)]
#[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(untagged)]
pub enum Vectors {
ImplicitlyUserProvided(VectorOrArrayOfVectors),
Explicit(ExplicitVectors),
}
impl<E: DeserializeError> Deserr<E> for Vectors {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef,
) -> Result<Self, E> {
match value {
deserr::Value::Sequence(_) | deserr::Value::Null => {
Ok(Vectors::ImplicitlyUserProvided(VectorOrArrayOfVectors::deserialize_from_value(
value, location,
)?))
}
deserr::Value::Map(_) => {
Ok(Vectors::Explicit(ExplicitVectors::deserialize_from_value(value, location)?))
}
value => Err(take_cf_content(E::error(
None,
deserr::ErrorKind::IncorrectValueKind {
actual: value,
accepted: &[
deserr::ValueKind::Sequence,
deserr::ValueKind::Map,
deserr::ValueKind::Null,
],
},
location,
))),
}
}
}
impl Vectors {
pub fn must_regenerate(&self) -> bool {
pub fn into_array_of_vectors(self) -> Vec<Embedding> {
match self {
Vectors::ImplicitlyUserProvided(_) => false,
Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate,
}
}
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
match self {
Vectors::ImplicitlyUserProvided(embeddings) => {
Some(embeddings.into_array_of_vectors().unwrap_or_default())
}
Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => {
embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default())
Vectors::ImplicitlyUserProvided(embeddings)
| Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => {
embeddings.into_array_of_vectors().unwrap_or_default()
}
}
}
}
#[derive(serde::Serialize, Deserr, Debug)]
#[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct ExplicitVectors {
#[serde(default)]
#[deserr(default)]
pub embeddings: Option<VectorOrArrayOfVectors>,
pub regenerate: bool,
}
pub enum VectorState {
Inline(Vectors),
Manual,
Generated,
}
impl VectorState {
pub fn must_regenerate(&self) -> bool {
match self {
VectorState::Inline(vectors) => vectors.must_regenerate(),
VectorState::Manual => false,
VectorState::Generated => true,
}
}
}
pub enum VectorsState {
NoVectorsFid,
NoVectorsFieldInDocument,
Vectors(BTreeMap<String, Vectors>),
pub embeddings: VectorOrArrayOfVectors,
pub user_provided: bool,
}
pub struct ParsedVectorsDiff {
old: BTreeMap<String, VectorState>,
new: VectorsState,
pub old: Option<BTreeMap<String, Vectors>>,
pub new: Option<BTreeMap<String, Vectors>>,
}
impl ParsedVectorsDiff {
pub fn new(
docid: DocumentId,
embedders_configs: &[IndexEmbeddingConfig],
documents_diff: KvReader<'_, FieldId>,
old_vectors_fid: Option<FieldId>,
new_vectors_fid: Option<FieldId>,
) -> Result<Self, Error> {
let mut old = match old_vectors_fid
let old = match old_vectors_fid
.and_then(|vectors_fid| documents_diff.get(vectors_fid))
.map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion))
@ -129,84 +61,48 @@ impl ParsedVectorsDiff {
return Err(error);
}
}
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect());
for embedding_config in embedders_configs {
if embedding_config.user_provided.contains(docid) {
old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual);
}
}
let new = 'new: {
let Some(new_vectors_fid) = new_vectors_fid else {
break 'new VectorsState::NoVectorsFid;
};
let Some(bytes) = documents_diff.get(new_vectors_fid) else {
break 'new VectorsState::NoVectorsFieldInDocument;
};
let obkv = KvReaderDelAdd::new(bytes);
match to_vector_map(obkv, DelAdd::Addition)? {
Some(new) => VectorsState::Vectors(new),
None => VectorsState::NoVectorsFieldInDocument,
}
};
.flatten();
let new = new_vectors_fid
.and_then(|vectors_fid| documents_diff.get(vectors_fid))
.map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Addition))
.transpose()?
.flatten();
Ok(Self { old, new })
}
pub fn remove(&mut self, embedder_name: &str) -> (VectorState, VectorState) {
let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated);
let state_from_old = match old {
// assume a userProvided is still userProvided
VectorState::Manual => VectorState::Manual,
// generated is still generated
VectorState::Generated => VectorState::Generated,
// weird case that shouldn't happen were the previous docs version is inline,
// but it was removed in the new version
// Since it is not in the new version, we switch to generated
VectorState::Inline(_) => VectorState::Generated,
};
let new = match &mut self.new {
VectorsState::Vectors(new) => {
new.remove(embedder_name).map(VectorState::Inline).unwrap_or(state_from_old)
}
_ =>
// if no `_vectors` field is present in the new document,
// the state depends on the previous version of the document
{
state_from_old
}
};
pub fn remove(&mut self, embedder_name: &str) -> (Option<Vectors>, Option<Vectors>) {
let old = self.old.as_mut().and_then(|old| old.remove(embedder_name));
let new = self.new.as_mut().and_then(|new| new.remove(embedder_name));
(old, new)
}
}
pub struct ParsedVectors(pub BTreeMap<String, Vectors>);
impl<E: DeserializeError> Deserr<E> for ParsedVectors {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef,
) -> Result<Self, E> {
let value = <BTreeMap<String, Vectors>>::deserialize_from_value(value, location)?;
Ok(ParsedVectors(value))
}
}
impl ParsedVectors {
pub fn from_bytes(value: &[u8]) -> Result<Self, Error> {
let value: serde_json::Value = from_slice(value).map_err(Error::InternalSerdeJson)?;
deserr::deserialize(value).map_err(|error| Error::InvalidEmbedderConf { error })
let Ok(value) = from_slice(value) else {
let value = from_slice(value).map_err(Error::InternalSerdeJson)?;
return Err(Error::InvalidMap(value));
};
Ok(ParsedVectors(value))
}
pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
self.0.retain(|k, _v| !embedders.contains(k))
pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) {
self.0.retain(|k, v| match v {
Vectors::ImplicitlyUserProvided(_) => true,
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
*user_provided
// if the embedder is not in the config, then never touch it
|| !embedders.contains(k)
}
});
}
}
pub enum Error {
InvalidMap(Value),
InvalidEmbedderConf { error: deserr::errors::JsonError },
InternalSerdeJson(serde_json::Error),
}
@ -216,12 +112,6 @@ impl Error {
Error::InvalidMap(value) => {
crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value })
}
Error::InvalidEmbedderConf { error } => {
crate::Error::UserError(UserError::InvalidVectorsEmbedderConf {
document_id,
error,
})
}
Error::InternalSerdeJson(error) => {
crate::Error::InternalError(InternalError::SerdeJson(error))
}
@ -242,84 +132,13 @@ fn to_vector_map(
}
/// Represents either a vector or an array of multiple vectors.
#[derive(serde::Serialize, Debug)]
#[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(transparent)]
pub struct VectorOrArrayOfVectors {
#[serde(with = "either::serde_untagged_optional")]
inner: Option<either::Either<Vec<Embedding>, Embedding>>,
}
impl<E: DeserializeError> Deserr<E> for VectorOrArrayOfVectors {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef,
) -> Result<Self, E> {
match value {
deserr::Value::Null => Ok(VectorOrArrayOfVectors { inner: None }),
deserr::Value::Sequence(seq) => {
let mut iter = seq.into_iter();
match iter.next().map(|v| v.into_value()) {
None => {
// With the strange way serde serialize the `Either`, we must send the left part
// otherwise it'll consider we returned [[]]
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(Vec::new())) })
}
Some(val @ deserr::Value::Sequence(_)) => {
let first = Embedding::deserialize_from_value(val, location.push_index(0))?;
let mut collect = vec![first];
let mut tail = iter
.enumerate()
.map(|(i, v)| {
Embedding::deserialize_from_value(
v.into_value(),
location.push_index(i + 1),
)
})
.collect::<Result<Vec<_>, _>>()?;
collect.append(&mut tail);
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(collect)) })
}
Some(
val @ deserr::Value::Integer(_)
| val @ deserr::Value::NegativeInteger(_)
| val @ deserr::Value::Float(_),
) => {
let first = <f32>::deserialize_from_value(val, location.push_index(0))?;
let mut embedding = iter
.enumerate()
.map(|(i, v)| {
<f32>::deserialize_from_value(
v.into_value(),
location.push_index(i + 1),
)
})
.collect::<Result<Vec<_>, _>>()?;
embedding.insert(0, first);
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Right(embedding)) })
}
Some(value) => Err(take_cf_content(E::error(
None,
deserr::ErrorKind::IncorrectValueKind {
actual: value,
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Float],
},
location.push_index(0),
))),
}
}
value => Err(take_cf_content(E::error(
None,
deserr::ErrorKind::IncorrectValueKind {
actual: value,
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Null],
},
location,
))),
}
}
}
impl VectorOrArrayOfVectors {
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
match self.inner? {
@ -331,41 +150,21 @@ impl VectorOrArrayOfVectors {
pub fn from_array_of_vectors(array_of_vec: Vec<Embedding>) -> Self {
Self { inner: Some(either::Either::Left(array_of_vec)) }
}
pub fn from_vector(vec: Embedding) -> Self {
Self { inner: Some(either::Either::Right(vec)) }
}
}
impl From<Embedding> for VectorOrArrayOfVectors {
fn from(vec: Embedding) -> Self {
Self::from_vector(vec)
}
}
impl From<Vec<Embedding>> for VectorOrArrayOfVectors {
fn from(vec: Vec<Embedding>) -> Self {
Self::from_array_of_vectors(vec)
}
}
#[cfg(test)]
mod test {
use super::VectorOrArrayOfVectors;
fn embedding_from_str(s: &str) -> Result<VectorOrArrayOfVectors, deserr::errors::JsonError> {
let value: serde_json::Value = serde_json::from_str(s).unwrap();
deserr::deserialize(value)
}
#[test]
fn array_of_vectors() {
let null = embedding_from_str("null").unwrap();
let empty = embedding_from_str("[]").unwrap();
let one = embedding_from_str("[0.1]").unwrap();
let two = embedding_from_str("[0.1, 0.2]").unwrap();
let one_vec = embedding_from_str("[[0.1, 0.2]]").unwrap();
let two_vecs = embedding_from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap();
let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap();
let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap();
let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap();
let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap();
let two_vecs: VectorOrArrayOfVectors =
serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null");
insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]");

View File

@ -1,5 +1,4 @@
use deserr::Deserr;
use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize};
use super::rest::InputType;
@ -73,238 +72,6 @@ pub fn check_unset<T>(
}
}
/// Indicates what action should take place during a reindexing operation for an embedder
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ReindexAction {
/// An indexing operation should take place for this embedder, keeping existing vectors
/// and checking whether the document template changed or not
RegeneratePrompts,
/// An indexing operation should take place for all documents for this embedder, removing existing vectors
/// (except userProvided ones)
FullReindex,
}
pub enum SettingsDiff {
Remove,
Reindex { action: ReindexAction, updated_settings: EmbeddingSettings },
UpdateWithoutReindex { updated_settings: EmbeddingSettings },
}
pub enum EmbedderAction {
WriteBackToDocuments(WriteBackToDocuments),
Reindex(ReindexAction),
}
pub struct WriteBackToDocuments {
pub embedder_id: u8,
pub user_provided: RoaringBitmap,
}
impl SettingsDiff {
pub fn from_settings(old: EmbeddingSettings, new: Setting<EmbeddingSettings>) -> Self {
match new {
Setting::Set(new) => {
let EmbeddingSettings {
mut source,
mut model,
mut revision,
mut api_key,
mut dimensions,
mut document_template,
mut url,
mut query,
mut input_field,
mut path_to_embeddings,
mut embedding_object,
mut input_type,
mut distribution,
} = old;
let EmbeddingSettings {
source: new_source,
model: new_model,
revision: new_revision,
api_key: new_api_key,
dimensions: new_dimensions,
document_template: new_document_template,
url: new_url,
query: new_query,
input_field: new_input_field,
path_to_embeddings: new_path_to_embeddings,
embedding_object: new_embedding_object,
input_type: new_input_type,
distribution: new_distribution,
} = new;
let mut reindex_action = None;
// **Warning**: do not use short-circuiting || here, we want all these operations applied
if source.apply(new_source) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
// when the source changes, we need to reapply the default settings for the new source
apply_default_for_source(
&source,
&mut model,
&mut revision,
&mut dimensions,
&mut url,
&mut query,
&mut input_field,
&mut path_to_embeddings,
&mut embedding_object,
&mut input_type,
&mut document_template,
)
}
if model.apply(new_model) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if revision.apply(new_revision) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if dimensions.apply(new_dimensions) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if url.apply(new_url) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if query.apply(new_query) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if input_field.apply(new_input_field) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if path_to_embeddings.apply(new_path_to_embeddings) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if embedding_object.apply(new_embedding_object) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if input_type.apply(new_input_type) {
ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex);
}
if document_template.apply(new_document_template) {
ReindexAction::push_action(
&mut reindex_action,
ReindexAction::RegeneratePrompts,
);
}
distribution.apply(new_distribution);
api_key.apply(new_api_key);
let updated_settings = EmbeddingSettings {
source,
model,
revision,
api_key,
dimensions,
document_template,
url,
query,
input_field,
path_to_embeddings,
embedding_object,
input_type,
distribution,
};
match reindex_action {
Some(action) => Self::Reindex { action, updated_settings },
None => Self::UpdateWithoutReindex { updated_settings },
}
}
Setting::Reset => Self::Remove,
Setting::NotSet => Self::UpdateWithoutReindex { updated_settings: old },
}
}
}
impl ReindexAction {
fn push_action(this: &mut Option<Self>, other: Self) {
*this = match (*this, other) {
(_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex),
(Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex),
(_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts),
}
}
}
#[allow(clippy::too_many_arguments)] // private function
fn apply_default_for_source(
source: &Setting<EmbedderSource>,
model: &mut Setting<String>,
revision: &mut Setting<String>,
dimensions: &mut Setting<usize>,
url: &mut Setting<String>,
query: &mut Setting<serde_json::Value>,
input_field: &mut Setting<Vec<String>>,
path_to_embeddings: &mut Setting<Vec<String>>,
embedding_object: &mut Setting<Vec<String>>,
input_type: &mut Setting<InputType>,
document_template: &mut Setting<String>,
) {
match source {
Setting::Set(EmbedderSource::HuggingFace) => {
*model = Setting::Reset;
*revision = Setting::Reset;
*dimensions = Setting::NotSet;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
}
Setting::Set(EmbedderSource::Ollama) => {
*model = Setting::Reset;
*revision = Setting::NotSet;
*dimensions = Setting::Reset;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
}
Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => {
*model = Setting::Reset;
*revision = Setting::NotSet;
*dimensions = Setting::NotSet;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
}
Setting::Set(EmbedderSource::Rest) => {
*model = Setting::NotSet;
*revision = Setting::NotSet;
*dimensions = Setting::Reset;
*url = Setting::Reset;
*query = Setting::Reset;
*input_field = Setting::Reset;
*path_to_embeddings = Setting::Reset;
*embedding_object = Setting::Reset;
*input_type = Setting::Reset;
}
Setting::Set(EmbedderSource::UserProvided) => {
*model = Setting::NotSet;
*revision = Setting::NotSet;
*dimensions = Setting::Reset;
*url = Setting::NotSet;
*query = Setting::NotSet;
*input_field = Setting::NotSet;
*path_to_embeddings = Setting::NotSet;
*embedding_object = Setting::NotSet;
*input_type = Setting::NotSet;
*document_template = Setting::NotSet;
}
Setting::NotSet => {}
}
}
pub fn check_set<T>(
key: &Setting<T>,
field: &'static str,
@ -443,6 +210,66 @@ impl EmbeddingSettings {
*model = Setting::Set(openai::EmbeddingModel::default().name().to_owned())
}
}
pub(crate) fn apply_and_need_reindex(
old: &mut Setting<EmbeddingSettings>,
new: Setting<EmbeddingSettings>,
) -> bool {
match (old, new) {
(
Setting::Set(EmbeddingSettings {
source: old_source,
model: old_model,
revision: old_revision,
api_key: old_api_key,
dimensions: old_dimensions,
document_template: old_document_template,
url: old_url,
query: old_query,
input_field: old_input_field,
path_to_embeddings: old_path_to_embeddings,
embedding_object: old_embedding_object,
input_type: old_input_type,
distribution: old_distribution,
}),
Setting::Set(EmbeddingSettings {
source: new_source,
model: new_model,
revision: new_revision,
api_key: new_api_key,
dimensions: new_dimensions,
document_template: new_document_template,
url: new_url,
query: new_query,
input_field: new_input_field,
path_to_embeddings: new_path_to_embeddings,
embedding_object: new_embedding_object,
input_type: new_input_type,
distribution: new_distribution,
}),
) => {
let mut needs_reindex = false;
needs_reindex |= old_source.apply(new_source);
needs_reindex |= old_model.apply(new_model);
needs_reindex |= old_revision.apply(new_revision);
needs_reindex |= old_dimensions.apply(new_dimensions);
needs_reindex |= old_document_template.apply(new_document_template);
needs_reindex |= old_url.apply(new_url);
needs_reindex |= old_query.apply(new_query);
needs_reindex |= old_input_field.apply(new_input_field);
needs_reindex |= old_path_to_embeddings.apply(new_path_to_embeddings);
needs_reindex |= old_embedding_object.apply(new_embedding_object);
needs_reindex |= old_input_type.apply(new_input_type);
old_distribution.apply(new_distribution);
old_api_key.apply(new_api_key);
needs_reindex
}
(Setting::Reset, Setting::Reset) | (_, Setting::NotSet) => false,
_ => true,
}
}
}
#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)]

View File

@ -21,7 +21,7 @@ reqwest = { version = "0.11.23", features = [
"stream",
"json",
"rustls-tls",
], default-features = false }
], default_features = false }
serde = { version = "1.0.195", features = ["derive"] }
serde_json = "1.0.111"
sha2 = "0.10.8"